fastembed 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -0
- data/.yardopts +6 -0
- data/BENCHMARKS.md +124 -1
- data/CHANGELOG.md +14 -0
- data/README.md +395 -74
- data/benchmark/compare_all.rb +167 -0
- data/benchmark/compare_python.py +60 -0
- data/benchmark/memory_profile.rb +70 -0
- data/benchmark/profile.rb +198 -0
- data/benchmark/reranker_benchmark.rb +158 -0
- data/exe/fastembed +6 -0
- data/fastembed.gemspec +3 -0
- data/lib/fastembed/async.rb +193 -0
- data/lib/fastembed/base_model.rb +247 -0
- data/lib/fastembed/base_model_info.rb +61 -0
- data/lib/fastembed/cli.rb +745 -0
- data/lib/fastembed/custom_model_registry.rb +255 -0
- data/lib/fastembed/image_embedding.rb +313 -0
- data/lib/fastembed/late_interaction_embedding.rb +260 -0
- data/lib/fastembed/late_interaction_model_info.rb +91 -0
- data/lib/fastembed/model_info.rb +59 -19
- data/lib/fastembed/model_management.rb +82 -23
- data/lib/fastembed/onnx_embedding_model.rb +25 -4
- data/lib/fastembed/pooling.rb +39 -3
- data/lib/fastembed/progress.rb +52 -0
- data/lib/fastembed/quantization.rb +75 -0
- data/lib/fastembed/reranker_model_info.rb +91 -0
- data/lib/fastembed/sparse_embedding.rb +261 -0
- data/lib/fastembed/sparse_model_info.rb +80 -0
- data/lib/fastembed/text_cross_encoder.rb +217 -0
- data/lib/fastembed/text_embedding.rb +161 -28
- data/lib/fastembed/validators.rb +59 -0
- data/lib/fastembed/version.rb +1 -1
- data/lib/fastembed.rb +42 -1
- data/plan.md +257 -0
- data/scripts/verify_models.rb +229 -0
- metadata +70 -3
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Unified benchmark comparing Ruby fastembed with Python fastembed
|
|
5
|
+
# Runs both implementations and reports side-by-side results
|
|
6
|
+
|
|
7
|
+
require 'bundler/setup'
|
|
8
|
+
require 'fastembed'
|
|
9
|
+
require 'benchmark'
|
|
10
|
+
require 'json'
|
|
11
|
+
require 'open3'
|
|
12
|
+
|
|
13
|
+
TEXTS = [
|
|
14
|
+
'Machine learning is a subset of artificial intelligence that enables systems to learn from data.',
|
|
15
|
+
'Ruby on Rails is a server-side web application framework written in Ruby under the MIT License.',
|
|
16
|
+
'Vector databases store embeddings and enable fast similarity search across millions of documents.',
|
|
17
|
+
'Natural language processing helps computers understand, interpret, and generate human language.',
|
|
18
|
+
'The quick brown fox jumps over the lazy dog. This is a classic pangram used in typing tests.'
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
def run_ruby_benchmark
|
|
22
|
+
puts 'Running Ruby benchmark...'
|
|
23
|
+
results = {}
|
|
24
|
+
|
|
25
|
+
# Model loading
|
|
26
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
27
|
+
embedding = Fastembed::TextEmbedding.new(model_name: 'BAAI/bge-small-en-v1.5', show_progress: false)
|
|
28
|
+
results[:load_time] = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000).round(1)
|
|
29
|
+
|
|
30
|
+
# Warmup
|
|
31
|
+
embedding.embed(['warmup']).to_a
|
|
32
|
+
|
|
33
|
+
# Single document latency
|
|
34
|
+
times = []
|
|
35
|
+
10.times do
|
|
36
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
37
|
+
embedding.embed([TEXTS.first]).to_a
|
|
38
|
+
times << (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
|
|
39
|
+
end
|
|
40
|
+
results[:single_latency] = times.min.round(2)
|
|
41
|
+
|
|
42
|
+
# Throughput tests
|
|
43
|
+
[100, 500, 1000].each do |count|
|
|
44
|
+
texts = TEXTS.cycle.take(count)
|
|
45
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
46
|
+
embedding.embed(texts, batch_size: 64).to_a
|
|
47
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
48
|
+
results[:"throughput_#{count}"] = (count / elapsed).round(1)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
results
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def run_python_benchmark
|
|
55
|
+
puts 'Running Python benchmark...'
|
|
56
|
+
|
|
57
|
+
python_script = <<~PYTHON
|
|
58
|
+
import json
|
|
59
|
+
import time
|
|
60
|
+
from fastembed import TextEmbedding
|
|
61
|
+
|
|
62
|
+
TEXTS = [
|
|
63
|
+
"Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
|
|
64
|
+
"Ruby on Rails is a server-side web application framework written in Ruby under the MIT License.",
|
|
65
|
+
"Vector databases store embeddings and enable fast similarity search across millions of documents.",
|
|
66
|
+
"Natural language processing helps computers understand, interpret, and generate human language.",
|
|
67
|
+
"The quick brown fox jumps over the lazy dog. This is a classic pangram used in typing tests."
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
results = {}
|
|
71
|
+
|
|
72
|
+
# Model loading
|
|
73
|
+
start = time.time()
|
|
74
|
+
embedding = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
75
|
+
results["load_time"] = round((time.time() - start) * 1000, 1)
|
|
76
|
+
|
|
77
|
+
# Warmup
|
|
78
|
+
list(embedding.embed(["warmup"]))
|
|
79
|
+
|
|
80
|
+
# Single document latency
|
|
81
|
+
times = []
|
|
82
|
+
for _ in range(10):
|
|
83
|
+
start = time.time()
|
|
84
|
+
list(embedding.embed([TEXTS[0]]))
|
|
85
|
+
times.append((time.time() - start) * 1000)
|
|
86
|
+
results["single_latency"] = round(min(times), 2)
|
|
87
|
+
|
|
88
|
+
# Throughput tests
|
|
89
|
+
for count in [100, 500, 1000]:
|
|
90
|
+
texts = (TEXTS * (count // len(TEXTS) + 1))[:count]
|
|
91
|
+
start = time.time()
|
|
92
|
+
list(embedding.embed(texts, batch_size=64))
|
|
93
|
+
elapsed = time.time() - start
|
|
94
|
+
results[f"throughput_{count}"] = round(count / elapsed, 1)
|
|
95
|
+
|
|
96
|
+
print(json.dumps(results))
|
|
97
|
+
PYTHON
|
|
98
|
+
|
|
99
|
+
stdout, status = Open3.capture2('python3', '-c', python_script)
|
|
100
|
+
|
|
101
|
+
if status.success?
|
|
102
|
+
JSON.parse(stdout)
|
|
103
|
+
else
|
|
104
|
+
puts 'Warning: Python benchmark failed. Is fastembed installed? (pip install fastembed)'
|
|
105
|
+
nil
|
|
106
|
+
end
|
|
107
|
+
rescue Errno::ENOENT
|
|
108
|
+
puts 'Warning: Python not found'
|
|
109
|
+
nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def print_comparison(ruby_results, python_results)
|
|
113
|
+
puts
|
|
114
|
+
puts '=' * 70
|
|
115
|
+
puts 'RUBY vs PYTHON FASTEMBED COMPARISON'
|
|
116
|
+
puts '=' * 70
|
|
117
|
+
puts
|
|
118
|
+
|
|
119
|
+
metrics = [
|
|
120
|
+
[:load_time, 'Model load time', 'ms', :lower_better],
|
|
121
|
+
[:single_latency, 'Single doc latency', 'ms', :lower_better],
|
|
122
|
+
[:throughput_100, '100 docs throughput', 'docs/sec', :higher_better],
|
|
123
|
+
[:throughput_500, '500 docs throughput', 'docs/sec', :higher_better],
|
|
124
|
+
[:throughput_1000, '1000 docs throughput', 'docs/sec', :higher_better]
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
puts format('%-25s %15s %15s %10s', 'Metric', 'Ruby', 'Python', 'Winner')
|
|
128
|
+
puts '-' * 70
|
|
129
|
+
|
|
130
|
+
metrics.each do |key, label, unit, direction|
|
|
131
|
+
ruby_val = ruby_results[key]
|
|
132
|
+
python_val = python_results&.fetch(key.to_s, nil)
|
|
133
|
+
|
|
134
|
+
if python_val
|
|
135
|
+
if direction == :lower_better
|
|
136
|
+
winner = ruby_val < python_val ? 'Ruby' : 'Python'
|
|
137
|
+
ratio = python_val / ruby_val
|
|
138
|
+
else
|
|
139
|
+
winner = ruby_val > python_val ? 'Ruby' : 'Python'
|
|
140
|
+
ratio = ruby_val / python_val
|
|
141
|
+
end
|
|
142
|
+
ratio_str = winner == 'Ruby' ? "(#{ratio.round(1)}x)" : ''
|
|
143
|
+
winner_str = "#{winner} #{ratio_str}"
|
|
144
|
+
else
|
|
145
|
+
winner_str = 'N/A'
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
ruby_str = "#{ruby_val} #{unit}"
|
|
149
|
+
python_str = python_val ? "#{python_val} #{unit}" : 'N/A'
|
|
150
|
+
|
|
151
|
+
puts format('%-25s %15s %15s %10s', label, ruby_str, python_str, winner_str)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
puts
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Run benchmarks
|
|
158
|
+
ruby_results = run_ruby_benchmark
|
|
159
|
+
python_results = run_python_benchmark
|
|
160
|
+
|
|
161
|
+
print_comparison(ruby_results, python_results)
|
|
162
|
+
|
|
163
|
+
puts 'Summary:'
|
|
164
|
+
puts '- Both use the same ONNX Runtime and HuggingFace Tokenizers'
|
|
165
|
+
puts '- Performance differences come from language overhead and batching'
|
|
166
|
+
puts '- Ruby tends to win on latency, Python on large batch throughput'
|
|
167
|
+
puts
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Compare Python FastEmbed performance with Ruby FastEmbed."""
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
from fastembed import TextEmbedding
|
|
6
|
+
|
|
7
|
+
TEXTS = [
|
|
8
|
+
"Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
|
|
9
|
+
"Ruby on Rails is a server-side web application framework written in Ruby under the MIT License.",
|
|
10
|
+
"Vector databases store embeddings and enable fast similarity search across millions of documents.",
|
|
11
|
+
"Natural language processing helps computers understand, interpret, and generate human language.",
|
|
12
|
+
"The quick brown fox jumps over the lazy dog. This is a classic pangram used in typing tests."
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
def benchmark_python():
|
|
16
|
+
print("=" * 60)
|
|
17
|
+
print("PYTHON FASTEMBED BENCHMARK")
|
|
18
|
+
print("=" * 60)
|
|
19
|
+
print()
|
|
20
|
+
|
|
21
|
+
# Model loading time
|
|
22
|
+
start = time.time()
|
|
23
|
+
embedding = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
24
|
+
load_time = time.time() - start
|
|
25
|
+
print(f"Model load time: {load_time * 1000:.1f}ms")
|
|
26
|
+
|
|
27
|
+
# Warmup
|
|
28
|
+
list(embedding.embed(["warmup"]))
|
|
29
|
+
|
|
30
|
+
# Single document latency
|
|
31
|
+
print()
|
|
32
|
+
print("Single document latency:")
|
|
33
|
+
for i, text in enumerate(TEXTS[:3]):
|
|
34
|
+
times = []
|
|
35
|
+
for _ in range(10):
|
|
36
|
+
start = time.time()
|
|
37
|
+
list(embedding.embed([text]))
|
|
38
|
+
times.append(time.time() - start)
|
|
39
|
+
avg = sum(times) / len(times)
|
|
40
|
+
min_time = min(times)
|
|
41
|
+
print(f" Text {i+1} ({len(text)} chars): avg {avg*1000:.2f}ms, min {min_time*1000:.2f}ms")
|
|
42
|
+
|
|
43
|
+
# Throughput
|
|
44
|
+
print()
|
|
45
|
+
print("Throughput:")
|
|
46
|
+
for count in [100, 500, 1000]:
|
|
47
|
+
texts = (TEXTS * (count // len(TEXTS) + 1))[:count]
|
|
48
|
+
|
|
49
|
+
start = time.time()
|
|
50
|
+
list(embedding.embed(texts, batch_size=64))
|
|
51
|
+
elapsed = time.time() - start
|
|
52
|
+
|
|
53
|
+
rate = count / elapsed
|
|
54
|
+
print(f" {count} docs: {rate:.1f} docs/sec ({elapsed*1000:.1f}ms)")
|
|
55
|
+
|
|
56
|
+
print()
|
|
57
|
+
print("=" * 60)
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
benchmark_python()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'bundler/setup'
|
|
5
|
+
require 'fastembed'
|
|
6
|
+
|
|
7
|
+
def memory_mb
|
|
8
|
+
`ps -o rss= -p #{Process.pid}`.to_i / 1024.0
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def print_memory(label)
|
|
12
|
+
puts "#{label}: #{memory_mb.round(1)} MB"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
puts '=' * 60
|
|
16
|
+
puts 'MEMORY PROFILING'
|
|
17
|
+
puts '=' * 60
|
|
18
|
+
puts
|
|
19
|
+
|
|
20
|
+
print_memory('Initial')
|
|
21
|
+
|
|
22
|
+
# Load model
|
|
23
|
+
embedding = Fastembed::TextEmbedding.new
|
|
24
|
+
print_memory('After model load')
|
|
25
|
+
|
|
26
|
+
# Generate sample texts
|
|
27
|
+
texts = Array.new(1000) { |i| "This is document number #{i} with some content for embedding." }
|
|
28
|
+
|
|
29
|
+
# Process in batches
|
|
30
|
+
GC.start
|
|
31
|
+
print_memory('Before embedding 1000 docs')
|
|
32
|
+
|
|
33
|
+
vectors = embedding.embed(texts, batch_size: 64).to_a
|
|
34
|
+
print_memory('After embedding 1000 docs (holding results)')
|
|
35
|
+
|
|
36
|
+
# Clear vectors
|
|
37
|
+
vectors = nil
|
|
38
|
+
GC.start
|
|
39
|
+
sleep 0.1
|
|
40
|
+
print_memory('After clearing vectors + GC')
|
|
41
|
+
|
|
42
|
+
# Test lazy evaluation memory efficiency
|
|
43
|
+
puts
|
|
44
|
+
puts 'Testing lazy evaluation memory efficiency...'
|
|
45
|
+
print_memory('Before lazy processing')
|
|
46
|
+
|
|
47
|
+
count = 0
|
|
48
|
+
embedding.embed(texts, batch_size: 64).each do |_vec|
|
|
49
|
+
count += 1
|
|
50
|
+
# Don't store vectors, just count them
|
|
51
|
+
end
|
|
52
|
+
puts "Processed #{count} vectors without storing"
|
|
53
|
+
|
|
54
|
+
GC.start
|
|
55
|
+
sleep 0.1
|
|
56
|
+
print_memory('After lazy processing + GC')
|
|
57
|
+
|
|
58
|
+
# Stress test - multiple rounds
|
|
59
|
+
puts
|
|
60
|
+
puts 'Stress test - 5 rounds of 1000 docs each...'
|
|
61
|
+
5.times do |round|
|
|
62
|
+
embedding.embed(texts, batch_size: 64).to_a
|
|
63
|
+
GC.start
|
|
64
|
+
print_memory("After round #{round + 1}")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
puts
|
|
68
|
+
puts '=' * 60
|
|
69
|
+
puts 'MEMORY PROFILE COMPLETE'
|
|
70
|
+
puts '=' * 60
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'bundler/setup'
|
|
5
|
+
require 'fastembed'
|
|
6
|
+
require 'benchmark'
|
|
7
|
+
|
|
8
|
+
# Sample texts of varying lengths
|
|
9
|
+
SHORT_TEXTS = [
|
|
10
|
+
'Hello world',
|
|
11
|
+
'Ruby is great',
|
|
12
|
+
'Machine learning',
|
|
13
|
+
'Vector embeddings',
|
|
14
|
+
'Semantic search'
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
MEDIUM_TEXTS = [
|
|
18
|
+
'The quick brown fox jumps over the lazy dog. This is a classic pangram used in typing tests.',
|
|
19
|
+
'Machine learning is a subset of artificial intelligence that enables systems to learn from data.',
|
|
20
|
+
'Ruby on Rails is a server-side web application framework written in Ruby under the MIT License.',
|
|
21
|
+
'Vector databases store embeddings and enable fast similarity search across millions of documents.',
|
|
22
|
+
'Natural language processing helps computers understand, interpret, and generate human language.'
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
LONG_TEXTS = [
|
|
26
|
+
'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals. The term artificial intelligence had previously been used to describe machines that mimic and display human cognitive skills that are associated with the human mind, such as learning and problem-solving.',
|
|
27
|
+
'Ruby is an interpreted, high-level, general-purpose programming language which supports multiple programming paradigms. It was designed with an emphasis on programming productivity and simplicity. In Ruby, everything is an object, including primitive data types. It was developed in the mid-1990s by Yukihiro Matsumoto in Japan. Ruby is dynamically typed and uses garbage collection and just-in-time compilation.',
|
|
28
|
+
'Text embeddings are dense vector representations of text that capture semantic meaning. They are produced by machine learning models trained on large corpora of text data. These embeddings enable semantic similarity calculations, clustering, and information retrieval tasks. Modern embedding models like BERT, Sentence Transformers, and OpenAI embeddings have revolutionized natural language processing applications.',
|
|
29
|
+
'Vector databases are specialized database systems designed to store and query high-dimensional vector data efficiently. They use approximate nearest neighbor algorithms like HNSW, IVF, and PQ to enable fast similarity search at scale. Popular vector databases include Pinecone, Weaviate, Qdrant, Milvus, and pgvector. They are essential infrastructure for semantic search, recommendation systems, and RAG applications.',
|
|
30
|
+
'The Transformer architecture, introduced in the paper "Attention Is All You Need", revolutionized natural language processing. It relies entirely on self-attention mechanisms, dispensing with recurrence and convolutions. This enables much more parallelization and has led to significant improvements in translation quality. Transformers are the foundation of modern language models like GPT, BERT, and T5.'
|
|
31
|
+
].freeze
|
|
32
|
+
|
|
33
|
+
def print_separator
|
|
34
|
+
puts '-' * 70
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def format_rate(count, time)
|
|
38
|
+
rate = count / time
|
|
39
|
+
"#{rate.round(1)} docs/sec"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def profile_batch(embedding, texts, batch_size, iterations = 3)
|
|
43
|
+
times = []
|
|
44
|
+
iterations.times do
|
|
45
|
+
GC.start
|
|
46
|
+
time = Benchmark.realtime do
|
|
47
|
+
embedding.embed(texts, batch_size: batch_size).to_a
|
|
48
|
+
end
|
|
49
|
+
times << time
|
|
50
|
+
end
|
|
51
|
+
times.min # Return best time
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
puts '=' * 70
|
|
55
|
+
puts 'FASTEMBED-RB PERFORMANCE PROFILE'
|
|
56
|
+
puts '=' * 70
|
|
57
|
+
puts
|
|
58
|
+
puts "Ruby version: #{RUBY_VERSION}"
|
|
59
|
+
puts "Platform: #{RUBY_PLATFORM}"
|
|
60
|
+
puts "Fastembed version: #{Fastembed::VERSION}"
|
|
61
|
+
puts
|
|
62
|
+
|
|
63
|
+
# Model loading benchmark
|
|
64
|
+
print_separator
|
|
65
|
+
puts 'MODEL LOADING TIME'
|
|
66
|
+
print_separator
|
|
67
|
+
|
|
68
|
+
models = [
|
|
69
|
+
'BAAI/bge-small-en-v1.5',
|
|
70
|
+
'sentence-transformers/all-MiniLM-L6-v2'
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
models.each do |model_name|
|
|
74
|
+
# Ensure model is downloaded first
|
|
75
|
+
Fastembed::TextEmbedding.new(model_name: model_name)
|
|
76
|
+
GC.start
|
|
77
|
+
|
|
78
|
+
time = Benchmark.realtime do
|
|
79
|
+
Fastembed::TextEmbedding.new(model_name: model_name)
|
|
80
|
+
end
|
|
81
|
+
puts "#{model_name}: #{(time * 1000).round(1)}ms"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
puts
|
|
85
|
+
|
|
86
|
+
# Single document latency
|
|
87
|
+
print_separator
|
|
88
|
+
puts 'SINGLE DOCUMENT LATENCY (lower is better)'
|
|
89
|
+
print_separator
|
|
90
|
+
|
|
91
|
+
embedding = Fastembed::TextEmbedding.new
|
|
92
|
+
warmup = embedding.embed(['warmup']).to_a # Warm up
|
|
93
|
+
|
|
94
|
+
[SHORT_TEXTS.first, MEDIUM_TEXTS.first, LONG_TEXTS.first].each_with_index do |text, i|
|
|
95
|
+
label = %w[Short Medium Long][i]
|
|
96
|
+
times = []
|
|
97
|
+
10.times do
|
|
98
|
+
time = Benchmark.realtime { embedding.embed([text]).to_a }
|
|
99
|
+
times << time
|
|
100
|
+
end
|
|
101
|
+
avg = times.sum / times.length
|
|
102
|
+
min = times.min
|
|
103
|
+
puts "#{label} text (#{text.length} chars): avg #{(avg * 1000).round(2)}ms, min #{(min * 1000).round(2)}ms"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
puts
|
|
107
|
+
|
|
108
|
+
# Throughput benchmarks
|
|
109
|
+
print_separator
|
|
110
|
+
puts 'THROUGHPUT (higher is better)'
|
|
111
|
+
print_separator
|
|
112
|
+
|
|
113
|
+
[10, 100, 500, 1000].each do |count|
|
|
114
|
+
texts = MEDIUM_TEXTS.cycle.take(count)
|
|
115
|
+
|
|
116
|
+
[32, 64, 128, 256].each do |batch_size|
|
|
117
|
+
next if batch_size > count
|
|
118
|
+
|
|
119
|
+
time = profile_batch(embedding, texts, batch_size)
|
|
120
|
+
rate = format_rate(count, time)
|
|
121
|
+
puts "#{count} docs, batch #{batch_size}: #{rate} (#{(time * 1000).round(1)}ms total)"
|
|
122
|
+
end
|
|
123
|
+
puts
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Memory efficiency test
|
|
127
|
+
print_separator
|
|
128
|
+
puts 'LAZY EVALUATION TEST'
|
|
129
|
+
print_separator
|
|
130
|
+
|
|
131
|
+
texts = MEDIUM_TEXTS.cycle.take(1000)
|
|
132
|
+
processed = 0
|
|
133
|
+
|
|
134
|
+
time = Benchmark.realtime do
|
|
135
|
+
embedding.embed(texts, batch_size: 64).each do |_vec|
|
|
136
|
+
processed += 1
|
|
137
|
+
break if processed >= 100 # Only process first 100
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
puts "Processed #{processed}/1000 documents in #{(time * 1000).round(1)}ms"
|
|
142
|
+
puts '(Lazy evaluation means we only computed embeddings for documents we needed)'
|
|
143
|
+
|
|
144
|
+
puts
|
|
145
|
+
|
|
146
|
+
# Embedding quality sanity check
|
|
147
|
+
print_separator
|
|
148
|
+
puts 'EMBEDDING QUALITY SANITY CHECK'
|
|
149
|
+
print_separator
|
|
150
|
+
|
|
151
|
+
test_pairs = [
|
|
152
|
+
['dog', 'puppy', 'high'],
|
|
153
|
+
['dog', 'cat', 'medium'],
|
|
154
|
+
['dog', 'airplane', 'low'],
|
|
155
|
+
['machine learning', 'artificial intelligence', 'high'],
|
|
156
|
+
['machine learning', 'cooking recipes', 'low']
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
def cosine_similarity(a, b)
|
|
160
|
+
a.zip(b).sum { |x, y| x * y }
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
test_pairs.each do |text1, text2, expected|
|
|
164
|
+
vecs = embedding.embed([text1, text2]).to_a
|
|
165
|
+
sim = cosine_similarity(vecs[0], vecs[1])
|
|
166
|
+
status = case expected
|
|
167
|
+
when 'high' then sim > 0.7 ? 'PASS' : 'FAIL'
|
|
168
|
+
when 'medium' then sim > 0.4 && sim < 0.8 ? 'PASS' : 'FAIL'
|
|
169
|
+
when 'low' then sim < 0.5 ? 'PASS' : 'FAIL'
|
|
170
|
+
end
|
|
171
|
+
puts "#{status}: '#{text1}' vs '#{text2}' = #{sim.round(3)} (expected #{expected})"
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
puts
|
|
175
|
+
|
|
176
|
+
# Compare with batch sizes
|
|
177
|
+
print_separator
|
|
178
|
+
puts 'OPTIMAL BATCH SIZE ANALYSIS'
|
|
179
|
+
print_separator
|
|
180
|
+
|
|
181
|
+
texts = MEDIUM_TEXTS.cycle.take(500)
|
|
182
|
+
results = {}
|
|
183
|
+
|
|
184
|
+
[1, 8, 16, 32, 64, 128, 256, 512].each do |batch_size|
|
|
185
|
+
time = profile_batch(embedding, texts, batch_size, 2)
|
|
186
|
+
rate = 500.0 / time
|
|
187
|
+
results[batch_size] = rate
|
|
188
|
+
puts "Batch #{batch_size.to_s.rjust(3)}: #{rate.round(1)} docs/sec"
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
optimal = results.max_by { |_, v| v }
|
|
192
|
+
puts
|
|
193
|
+
puts "Optimal batch size: #{optimal[0]} (#{optimal[1].round(1)} docs/sec)"
|
|
194
|
+
|
|
195
|
+
puts
|
|
196
|
+
puts '=' * 70
|
|
197
|
+
puts 'PROFILE COMPLETE'
|
|
198
|
+
puts '=' * 70
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Benchmark script for TextCrossEncoder (reranker) performance
|
|
5
|
+
|
|
6
|
+
require 'bundler/setup'
|
|
7
|
+
require 'fastembed'
|
|
8
|
+
require 'benchmark'
|
|
9
|
+
|
|
10
|
+
QUERY = 'What is machine learning?'
|
|
11
|
+
|
|
12
|
+
DOCUMENTS = [
|
|
13
|
+
'Machine learning is a subset of artificial intelligence that enables systems to learn from data.',
|
|
14
|
+
'Ruby on Rails is a server-side web application framework written in Ruby under the MIT License.',
|
|
15
|
+
'Deep learning uses neural networks with many layers to model complex patterns in data.',
|
|
16
|
+
'Vector databases store embeddings and enable fast similarity search across millions of documents.',
|
|
17
|
+
'Supervised learning requires labeled training data to learn the mapping from inputs to outputs.',
|
|
18
|
+
'Natural language processing helps computers understand, interpret, and generate human language.',
|
|
19
|
+
'Random forests are ensemble learning methods that construct multiple decision trees.',
|
|
20
|
+
'The quick brown fox jumps over the lazy dog. This is a classic pangram used in typing tests.',
|
|
21
|
+
'Gradient descent is an optimization algorithm used to minimize the loss function in ML models.',
|
|
22
|
+
'Transformers use self-attention mechanisms to process sequential data in parallel.'
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
def print_separator
|
|
26
|
+
puts '-' * 70
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
puts '=' * 70
|
|
30
|
+
puts 'RERANKER (CROSS-ENCODER) PERFORMANCE BENCHMARK'
|
|
31
|
+
puts '=' * 70
|
|
32
|
+
puts
|
|
33
|
+
puts "Ruby version: #{RUBY_VERSION}"
|
|
34
|
+
puts "Fastembed version: #{Fastembed::VERSION}"
|
|
35
|
+
puts
|
|
36
|
+
|
|
37
|
+
# Model loading benchmark
|
|
38
|
+
print_separator
|
|
39
|
+
puts 'MODEL LOADING TIME'
|
|
40
|
+
print_separator
|
|
41
|
+
|
|
42
|
+
Fastembed::SUPPORTED_RERANKER_MODELS.each_key do |model_name|
|
|
43
|
+
# Ensure model is downloaded first
|
|
44
|
+
begin
|
|
45
|
+
Fastembed::TextCrossEncoder.new(model_name: model_name)
|
|
46
|
+
rescue StandardError
|
|
47
|
+
puts "#{model_name}: (skipped - not available)"
|
|
48
|
+
next
|
|
49
|
+
end
|
|
50
|
+
GC.start
|
|
51
|
+
|
|
52
|
+
time = Benchmark.realtime do
|
|
53
|
+
Fastembed::TextCrossEncoder.new(model_name: model_name)
|
|
54
|
+
end
|
|
55
|
+
puts "#{model_name}: #{(time * 1000).round(1)}ms"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
puts
|
|
59
|
+
|
|
60
|
+
# Use default model for latency tests
|
|
61
|
+
reranker = Fastembed::TextCrossEncoder.new
|
|
62
|
+
|
|
63
|
+
# Single query latency
|
|
64
|
+
print_separator
|
|
65
|
+
puts 'SINGLE QUERY LATENCY (reranking against 10 documents)'
|
|
66
|
+
print_separator
|
|
67
|
+
|
|
68
|
+
times = []
|
|
69
|
+
20.times do
|
|
70
|
+
GC.start
|
|
71
|
+
time = Benchmark.realtime do
|
|
72
|
+
reranker.rerank(query: QUERY, documents: DOCUMENTS)
|
|
73
|
+
end
|
|
74
|
+
times << time
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
avg = times.sum / times.length
|
|
78
|
+
min = times.min
|
|
79
|
+
max = times.max
|
|
80
|
+
puts "Average: #{(avg * 1000).round(2)}ms"
|
|
81
|
+
puts "Min: #{(min * 1000).round(2)}ms"
|
|
82
|
+
puts "Max: #{(max * 1000).round(2)}ms"
|
|
83
|
+
puts
|
|
84
|
+
|
|
85
|
+
# Throughput with varying document counts
|
|
86
|
+
print_separator
|
|
87
|
+
puts 'THROUGHPUT VS DOCUMENT COUNT'
|
|
88
|
+
print_separator
|
|
89
|
+
|
|
90
|
+
[10, 50, 100, 200].each do |doc_count|
|
|
91
|
+
docs = DOCUMENTS.cycle.take(doc_count)
|
|
92
|
+
|
|
93
|
+
times = []
|
|
94
|
+
3.times do
|
|
95
|
+
GC.start
|
|
96
|
+
time = Benchmark.realtime do
|
|
97
|
+
reranker.rerank(query: QUERY, documents: docs, batch_size: 64)
|
|
98
|
+
end
|
|
99
|
+
times << time
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
min_time = times.min
|
|
103
|
+
rate = doc_count / min_time
|
|
104
|
+
puts "#{doc_count} documents: #{rate.round(1)} docs/sec (#{(min_time * 1000).round(1)}ms)"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
puts
|
|
108
|
+
|
|
109
|
+
# Batch size optimization
|
|
110
|
+
print_separator
|
|
111
|
+
puts 'BATCH SIZE OPTIMIZATION (100 documents)'
|
|
112
|
+
print_separator
|
|
113
|
+
|
|
114
|
+
docs = DOCUMENTS.cycle.take(100)
|
|
115
|
+
results = {}
|
|
116
|
+
|
|
117
|
+
[8, 16, 32, 64, 128].each do |batch_size|
|
|
118
|
+
times = []
|
|
119
|
+
3.times do
|
|
120
|
+
GC.start
|
|
121
|
+
time = Benchmark.realtime do
|
|
122
|
+
reranker.rerank(query: QUERY, documents: docs, batch_size: batch_size)
|
|
123
|
+
end
|
|
124
|
+
times << time
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
min_time = times.min
|
|
128
|
+
rate = 100.0 / min_time
|
|
129
|
+
results[batch_size] = rate
|
|
130
|
+
puts "Batch #{batch_size.to_s.rjust(3)}: #{rate.round(1)} docs/sec"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
optimal = results.max_by { |_, v| v }
|
|
134
|
+
puts
|
|
135
|
+
puts "Optimal batch size: #{optimal[0]} (#{optimal[1].round(1)} docs/sec)"
|
|
136
|
+
|
|
137
|
+
puts
|
|
138
|
+
|
|
139
|
+
# Quality check
|
|
140
|
+
print_separator
|
|
141
|
+
puts 'RERANKING QUALITY CHECK'
|
|
142
|
+
print_separator
|
|
143
|
+
|
|
144
|
+
results = reranker.rerank_with_scores(query: QUERY, documents: DOCUMENTS, top_k: 5)
|
|
145
|
+
|
|
146
|
+
puts "Query: '#{QUERY}'"
|
|
147
|
+
puts
|
|
148
|
+
puts 'Top 5 results:'
|
|
149
|
+
results.each_with_index do |result, i|
|
|
150
|
+
score = result[:score]
|
|
151
|
+
doc = result[:document][0, 60]
|
|
152
|
+
puts "#{i + 1}. (#{score.round(3)}) #{doc}..."
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
puts
|
|
156
|
+
puts '=' * 70
|
|
157
|
+
puts 'BENCHMARK COMPLETE'
|
|
158
|
+
puts '=' * 70
|
data/exe/fastembed
ADDED
data/fastembed.gemspec
CHANGED
|
@@ -32,8 +32,11 @@ Gem::Specification.new do |spec|
|
|
|
32
32
|
spec.add_dependency 'onnxruntime', '~> 0.9'
|
|
33
33
|
spec.add_dependency 'tokenizers', '~> 0.5'
|
|
34
34
|
|
|
35
|
+
spec.add_development_dependency 'mini_magick', '~> 4.0'
|
|
35
36
|
spec.add_development_dependency 'rake', '~> 13.0'
|
|
36
37
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
37
38
|
spec.add_development_dependency 'rubocop', '~> 1.0'
|
|
38
39
|
spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
|
|
40
|
+
spec.add_development_dependency 'webmock', '~> 3.0'
|
|
41
|
+
spec.add_development_dependency 'yard', '~> 0.9'
|
|
39
42
|
end
|