fastembed 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.mise.toml +2 -0
- data/.rspec +3 -0
- data/.rubocop.yml +73 -0
- data/BENCHMARKS.md +83 -0
- data/CHANGELOG.md +31 -0
- data/Gemfile +8 -0
- data/LICENSE +21 -0
- data/README.md +163 -0
- data/Rakefile +8 -0
- data/fastembed.gemspec +39 -0
- data/lib/fastembed/model_info.rb +151 -0
- data/lib/fastembed/model_management.rb +154 -0
- data/lib/fastembed/onnx_embedding_model.rb +118 -0
- data/lib/fastembed/pooling.rb +71 -0
- data/lib/fastembed/text_embedding.rb +118 -0
- data/lib/fastembed/version.rb +5 -0
- data/lib/fastembed.rb +14 -0
- metadata +148 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 1bc6e2a53d0c7a8c4679f7af6a23af74b903b0e227c401307e3d936d74719291
|
|
4
|
+
data.tar.gz: 5664c43f8d7f0632719324b42805abe3806274c18113232f82bbddce918647e5
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: f7f821a8a8ee49fdbbde65eaf394d8b406b1aecfc53fa572ecbbf3ccaf9fd120771fa5e5d444d339aa9ec5b729c63f283e564d05ea1ce1db1427580ced41cc0d
|
|
7
|
+
data.tar.gz: a99fda1200c29abf2ea7bed10ab5e7eee30fc5238e5d13b5608f7c41014e98e1e880c67ea1da1e56eef10864697ebca11ed7a7bf6463fb437f4dc4fe80267bef
|
data/.mise.toml
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
plugins:
|
|
2
|
+
- rubocop-rspec
|
|
3
|
+
|
|
4
|
+
AllCops:
|
|
5
|
+
TargetRubyVersion: 3.3
|
|
6
|
+
NewCops: enable
|
|
7
|
+
SuggestExtensions: false
|
|
8
|
+
Exclude:
|
|
9
|
+
- 'bin/*'
|
|
10
|
+
- 'vendor/**/*'
|
|
11
|
+
|
|
12
|
+
Style/Documentation:
|
|
13
|
+
Enabled: false
|
|
14
|
+
|
|
15
|
+
Style/FrozenStringLiteralComment:
|
|
16
|
+
Enabled: true
|
|
17
|
+
EnforcedStyle: always
|
|
18
|
+
|
|
19
|
+
Metrics/MethodLength:
|
|
20
|
+
Max: 25
|
|
21
|
+
|
|
22
|
+
Metrics/AbcSize:
|
|
23
|
+
Max: 30
|
|
24
|
+
|
|
25
|
+
Metrics/CyclomaticComplexity:
|
|
26
|
+
Max: 10
|
|
27
|
+
|
|
28
|
+
Metrics/ParameterLists:
|
|
29
|
+
Max: 9
|
|
30
|
+
|
|
31
|
+
Naming/MethodParameterName:
|
|
32
|
+
AllowedNames:
|
|
33
|
+
- a
|
|
34
|
+
- b
|
|
35
|
+
- x
|
|
36
|
+
- y
|
|
37
|
+
|
|
38
|
+
Lint/EmptyBlock:
|
|
39
|
+
Exclude:
|
|
40
|
+
- 'lib/fastembed/text_embedding.rb'
|
|
41
|
+
|
|
42
|
+
RSpec/MultipleDescribes:
|
|
43
|
+
Enabled: false
|
|
44
|
+
|
|
45
|
+
Gemspec/DevelopmentDependencies:
|
|
46
|
+
Enabled: false
|
|
47
|
+
|
|
48
|
+
Metrics/BlockLength:
|
|
49
|
+
Exclude:
|
|
50
|
+
- 'spec/**/*'
|
|
51
|
+
- '*.gemspec'
|
|
52
|
+
|
|
53
|
+
Metrics/ClassLength:
|
|
54
|
+
Max: 150
|
|
55
|
+
|
|
56
|
+
Metrics/ModuleLength:
|
|
57
|
+
Max: 150
|
|
58
|
+
|
|
59
|
+
Layout/LineLength:
|
|
60
|
+
Max: 120
|
|
61
|
+
|
|
62
|
+
RSpec/MultipleExpectations:
|
|
63
|
+
Max: 5
|
|
64
|
+
|
|
65
|
+
RSpec/ExampleLength:
|
|
66
|
+
Max: 15
|
|
67
|
+
|
|
68
|
+
RSpec/NestedGroups:
|
|
69
|
+
Max: 4
|
|
70
|
+
|
|
71
|
+
RSpec/DescribeClass:
|
|
72
|
+
Exclude:
|
|
73
|
+
- 'spec/integration_spec.rb'
|
data/BENCHMARKS.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Benchmarks
|
|
2
|
+
|
|
3
|
+
Performance benchmarks on Apple M1 Max, Ruby 3.3.10.
|
|
4
|
+
|
|
5
|
+
## Single Document Latency
|
|
6
|
+
|
|
7
|
+
Using the default model (BAAI/bge-small-en-v1.5):
|
|
8
|
+
|
|
9
|
+
| Text Length | Latency |
|
|
10
|
+
|-------------|---------|
|
|
11
|
+
| Short (~10 tokens) | ~6.5 ms |
|
|
12
|
+
| Medium (~30 tokens) | ~6.5 ms |
|
|
13
|
+
|
|
14
|
+
## Batch Throughput
|
|
15
|
+
|
|
16
|
+
### bge-small-en-v1.5 (default)
|
|
17
|
+
|
|
18
|
+
| Text Length | 100 docs | Throughput |
|
|
19
|
+
|-------------|----------|------------|
|
|
20
|
+
| Short sentences | 0.2s | **502 docs/sec** |
|
|
21
|
+
| Medium paragraphs | 0.5s | **197 docs/sec** |
|
|
22
|
+
| Long documents | 2.3s | **44 docs/sec** |
|
|
23
|
+
|
|
24
|
+
### Large Scale (1000 documents)
|
|
25
|
+
|
|
26
|
+
| Model | Time | Throughput |
|
|
27
|
+
|-------|------|------------|
|
|
28
|
+
| bge-small-en-v1.5 | 2.0s | **509 docs/sec** |
|
|
29
|
+
|
|
30
|
+
## Model Comparison
|
|
31
|
+
|
|
32
|
+
| Model | Dimensions | Size | Throughput |
|
|
33
|
+
|-------|-----------|------|------------|
|
|
34
|
+
| bge-small-en-v1.5 | 384 | 67 MB | **530 docs/sec** |
|
|
35
|
+
| bge-base-en-v1.5 | 768 | 210 MB | **169 docs/sec** |
|
|
36
|
+
| bge-large-en-v1.5 | 1024 | 1.2 GB | **50 docs/sec** |
|
|
37
|
+
|
|
38
|
+
## CPU vs CoreML (Apple Silicon)
|
|
39
|
+
|
|
40
|
+
We tested CoreML execution provider to see if GPU/Neural Engine acceleration helps.
|
|
41
|
+
|
|
42
|
+
### Results by Model
|
|
43
|
+
|
|
44
|
+
| Model | CPU | CoreML (best batch) | Ratio |
|
|
45
|
+
|-------|-----|---------------------|-------|
|
|
46
|
+
| bge-small | 418/s | 162/s (batch=64) | CPU 2.6x faster |
|
|
47
|
+
| bge-base | 134/s | 64/s (batch=32) | CPU 2.1x faster |
|
|
48
|
+
| bge-large | 41/s | 23/s (batch=16) | CPU 1.8x faster |
|
|
49
|
+
|
|
50
|
+
### Batch Size Impact on CoreML
|
|
51
|
+
|
|
52
|
+
| Batch Size | CPU (bge-small) | CoreML | Ratio |
|
|
53
|
+
|------------|-----------------|--------|-------|
|
|
54
|
+
| 1 | 209/s | 40/s | 0.19x |
|
|
55
|
+
| 8 | 351/s | 118/s | 0.34x |
|
|
56
|
+
| 16 | 382/s | 146/s | 0.38x |
|
|
57
|
+
| 32 | 410/s | 155/s | 0.38x |
|
|
58
|
+
| 64 | 418/s | 162/s | 0.39x |
|
|
59
|
+
| 128 | 391/s | 139/s | 0.35x |
|
|
60
|
+
| 256 | 412/s | 121/s | 0.29x |
|
|
61
|
+
|
|
62
|
+
### Conclusion
|
|
63
|
+
|
|
64
|
+
**CPU is faster than CoreML** for all embedding models on Apple Silicon. The gap narrows for larger models, but CPU still wins. This is because:
|
|
65
|
+
|
|
66
|
+
1. ONNX Runtime's CPU implementation is highly optimized for M1/M2
|
|
67
|
+
2. Data transfer overhead to Neural Engine outweighs compute benefits
|
|
68
|
+
3. Embedding models are relatively small compared to LLMs
|
|
69
|
+
|
|
70
|
+
**Recommendation:** Stick with the default CPU provider.
|
|
71
|
+
|
|
72
|
+
## Running Your Own Benchmarks
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
require 'fastembed'
|
|
76
|
+
require 'benchmark'
|
|
77
|
+
|
|
78
|
+
embedding = Fastembed::TextEmbedding.new
|
|
79
|
+
texts = Array.new(1000) { "Sample text for benchmarking" }
|
|
80
|
+
|
|
81
|
+
result = Benchmark.measure { embedding.embed(texts).to_a }
|
|
82
|
+
puts "#{1000 / result.real} docs/sec"
|
|
83
|
+
```
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [1.0.0] - 2025-01-08
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Initial release
|
|
15
|
+
- `TextEmbedding` class for generating text embeddings
|
|
16
|
+
- Automatic model downloading and caching from HuggingFace
|
|
17
|
+
- Support for multiple embedding models:
|
|
18
|
+
- BAAI/bge-small-en-v1.5 (default)
|
|
19
|
+
- BAAI/bge-base-en-v1.5
|
|
20
|
+
- BAAI/bge-large-en-v1.5
|
|
21
|
+
- sentence-transformers/all-MiniLM-L6-v2
|
|
22
|
+
- intfloat/multilingual-e5-small
|
|
23
|
+
- intfloat/multilingual-e5-base
|
|
24
|
+
- Lazy evaluation with `Enumerator` for memory efficiency
|
|
25
|
+
- Query and passage embedding methods for retrieval tasks
|
|
26
|
+
- Configurable batch size, threading, and execution providers
|
|
27
|
+
- Mean pooling and L2 normalization
|
|
28
|
+
- CoreML execution provider support (experimental)
|
|
29
|
+
|
|
30
|
+
[Unreleased]: https://github.com/khasinski/fastembed-rb/compare/v1.0.0...HEAD
|
|
31
|
+
[1.0.0]: https://github.com/khasinski/fastembed-rb/releases/tag/v1.0.0
|
data/Gemfile
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Chris Hasinski
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# FastEmbed Ruby
|
|
2
|
+
|
|
3
|
+
[](https://rubygems.org/gems/fastembed)
|
|
4
|
+
[](https://github.com/khasinski/fastembed-rb/actions/workflows/ci.yml)
|
|
5
|
+
|
|
6
|
+
Fast, lightweight text embeddings in Ruby. Convert text into vectors for semantic search, similarity matching, clustering, and RAG applications.
|
|
7
|
+
|
|
8
|
+
```ruby
|
|
9
|
+
embedding = Fastembed::TextEmbedding.new
|
|
10
|
+
vectors = embedding.embed(["Hello world", "Ruby is great"]).to_a
|
|
11
|
+
# => [[0.123, -0.456, ...], [0.789, 0.012, ...]] (384-dimensional vectors)
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## What are embeddings?
|
|
15
|
+
|
|
16
|
+
Embeddings convert text into numerical vectors that capture semantic meaning. Similar texts produce similar vectors, enabling:
|
|
17
|
+
|
|
18
|
+
- **Semantic search** - Find relevant documents by meaning, not just keywords
|
|
19
|
+
- **Similarity matching** - Compare texts to find duplicates or related content
|
|
20
|
+
- **RAG applications** - Retrieve context for LLMs like ChatGPT
|
|
21
|
+
- **Clustering** - Group similar documents together
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
gem 'fastembed'
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```ruby
|
|
32
|
+
require 'fastembed'
|
|
33
|
+
|
|
34
|
+
# Create embedding model (downloads automatically on first use, ~67MB)
|
|
35
|
+
embedding = Fastembed::TextEmbedding.new
|
|
36
|
+
|
|
37
|
+
# Embed your texts
|
|
38
|
+
docs = ["Ruby is a programming language", "Python is also a programming language"]
|
|
39
|
+
vectors = embedding.embed(docs).to_a
|
|
40
|
+
|
|
41
|
+
# Find similarity between texts (cosine similarity via dot product)
|
|
42
|
+
similarity = vectors[0].zip(vectors[1]).sum { |a, b| a * b }
|
|
43
|
+
puts similarity # => 0.89 (high similarity!)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Semantic Search Example
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
# Your document corpus
|
|
50
|
+
documents = [
|
|
51
|
+
"The quick brown fox jumps over the lazy dog",
|
|
52
|
+
"Machine learning is a subset of artificial intelligence",
|
|
53
|
+
"Ruby on Rails is a web application framework",
|
|
54
|
+
"Neural networks are inspired by biological brains"
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# Create embeddings for all documents
|
|
58
|
+
embedding = Fastembed::TextEmbedding.new
|
|
59
|
+
doc_vectors = embedding.embed(documents).to_a
|
|
60
|
+
|
|
61
|
+
# Search query
|
|
62
|
+
query = "AI and deep learning"
|
|
63
|
+
query_vector = embedding.embed(query).first
|
|
64
|
+
|
|
65
|
+
# Find most similar document (highest dot product)
|
|
66
|
+
similarities = doc_vectors.map.with_index do |doc_vec, i|
|
|
67
|
+
score = query_vector.zip(doc_vec).sum { |a, b| a * b }
|
|
68
|
+
[i, score]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
best_match = similarities.max_by { |_, score| score }
|
|
72
|
+
puts documents[best_match[0]] # => "Machine learning is a subset of artificial intelligence"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Choose a Model
|
|
78
|
+
|
|
79
|
+
```ruby
|
|
80
|
+
# Default: Fast and accurate (384 dimensions, 67MB)
|
|
81
|
+
embedding = Fastembed::TextEmbedding.new
|
|
82
|
+
|
|
83
|
+
# Higher accuracy (768 dimensions, 210MB)
|
|
84
|
+
embedding = Fastembed::TextEmbedding.new(model_name: "BAAI/bge-base-en-v1.5")
|
|
85
|
+
|
|
86
|
+
# Multilingual support (100+ languages)
|
|
87
|
+
embedding = Fastembed::TextEmbedding.new(model_name: "intfloat/multilingual-e5-small")
|
|
88
|
+
|
|
89
|
+
# Long documents (8192 tokens vs default 512)
|
|
90
|
+
embedding = Fastembed::TextEmbedding.new(model_name: "nomic-ai/nomic-embed-text-v1.5")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Process Large Datasets
|
|
94
|
+
|
|
95
|
+
```ruby
|
|
96
|
+
# Lazy evaluation - memory efficient for large datasets
|
|
97
|
+
documents = File.readlines("corpus.txt")
|
|
98
|
+
|
|
99
|
+
embedding.embed(documents, batch_size: 64).each_slice(100) do |batch|
|
|
100
|
+
store_in_vector_database(batch)
|
|
101
|
+
end
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### List Available Models
|
|
105
|
+
|
|
106
|
+
```ruby
|
|
107
|
+
Fastembed::TextEmbedding.list_supported_models.each do |model|
|
|
108
|
+
puts "#{model[:model_name]} - #{model[:dim]}d - #{model[:description]}"
|
|
109
|
+
end
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Supported Models
|
|
113
|
+
|
|
114
|
+
| Model | Dim | Use Case |
|
|
115
|
+
|-------|-----|----------|
|
|
116
|
+
| `BAAI/bge-small-en-v1.5` | 384 | Default, fast English embeddings |
|
|
117
|
+
| `BAAI/bge-base-en-v1.5` | 768 | Higher accuracy English |
|
|
118
|
+
| `BAAI/bge-large-en-v1.5` | 1024 | Highest accuracy English |
|
|
119
|
+
| `sentence-transformers/all-MiniLM-L6-v2` | 384 | General purpose, lightweight |
|
|
120
|
+
| `sentence-transformers/all-mpnet-base-v2` | 768 | High quality general purpose |
|
|
121
|
+
| `intfloat/multilingual-e5-small` | 384 | 100+ languages |
|
|
122
|
+
| `intfloat/multilingual-e5-base` | 768 | 100+ languages, higher accuracy |
|
|
123
|
+
| `nomic-ai/nomic-embed-text-v1.5` | 768 | Long context (8192 tokens) |
|
|
124
|
+
| `jinaai/jina-embeddings-v2-base-en` | 768 | Long context (8192 tokens) |
|
|
125
|
+
|
|
126
|
+
## Performance
|
|
127
|
+
|
|
128
|
+
On Apple M1 Max with the default model:
|
|
129
|
+
|
|
130
|
+
| Batch Size | Throughput |
|
|
131
|
+
|------------|------------|
|
|
132
|
+
| 1 document | ~6.5ms |
|
|
133
|
+
| 100 documents | ~500 docs/sec |
|
|
134
|
+
| 1000 documents | ~500 docs/sec |
|
|
135
|
+
|
|
136
|
+
Larger models are slower but more accurate. See [benchmarks](BENCHMARKS.md) for details.
|
|
137
|
+
|
|
138
|
+
## Configuration
|
|
139
|
+
|
|
140
|
+
```ruby
|
|
141
|
+
Fastembed::TextEmbedding.new(
|
|
142
|
+
model_name: "BAAI/bge-small-en-v1.5", # Model to use
|
|
143
|
+
cache_dir: "~/.cache/fastembed", # Where to store models
|
|
144
|
+
threads: 4, # ONNX Runtime threads
|
|
145
|
+
providers: ["CUDAExecutionProvider"] # GPU acceleration (Linux/Windows)
|
|
146
|
+
)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Environment variables:**
|
|
150
|
+
- `FASTEMBED_CACHE_PATH` - Custom model cache directory
|
|
151
|
+
|
|
152
|
+
## Requirements
|
|
153
|
+
|
|
154
|
+
- Ruby >= 3.3
|
|
155
|
+
- ~70MB disk space for default model (varies by model)
|
|
156
|
+
|
|
157
|
+
## Acknowledgments
|
|
158
|
+
|
|
159
|
+
Ruby port of [FastEmbed](https://github.com/qdrant/fastembed) by Qdrant. Built on [onnxruntime-ruby](https://github.com/ankane/onnxruntime-ruby) and [tokenizers-ruby](https://github.com/ankane/tokenizers-ruby) by Andrew Kane.
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT
|
data/Rakefile
ADDED
data/fastembed.gemspec
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/fastembed/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'fastembed'
|
|
7
|
+
spec.version = Fastembed::VERSION
|
|
8
|
+
spec.authors = ['Chris Hasinski']
|
|
9
|
+
spec.email = ['krzysztof.hasinski@gmail.com']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Fast, lightweight text embeddings for Ruby'
|
|
12
|
+
spec.description = 'A Ruby port of FastEmbed - fast text embeddings using ONNX Runtime'
|
|
13
|
+
spec.homepage = 'https://github.com/khasinski/fastembed-rb'
|
|
14
|
+
spec.license = 'MIT'
|
|
15
|
+
spec.required_ruby_version = '>= 3.3.0'
|
|
16
|
+
|
|
17
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
|
18
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
|
19
|
+
spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
20
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
21
|
+
|
|
22
|
+
spec.files = Dir.chdir(__dir__) do
|
|
23
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
24
|
+
(File.expand_path(f) == __FILE__) ||
|
|
25
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github])
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
spec.bindir = 'exe'
|
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
30
|
+
spec.require_paths = ['lib']
|
|
31
|
+
|
|
32
|
+
spec.add_dependency 'onnxruntime', '~> 0.9'
|
|
33
|
+
spec.add_dependency 'tokenizers', '~> 0.5'
|
|
34
|
+
|
|
35
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
|
36
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
37
|
+
spec.add_development_dependency 'rubocop', '~> 1.0'
|
|
38
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 3.0'
|
|
39
|
+
end
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Fastembed
|
|
4
|
+
# Model information structure
|
|
5
|
+
class ModelInfo
|
|
6
|
+
attr_reader :model_name, :dim, :description, :size_in_gb, :model_file,
|
|
7
|
+
:tokenizer_file, :sources, :pooling, :normalize
|
|
8
|
+
|
|
9
|
+
def initialize(
|
|
10
|
+
model_name:,
|
|
11
|
+
dim:,
|
|
12
|
+
description:,
|
|
13
|
+
size_in_gb:,
|
|
14
|
+
sources:,
|
|
15
|
+
model_file: 'model.onnx',
|
|
16
|
+
tokenizer_file: 'tokenizer.json',
|
|
17
|
+
pooling: :mean,
|
|
18
|
+
normalize: true
|
|
19
|
+
)
|
|
20
|
+
@model_name = model_name
|
|
21
|
+
@dim = dim
|
|
22
|
+
@description = description
|
|
23
|
+
@size_in_gb = size_in_gb
|
|
24
|
+
@sources = sources
|
|
25
|
+
@model_file = model_file
|
|
26
|
+
@tokenizer_file = tokenizer_file
|
|
27
|
+
@pooling = pooling
|
|
28
|
+
@normalize = normalize
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def hf_repo
|
|
32
|
+
sources[:hf]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def to_h
|
|
36
|
+
{
|
|
37
|
+
model_name: model_name,
|
|
38
|
+
dim: dim,
|
|
39
|
+
description: description,
|
|
40
|
+
size_in_gb: size_in_gb,
|
|
41
|
+
sources: sources,
|
|
42
|
+
model_file: model_file,
|
|
43
|
+
tokenizer_file: tokenizer_file,
|
|
44
|
+
pooling: pooling,
|
|
45
|
+
normalize: normalize
|
|
46
|
+
}
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Registry of supported models
|
|
51
|
+
SUPPORTED_MODELS = {
|
|
52
|
+
'BAAI/bge-small-en-v1.5' => ModelInfo.new(
|
|
53
|
+
model_name: 'BAAI/bge-small-en-v1.5',
|
|
54
|
+
dim: 384,
|
|
55
|
+
description: 'Fast and accurate English embedding model',
|
|
56
|
+
size_in_gb: 0.067,
|
|
57
|
+
sources: { hf: 'Xenova/bge-small-en-v1.5' },
|
|
58
|
+
model_file: 'onnx/model.onnx'
|
|
59
|
+
),
|
|
60
|
+
'BAAI/bge-base-en-v1.5' => ModelInfo.new(
|
|
61
|
+
model_name: 'BAAI/bge-base-en-v1.5',
|
|
62
|
+
dim: 768,
|
|
63
|
+
description: 'Balanced English embedding model with higher accuracy',
|
|
64
|
+
size_in_gb: 0.210,
|
|
65
|
+
sources: { hf: 'Xenova/bge-base-en-v1.5' },
|
|
66
|
+
model_file: 'onnx/model.onnx'
|
|
67
|
+
),
|
|
68
|
+
'BAAI/bge-large-en-v1.5' => ModelInfo.new(
|
|
69
|
+
model_name: 'BAAI/bge-large-en-v1.5',
|
|
70
|
+
dim: 1024,
|
|
71
|
+
description: 'High accuracy English embedding model',
|
|
72
|
+
size_in_gb: 1.2,
|
|
73
|
+
sources: { hf: 'Xenova/bge-large-en-v1.5' },
|
|
74
|
+
model_file: 'onnx/model.onnx'
|
|
75
|
+
),
|
|
76
|
+
'sentence-transformers/all-MiniLM-L6-v2' => ModelInfo.new(
|
|
77
|
+
model_name: 'sentence-transformers/all-MiniLM-L6-v2',
|
|
78
|
+
dim: 384,
|
|
79
|
+
description: 'Lightweight general-purpose sentence embedding model',
|
|
80
|
+
size_in_gb: 0.09,
|
|
81
|
+
sources: { hf: 'Xenova/all-MiniLM-L6-v2' },
|
|
82
|
+
model_file: 'onnx/model.onnx'
|
|
83
|
+
),
|
|
84
|
+
'intfloat/multilingual-e5-small' => ModelInfo.new(
|
|
85
|
+
model_name: 'intfloat/multilingual-e5-small',
|
|
86
|
+
dim: 384,
|
|
87
|
+
description: 'Multilingual embedding model supporting 100+ languages',
|
|
88
|
+
size_in_gb: 0.45,
|
|
89
|
+
sources: { hf: 'Xenova/multilingual-e5-small' },
|
|
90
|
+
model_file: 'onnx/model.onnx'
|
|
91
|
+
),
|
|
92
|
+
'intfloat/multilingual-e5-base' => ModelInfo.new(
|
|
93
|
+
model_name: 'intfloat/multilingual-e5-base',
|
|
94
|
+
dim: 768,
|
|
95
|
+
description: 'Larger multilingual embedding model',
|
|
96
|
+
size_in_gb: 1.11,
|
|
97
|
+
sources: { hf: 'Xenova/multilingual-e5-base' },
|
|
98
|
+
model_file: 'onnx/model.onnx'
|
|
99
|
+
),
|
|
100
|
+
'nomic-ai/nomic-embed-text-v1' => ModelInfo.new(
|
|
101
|
+
model_name: 'nomic-ai/nomic-embed-text-v1',
|
|
102
|
+
dim: 768,
|
|
103
|
+
description: 'Long context (8192 tokens) English embedding model',
|
|
104
|
+
size_in_gb: 0.52,
|
|
105
|
+
sources: { hf: 'nomic-ai/nomic-embed-text-v1' },
|
|
106
|
+
model_file: 'onnx/model.onnx'
|
|
107
|
+
),
|
|
108
|
+
'nomic-ai/nomic-embed-text-v1.5' => ModelInfo.new(
|
|
109
|
+
model_name: 'nomic-ai/nomic-embed-text-v1.5',
|
|
110
|
+
dim: 768,
|
|
111
|
+
description: 'Improved long context embedding with Matryoshka support',
|
|
112
|
+
size_in_gb: 0.52,
|
|
113
|
+
sources: { hf: 'nomic-ai/nomic-embed-text-v1.5' },
|
|
114
|
+
model_file: 'onnx/model.onnx'
|
|
115
|
+
),
|
|
116
|
+
'jinaai/jina-embeddings-v2-small-en' => ModelInfo.new(
|
|
117
|
+
model_name: 'jinaai/jina-embeddings-v2-small-en',
|
|
118
|
+
dim: 512,
|
|
119
|
+
description: 'Small English embedding with 8192 token context',
|
|
120
|
+
size_in_gb: 0.06,
|
|
121
|
+
sources: { hf: 'Xenova/jina-embeddings-v2-small-en' },
|
|
122
|
+
model_file: 'onnx/model.onnx'
|
|
123
|
+
),
|
|
124
|
+
'jinaai/jina-embeddings-v2-base-en' => ModelInfo.new(
|
|
125
|
+
model_name: 'jinaai/jina-embeddings-v2-base-en',
|
|
126
|
+
dim: 768,
|
|
127
|
+
description: 'Base English embedding with 8192 token context',
|
|
128
|
+
size_in_gb: 0.52,
|
|
129
|
+
sources: { hf: 'Xenova/jina-embeddings-v2-base-en' },
|
|
130
|
+
model_file: 'onnx/model.onnx'
|
|
131
|
+
),
|
|
132
|
+
'sentence-transformers/paraphrase-MiniLM-L6-v2' => ModelInfo.new(
|
|
133
|
+
model_name: 'sentence-transformers/paraphrase-MiniLM-L6-v2',
|
|
134
|
+
dim: 384,
|
|
135
|
+
description: 'Optimized for paraphrase detection and semantic similarity',
|
|
136
|
+
size_in_gb: 0.09,
|
|
137
|
+
sources: { hf: 'Xenova/paraphrase-MiniLM-L6-v2' },
|
|
138
|
+
model_file: 'onnx/model.onnx'
|
|
139
|
+
),
|
|
140
|
+
'sentence-transformers/all-mpnet-base-v2' => ModelInfo.new(
|
|
141
|
+
model_name: 'sentence-transformers/all-mpnet-base-v2',
|
|
142
|
+
dim: 768,
|
|
143
|
+
description: 'High quality general-purpose sentence embeddings',
|
|
144
|
+
size_in_gb: 0.44,
|
|
145
|
+
sources: { hf: 'Xenova/all-mpnet-base-v2' },
|
|
146
|
+
model_file: 'onnx/model.onnx'
|
|
147
|
+
)
|
|
148
|
+
}.freeze
|
|
149
|
+
|
|
150
|
+
DEFAULT_MODEL = 'BAAI/bge-small-en-v1.5'
|
|
151
|
+
end
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
require 'uri'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
module Fastembed
|
|
9
|
+
# Handles model downloading and caching
|
|
10
|
+
module ModelManagement
|
|
11
|
+
HF_API_BASE = 'https://huggingface.co'
|
|
12
|
+
REQUIRED_FILES = %w[
|
|
13
|
+
config.json
|
|
14
|
+
tokenizer.json
|
|
15
|
+
tokenizer_config.json
|
|
16
|
+
special_tokens_map.json
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
class << self
|
|
20
|
+
# Returns the cache directory for storing models
|
|
21
|
+
# Priority: FASTEMBED_CACHE_PATH > XDG_CACHE_HOME > ~/.cache
|
|
22
|
+
def cache_dir
|
|
23
|
+
@cache_dir ||= begin
|
|
24
|
+
base = ENV['FASTEMBED_CACHE_PATH'] ||
|
|
25
|
+
ENV['XDG_CACHE_HOME'] ||
|
|
26
|
+
File.join(Dir.home, '.cache')
|
|
27
|
+
File.join(base, 'fastembed')
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Set a custom cache directory
|
|
32
|
+
attr_writer :cache_dir
|
|
33
|
+
|
|
34
|
+
# Returns the path to a cached model, downloading if necessary
|
|
35
|
+
def retrieve_model(model_name, show_progress: true)
|
|
36
|
+
model_info = resolve_model_info(model_name)
|
|
37
|
+
model_dir = model_directory(model_info)
|
|
38
|
+
|
|
39
|
+
# Check if model is already cached
|
|
40
|
+
return model_dir if model_cached?(model_dir, model_info)
|
|
41
|
+
|
|
42
|
+
# Download model
|
|
43
|
+
download_model(model_info, model_dir, show_progress: show_progress)
|
|
44
|
+
model_dir
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if a model exists in cache
|
|
48
|
+
def model_cached?(model_dir, model_info)
|
|
49
|
+
return false unless Dir.exist?(model_dir)
|
|
50
|
+
|
|
51
|
+
# Check for required files
|
|
52
|
+
model_path = File.join(model_dir, model_info.model_file)
|
|
53
|
+
tokenizer_path = File.join(model_dir, model_info.tokenizer_file)
|
|
54
|
+
|
|
55
|
+
File.exist?(model_path) && File.exist?(tokenizer_path)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Get the directory path for a model
|
|
59
|
+
def model_directory(model_info)
|
|
60
|
+
# Create a safe directory name from the model name
|
|
61
|
+
safe_name = model_info.model_name.gsub('/', '--')
|
|
62
|
+
File.join(cache_dir, 'models', safe_name)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Resolve model name to ModelInfo
|
|
66
|
+
def resolve_model_info(model_name)
|
|
67
|
+
model_info = SUPPORTED_MODELS[model_name]
|
|
68
|
+
unless model_info
|
|
69
|
+
raise ArgumentError,
|
|
70
|
+
"Unknown model: #{model_name}. Use TextEmbedding.list_supported_models to see available models."
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
model_info
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def download_model(model_info, model_dir, show_progress: true)
|
|
79
|
+
FileUtils.mkdir_p(model_dir)
|
|
80
|
+
|
|
81
|
+
repo_id = model_info.hf_repo
|
|
82
|
+
puts "Downloading model #{model_info.model_name} from #{repo_id}..." if show_progress
|
|
83
|
+
|
|
84
|
+
# Download model file
|
|
85
|
+
download_file(repo_id, model_info.model_file, model_dir, show_progress: show_progress)
|
|
86
|
+
|
|
87
|
+
# Download tokenizer and config files
|
|
88
|
+
files_to_download = REQUIRED_FILES + [model_info.tokenizer_file]
|
|
89
|
+
files_to_download.uniq.each do |file|
|
|
90
|
+
download_file(repo_id, file, model_dir, show_progress: show_progress,
|
|
91
|
+
required: file == model_info.tokenizer_file)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
puts "Model downloaded successfully to #{model_dir}" if show_progress
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def download_file(repo_id, file_path, model_dir, show_progress: true, required: true)
|
|
98
|
+
# Determine the correct local path
|
|
99
|
+
# If file_path contains directories (e.g., "onnx/model.onnx"), create them
|
|
100
|
+
local_path = File.join(model_dir, file_path)
|
|
101
|
+
FileUtils.mkdir_p(File.dirname(local_path))
|
|
102
|
+
|
|
103
|
+
# Skip if already exists
|
|
104
|
+
if File.exist?(local_path)
|
|
105
|
+
puts " #{file_path} (cached)" if show_progress
|
|
106
|
+
return
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
url = "#{HF_API_BASE}/#{repo_id}/resolve/main/#{file_path}"
|
|
110
|
+
puts " Downloading #{file_path}..." if show_progress
|
|
111
|
+
|
|
112
|
+
begin
|
|
113
|
+
download_with_redirect(url, local_path, show_progress: show_progress)
|
|
114
|
+
rescue StandardError => e
|
|
115
|
+
raise DownloadError, "Failed to download #{file_path}: #{e.message}" if required
|
|
116
|
+
|
|
117
|
+
puts " #{file_path} (skipped - not available)" if show_progress
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def download_with_redirect(url, local_path, show_progress: true, max_redirects: 10)
|
|
122
|
+
raise DownloadError, 'Too many redirects' if max_redirects <= 0
|
|
123
|
+
|
|
124
|
+
uri = URI.parse(url)
|
|
125
|
+
|
|
126
|
+
# Handle relative URLs by using https scheme
|
|
127
|
+
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
128
|
+
# If it's a relative URL, we can't handle it
|
|
129
|
+
raise DownloadError, "Invalid URL scheme: #{url}"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', read_timeout: 300,
|
|
133
|
+
open_timeout: 30) do |http|
|
|
134
|
+
request = Net::HTTP::Get.new(uri)
|
|
135
|
+
request['User-Agent'] = "fastembed-ruby/#{VERSION}"
|
|
136
|
+
|
|
137
|
+
response = http.request(request)
|
|
138
|
+
|
|
139
|
+
case response
|
|
140
|
+
when Net::HTTPSuccess
|
|
141
|
+
File.binwrite(local_path, response.body)
|
|
142
|
+
when Net::HTTPRedirection
|
|
143
|
+
new_url = response['location']
|
|
144
|
+
# Handle relative redirects
|
|
145
|
+
new_url = "#{uri.scheme}://#{uri.host}#{new_url}" if new_url.start_with?('/')
|
|
146
|
+
download_with_redirect(new_url, local_path, show_progress: show_progress, max_redirects: max_redirects - 1)
|
|
147
|
+
else
|
|
148
|
+
raise DownloadError, "HTTP #{response.code}: #{response.message}"
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'onnxruntime'
|
|
4
|
+
require 'tokenizers'
|
|
5
|
+
|
|
6
|
+
module Fastembed
|
|
7
|
+
# ONNX-based embedding model wrapper
|
|
8
|
+
class OnnxEmbeddingModel
|
|
9
|
+
attr_reader :model_info, :model_dir
|
|
10
|
+
|
|
11
|
+
def initialize(model_info, model_dir, threads: nil, providers: nil)
|
|
12
|
+
@model_info = model_info
|
|
13
|
+
@model_dir = model_dir
|
|
14
|
+
@threads = threads
|
|
15
|
+
@providers = providers
|
|
16
|
+
|
|
17
|
+
load_model
|
|
18
|
+
load_tokenizer
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Embed a batch of texts
|
|
22
|
+
def embed(texts)
|
|
23
|
+
# Tokenize
|
|
24
|
+
encoded = tokenize(texts)
|
|
25
|
+
|
|
26
|
+
# Run inference
|
|
27
|
+
outputs = run_inference(encoded)
|
|
28
|
+
|
|
29
|
+
# Apply pooling and normalization
|
|
30
|
+
Pooling.apply(
|
|
31
|
+
model_info.pooling,
|
|
32
|
+
outputs,
|
|
33
|
+
encoded[:attention_mask],
|
|
34
|
+
should_normalize: model_info.normalize
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def load_model
|
|
41
|
+
model_path = File.join(model_dir, model_info.model_file)
|
|
42
|
+
raise Error, "Model file not found: #{model_path}" unless File.exist?(model_path)
|
|
43
|
+
|
|
44
|
+
session_options = {}
|
|
45
|
+
session_options[:inter_op_num_threads] = @threads if @threads
|
|
46
|
+
session_options[:intra_op_num_threads] = @threads if @threads
|
|
47
|
+
session_options[:providers] = @providers if @providers
|
|
48
|
+
|
|
49
|
+
@session = OnnxRuntime::InferenceSession.new(model_path, **session_options)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def load_tokenizer
|
|
53
|
+
tokenizer_path = File.join(model_dir, model_info.tokenizer_file)
|
|
54
|
+
raise Error, "Tokenizer file not found: #{tokenizer_path}" unless File.exist?(tokenizer_path)
|
|
55
|
+
|
|
56
|
+
@tokenizer = Tokenizers.from_file(tokenizer_path)
|
|
57
|
+
|
|
58
|
+
# Configure tokenizer for batch encoding
|
|
59
|
+
@tokenizer.enable_padding(pad_id: 0, pad_token: '[PAD]')
|
|
60
|
+
@tokenizer.enable_truncation(512)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def tokenize(texts)
|
|
64
|
+
texts = [texts] if texts.is_a?(String)
|
|
65
|
+
|
|
66
|
+
# Batch encode
|
|
67
|
+
encodings = @tokenizer.encode_batch(texts)
|
|
68
|
+
|
|
69
|
+
# Convert to format expected by ONNX model
|
|
70
|
+
{
|
|
71
|
+
input_ids: encodings.map(&:ids),
|
|
72
|
+
attention_mask: encodings.map(&:attention_mask),
|
|
73
|
+
token_type_ids: encodings.map { |e| e.type_ids || Array.new(e.ids.length, 0) }
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def run_inference(encoded)
|
|
78
|
+
# Prepare inputs
|
|
79
|
+
inputs = {
|
|
80
|
+
'input_ids' => encoded[:input_ids],
|
|
81
|
+
'attention_mask' => encoded[:attention_mask]
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Some models require token_type_ids
|
|
85
|
+
inputs['token_type_ids'] = encoded[:token_type_ids] if input_names.include?('token_type_ids')
|
|
86
|
+
|
|
87
|
+
# Run model
|
|
88
|
+
outputs = @session.run(nil, inputs)
|
|
89
|
+
|
|
90
|
+
# Get the last hidden state (usually first output)
|
|
91
|
+
# Output shape: [batch_size, seq_len, hidden_size]
|
|
92
|
+
extract_embeddings(outputs)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def input_names
|
|
96
|
+
@input_names ||= @session.inputs.map { |i| i[:name] }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def output_names
|
|
100
|
+
@output_names ||= @session.outputs.map { |o| o[:name] }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def extract_embeddings(outputs)
|
|
104
|
+
# ONNX models typically output as a hash or the first output
|
|
105
|
+
# The key is usually "last_hidden_state" or similar
|
|
106
|
+
if outputs.is_a?(Hash)
|
|
107
|
+
# Try common output names
|
|
108
|
+
%w[last_hidden_state token_embeddings].each do |key|
|
|
109
|
+
return outputs[key] if outputs.key?(key)
|
|
110
|
+
end
|
|
111
|
+
# Fall back to first output
|
|
112
|
+
outputs.values.first
|
|
113
|
+
else
|
|
114
|
+
outputs.first
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Fastembed
|
|
4
|
+
# Pooling strategies for transformer outputs
|
|
5
|
+
module Pooling
|
|
6
|
+
class << self
|
|
7
|
+
# Mean pooling - averages all token embeddings weighted by attention mask
|
|
8
|
+
def mean_pooling(token_embeddings, attention_mask)
|
|
9
|
+
# token_embeddings: [batch_size, seq_len, hidden_size]
|
|
10
|
+
# attention_mask: [batch_size, seq_len]
|
|
11
|
+
|
|
12
|
+
batch_size = token_embeddings.length
|
|
13
|
+
hidden_size = token_embeddings[0][0].length
|
|
14
|
+
|
|
15
|
+
batch_size.times.map do |batch_idx|
|
|
16
|
+
embeddings = token_embeddings[batch_idx]
|
|
17
|
+
mask = attention_mask[batch_idx]
|
|
18
|
+
seq_len = embeddings.length
|
|
19
|
+
|
|
20
|
+
# Sum embeddings weighted by attention mask
|
|
21
|
+
summed = Array.new(hidden_size, 0.0)
|
|
22
|
+
mask_sum = 0.0
|
|
23
|
+
|
|
24
|
+
seq_len.times do |seq_idx|
|
|
25
|
+
weight = mask[seq_idx].to_f
|
|
26
|
+
mask_sum += weight
|
|
27
|
+
next if weight.zero?
|
|
28
|
+
|
|
29
|
+
hidden_size.times do |hidden_idx|
|
|
30
|
+
summed[hidden_idx] += embeddings[seq_idx][hidden_idx] * weight
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Avoid division by zero
|
|
35
|
+
mask_sum = 1.0 if mask_sum.zero?
|
|
36
|
+
|
|
37
|
+
# Divide by sum of mask to get mean
|
|
38
|
+
summed.map { |v| v / mask_sum }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# CLS pooling - uses the [CLS] token embedding (first token)
|
|
43
|
+
def cls_pooling(token_embeddings, _attention_mask)
|
|
44
|
+
token_embeddings.map { |batch| batch[0] }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# L2 normalize vectors
|
|
48
|
+
def normalize(vectors)
|
|
49
|
+
vectors.map do |vector|
|
|
50
|
+
norm = Math.sqrt(vector.sum { |v| v * v })
|
|
51
|
+
norm = 1.0 if norm.zero?
|
|
52
|
+
vector.map { |v| v / norm }
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Apply pooling based on strategy
|
|
57
|
+
def apply(strategy, token_embeddings, attention_mask, should_normalize: true)
|
|
58
|
+
pooled = case strategy
|
|
59
|
+
when :mean
|
|
60
|
+
mean_pooling(token_embeddings, attention_mask)
|
|
61
|
+
when :cls
|
|
62
|
+
cls_pooling(token_embeddings, attention_mask)
|
|
63
|
+
else
|
|
64
|
+
raise ArgumentError, "Unknown pooling strategy: #{strategy}"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
should_normalize ? normalize(pooled) : pooled
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Fastembed
|
|
4
|
+
# Main class for generating text embeddings
|
|
5
|
+
#
|
|
6
|
+
# @example Basic usage
|
|
7
|
+
# embedding = Fastembed::TextEmbedding.new
|
|
8
|
+
# vectors = embedding.embed(["Hello world", "Another text"]).to_a
|
|
9
|
+
#
|
|
10
|
+
# @example Custom model
|
|
11
|
+
# embedding = Fastembed::TextEmbedding.new(model_name: "BAAI/bge-base-en-v1.5")
|
|
12
|
+
#
|
|
13
|
+
# @example Lazy iteration for large datasets
|
|
14
|
+
# embedding.embed(documents).each do |vector|
|
|
15
|
+
# # Process each vector
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
class TextEmbedding
|
|
19
|
+
attr_reader :model_name, :model_info, :dim
|
|
20
|
+
|
|
21
|
+
# Initialize a text embedding model
|
|
22
|
+
#
|
|
23
|
+
# @param model_name [String] Name of the model to use (default: "BAAI/bge-small-en-v1.5")
|
|
24
|
+
# @param cache_dir [String, nil] Custom cache directory for models
|
|
25
|
+
# @param threads [Integer, nil] Number of threads for ONNX Runtime
|
|
26
|
+
# @param providers [Array<String>, nil] ONNX execution providers (e.g., ["CoreMLExecutionProvider"])
|
|
27
|
+
# @param show_progress [Boolean] Whether to show download progress
|
|
28
|
+
def initialize(
|
|
29
|
+
model_name: DEFAULT_MODEL,
|
|
30
|
+
cache_dir: nil,
|
|
31
|
+
threads: nil,
|
|
32
|
+
providers: nil,
|
|
33
|
+
show_progress: true
|
|
34
|
+
)
|
|
35
|
+
@model_name = model_name
|
|
36
|
+
@threads = threads
|
|
37
|
+
@providers = providers
|
|
38
|
+
@show_progress = show_progress
|
|
39
|
+
|
|
40
|
+
# Set custom cache directory if provided
|
|
41
|
+
ModelManagement.cache_dir = cache_dir if cache_dir
|
|
42
|
+
|
|
43
|
+
# Resolve model info
|
|
44
|
+
@model_info = ModelManagement.resolve_model_info(model_name)
|
|
45
|
+
@dim = @model_info.dim
|
|
46
|
+
|
|
47
|
+
# Download and load model
|
|
48
|
+
@model_dir = ModelManagement.retrieve_model(model_name, show_progress: show_progress)
|
|
49
|
+
@model = OnnxEmbeddingModel.new(@model_info, @model_dir, threads: threads, providers: providers)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Generate embeddings for documents
|
|
53
|
+
#
|
|
54
|
+
# @param documents [Array<String>, String] Text document(s) to embed
|
|
55
|
+
# @param batch_size [Integer] Number of documents to process at once
|
|
56
|
+
# @return [Enumerator] Lazy enumerator yielding embedding vectors
|
|
57
|
+
# @raise [ArgumentError] If documents is nil or contains nil values
|
|
58
|
+
#
|
|
59
|
+
# @example
|
|
60
|
+
# vectors = embedding.embed(["Hello", "World"]).to_a
|
|
61
|
+
# # => [[0.1, 0.2, ...], [0.3, 0.4, ...]]
|
|
62
|
+
def embed(documents, batch_size: 256)
|
|
63
|
+
raise ArgumentError, 'documents cannot be nil' if documents.nil?
|
|
64
|
+
|
|
65
|
+
documents = [documents] if documents.is_a?(String)
|
|
66
|
+
return Enumerator.new { |_| } if documents.empty?
|
|
67
|
+
|
|
68
|
+
# Validate all documents
|
|
69
|
+
documents.each_with_index do |doc, i|
|
|
70
|
+
raise ArgumentError, "document at index #{i} cannot be nil" if doc.nil?
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Enumerator.new do |yielder|
|
|
74
|
+
documents.each_slice(batch_size) do |batch|
|
|
75
|
+
embeddings = @model.embed(batch)
|
|
76
|
+
embeddings.each { |embedding| yielder << embedding }
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Generate embeddings for query texts (with "query: " prefix for retrieval models)
|
|
82
|
+
#
|
|
83
|
+
# @param queries [Array<String>, String] Query text(s) to embed
|
|
84
|
+
# @param batch_size [Integer] Number of queries to process at once
|
|
85
|
+
# @return [Enumerator] Lazy enumerator yielding embedding vectors
|
|
86
|
+
def query_embed(queries, batch_size: 256)
|
|
87
|
+
queries = [queries] if queries.is_a?(String)
|
|
88
|
+
prefixed = queries.map { |q| "query: #{q}" }
|
|
89
|
+
embed(prefixed, batch_size: batch_size)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Generate embeddings for passage texts (with "passage: " prefix for retrieval models)
|
|
93
|
+
#
|
|
94
|
+
# @param passages [Array<String>, String] Passage text(s) to embed
|
|
95
|
+
# @param batch_size [Integer] Number of passages to process at once
|
|
96
|
+
# @return [Enumerator] Lazy enumerator yielding embedding vectors
|
|
97
|
+
def passage_embed(passages, batch_size: 256)
|
|
98
|
+
passages = [passages] if passages.is_a?(String)
|
|
99
|
+
prefixed = passages.map { |p| "passage: #{p}" }
|
|
100
|
+
embed(prefixed, batch_size: batch_size)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# List all supported models
|
|
104
|
+
#
|
|
105
|
+
# @return [Array<Hash>] Array of model information hashes
|
|
106
|
+
def self.list_supported_models
|
|
107
|
+
SUPPORTED_MODELS.values.map(&:to_h)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Get information about a specific model
|
|
111
|
+
#
|
|
112
|
+
# @param model_name [String] Name of the model
|
|
113
|
+
# @return [Hash, nil] Model information or nil if not found
|
|
114
|
+
def self.get_model_info(model_name)
|
|
115
|
+
SUPPORTED_MODELS[model_name]&.to_h
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
data/lib/fastembed.rb
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'fastembed/version'
|
|
4
|
+
|
|
5
|
+
module Fastembed
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
class DownloadError < Error; end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
require_relative 'fastembed/model_info'
|
|
11
|
+
require_relative 'fastembed/model_management'
|
|
12
|
+
require_relative 'fastembed/pooling'
|
|
13
|
+
require_relative 'fastembed/onnx_embedding_model'
|
|
14
|
+
require_relative 'fastembed/text_embedding'
|
metadata
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: fastembed
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Chris Hasinski
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-01-08 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: onnxruntime
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.9'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.9'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: tokenizers
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0.5'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0.5'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: rake
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '13.0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '13.0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rspec
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '3.0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '3.0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: rubocop
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '1.0'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '1.0'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: rubocop-rspec
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - "~>"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '3.0'
|
|
90
|
+
type: :development
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - "~>"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '3.0'
|
|
97
|
+
description: A Ruby port of FastEmbed - fast text embeddings using ONNX Runtime
|
|
98
|
+
email:
|
|
99
|
+
- krzysztof.hasinski@gmail.com
|
|
100
|
+
executables: []
|
|
101
|
+
extensions: []
|
|
102
|
+
extra_rdoc_files: []
|
|
103
|
+
files:
|
|
104
|
+
- ".mise.toml"
|
|
105
|
+
- ".rspec"
|
|
106
|
+
- ".rubocop.yml"
|
|
107
|
+
- BENCHMARKS.md
|
|
108
|
+
- CHANGELOG.md
|
|
109
|
+
- Gemfile
|
|
110
|
+
- LICENSE
|
|
111
|
+
- README.md
|
|
112
|
+
- Rakefile
|
|
113
|
+
- fastembed.gemspec
|
|
114
|
+
- lib/fastembed.rb
|
|
115
|
+
- lib/fastembed/model_info.rb
|
|
116
|
+
- lib/fastembed/model_management.rb
|
|
117
|
+
- lib/fastembed/onnx_embedding_model.rb
|
|
118
|
+
- lib/fastembed/pooling.rb
|
|
119
|
+
- lib/fastembed/text_embedding.rb
|
|
120
|
+
- lib/fastembed/version.rb
|
|
121
|
+
homepage: https://github.com/khasinski/fastembed-rb
|
|
122
|
+
licenses:
|
|
123
|
+
- MIT
|
|
124
|
+
metadata:
|
|
125
|
+
homepage_uri: https://github.com/khasinski/fastembed-rb
|
|
126
|
+
source_code_uri: https://github.com/khasinski/fastembed-rb
|
|
127
|
+
changelog_uri: https://github.com/khasinski/fastembed-rb/blob/main/CHANGELOG.md
|
|
128
|
+
rubygems_mfa_required: 'true'
|
|
129
|
+
post_install_message:
|
|
130
|
+
rdoc_options: []
|
|
131
|
+
require_paths:
|
|
132
|
+
- lib
|
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
134
|
+
requirements:
|
|
135
|
+
- - ">="
|
|
136
|
+
- !ruby/object:Gem::Version
|
|
137
|
+
version: 3.3.0
|
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
|
+
requirements:
|
|
140
|
+
- - ">="
|
|
141
|
+
- !ruby/object:Gem::Version
|
|
142
|
+
version: '0'
|
|
143
|
+
requirements: []
|
|
144
|
+
rubygems_version: 3.5.22
|
|
145
|
+
signing_key:
|
|
146
|
+
specification_version: 4
|
|
147
|
+
summary: Fast, lightweight text embeddings for Ruby
|
|
148
|
+
test_files: []
|