leann 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +375 -0
- data/exe/leann +167 -0
- data/lib/generators/leann/install/install_generator.rb +51 -0
- data/lib/generators/leann/install/templates/migration.rb.erb +28 -0
- data/lib/leann/backend/base.rb +51 -0
- data/lib/leann/backend/leann_graph.rb +476 -0
- data/lib/leann/builder.rb +317 -0
- data/lib/leann/configuration.rb +148 -0
- data/lib/leann/embedding/base.rb +63 -0
- data/lib/leann/embedding/fastembed.rb +120 -0
- data/lib/leann/embedding/ollama.rb +194 -0
- data/lib/leann/embedding/openai.rb +149 -0
- data/lib/leann/embedding/ruby_llm.rb +57 -0
- data/lib/leann/errors.rb +71 -0
- data/lib/leann/index.rb +236 -0
- data/lib/leann/rails/active_record/index.rb +70 -0
- data/lib/leann/rails/active_record/passage.rb +56 -0
- data/lib/leann/rails/builder.rb +205 -0
- data/lib/leann/rails/railtie.rb +16 -0
- data/lib/leann/rails/searcher.rb +117 -0
- data/lib/leann/rails/storage/active_record_backend.rb +332 -0
- data/lib/leann/rails.rb +90 -0
- data/lib/leann/ruby_llm/search.rb +89 -0
- data/lib/leann/search_result.rb +195 -0
- data/lib/leann/searcher.rb +189 -0
- data/lib/leann/version.rb +3 -0
- data/lib/leann.rb +133 -0
- metadata +177 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "securerandom"
|
|
6
|
+
require "time"
|
|
7
|
+
|
|
8
|
+
module Leann
|
|
9
|
+
# Builds a new Leann index
|
|
10
|
+
#
|
|
11
|
+
# @example DSL style
|
|
12
|
+
# Leann.build("my_index") do
|
|
13
|
+
# add "First document"
|
|
14
|
+
# add "Second document", source: "manual"
|
|
15
|
+
# add_file "README.md"
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# @example Programmatic style
|
|
19
|
+
# builder = Leann::Builder.new("my_index")
|
|
20
|
+
# builder.add("First document")
|
|
21
|
+
# builder.add("Second document")
|
|
22
|
+
# builder.save
|
|
23
|
+
#
|
|
24
|
+
class Builder
|
|
25
|
+
# @return [String] Index name
|
|
26
|
+
attr_reader :name
|
|
27
|
+
|
|
28
|
+
# @return [String] Index path
|
|
29
|
+
attr_reader :path
|
|
30
|
+
|
|
31
|
+
# @return [Array<Hash>] Documents to be indexed
|
|
32
|
+
attr_reader :documents
|
|
33
|
+
|
|
34
|
+
# @param name [String] Index name
|
|
35
|
+
# @param embedding [Symbol] Embedding provider (:ruby_llm, :openai, :ollama, :fastembed)
|
|
36
|
+
# @param model [String, nil] Embedding model name
|
|
37
|
+
# @param path [String, nil] Custom path for index
|
|
38
|
+
# @param force [Boolean] Overwrite existing index
|
|
39
|
+
def initialize(name, embedding: nil, model: nil, path: nil, force: false, **_options)
|
|
40
|
+
@name = name
|
|
41
|
+
@path = resolve_path(name, path)
|
|
42
|
+
@embedding_provider = embedding || Leann.configuration.embedding_provider
|
|
43
|
+
@embedding_model = model || Leann.configuration.embedding_model_for(@embedding_provider)
|
|
44
|
+
@force = force
|
|
45
|
+
@documents = []
|
|
46
|
+
|
|
47
|
+
check_existing_index unless force
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Add a text document
|
|
51
|
+
#
|
|
52
|
+
# @param text [String] Document text
|
|
53
|
+
# @param metadata [Hash] Additional metadata (passed as keyword arguments)
|
|
54
|
+
# @return [self]
|
|
55
|
+
#
|
|
56
|
+
# @example
|
|
57
|
+
# builder.add("Hello world")
|
|
58
|
+
# builder.add("Document with metadata", source: "file.txt", chapter: 1)
|
|
59
|
+
def add(text, **metadata)
|
|
60
|
+
raise ArgumentError, "Text cannot be nil" if text.nil?
|
|
61
|
+
raise ArgumentError, "Text cannot be empty" if text.to_s.strip.empty?
|
|
62
|
+
|
|
63
|
+
doc = {
|
|
64
|
+
id: metadata.delete(:id) || generate_id,
|
|
65
|
+
text: text.to_s.strip,
|
|
66
|
+
metadata: metadata
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
@documents << doc
|
|
70
|
+
self
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Add document (alias for add)
|
|
74
|
+
alias << add
|
|
75
|
+
|
|
76
|
+
# Add content from a file
|
|
77
|
+
#
|
|
78
|
+
# @param file_path [String] Path to file
|
|
79
|
+
# @param metadata [Hash] Additional metadata
|
|
80
|
+
# @return [self]
|
|
81
|
+
#
|
|
82
|
+
# @example
|
|
83
|
+
# builder.add_file("README.md")
|
|
84
|
+
# builder.add_file("docs/guide.txt", category: "documentation")
|
|
85
|
+
def add_file(file_path, **metadata)
|
|
86
|
+
raise ArgumentError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
87
|
+
|
|
88
|
+
content = File.read(file_path)
|
|
89
|
+
file_metadata = {
|
|
90
|
+
source: file_path,
|
|
91
|
+
filename: File.basename(file_path),
|
|
92
|
+
extension: File.extname(file_path)
|
|
93
|
+
}.merge(metadata)
|
|
94
|
+
|
|
95
|
+
add(content, **file_metadata)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Add all files from a directory
|
|
99
|
+
#
|
|
100
|
+
# @param directory [String] Directory path
|
|
101
|
+
# @param pattern [String] Glob pattern (default: "**/*")
|
|
102
|
+
# @param extensions [Array<String>, nil] Filter by extensions (e.g., [".md", ".txt"])
|
|
103
|
+
# @param metadata [Hash] Additional metadata for all files
|
|
104
|
+
# @return [self]
|
|
105
|
+
#
|
|
106
|
+
# @example
|
|
107
|
+
# builder.add_directory("docs/")
|
|
108
|
+
# builder.add_directory("src/", extensions: [".rb", ".py"])
|
|
109
|
+
def add_directory(directory, pattern: "**/*", extensions: nil, **metadata)
|
|
110
|
+
raise ArgumentError, "Directory not found: #{directory}" unless Dir.exist?(directory)
|
|
111
|
+
|
|
112
|
+
full_pattern = File.join(directory, pattern)
|
|
113
|
+
Dir.glob(full_pattern).each do |file_path|
|
|
114
|
+
next unless File.file?(file_path)
|
|
115
|
+
next if extensions && !extensions.include?(File.extname(file_path))
|
|
116
|
+
|
|
117
|
+
add_file(file_path, **metadata)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
self
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Add multiple documents at once
|
|
124
|
+
#
|
|
125
|
+
# @param docs [Array<String>, Array<Hash>] Documents to add
|
|
126
|
+
# @return [self]
|
|
127
|
+
#
|
|
128
|
+
# @example
|
|
129
|
+
# builder.add_all(["Doc 1", "Doc 2", "Doc 3"])
|
|
130
|
+
# builder.add_all([
|
|
131
|
+
# { text: "Doc 1", source: "a" },
|
|
132
|
+
# { text: "Doc 2", source: "b" }
|
|
133
|
+
# ])
|
|
134
|
+
def add_all(docs)
|
|
135
|
+
docs.each do |doc|
|
|
136
|
+
case doc
|
|
137
|
+
when String
|
|
138
|
+
add(doc)
|
|
139
|
+
when Hash
|
|
140
|
+
text = doc.delete(:text) || doc.delete("text")
|
|
141
|
+
add(text, **doc.transform_keys(&:to_sym))
|
|
142
|
+
else
|
|
143
|
+
raise ArgumentError, "Invalid document type: #{doc.class}"
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
self
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Get number of documents added
|
|
151
|
+
# @return [Integer]
|
|
152
|
+
def count
|
|
153
|
+
@documents.size
|
|
154
|
+
end
|
|
155
|
+
alias size count
|
|
156
|
+
|
|
157
|
+
# Check if any documents have been added
|
|
158
|
+
# @return [Boolean]
|
|
159
|
+
def empty?
|
|
160
|
+
@documents.empty?
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Build and save the index
|
|
164
|
+
# @return [Index] The built index
|
|
165
|
+
def save
|
|
166
|
+
raise EmptyIndexError if empty?
|
|
167
|
+
|
|
168
|
+
puts "Building index '#{name}' with #{count} documents..."
|
|
169
|
+
|
|
170
|
+
# Create directory if needed
|
|
171
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
172
|
+
|
|
173
|
+
# Delete existing if force mode
|
|
174
|
+
Index.delete(path) if @force && Index.exists?(path)
|
|
175
|
+
|
|
176
|
+
# Compute embeddings
|
|
177
|
+
embeddings = compute_embeddings
|
|
178
|
+
|
|
179
|
+
# Save passages
|
|
180
|
+
save_passages
|
|
181
|
+
|
|
182
|
+
# Build and save graph
|
|
183
|
+
save_graph(embeddings)
|
|
184
|
+
|
|
185
|
+
# Save metadata
|
|
186
|
+
save_metadata(embeddings)
|
|
187
|
+
|
|
188
|
+
puts "Index '#{name}' created successfully!"
|
|
189
|
+
|
|
190
|
+
Index.open(path)
|
|
191
|
+
end
|
|
192
|
+
alias build save
|
|
193
|
+
|
|
194
|
+
private
|
|
195
|
+
|
|
196
|
+
def resolve_path(name, custom_path)
|
|
197
|
+
if custom_path
|
|
198
|
+
custom_path.end_with?(Index::INDEX_EXTENSION) ? custom_path : "#{custom_path}#{Index::INDEX_EXTENSION}"
|
|
199
|
+
else
|
|
200
|
+
"#{name}#{Index::INDEX_EXTENSION}"
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def check_existing_index
|
|
205
|
+
raise IndexExistsError, name if Index.exists?(path)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def generate_id
|
|
209
|
+
SecureRandom.uuid
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def compute_embeddings
|
|
213
|
+
texts = @documents.map { |d| d[:text] }
|
|
214
|
+
embedding_provider.compute(texts)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def embedding_provider
|
|
218
|
+
@_embedding_provider ||= load_embedding_provider
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def load_embedding_provider
|
|
222
|
+
require_relative "embedding/base"
|
|
223
|
+
|
|
224
|
+
case @embedding_provider
|
|
225
|
+
when :ruby_llm
|
|
226
|
+
require_relative "embedding/ruby_llm"
|
|
227
|
+
Embedding::RubyLLM.new(model: @embedding_model)
|
|
228
|
+
when :openai
|
|
229
|
+
require_relative "embedding/openai"
|
|
230
|
+
Embedding::OpenAI.new(model: @embedding_model)
|
|
231
|
+
when :ollama
|
|
232
|
+
require_relative "embedding/ollama"
|
|
233
|
+
Embedding::Ollama.new(model: @embedding_model)
|
|
234
|
+
when :fastembed
|
|
235
|
+
require_relative "embedding/fastembed"
|
|
236
|
+
Embedding::FastEmbed.new(model: @embedding_model)
|
|
237
|
+
else
|
|
238
|
+
raise ConfigurationError, "Unknown embedding provider: #{@embedding_provider}"
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def save_passages
|
|
243
|
+
passages_file = "#{path}#{Index::PASSAGES_SUFFIX}"
|
|
244
|
+
offsets_file = "#{path}#{Index::OFFSETS_SUFFIX}"
|
|
245
|
+
|
|
246
|
+
offsets = {}
|
|
247
|
+
|
|
248
|
+
File.open(passages_file, "w") do |f|
|
|
249
|
+
@documents.each do |doc|
|
|
250
|
+
offsets[doc[:id]] = f.pos
|
|
251
|
+
f.puts(JSON.generate(doc))
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
File.write(offsets_file, JSON.generate(offsets))
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def save_graph(embeddings)
|
|
259
|
+
ids = @documents.map { |d| d[:id] }
|
|
260
|
+
|
|
261
|
+
require_relative "backend/leann_graph"
|
|
262
|
+
|
|
263
|
+
graph = Backend::LeannGraph.new(
|
|
264
|
+
dimensions: embeddings.first.size,
|
|
265
|
+
m: Leann.configuration.hnsw_m,
|
|
266
|
+
ef_construction: Leann.configuration.hnsw_ef_construction
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
graph.build(ids, embeddings)
|
|
270
|
+
graph.save(path)
|
|
271
|
+
|
|
272
|
+
report_storage_savings(embeddings)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def report_storage_savings(embeddings)
|
|
276
|
+
embedding_size = embeddings.first.size * 4 # float32
|
|
277
|
+
total_embedding_bytes = embeddings.length * embedding_size
|
|
278
|
+
|
|
279
|
+
graph_file = "#{path}.graph.bin"
|
|
280
|
+
actual_size = File.exist?(graph_file) ? File.size(graph_file) : 0
|
|
281
|
+
|
|
282
|
+
savings = ((total_embedding_bytes - actual_size).to_f / total_embedding_bytes * 100).round(1)
|
|
283
|
+
puts "Storage savings: #{savings}% (#{format_bytes(total_embedding_bytes)} → #{format_bytes(actual_size)})"
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def format_bytes(bytes)
|
|
287
|
+
if bytes < 1024
|
|
288
|
+
"#{bytes} B"
|
|
289
|
+
elsif bytes < 1024 * 1024
|
|
290
|
+
"#{(bytes / 1024.0).round(1)} KB"
|
|
291
|
+
else
|
|
292
|
+
"#{(bytes / (1024.0 * 1024)).round(2)} MB"
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def save_metadata(embeddings)
|
|
297
|
+
meta_file = "#{path}#{Index::META_SUFFIX}"
|
|
298
|
+
|
|
299
|
+
metadata = {
|
|
300
|
+
version: "1.0",
|
|
301
|
+
name: name,
|
|
302
|
+
backend: "leann",
|
|
303
|
+
embedding_provider: @embedding_provider.to_s,
|
|
304
|
+
embedding_model: @embedding_model,
|
|
305
|
+
dimensions: embeddings.first&.size || 0,
|
|
306
|
+
document_count: @documents.size,
|
|
307
|
+
created_at: Time.now.utc.iso8601,
|
|
308
|
+
config: {
|
|
309
|
+
hnsw_m: Leann.configuration.hnsw_m,
|
|
310
|
+
hnsw_ef_construction: Leann.configuration.hnsw_ef_construction
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
File.write(meta_file, JSON.pretty_generate(metadata))
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
end
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
# Global configuration for Leann
|
|
5
|
+
#
|
|
6
|
+
# @example With RubyLLM (recommended)
|
|
7
|
+
# # If RubyLLM is present, LEANN uses it automatically
|
|
8
|
+
# # Just configure RubyLLM as usual:
|
|
9
|
+
# RubyLLM.configure do |config|
|
|
10
|
+
# config.openai_api_key = ENV["OPENAI_API_KEY"]
|
|
11
|
+
# end
|
|
12
|
+
#
|
|
13
|
+
# @example Manual configuration
|
|
14
|
+
# Leann.configure do |config|
|
|
15
|
+
# config.embedding_provider = :openai
|
|
16
|
+
# config.openai_api_key = ENV["OPENAI_API_KEY"]
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
class Configuration
|
|
20
|
+
# Embedding provider (:ruby_llm, :openai, :ollama, :fastembed)
|
|
21
|
+
# Defaults to :ruby_llm if RubyLLM gem is available, otherwise :openai
|
|
22
|
+
# @return [Symbol]
|
|
23
|
+
attr_accessor :embedding_provider
|
|
24
|
+
|
|
25
|
+
# OpenAI API key (only needed if not using RubyLLM)
|
|
26
|
+
# @return [String, nil]
|
|
27
|
+
attr_accessor :openai_api_key
|
|
28
|
+
|
|
29
|
+
# OpenAI base URL (for custom endpoints)
|
|
30
|
+
# @return [String, nil]
|
|
31
|
+
attr_accessor :openai_base_url
|
|
32
|
+
|
|
33
|
+
# Ollama host URL
|
|
34
|
+
# @return [String]
|
|
35
|
+
attr_accessor :ollama_host
|
|
36
|
+
|
|
37
|
+
# Default embedding model
|
|
38
|
+
# @return [String]
|
|
39
|
+
attr_accessor :default_embedding_model
|
|
40
|
+
|
|
41
|
+
# Index storage directory
|
|
42
|
+
# @return [String]
|
|
43
|
+
attr_accessor :index_directory
|
|
44
|
+
|
|
45
|
+
# HNSW M parameter (graph connectivity)
|
|
46
|
+
# @return [Integer]
|
|
47
|
+
attr_accessor :hnsw_m
|
|
48
|
+
|
|
49
|
+
# HNSW ef_construction parameter
|
|
50
|
+
# @return [Integer]
|
|
51
|
+
attr_accessor :hnsw_ef_construction
|
|
52
|
+
|
|
53
|
+
# Default chunk size for text splitting
|
|
54
|
+
# @return [Integer]
|
|
55
|
+
attr_accessor :chunk_size
|
|
56
|
+
|
|
57
|
+
# Default chunk overlap
|
|
58
|
+
# @return [Integer]
|
|
59
|
+
attr_accessor :chunk_overlap
|
|
60
|
+
|
|
61
|
+
def initialize
|
|
62
|
+
# Default to RubyLLM if available, otherwise OpenAI
|
|
63
|
+
@embedding_provider = ruby_llm_available? ? :ruby_llm : :openai
|
|
64
|
+
|
|
65
|
+
@openai_api_key = ENV["OPENAI_API_KEY"]
|
|
66
|
+
@openai_base_url = ENV["OPENAI_BASE_URL"]
|
|
67
|
+
@ollama_host = ENV.fetch("OLLAMA_HOST", "http://localhost:11434")
|
|
68
|
+
@default_embedding_model = nil # Let provider choose default
|
|
69
|
+
|
|
70
|
+
@index_directory = ".leann"
|
|
71
|
+
@hnsw_m = 32
|
|
72
|
+
@hnsw_ef_construction = 200
|
|
73
|
+
|
|
74
|
+
@chunk_size = 512
|
|
75
|
+
@chunk_overlap = 64
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Check if RubyLLM gem is available
|
|
79
|
+
# @return [Boolean]
|
|
80
|
+
def ruby_llm_available?
|
|
81
|
+
defined?(::RubyLLM) || gem_available?("ruby_llm")
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Check if FastEmbed gem is available
|
|
85
|
+
# @return [Boolean]
|
|
86
|
+
def fastembed_available?
|
|
87
|
+
defined?(::Fastembed) || gem_available?("fastembed")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Validate configuration
|
|
91
|
+
# @raise [ConfigurationError] if configuration is invalid
|
|
92
|
+
def validate!
|
|
93
|
+
case embedding_provider
|
|
94
|
+
when :ruby_llm
|
|
95
|
+
unless ruby_llm_available?
|
|
96
|
+
raise ConfigurationError, "RubyLLM gem is required. Add 'ruby_llm' to your Gemfile."
|
|
97
|
+
end
|
|
98
|
+
when :openai
|
|
99
|
+
raise ConfigurationError, "OpenAI API key is required" if openai_api_key.nil? || openai_api_key.empty?
|
|
100
|
+
when :ollama
|
|
101
|
+
# Ollama doesn't require API key, just needs to be running
|
|
102
|
+
when :fastembed
|
|
103
|
+
unless fastembed_available?
|
|
104
|
+
raise ConfigurationError, "FastEmbed gem is required. Add 'fastembed' to your Gemfile."
|
|
105
|
+
end
|
|
106
|
+
else
|
|
107
|
+
raise ConfigurationError, "Unknown embedding provider: #{embedding_provider}"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
true
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Get embedding model for a provider
|
|
114
|
+
# @return [String, nil]
|
|
115
|
+
def embedding_model_for(provider)
|
|
116
|
+
# Return custom model if explicitly set
|
|
117
|
+
return @default_embedding_model if @custom_embedding_model
|
|
118
|
+
|
|
119
|
+
# Provider-specific defaults
|
|
120
|
+
case provider
|
|
121
|
+
when :ruby_llm
|
|
122
|
+
nil # RubyLLM uses its own configured default
|
|
123
|
+
when :openai
|
|
124
|
+
"text-embedding-3-small"
|
|
125
|
+
when :ollama
|
|
126
|
+
"nomic-embed-text"
|
|
127
|
+
when :fastembed
|
|
128
|
+
"BAAI/bge-small-en-v1.5"
|
|
129
|
+
else
|
|
130
|
+
@default_embedding_model
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def default_embedding_model=(value)
|
|
135
|
+
@default_embedding_model = value
|
|
136
|
+
@custom_embedding_model = true
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
private
|
|
140
|
+
|
|
141
|
+
def gem_available?(name)
|
|
142
|
+
Gem::Specification.find_by_name(name)
|
|
143
|
+
true
|
|
144
|
+
rescue Gem::LoadError
|
|
145
|
+
false
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
module Embedding
|
|
5
|
+
# Base class for embedding providers
|
|
6
|
+
#
|
|
7
|
+
# Subclasses must implement:
|
|
8
|
+
# - #compute(texts) -> Array<Array<Float>>
|
|
9
|
+
#
|
|
10
|
+
class Base
|
|
11
|
+
# @return [String] Model name
|
|
12
|
+
attr_reader :model
|
|
13
|
+
|
|
14
|
+
# @return [Integer, nil] Embedding dimensions
|
|
15
|
+
attr_reader :dimensions
|
|
16
|
+
|
|
17
|
+
# @param model [String] Embedding model name
|
|
18
|
+
def initialize(model:)
|
|
19
|
+
@model = model
|
|
20
|
+
@dimensions = nil
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Compute embeddings for a list of texts
|
|
24
|
+
#
|
|
25
|
+
# @param texts [Array<String>] Texts to embed
|
|
26
|
+
# @return [Array<Array<Float>>] Embeddings (one per text)
|
|
27
|
+
# @raise [NotImplementedError] if not overridden
|
|
28
|
+
def compute(texts)
|
|
29
|
+
raise NotImplementedError, "Subclasses must implement #compute"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Compute embedding for a single text
|
|
33
|
+
#
|
|
34
|
+
# @param text [String]
|
|
35
|
+
# @return [Array<Float>]
|
|
36
|
+
def compute_one(text)
|
|
37
|
+
compute([text]).first
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
protected
|
|
41
|
+
|
|
42
|
+
# Normalize embedding to unit length (L2 normalization)
|
|
43
|
+
# @param embedding [Array<Float>]
|
|
44
|
+
# @return [Array<Float>]
|
|
45
|
+
def normalize(embedding)
|
|
46
|
+
norm = Math.sqrt(embedding.sum { |x| x * x })
|
|
47
|
+
return embedding if norm.zero?
|
|
48
|
+
|
|
49
|
+
embedding.map { |x| x / norm }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Batch processing helper
|
|
53
|
+
# @param items [Array]
|
|
54
|
+
# @param batch_size [Integer]
|
|
55
|
+
# @yield [Array] Each batch
|
|
56
|
+
def in_batches(items, batch_size)
|
|
57
|
+
items.each_slice(batch_size) do |batch|
|
|
58
|
+
yield batch
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Leann
|
|
6
|
+
module Embedding
|
|
7
|
+
# FastEmbed provider for local embeddings
|
|
8
|
+
#
|
|
9
|
+
# Uses ONNX Runtime for fast, local embedding generation without
|
|
10
|
+
# requiring an API key or external service.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# provider = Leann::Embedding::FastEmbed.new(model: "BAAI/bge-small-en-v1.5")
|
|
14
|
+
# embeddings = provider.compute(["Hello", "World"])
|
|
15
|
+
#
|
|
16
|
+
class FastEmbed < Base
|
|
17
|
+
MAX_BATCH_SIZE = 64
|
|
18
|
+
|
|
19
|
+
# Supported models with their dimensions
|
|
20
|
+
MODELS = {
|
|
21
|
+
"BAAI/bge-small-en-v1.5" => 384,
|
|
22
|
+
"BAAI/bge-base-en-v1.5" => 768,
|
|
23
|
+
"intfloat/multilingual-e5-small" => 384,
|
|
24
|
+
"nomic-ai/nomic-embed-text-v1.5" => 768
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
|
28
|
+
|
|
29
|
+
# @param model [String] FastEmbed model name
|
|
30
|
+
# @param cache_dir [String, nil] Model cache directory
|
|
31
|
+
# @param threads [Integer, nil] Number of ONNX threads
|
|
32
|
+
def initialize(model: nil, cache_dir: nil, threads: nil)
|
|
33
|
+
model ||= DEFAULT_MODEL
|
|
34
|
+
super(model: model)
|
|
35
|
+
|
|
36
|
+
@cache_dir = cache_dir || ENV["FASTEMBED_CACHE_PATH"]
|
|
37
|
+
@threads = threads
|
|
38
|
+
@client = nil
|
|
39
|
+
|
|
40
|
+
check_gem!
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Compute embeddings for texts
|
|
44
|
+
#
|
|
45
|
+
# @param texts [Array<String>]
|
|
46
|
+
# @return [Array<Array<Float>>]
|
|
47
|
+
def compute(texts)
|
|
48
|
+
return [] if texts.empty?
|
|
49
|
+
|
|
50
|
+
all_embeddings = []
|
|
51
|
+
|
|
52
|
+
in_batches(texts, MAX_BATCH_SIZE) do |batch|
|
|
53
|
+
batch_embeddings = compute_batch(batch)
|
|
54
|
+
all_embeddings.concat(batch_embeddings)
|
|
55
|
+
print "." # Progress indicator
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
puts " Done! (#{all_embeddings.size} embeddings)" unless texts.size < MAX_BATCH_SIZE
|
|
59
|
+
|
|
60
|
+
# FastEmbed returns normalized vectors by default
|
|
61
|
+
all_embeddings
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Get dimensions for the configured model
|
|
65
|
+
# @return [Integer]
|
|
66
|
+
def dimensions
|
|
67
|
+
@dimensions ||= MODELS[model] || detect_dimensions
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def check_gem!
|
|
73
|
+
unless defined?(::Fastembed)
|
|
74
|
+
raise ConfigurationError, <<~MSG
|
|
75
|
+
FastEmbed gem is required for local embeddings.
|
|
76
|
+
|
|
77
|
+
Add to your Gemfile:
|
|
78
|
+
gem 'fastembed'
|
|
79
|
+
|
|
80
|
+
Or install directly:
|
|
81
|
+
gem install fastembed
|
|
82
|
+
MSG
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def client
|
|
87
|
+
@client ||= begin
|
|
88
|
+
options = { model_name: model }
|
|
89
|
+
options[:cache_dir] = @cache_dir if @cache_dir
|
|
90
|
+
options[:threads] = @threads if @threads
|
|
91
|
+
|
|
92
|
+
::Fastembed::TextEmbedding.new(**options)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def compute_batch(texts)
|
|
97
|
+
# FastEmbed returns a lazy enumerator, convert to array
|
|
98
|
+
client.embed(texts, batch_size: texts.size).to_a
|
|
99
|
+
rescue ::Fastembed::Error => e
|
|
100
|
+
raise EmbeddingError.new(
|
|
101
|
+
"FastEmbed error: #{e.message}",
|
|
102
|
+
provider: :fastembed,
|
|
103
|
+
original_error: e
|
|
104
|
+
)
|
|
105
|
+
rescue StandardError => e
|
|
106
|
+
raise EmbeddingError.new(
|
|
107
|
+
"FastEmbed error: #{e.message}",
|
|
108
|
+
provider: :fastembed,
|
|
109
|
+
original_error: e
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def detect_dimensions
|
|
114
|
+
# Compute a single embedding to detect dimensions
|
|
115
|
+
sample = client.embed(["test"], batch_size: 1).first
|
|
116
|
+
sample.size
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|