leann 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,317 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+ require "securerandom"
6
+ require "time"
7
+
8
+ module Leann
9
+ # Builds a new Leann index
10
+ #
11
+ # @example DSL style
12
+ # Leann.build("my_index") do
13
+ # add "First document"
14
+ # add "Second document", source: "manual"
15
+ # add_file "README.md"
16
+ # end
17
+ #
18
+ # @example Programmatic style
19
+ # builder = Leann::Builder.new("my_index")
20
+ # builder.add("First document")
21
+ # builder.add("Second document")
22
+ # builder.save
23
+ #
24
+ class Builder
25
+ # @return [String] Index name
26
+ attr_reader :name
27
+
28
+ # @return [String] Index path
29
+ attr_reader :path
30
+
31
+ # @return [Array<Hash>] Documents to be indexed
32
+ attr_reader :documents
33
+
34
+ # @param name [String] Index name
35
+ # @param embedding [Symbol] Embedding provider (:ruby_llm, :openai, :ollama, :fastembed)
36
+ # @param model [String, nil] Embedding model name
37
+ # @param path [String, nil] Custom path for index
38
+ # @param force [Boolean] Overwrite existing index
39
+ def initialize(name, embedding: nil, model: nil, path: nil, force: false, **_options)
40
+ @name = name
41
+ @path = resolve_path(name, path)
42
+ @embedding_provider = embedding || Leann.configuration.embedding_provider
43
+ @embedding_model = model || Leann.configuration.embedding_model_for(@embedding_provider)
44
+ @force = force
45
+ @documents = []
46
+
47
+ check_existing_index unless force
48
+ end
49
+
50
+ # Add a text document
51
+ #
52
+ # @param text [String] Document text
53
+ # @param metadata [Hash] Additional metadata (passed as keyword arguments)
54
+ # @return [self]
55
+ #
56
+ # @example
57
+ # builder.add("Hello world")
58
+ # builder.add("Document with metadata", source: "file.txt", chapter: 1)
59
+ def add(text, **metadata)
60
+ raise ArgumentError, "Text cannot be nil" if text.nil?
61
+ raise ArgumentError, "Text cannot be empty" if text.to_s.strip.empty?
62
+
63
+ doc = {
64
+ id: metadata.delete(:id) || generate_id,
65
+ text: text.to_s.strip,
66
+ metadata: metadata
67
+ }
68
+
69
+ @documents << doc
70
+ self
71
+ end
72
+
73
+ # Add document (alias for add)
74
+ alias << add
75
+
76
+ # Add content from a file
77
+ #
78
+ # @param file_path [String] Path to file
79
+ # @param metadata [Hash] Additional metadata
80
+ # @return [self]
81
+ #
82
+ # @example
83
+ # builder.add_file("README.md")
84
+ # builder.add_file("docs/guide.txt", category: "documentation")
85
+ def add_file(file_path, **metadata)
86
+ raise ArgumentError, "File not found: #{file_path}" unless File.exist?(file_path)
87
+
88
+ content = File.read(file_path)
89
+ file_metadata = {
90
+ source: file_path,
91
+ filename: File.basename(file_path),
92
+ extension: File.extname(file_path)
93
+ }.merge(metadata)
94
+
95
+ add(content, **file_metadata)
96
+ end
97
+
98
+ # Add all files from a directory
99
+ #
100
+ # @param directory [String] Directory path
101
+ # @param pattern [String] Glob pattern (default: "**/*")
102
+ # @param extensions [Array<String>, nil] Filter by extensions (e.g., [".md", ".txt"])
103
+ # @param metadata [Hash] Additional metadata for all files
104
+ # @return [self]
105
+ #
106
+ # @example
107
+ # builder.add_directory("docs/")
108
+ # builder.add_directory("src/", extensions: [".rb", ".py"])
109
+ def add_directory(directory, pattern: "**/*", extensions: nil, **metadata)
110
+ raise ArgumentError, "Directory not found: #{directory}" unless Dir.exist?(directory)
111
+
112
+ full_pattern = File.join(directory, pattern)
113
+ Dir.glob(full_pattern).each do |file_path|
114
+ next unless File.file?(file_path)
115
+ next if extensions && !extensions.include?(File.extname(file_path))
116
+
117
+ add_file(file_path, **metadata)
118
+ end
119
+
120
+ self
121
+ end
122
+
123
+ # Add multiple documents at once
124
+ #
125
+ # @param docs [Array<String>, Array<Hash>] Documents to add
126
+ # @return [self]
127
+ #
128
+ # @example
129
+ # builder.add_all(["Doc 1", "Doc 2", "Doc 3"])
130
+ # builder.add_all([
131
+ # { text: "Doc 1", source: "a" },
132
+ # { text: "Doc 2", source: "b" }
133
+ # ])
134
+ def add_all(docs)
135
+ docs.each do |doc|
136
+ case doc
137
+ when String
138
+ add(doc)
139
+ when Hash
140
+ text = doc.delete(:text) || doc.delete("text")
141
+ add(text, **doc.transform_keys(&:to_sym))
142
+ else
143
+ raise ArgumentError, "Invalid document type: #{doc.class}"
144
+ end
145
+ end
146
+
147
+ self
148
+ end
149
+
150
+ # Get number of documents added
151
+ # @return [Integer]
152
+ def count
153
+ @documents.size
154
+ end
155
+ alias size count
156
+
157
+ # Check if any documents have been added
158
+ # @return [Boolean]
159
+ def empty?
160
+ @documents.empty?
161
+ end
162
+
163
+ # Build and save the index
164
+ # @return [Index] The built index
165
+ def save
166
+ raise EmptyIndexError if empty?
167
+
168
+ puts "Building index '#{name}' with #{count} documents..."
169
+
170
+ # Create directory if needed
171
+ FileUtils.mkdir_p(File.dirname(path))
172
+
173
+ # Delete existing if force mode
174
+ Index.delete(path) if @force && Index.exists?(path)
175
+
176
+ # Compute embeddings
177
+ embeddings = compute_embeddings
178
+
179
+ # Save passages
180
+ save_passages
181
+
182
+ # Build and save graph
183
+ save_graph(embeddings)
184
+
185
+ # Save metadata
186
+ save_metadata(embeddings)
187
+
188
+ puts "Index '#{name}' created successfully!"
189
+
190
+ Index.open(path)
191
+ end
192
+ alias build save
193
+
194
+ private
195
+
196
+ def resolve_path(name, custom_path)
197
+ if custom_path
198
+ custom_path.end_with?(Index::INDEX_EXTENSION) ? custom_path : "#{custom_path}#{Index::INDEX_EXTENSION}"
199
+ else
200
+ "#{name}#{Index::INDEX_EXTENSION}"
201
+ end
202
+ end
203
+
204
+ def check_existing_index
205
+ raise IndexExistsError, name if Index.exists?(path)
206
+ end
207
+
208
+ def generate_id
209
+ SecureRandom.uuid
210
+ end
211
+
212
+ def compute_embeddings
213
+ texts = @documents.map { |d| d[:text] }
214
+ embedding_provider.compute(texts)
215
+ end
216
+
217
+ def embedding_provider
218
+ @_embedding_provider ||= load_embedding_provider
219
+ end
220
+
221
+ def load_embedding_provider
222
+ require_relative "embedding/base"
223
+
224
+ case @embedding_provider
225
+ when :ruby_llm
226
+ require_relative "embedding/ruby_llm"
227
+ Embedding::RubyLLM.new(model: @embedding_model)
228
+ when :openai
229
+ require_relative "embedding/openai"
230
+ Embedding::OpenAI.new(model: @embedding_model)
231
+ when :ollama
232
+ require_relative "embedding/ollama"
233
+ Embedding::Ollama.new(model: @embedding_model)
234
+ when :fastembed
235
+ require_relative "embedding/fastembed"
236
+ Embedding::FastEmbed.new(model: @embedding_model)
237
+ else
238
+ raise ConfigurationError, "Unknown embedding provider: #{@embedding_provider}"
239
+ end
240
+ end
241
+
242
+ def save_passages
243
+ passages_file = "#{path}#{Index::PASSAGES_SUFFIX}"
244
+ offsets_file = "#{path}#{Index::OFFSETS_SUFFIX}"
245
+
246
+ offsets = {}
247
+
248
+ File.open(passages_file, "w") do |f|
249
+ @documents.each do |doc|
250
+ offsets[doc[:id]] = f.pos
251
+ f.puts(JSON.generate(doc))
252
+ end
253
+ end
254
+
255
+ File.write(offsets_file, JSON.generate(offsets))
256
+ end
257
+
258
+ def save_graph(embeddings)
259
+ ids = @documents.map { |d| d[:id] }
260
+
261
+ require_relative "backend/leann_graph"
262
+
263
+ graph = Backend::LeannGraph.new(
264
+ dimensions: embeddings.first.size,
265
+ m: Leann.configuration.hnsw_m,
266
+ ef_construction: Leann.configuration.hnsw_ef_construction
267
+ )
268
+
269
+ graph.build(ids, embeddings)
270
+ graph.save(path)
271
+
272
+ report_storage_savings(embeddings)
273
+ end
274
+
275
+ def report_storage_savings(embeddings)
276
+ embedding_size = embeddings.first.size * 4 # float32
277
+ total_embedding_bytes = embeddings.length * embedding_size
278
+
279
+ graph_file = "#{path}.graph.bin"
280
+ actual_size = File.exist?(graph_file) ? File.size(graph_file) : 0
281
+
282
+ savings = ((total_embedding_bytes - actual_size).to_f / total_embedding_bytes * 100).round(1)
283
+ puts "Storage savings: #{savings}% (#{format_bytes(total_embedding_bytes)} → #{format_bytes(actual_size)})"
284
+ end
285
+
286
+ def format_bytes(bytes)
287
+ if bytes < 1024
288
+ "#{bytes} B"
289
+ elsif bytes < 1024 * 1024
290
+ "#{(bytes / 1024.0).round(1)} KB"
291
+ else
292
+ "#{(bytes / (1024.0 * 1024)).round(2)} MB"
293
+ end
294
+ end
295
+
296
+ def save_metadata(embeddings)
297
+ meta_file = "#{path}#{Index::META_SUFFIX}"
298
+
299
+ metadata = {
300
+ version: "1.0",
301
+ name: name,
302
+ backend: "leann",
303
+ embedding_provider: @embedding_provider.to_s,
304
+ embedding_model: @embedding_model,
305
+ dimensions: embeddings.first&.size || 0,
306
+ document_count: @documents.size,
307
+ created_at: Time.now.utc.iso8601,
308
+ config: {
309
+ hnsw_m: Leann.configuration.hnsw_m,
310
+ hnsw_ef_construction: Leann.configuration.hnsw_ef_construction
311
+ }
312
+ }
313
+
314
+ File.write(meta_file, JSON.pretty_generate(metadata))
315
+ end
316
+ end
317
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leann
4
+ # Global configuration for Leann
5
+ #
6
+ # @example With RubyLLM (recommended)
7
+ # # If RubyLLM is present, LEANN uses it automatically
8
+ # # Just configure RubyLLM as usual:
9
+ # RubyLLM.configure do |config|
10
+ # config.openai_api_key = ENV["OPENAI_API_KEY"]
11
+ # end
12
+ #
13
+ # @example Manual configuration
14
+ # Leann.configure do |config|
15
+ # config.embedding_provider = :openai
16
+ # config.openai_api_key = ENV["OPENAI_API_KEY"]
17
+ # end
18
+ #
19
+ class Configuration
20
+ # Embedding provider (:ruby_llm, :openai, :ollama, :fastembed)
21
+ # Defaults to :ruby_llm if RubyLLM gem is available, otherwise :openai
22
+ # @return [Symbol]
23
+ attr_accessor :embedding_provider
24
+
25
+ # OpenAI API key (only needed if not using RubyLLM)
26
+ # @return [String, nil]
27
+ attr_accessor :openai_api_key
28
+
29
+ # OpenAI base URL (for custom endpoints)
30
+ # @return [String, nil]
31
+ attr_accessor :openai_base_url
32
+
33
+ # Ollama host URL
34
+ # @return [String]
35
+ attr_accessor :ollama_host
36
+
37
+ # Default embedding model
38
+ # @return [String]
39
+ attr_accessor :default_embedding_model
40
+
41
+ # Index storage directory
42
+ # @return [String]
43
+ attr_accessor :index_directory
44
+
45
+ # HNSW M parameter (graph connectivity)
46
+ # @return [Integer]
47
+ attr_accessor :hnsw_m
48
+
49
+ # HNSW ef_construction parameter
50
+ # @return [Integer]
51
+ attr_accessor :hnsw_ef_construction
52
+
53
+ # Default chunk size for text splitting
54
+ # @return [Integer]
55
+ attr_accessor :chunk_size
56
+
57
+ # Default chunk overlap
58
+ # @return [Integer]
59
+ attr_accessor :chunk_overlap
60
+
61
+ def initialize
62
+ # Default to RubyLLM if available, otherwise OpenAI
63
+ @embedding_provider = ruby_llm_available? ? :ruby_llm : :openai
64
+
65
+ @openai_api_key = ENV["OPENAI_API_KEY"]
66
+ @openai_base_url = ENV["OPENAI_BASE_URL"]
67
+ @ollama_host = ENV.fetch("OLLAMA_HOST", "http://localhost:11434")
68
+ @default_embedding_model = nil # Let provider choose default
69
+
70
+ @index_directory = ".leann"
71
+ @hnsw_m = 32
72
+ @hnsw_ef_construction = 200
73
+
74
+ @chunk_size = 512
75
+ @chunk_overlap = 64
76
+ end
77
+
78
+ # Check if RubyLLM gem is available
79
+ # @return [Boolean]
80
+ def ruby_llm_available?
81
+ defined?(::RubyLLM) || gem_available?("ruby_llm")
82
+ end
83
+
84
+ # Check if FastEmbed gem is available
85
+ # @return [Boolean]
86
+ def fastembed_available?
87
+ defined?(::Fastembed) || gem_available?("fastembed")
88
+ end
89
+
90
+ # Validate configuration
91
+ # @raise [ConfigurationError] if configuration is invalid
92
+ def validate!
93
+ case embedding_provider
94
+ when :ruby_llm
95
+ unless ruby_llm_available?
96
+ raise ConfigurationError, "RubyLLM gem is required. Add 'ruby_llm' to your Gemfile."
97
+ end
98
+ when :openai
99
+ raise ConfigurationError, "OpenAI API key is required" if openai_api_key.nil? || openai_api_key.empty?
100
+ when :ollama
101
+ # Ollama doesn't require API key, just needs to be running
102
+ when :fastembed
103
+ unless fastembed_available?
104
+ raise ConfigurationError, "FastEmbed gem is required. Add 'fastembed' to your Gemfile."
105
+ end
106
+ else
107
+ raise ConfigurationError, "Unknown embedding provider: #{embedding_provider}"
108
+ end
109
+
110
+ true
111
+ end
112
+
113
+ # Get embedding model for a provider
114
+ # @return [String, nil]
115
+ def embedding_model_for(provider)
116
+ # Return custom model if explicitly set
117
+ return @default_embedding_model if @custom_embedding_model
118
+
119
+ # Provider-specific defaults
120
+ case provider
121
+ when :ruby_llm
122
+ nil # RubyLLM uses its own configured default
123
+ when :openai
124
+ "text-embedding-3-small"
125
+ when :ollama
126
+ "nomic-embed-text"
127
+ when :fastembed
128
+ "BAAI/bge-small-en-v1.5"
129
+ else
130
+ @default_embedding_model
131
+ end
132
+ end
133
+
134
+ def default_embedding_model=(value)
135
+ @default_embedding_model = value
136
+ @custom_embedding_model = true
137
+ end
138
+
139
+ private
140
+
141
+ def gem_available?(name)
142
+ Gem::Specification.find_by_name(name)
143
+ true
144
+ rescue Gem::LoadError
145
+ false
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leann
4
+ module Embedding
5
+ # Base class for embedding providers
6
+ #
7
+ # Subclasses must implement:
8
+ # - #compute(texts) -> Array<Array<Float>>
9
+ #
10
+ class Base
11
+ # @return [String] Model name
12
+ attr_reader :model
13
+
14
+ # @return [Integer, nil] Embedding dimensions
15
+ attr_reader :dimensions
16
+
17
+ # @param model [String] Embedding model name
18
+ def initialize(model:)
19
+ @model = model
20
+ @dimensions = nil
21
+ end
22
+
23
+ # Compute embeddings for a list of texts
24
+ #
25
+ # @param texts [Array<String>] Texts to embed
26
+ # @return [Array<Array<Float>>] Embeddings (one per text)
27
+ # @raise [NotImplementedError] if not overridden
28
+ def compute(texts)
29
+ raise NotImplementedError, "Subclasses must implement #compute"
30
+ end
31
+
32
+ # Compute embedding for a single text
33
+ #
34
+ # @param text [String]
35
+ # @return [Array<Float>]
36
+ def compute_one(text)
37
+ compute([text]).first
38
+ end
39
+
40
+ protected
41
+
42
+ # Normalize embedding to unit length (L2 normalization)
43
+ # @param embedding [Array<Float>]
44
+ # @return [Array<Float>]
45
+ def normalize(embedding)
46
+ norm = Math.sqrt(embedding.sum { |x| x * x })
47
+ return embedding if norm.zero?
48
+
49
+ embedding.map { |x| x / norm }
50
+ end
51
+
52
+ # Batch processing helper
53
+ # @param items [Array]
54
+ # @param batch_size [Integer]
55
+ # @yield [Array] Each batch
56
+ def in_batches(items, batch_size)
57
+ items.each_slice(batch_size) do |batch|
58
+ yield batch
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Leann
6
+ module Embedding
7
+ # FastEmbed provider for local embeddings
8
+ #
9
+ # Uses ONNX Runtime for fast, local embedding generation without
10
+ # requiring an API key or external service.
11
+ #
12
+ # @example
13
+ # provider = Leann::Embedding::FastEmbed.new(model: "BAAI/bge-small-en-v1.5")
14
+ # embeddings = provider.compute(["Hello", "World"])
15
+ #
16
+ class FastEmbed < Base
17
+ MAX_BATCH_SIZE = 64
18
+
19
+ # Supported models with their dimensions
20
+ MODELS = {
21
+ "BAAI/bge-small-en-v1.5" => 384,
22
+ "BAAI/bge-base-en-v1.5" => 768,
23
+ "intfloat/multilingual-e5-small" => 384,
24
+ "nomic-ai/nomic-embed-text-v1.5" => 768
25
+ }.freeze
26
+
27
+ DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
28
+
29
+ # @param model [String] FastEmbed model name
30
+ # @param cache_dir [String, nil] Model cache directory
31
+ # @param threads [Integer, nil] Number of ONNX threads
32
+ def initialize(model: nil, cache_dir: nil, threads: nil)
33
+ model ||= DEFAULT_MODEL
34
+ super(model: model)
35
+
36
+ @cache_dir = cache_dir || ENV["FASTEMBED_CACHE_PATH"]
37
+ @threads = threads
38
+ @client = nil
39
+
40
+ check_gem!
41
+ end
42
+
43
+ # Compute embeddings for texts
44
+ #
45
+ # @param texts [Array<String>]
46
+ # @return [Array<Array<Float>>]
47
+ def compute(texts)
48
+ return [] if texts.empty?
49
+
50
+ all_embeddings = []
51
+
52
+ in_batches(texts, MAX_BATCH_SIZE) do |batch|
53
+ batch_embeddings = compute_batch(batch)
54
+ all_embeddings.concat(batch_embeddings)
55
+ print "." # Progress indicator
56
+ end
57
+
58
+ puts " Done! (#{all_embeddings.size} embeddings)" unless texts.size < MAX_BATCH_SIZE
59
+
60
+ # FastEmbed returns normalized vectors by default
61
+ all_embeddings
62
+ end
63
+
64
+ # Get dimensions for the configured model
65
+ # @return [Integer]
66
+ def dimensions
67
+ @dimensions ||= MODELS[model] || detect_dimensions
68
+ end
69
+
70
+ private
71
+
72
+ def check_gem!
73
+ unless defined?(::Fastembed)
74
+ raise ConfigurationError, <<~MSG
75
+ FastEmbed gem is required for local embeddings.
76
+
77
+ Add to your Gemfile:
78
+ gem 'fastembed'
79
+
80
+ Or install directly:
81
+ gem install fastembed
82
+ MSG
83
+ end
84
+ end
85
+
86
+ def client
87
+ @client ||= begin
88
+ options = { model_name: model }
89
+ options[:cache_dir] = @cache_dir if @cache_dir
90
+ options[:threads] = @threads if @threads
91
+
92
+ ::Fastembed::TextEmbedding.new(**options)
93
+ end
94
+ end
95
+
96
+ def compute_batch(texts)
97
+ # FastEmbed returns a lazy enumerator, convert to array
98
+ client.embed(texts, batch_size: texts.size).to_a
99
+ rescue ::Fastembed::Error => e
100
+ raise EmbeddingError.new(
101
+ "FastEmbed error: #{e.message}",
102
+ provider: :fastembed,
103
+ original_error: e
104
+ )
105
+ rescue StandardError => e
106
+ raise EmbeddingError.new(
107
+ "FastEmbed error: #{e.message}",
108
+ provider: :fastembed,
109
+ original_error: e
110
+ )
111
+ end
112
+
113
+ def detect_dimensions
114
+ # Compute a single embedding to detect dimensions
115
+ sample = client.embed(["test"], batch_size: 1).first
116
+ sample.size
117
+ end
118
+ end
119
+ end
120
+ end