leann 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leann
4
+ module Rails
5
+ # ActiveRecord-based storage backend for LEANN graphs
6
+ #
7
+ # Stores graph structure (neighbor lists) in the passages table,
8
+ # avoiding the need for separate binary files.
9
+ #
10
+ class ActiveRecordBackend
11
+ attr_reader :index, :dimensions, :m, :ef_construction
12
+
13
+ # @param index [Leann::Rails::Index] The index record
14
+ def initialize(index)
15
+ @index = index
16
+ @dimensions = index.dimensions
17
+ config = index.config || {}
18
+ @m = config["hnsw_m"] || Leann.configuration.hnsw_m
19
+ @ef_construction = config["hnsw_ef_construction"] || Leann.configuration.hnsw_ef_construction
20
+ @entry_point_id = nil
21
+ @max_level = 0
22
+ end
23
+
24
+ # Build the graph from documents and embeddings
25
+ #
26
+ # @param documents [Array<Hash>] Documents with :id, :text, :metadata
27
+ # @param embeddings [Array<Array<Float>>] Embedding vectors
28
+ def build(documents, embeddings)
29
+ return if documents.empty?
30
+
31
+ puts "Building LEANN graph with #{documents.size} nodes (M=#{@m})..."
32
+
33
+ # Build in-memory graph first using the core algorithm
34
+ graph = build_graph(documents, embeddings)
35
+
36
+ # Store passages with neighbor information
37
+ store_passages(documents, graph)
38
+
39
+ # Store graph metadata in the index
40
+ update_index_metadata(graph)
41
+
42
+ puts "Graph built and stored in database: #{documents.size} passages"
43
+ end
44
+
45
+ # Search the graph
46
+ #
47
+ # @param query_embedding [Array<Float>] Query vector
48
+ # @param embedding_provider [Leann::Embedding::Base] Provider for recomputing embeddings
49
+ # @param passages [Hash] Passage texts by ID (for embedding recomputation)
50
+ # @param limit [Integer] Number of results
51
+ # @return [Array<Array>] Array of [id, score] pairs
52
+ def search(query_embedding, embedding_provider:, passages:, limit:)
53
+ return [] if @index.passages.empty?
54
+
55
+ # Load graph metadata
56
+ config = @index.config || {}
57
+ entry_point_id = config["entry_point_id"]
58
+ max_level = config["max_level"] || 0
59
+
60
+ return [] unless entry_point_id
61
+
62
+ # Perform HNSW search with on-the-fly embedding recomputation
63
+ ef_search = [limit * 2, 50].max
64
+ search_hnsw(query_embedding, entry_point_id, max_level, ef_search, embedding_provider, passages, limit)
65
+ end
66
+
67
+ private
68
+
69
+ def build_graph(documents, embeddings)
70
+ # Simple HNSW-like graph construction
71
+ graph = {}
72
+ levels = {}
73
+ entry_point = nil
74
+ max_level = 0
75
+
76
+ documents.each_with_index do |doc, idx|
77
+ id = doc[:id]
78
+ embedding = embeddings[idx]
79
+ level = random_level
80
+
81
+ graph[id] = { embedding: embedding, neighbors: Array.new(level + 1) { [] } }
82
+ levels[id] = level
83
+
84
+ if entry_point.nil?
85
+ entry_point = id
86
+ max_level = level
87
+ else
88
+ # Insert into graph
89
+ insert_node(graph, id, embedding, entry_point, max_level, level)
90
+
91
+ if level > max_level
92
+ max_level = level
93
+ entry_point = id
94
+ end
95
+ end
96
+ end
97
+
98
+ { graph: graph, entry_point: entry_point, max_level: max_level }
99
+ end
100
+
101
+ def random_level
102
+ level = 0
103
+ ml = 1.0 / Math.log(@m)
104
+ while rand < Math.exp(-level / ml) && level < 16
105
+ level += 1
106
+ end
107
+ level
108
+ end
109
+
110
+ def insert_node(graph, new_id, new_embedding, entry_point, max_level, node_level)
111
+ current = entry_point
112
+
113
+ # Traverse from top to node's level
114
+ (max_level).downto(node_level + 1) do |level|
115
+ current = greedy_search_level(graph, new_embedding, current, level)
116
+ end
117
+
118
+ # Insert at each level
119
+ node_level.downto(0) do |level|
120
+ neighbors = search_level(graph, new_embedding, current, level, @ef_construction)
121
+
122
+ # Select M best neighbors
123
+ selected = select_neighbors(graph, new_embedding, neighbors, @m)
124
+
125
+ graph[new_id][:neighbors][level] = selected
126
+
127
+ # Add bidirectional connections
128
+ selected.each do |neighbor_id|
129
+ neighbor_neighbors = graph[neighbor_id][:neighbors][level] || []
130
+ neighbor_neighbors << new_id
131
+
132
+ # Prune if too many
133
+ if neighbor_neighbors.size > @m * 2
134
+ neighbor_embedding = graph[neighbor_id][:embedding]
135
+ graph[neighbor_id][:neighbors][level] = select_neighbors(
136
+ graph, neighbor_embedding, neighbor_neighbors, @m * 2
137
+ )
138
+ else
139
+ graph[neighbor_id][:neighbors][level] = neighbor_neighbors
140
+ end
141
+ end
142
+
143
+ current = selected.first if selected.any?
144
+ end
145
+ end
146
+
147
+ def greedy_search_level(graph, query, entry, level)
148
+ current = entry
149
+ current_dist = cosine_distance(query, graph[current][:embedding])
150
+
151
+ loop do
152
+ changed = false
153
+ neighbors = graph[current][:neighbors][level] || []
154
+
155
+ neighbors.each do |neighbor|
156
+ next unless graph[neighbor]
157
+
158
+ dist = cosine_distance(query, graph[neighbor][:embedding])
159
+ if dist < current_dist
160
+ current = neighbor
161
+ current_dist = dist
162
+ changed = true
163
+ end
164
+ end
165
+
166
+ break unless changed
167
+ end
168
+
169
+ current
170
+ end
171
+
172
+ def search_level(graph, query, entry, level, ef)
173
+ visited = Set.new([entry])
174
+ candidates = [[cosine_distance(query, graph[entry][:embedding]), entry]]
175
+ results = candidates.dup
176
+
177
+ while candidates.any?
178
+ candidates.sort_by!(&:first)
179
+ current_dist, current = candidates.shift
180
+
181
+ break if results.any? && current_dist > results.last.first
182
+
183
+ neighbors = graph[current][:neighbors][level] || []
184
+ neighbors.each do |neighbor|
185
+ next if visited.include?(neighbor)
186
+ next unless graph[neighbor]
187
+
188
+ visited << neighbor
189
+ dist = cosine_distance(query, graph[neighbor][:embedding])
190
+
191
+ if results.size < ef || dist < results.last.first
192
+ candidates << [dist, neighbor]
193
+ results << [dist, neighbor]
194
+ results.sort_by!(&:first)
195
+ results.pop if results.size > ef
196
+ end
197
+ end
198
+ end
199
+
200
+ results.map(&:last)
201
+ end
202
+
203
+ def select_neighbors(graph, query, candidates, m)
204
+ return candidates if candidates.size <= m
205
+
206
+ scored = candidates.map do |id|
207
+ [cosine_distance(query, graph[id][:embedding]), id]
208
+ end
209
+
210
+ scored.sort_by(&:first).first(m).map(&:last)
211
+ end
212
+
213
+ def cosine_distance(a, b)
214
+ dot = 0.0
215
+ norm_a = 0.0
216
+ norm_b = 0.0
217
+
218
+ a.each_with_index do |val, i|
219
+ dot += val * b[i]
220
+ norm_a += val * val
221
+ norm_b += b[i] * b[i]
222
+ end
223
+
224
+ similarity = dot / (Math.sqrt(norm_a) * Math.sqrt(norm_b) + 1e-10)
225
+ 1.0 - similarity
226
+ end
227
+
228
+ def store_passages(documents, graph_data)
229
+ graph = graph_data[:graph]
230
+
231
+ # Bulk insert passages
232
+ passage_records = documents.map do |doc|
233
+ node = graph[doc[:id]]
234
+ # Store only level-0 neighbors (most important for search)
235
+ neighbors = node[:neighbors][0] || []
236
+
237
+ {
238
+ leann_index_id: @index.id,
239
+ external_id: doc[:id],
240
+ text: doc[:text],
241
+ metadata: doc[:metadata] || {},
242
+ neighbors: neighbors,
243
+ created_at: Time.current,
244
+ updated_at: Time.current
245
+ }
246
+ end
247
+
248
+ Passage.insert_all(passage_records)
249
+ end
250
+
251
+ def update_index_metadata(graph_data)
252
+ @index.update!(
253
+ config: (@index.config || {}).merge(
254
+ "entry_point_id" => graph_data[:entry_point],
255
+ "max_level" => graph_data[:max_level],
256
+ "hnsw_m" => @m,
257
+ "hnsw_ef_construction" => @ef_construction
258
+ )
259
+ )
260
+ end
261
+
262
+ def search_hnsw(query_embedding, entry_point_id, max_level, ef, embedding_provider, passages, limit)
263
+ # Load passages with neighbors
264
+ passage_map = @index.passages.index_by(&:external_id)
265
+
266
+ return [] if passage_map.empty?
267
+
268
+ # Get entry point
269
+ entry = passage_map[entry_point_id]
270
+ return [] unless entry
271
+
272
+ # Cache for computed embeddings
273
+ embedding_cache = {}
274
+
275
+ get_embedding = lambda do |id|
276
+ return embedding_cache[id] if embedding_cache[id]
277
+
278
+ text = passages[id] || passage_map[id]&.text
279
+ return nil unless text
280
+
281
+ embedding_cache[id] = embedding_provider.compute([text]).first
282
+ end
283
+
284
+ entry_embedding = get_embedding.call(entry_point_id)
285
+ return [] unless entry_embedding
286
+
287
+ # Simple greedy search at level 0 (most passages only have level 0)
288
+ visited = Set.new([entry_point_id])
289
+ current_dist = cosine_distance(query_embedding, entry_embedding)
290
+ candidates = [[current_dist, entry_point_id]]
291
+ results = candidates.dup
292
+
293
+ while candidates.any?
294
+ candidates.sort_by!(&:first)
295
+ dist, current_id = candidates.shift
296
+
297
+ break if results.size >= ef && dist > results.last.first
298
+
299
+ # Get neighbors from database
300
+ current_passage = passage_map[current_id]
301
+ next unless current_passage
302
+
303
+ neighbors = current_passage.neighbor_ids
304
+
305
+ neighbors.each do |neighbor_id|
306
+ next if visited.include?(neighbor_id)
307
+
308
+ visited << neighbor_id
309
+
310
+ neighbor_embedding = get_embedding.call(neighbor_id)
311
+ next unless neighbor_embedding
312
+
313
+ neighbor_dist = cosine_distance(query_embedding, neighbor_embedding)
314
+
315
+ if results.size < ef || neighbor_dist < results.last.first
316
+ candidates << [neighbor_dist, neighbor_id]
317
+ results << [neighbor_dist, neighbor_id]
318
+ results.sort_by!(&:first)
319
+ results.pop if results.size > ef
320
+ end
321
+ end
322
+ end
323
+
324
+ # Convert distances to similarity scores
325
+ results.first(limit).map do |dist, id|
326
+ score = 1.0 - dist
327
+ [id, score]
328
+ end
329
+ end
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../leann"
4
+
5
+ module Leann
6
+ module Rails
7
+ autoload :Index, "leann/rails/active_record/index"
8
+ autoload :Passage, "leann/rails/active_record/passage"
9
+ autoload :ActiveRecordBackend, "leann/rails/storage/active_record_backend"
10
+ autoload :Builder, "leann/rails/builder"
11
+ autoload :Searcher, "leann/rails/searcher"
12
+
13
+ class << self
14
+ # Build a new index stored in the database
15
+ #
16
+ # @param name [String] Index name (unique identifier)
17
+ # @param options [Hash] Options for building
18
+ # @option options [Symbol] :embedding (:openai) Embedding provider
19
+ # @option options [String] :model Embedding model name
20
+ #
21
+ # @example
22
+ # Leann::Rails.build("products") do
23
+ # add "Red running shoes for athletes", category: "shoes"
24
+ # add "Blue denim jeans, slim fit", category: "pants"
25
+ # end
26
+ #
27
+ # @return [Leann::Rails::Index] The built index record
28
+ def build(name, **options, &block)
29
+ builder = Builder.new(name, **options)
30
+ builder.instance_eval(&block) if block_given?
31
+ builder.save
32
+ end
33
+
34
+ # Search an existing database index
35
+ #
36
+ # @param name [String] Index name
37
+ # @param query [String] Search query
38
+ # @param limit [Integer] Maximum results
39
+ # @param threshold [Float] Minimum similarity score
40
+ # @param filters [Hash] Metadata filters
41
+ #
42
+ # @example
43
+ # results = Leann::Rails.search("products", "comfortable shoes")
44
+ #
45
+ # @return [Leann::SearchResults]
46
+ def search(name, query, limit: 5, threshold: nil, filters: nil)
47
+ index = Index.find_by!(name: name)
48
+ searcher = Searcher.new(index)
49
+ searcher.search(query, limit: limit, threshold: threshold, filters: filters)
50
+ end
51
+
52
+ # Open an existing index
53
+ #
54
+ # @param name [String] Index name
55
+ # @return [Leann::Rails::Index]
56
+ def open(name)
57
+ Index.find_by!(name: name)
58
+ end
59
+
60
+ # Check if an index exists
61
+ #
62
+ # @param name [String] Index name
63
+ # @return [Boolean]
64
+ def exists?(name)
65
+ Index.exists?(name: name)
66
+ end
67
+
68
+ # Delete an index and all its passages
69
+ #
70
+ # @param name [String] Index name
71
+ # @return [Boolean]
72
+ def delete(name)
73
+ index = Index.find_by(name: name)
74
+ return false unless index
75
+
76
+ index.destroy
77
+ true
78
+ end
79
+
80
+ # List all indexes
81
+ #
82
+ # @return [Array<String>]
83
+ def list
84
+ Index.pluck(:name).sort
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ require "leann/rails/railtie" if defined?(::Rails::Railtie)
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../../leann"
4
+
5
+ # Only require ruby_llm if not already defined (allows mocking in tests)
6
+ unless defined?(::RubyLLM::Tool)
7
+ begin
8
+ require "ruby_llm"
9
+ rescue LoadError
10
+ raise LoadError, "RubyLLM is required for Leann::RubyLLM::Search. Add 'ruby_llm' to your Gemfile."
11
+ end
12
+ end
13
+
14
+ module Leann
15
+ module RubyLLM
16
+ # A RubyLLM tool for searching LEANN indexes
17
+ #
18
+ # @example Basic usage
19
+ # chat = ::RubyLLM.chat(model: "gpt-4o")
20
+ # .with_tool(Leann::RubyLLM::Search.new("my_index"))
21
+ #
22
+ # chat.ask("What does LEANN do?")
23
+ # # => LLM searches the index and generates an answer
24
+ #
25
+ # @example Multiple indexes
26
+ # docs_search = Leann::RubyLLM::Search.new("docs", name: "search_docs")
27
+ # code_search = Leann::RubyLLM::Search.new("codebase", name: "search_code")
28
+ #
29
+ # chat = ::RubyLLM.chat(model: "gpt-4o")
30
+ # .with_tools(docs_search, code_search)
31
+ #
32
+ class Search < ::RubyLLM::Tool
33
+ # @param index_name [String] Name of the LEANN index to search
34
+ # @param name [String] Tool name (defaults to "leann_search")
35
+ # @param limit [Integer] Default number of results (default: 5)
36
+ def initialize(index_name, name: "leann_search", limit: 5)
37
+ @index_name = index_name
38
+ @default_limit = limit
39
+ @tool_name = name
40
+ super()
41
+ end
42
+
43
+ def name
44
+ @tool_name
45
+ end
46
+
47
+ def description
48
+ "Searches the '#{@index_name}' knowledge base for relevant documents. " \
49
+ "Use this to find information before answering questions."
50
+ end
51
+
52
+ def params
53
+ ::RubyLLM::Schema.new do
54
+ string :query,
55
+ description: "The search query to find relevant documents",
56
+ required: true
57
+ integer :limit,
58
+ description: "Maximum number of results to return (default: #{@default_limit})",
59
+ required: false
60
+ end
61
+ end
62
+
63
+ def execute(query:, limit: nil)
64
+ limit ||= @default_limit
65
+ results = Leann.search(@index_name, query, limit: limit)
66
+
67
+ if results.empty?
68
+ { found: false, message: "No relevant documents found for: #{query}" }
69
+ else
70
+ {
71
+ found: true,
72
+ count: results.size,
73
+ documents: results.map do |r|
74
+ {
75
+ text: r.text,
76
+ score: r.score.round(3),
77
+ metadata: r.metadata
78
+ }
79
+ end
80
+ }
81
+ end
82
+ rescue Leann::IndexNotFoundError
83
+ { error: "Index '#{@index_name}' not found" }
84
+ rescue StandardError => e
85
+ { error: e.message }
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leann
4
+ # Represents a single search result
5
+ #
6
+ # @example
7
+ # results = Leann.search("my_index", "query")
8
+ # results.each do |result|
9
+ # puts result.text
10
+ # puts result.score
11
+ # puts result.metadata[:source]
12
+ # end
13
+ #
14
+ class SearchResult
15
+ # @return [String] Document ID
16
+ attr_reader :id
17
+
18
+ # @return [String] Document text
19
+ attr_reader :text
20
+
21
+ # @return [Float] Similarity score (higher is better)
22
+ attr_reader :score
23
+
24
+ # @return [Hash] Document metadata
25
+ attr_reader :metadata
26
+
27
+ # @param id [String]
28
+ # @param text [String]
29
+ # @param score [Float]
30
+ # @param metadata [Hash]
31
+ def initialize(id:, text:, score:, metadata: {})
32
+ @id = id
33
+ @text = text
34
+ @score = score.to_f
35
+ @metadata = metadata.transform_keys(&:to_sym)
36
+ end
37
+
38
+ # Truncate text to a maximum length
39
+ # @param max_length [Integer]
40
+ # @param omission [String]
41
+ # @return [String]
42
+ def truncated_text(max_length = 100, omission: "...")
43
+ return text if text.length <= max_length
44
+
45
+ text[0, max_length - omission.length] + omission
46
+ end
47
+
48
+ # Human-readable string representation
49
+ # @return [String]
50
+ def to_s
51
+ "[#{format("%.3f", score)}] #{truncated_text(80)}"
52
+ end
53
+
54
+ # Detailed inspection
55
+ # @return [String]
56
+ def inspect
57
+ "#<Leann::SearchResult id=#{id.inspect} score=#{format("%.4f", score)} text=#{truncated_text(50).inspect}>"
58
+ end
59
+
60
+ # Convert to hash
61
+ # @return [Hash]
62
+ def to_h
63
+ {
64
+ id: id,
65
+ text: text,
66
+ score: score,
67
+ metadata: metadata
68
+ }
69
+ end
70
+
71
+ # Compare by score (for sorting)
72
+ # @param other [SearchResult]
73
+ # @return [Integer]
74
+ def <=>(other)
75
+ other.score <=> score # Descending order
76
+ end
77
+
78
+ # Check equality
79
+ # @param other [SearchResult]
80
+ # @return [Boolean]
81
+ def ==(other)
82
+ return false unless other.is_a?(SearchResult)
83
+
84
+ id == other.id && text == other.text && score == other.score
85
+ end
86
+ alias eql? ==
87
+
88
+ # Hash code for use as hash key
89
+ # @return [Integer]
90
+ def hash
91
+ [id, text, score].hash
92
+ end
93
+ end
94
+
95
+ # Collection of search results with utility methods
96
+ class SearchResults
97
+ include Enumerable
98
+
99
+ # @return [Array<SearchResult>]
100
+ attr_reader :results
101
+
102
+ # @return [String] Original query
103
+ attr_reader :query
104
+
105
+ # @return [Float] Search duration in seconds
106
+ attr_reader :duration
107
+
108
+ # @param results [Array<SearchResult>]
109
+ # @param query [String]
110
+ # @param duration [Float]
111
+ def initialize(results, query: nil, duration: nil)
112
+ @results = results
113
+ @query = query
114
+ @duration = duration
115
+ end
116
+
117
+ # Iterate over results
118
+ def each(&block)
119
+ results.each(&block)
120
+ end
121
+
122
+ # Number of results
123
+ # @return [Integer]
124
+ def size
125
+ results.size
126
+ end
127
+ alias length size
128
+ alias count size
129
+
130
+ # Check if empty
131
+ # @return [Boolean]
132
+ def empty?
133
+ results.empty?
134
+ end
135
+
136
+ # Get first result
137
+ # @return [SearchResult, nil]
138
+ def first
139
+ results.first
140
+ end
141
+
142
+ # Get top n results
143
+ # @param n [Integer]
144
+ # @return [Array<SearchResult>]
145
+ def top(n)
146
+ results.first(n)
147
+ end
148
+
149
+ # Get result by index
150
+ # @param index [Integer]
151
+ # @return [SearchResult, nil]
152
+ def [](index)
153
+ results[index]
154
+ end
155
+
156
+ # Filter results by minimum score
157
+ # @param min_score [Float]
158
+ # @return [SearchResults]
159
+ def above(min_score)
160
+ filtered = results.select { |r| r.score >= min_score }
161
+ SearchResults.new(filtered, query: query, duration: duration)
162
+ end
163
+
164
+ # Get all texts
165
+ # @return [Array<String>]
166
+ def texts
167
+ results.map(&:text)
168
+ end
169
+
170
+ # Join all texts
171
+ # @param separator [String]
172
+ # @return [String]
173
+ def combined_text(separator: "\n\n")
174
+ texts.join(separator)
175
+ end
176
+
177
+ # Pretty print results
178
+ # @return [String]
179
+ def to_s
180
+ lines = ["Search results for: #{query.inspect}"]
181
+ lines << "Found #{size} results in #{format("%.3f", duration || 0)}s"
182
+ lines << "-" * 60
183
+ results.each_with_index do |r, i|
184
+ lines << "#{i + 1}. #{r}"
185
+ end
186
+ lines.join("\n")
187
+ end
188
+
189
+ # Convert to array of hashes
190
+ # @return [Array<Hash>]
191
+ def to_a
192
+ results.map(&:to_h)
193
+ end
194
+ end
195
+ end