leann 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +375 -0
- data/exe/leann +167 -0
- data/lib/generators/leann/install/install_generator.rb +51 -0
- data/lib/generators/leann/install/templates/migration.rb.erb +28 -0
- data/lib/leann/backend/base.rb +51 -0
- data/lib/leann/backend/leann_graph.rb +476 -0
- data/lib/leann/builder.rb +317 -0
- data/lib/leann/configuration.rb +148 -0
- data/lib/leann/embedding/base.rb +63 -0
- data/lib/leann/embedding/fastembed.rb +120 -0
- data/lib/leann/embedding/ollama.rb +194 -0
- data/lib/leann/embedding/openai.rb +149 -0
- data/lib/leann/embedding/ruby_llm.rb +57 -0
- data/lib/leann/errors.rb +71 -0
- data/lib/leann/index.rb +236 -0
- data/lib/leann/rails/active_record/index.rb +70 -0
- data/lib/leann/rails/active_record/passage.rb +56 -0
- data/lib/leann/rails/builder.rb +205 -0
- data/lib/leann/rails/railtie.rb +16 -0
- data/lib/leann/rails/searcher.rb +117 -0
- data/lib/leann/rails/storage/active_record_backend.rb +332 -0
- data/lib/leann/rails.rb +90 -0
- data/lib/leann/ruby_llm/search.rb +89 -0
- data/lib/leann/search_result.rb +195 -0
- data/lib/leann/searcher.rb +189 -0
- data/lib/leann/version.rb +3 -0
- data/lib/leann.rb +133 -0
- metadata +177 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
module Rails
|
|
5
|
+
# ActiveRecord model for storing LEANN indexes
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# index = Leann::Rails::Index.find_by(name: "products")
|
|
9
|
+
# index.search("running shoes")
|
|
10
|
+
#
|
|
11
|
+
class Index < ::ActiveRecord::Base
|
|
12
|
+
self.table_name = "leann_indexes"
|
|
13
|
+
|
|
14
|
+
has_many :passages,
|
|
15
|
+
class_name: "Leann::Rails::Passage",
|
|
16
|
+
foreign_key: :leann_index_id,
|
|
17
|
+
dependent: :delete_all
|
|
18
|
+
|
|
19
|
+
validates :name, presence: true, uniqueness: true
|
|
20
|
+
validates :embedding_provider, presence: true
|
|
21
|
+
validates :dimensions, presence: true, numericality: { greater_than: 0 }
|
|
22
|
+
|
|
23
|
+
serialize :config, coder: JSON
|
|
24
|
+
|
|
25
|
+
# Search this index
|
|
26
|
+
#
|
|
27
|
+
# @param query [String] Search query
|
|
28
|
+
# @param limit [Integer] Maximum results
|
|
29
|
+
# @param threshold [Float] Minimum similarity score
|
|
30
|
+
# @param filters [Hash] Metadata filters
|
|
31
|
+
# @return [Leann::SearchResults]
|
|
32
|
+
def search(query, limit: 5, threshold: nil, filters: nil)
|
|
33
|
+
searcher = Searcher.new(self)
|
|
34
|
+
searcher.search(query, limit: limit, threshold: threshold, filters: filters)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Get number of documents
|
|
38
|
+
# @return [Integer]
|
|
39
|
+
def document_count
|
|
40
|
+
passages.count
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Get embedding provider as symbol
|
|
44
|
+
# @return [Symbol]
|
|
45
|
+
def embedding_provider_sym
|
|
46
|
+
embedding_provider.to_sym
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Index info as string
|
|
50
|
+
# @return [String]
|
|
51
|
+
def to_s
|
|
52
|
+
lines = [
|
|
53
|
+
"Index: #{name}",
|
|
54
|
+
" Documents: #{document_count}",
|
|
55
|
+
" Embedding: #{embedding_provider}/#{embedding_model}",
|
|
56
|
+
" Dimensions: #{dimensions}",
|
|
57
|
+
" Backend: active_record",
|
|
58
|
+
" Created: #{created_at&.strftime('%Y-%m-%d %H:%M:%S') || 'unknown'}"
|
|
59
|
+
]
|
|
60
|
+
lines.join("\n")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Detailed inspection
|
|
64
|
+
# @return [String]
|
|
65
|
+
def inspect
|
|
66
|
+
"#<Leann::Rails::Index id=#{id} name=#{name.inspect} documents=#{document_count}>"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
module Rails
|
|
5
|
+
# ActiveRecord model for storing passages/documents within an index
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# passage = Leann::Rails::Passage.find(123)
|
|
9
|
+
# passage.text # => "Document content..."
|
|
10
|
+
# passage.metadata # => { category: "docs" }
|
|
11
|
+
#
|
|
12
|
+
class Passage < ::ActiveRecord::Base
|
|
13
|
+
self.table_name = "leann_passages"
|
|
14
|
+
|
|
15
|
+
belongs_to :index,
|
|
16
|
+
class_name: "Leann::Rails::Index",
|
|
17
|
+
foreign_key: :leann_index_id
|
|
18
|
+
|
|
19
|
+
validates :external_id, presence: true
|
|
20
|
+
validates :text, presence: true
|
|
21
|
+
validates :leann_index_id, presence: true
|
|
22
|
+
|
|
23
|
+
serialize :metadata, coder: JSON
|
|
24
|
+
serialize :neighbors, coder: JSON
|
|
25
|
+
|
|
26
|
+
# Get metadata with symbolized keys
|
|
27
|
+
# @return [Hash]
|
|
28
|
+
def metadata_sym
|
|
29
|
+
(metadata || {}).transform_keys(&:to_sym)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get neighbor IDs
|
|
33
|
+
# @return [Array<String>]
|
|
34
|
+
def neighbor_ids
|
|
35
|
+
neighbors || []
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Convert to hash for search results
|
|
39
|
+
# @return [Hash]
|
|
40
|
+
def to_h
|
|
41
|
+
{
|
|
42
|
+
id: external_id,
|
|
43
|
+
text: text,
|
|
44
|
+
metadata: metadata_sym
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Detailed inspection
|
|
49
|
+
# @return [String]
|
|
50
|
+
def inspect
|
|
51
|
+
text_preview = text.length > 50 ? "#{text[0..47]}..." : text
|
|
52
|
+
"#<Leann::Rails::Passage id=#{id} external_id=#{external_id.inspect} text=#{text_preview.inspect}>"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
|
|
5
|
+
module Leann
|
|
6
|
+
module Rails
|
|
7
|
+
# Builds a new LEANN index stored in the database
|
|
8
|
+
#
|
|
9
|
+
# @example DSL style
|
|
10
|
+
# Leann::Rails.build("products") do
|
|
11
|
+
# add "Red running shoes", category: "shoes"
|
|
12
|
+
# add "Blue denim jeans", category: "pants"
|
|
13
|
+
# end
|
|
14
|
+
#
|
|
15
|
+
# @example Programmatic style
|
|
16
|
+
# builder = Leann::Rails::Builder.new("products")
|
|
17
|
+
# builder.add("Red running shoes", category: "shoes")
|
|
18
|
+
# builder.save
|
|
19
|
+
#
|
|
20
|
+
class Builder
|
|
21
|
+
# @return [String] Index name
|
|
22
|
+
attr_reader :name
|
|
23
|
+
|
|
24
|
+
# @return [Array<Hash>] Documents to be indexed
|
|
25
|
+
attr_reader :documents
|
|
26
|
+
|
|
27
|
+
# @param name [String] Index name (must be unique)
|
|
28
|
+
# @param embedding [Symbol] Embedding provider (:ruby_llm, :openai, :ollama, :fastembed)
|
|
29
|
+
# @param model [String, nil] Embedding model name
|
|
30
|
+
# @param force [Boolean] Overwrite existing index
|
|
31
|
+
def initialize(name, embedding: nil, model: nil, force: false)
|
|
32
|
+
@name = name
|
|
33
|
+
@embedding_provider = embedding || Leann.configuration.embedding_provider
|
|
34
|
+
@embedding_model = model || Leann.configuration.embedding_model_for(@embedding_provider)
|
|
35
|
+
@force = force
|
|
36
|
+
@documents = []
|
|
37
|
+
|
|
38
|
+
check_existing_index unless force
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Add a text document
|
|
42
|
+
#
|
|
43
|
+
# @param text [String] Document text
|
|
44
|
+
# @param metadata [Hash] Additional metadata
|
|
45
|
+
# @return [self]
|
|
46
|
+
def add(text, **metadata)
|
|
47
|
+
raise ArgumentError, "Text cannot be nil" if text.nil?
|
|
48
|
+
raise ArgumentError, "Text cannot be empty" if text.to_s.strip.empty?
|
|
49
|
+
|
|
50
|
+
doc = {
|
|
51
|
+
id: metadata.delete(:id) || SecureRandom.uuid,
|
|
52
|
+
text: text.to_s.strip,
|
|
53
|
+
metadata: metadata
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@documents << doc
|
|
57
|
+
self
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Add document (alias for add)
|
|
61
|
+
alias << add
|
|
62
|
+
|
|
63
|
+
# Add content from a file
|
|
64
|
+
#
|
|
65
|
+
# @param file_path [String] Path to file
|
|
66
|
+
# @param metadata [Hash] Additional metadata
|
|
67
|
+
# @return [self]
|
|
68
|
+
def add_file(file_path, **metadata)
|
|
69
|
+
raise ArgumentError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
70
|
+
|
|
71
|
+
content = File.read(file_path)
|
|
72
|
+
file_metadata = {
|
|
73
|
+
source: file_path,
|
|
74
|
+
filename: File.basename(file_path),
|
|
75
|
+
extension: File.extname(file_path)
|
|
76
|
+
}.merge(metadata)
|
|
77
|
+
|
|
78
|
+
add(content, **file_metadata)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Add all files from a directory
|
|
82
|
+
#
|
|
83
|
+
# @param directory [String] Directory path
|
|
84
|
+
# @param pattern [String] Glob pattern
|
|
85
|
+
# @param extensions [Array<String>, nil] Filter by extensions
|
|
86
|
+
# @param metadata [Hash] Additional metadata for all files
|
|
87
|
+
# @return [self]
|
|
88
|
+
def add_directory(directory, pattern: "**/*", extensions: nil, **metadata)
|
|
89
|
+
raise ArgumentError, "Directory not found: #{directory}" unless Dir.exist?(directory)
|
|
90
|
+
|
|
91
|
+
full_pattern = File.join(directory, pattern)
|
|
92
|
+
Dir.glob(full_pattern).each do |file_path|
|
|
93
|
+
next unless File.file?(file_path)
|
|
94
|
+
next if extensions && !extensions.include?(File.extname(file_path))
|
|
95
|
+
|
|
96
|
+
add_file(file_path, **metadata)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
self
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Add multiple documents at once
|
|
103
|
+
#
|
|
104
|
+
# @param docs [Array<String>, Array<Hash>] Documents to add
|
|
105
|
+
# @return [self]
|
|
106
|
+
def add_all(docs)
|
|
107
|
+
docs.each do |doc|
|
|
108
|
+
case doc
|
|
109
|
+
when String
|
|
110
|
+
add(doc)
|
|
111
|
+
when Hash
|
|
112
|
+
text = doc.delete(:text) || doc.delete("text")
|
|
113
|
+
add(text, **doc.transform_keys(&:to_sym))
|
|
114
|
+
else
|
|
115
|
+
raise ArgumentError, "Invalid document type: #{doc.class}"
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
self
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Get number of documents added
|
|
123
|
+
# @return [Integer]
|
|
124
|
+
def count
|
|
125
|
+
@documents.size
|
|
126
|
+
end
|
|
127
|
+
alias size count
|
|
128
|
+
|
|
129
|
+
# Check if any documents have been added
|
|
130
|
+
# @return [Boolean]
|
|
131
|
+
def empty?
|
|
132
|
+
@documents.empty?
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Build and save the index to the database
|
|
136
|
+
# @return [Leann::Rails::Index] The built index record
|
|
137
|
+
def save
|
|
138
|
+
raise Leann::EmptyIndexError if empty?
|
|
139
|
+
|
|
140
|
+
puts "Building index '#{name}' with #{count} documents..."
|
|
141
|
+
|
|
142
|
+
# Delete existing if force mode
|
|
143
|
+
Index.find_by(name: name)&.destroy if @force
|
|
144
|
+
|
|
145
|
+
# Compute embeddings
|
|
146
|
+
embeddings = compute_embeddings
|
|
147
|
+
|
|
148
|
+
# Create index record
|
|
149
|
+
index = Index.create!(
|
|
150
|
+
name: name,
|
|
151
|
+
embedding_provider: @embedding_provider.to_s,
|
|
152
|
+
embedding_model: @embedding_model,
|
|
153
|
+
dimensions: embeddings.first&.size || 0,
|
|
154
|
+
config: {
|
|
155
|
+
hnsw_m: Leann.configuration.hnsw_m,
|
|
156
|
+
hnsw_ef_construction: Leann.configuration.hnsw_ef_construction
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Build and store graph
|
|
161
|
+
backend = ActiveRecordBackend.new(index)
|
|
162
|
+
backend.build(@documents, embeddings)
|
|
163
|
+
|
|
164
|
+
puts "Index '#{name}' created successfully!"
|
|
165
|
+
|
|
166
|
+
index
|
|
167
|
+
end
|
|
168
|
+
alias build save
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def check_existing_index
|
|
173
|
+
raise Leann::IndexExistsError, name if Index.exists?(name: name)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def compute_embeddings
|
|
177
|
+
texts = @documents.map { |d| d[:text] }
|
|
178
|
+
embedding_provider.compute(texts)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def embedding_provider
|
|
182
|
+
@_embedding_provider ||= load_embedding_provider
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def load_embedding_provider
|
|
186
|
+
case @embedding_provider
|
|
187
|
+
when :ruby_llm
|
|
188
|
+
require "leann/embedding/ruby_llm"
|
|
189
|
+
Leann::Embedding::RubyLLM.new(model: @embedding_model)
|
|
190
|
+
when :openai
|
|
191
|
+
require "leann/embedding/openai"
|
|
192
|
+
Leann::Embedding::OpenAI.new(model: @embedding_model)
|
|
193
|
+
when :ollama
|
|
194
|
+
require "leann/embedding/ollama"
|
|
195
|
+
Leann::Embedding::Ollama.new(model: @embedding_model)
|
|
196
|
+
when :fastembed
|
|
197
|
+
require "leann/embedding/fastembed"
|
|
198
|
+
Leann::Embedding::FastEmbed.new(model: @embedding_model)
|
|
199
|
+
else
|
|
200
|
+
raise Leann::ConfigurationError, "Unknown embedding provider: #{@embedding_provider}"
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
module Rails
|
|
5
|
+
class Railtie < ::Rails::Railtie
|
|
6
|
+
initializer "leann.configure_rails" do
|
|
7
|
+
# Auto-configure based on Rails environment
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Expose generators
|
|
11
|
+
generators do
|
|
12
|
+
require "generators/leann/install/install_generator"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
module Rails
|
|
5
|
+
# Handles search operations on a database-stored index
|
|
6
|
+
class Searcher
|
|
7
|
+
# @return [Leann::Rails::Index]
|
|
8
|
+
attr_reader :index
|
|
9
|
+
|
|
10
|
+
# @param index [Leann::Rails::Index]
|
|
11
|
+
def initialize(index)
|
|
12
|
+
@index = index
|
|
13
|
+
@embedding_provider = nil
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Search the index
|
|
17
|
+
#
|
|
18
|
+
# @param query [String] Search query
|
|
19
|
+
# @param limit [Integer] Maximum results
|
|
20
|
+
# @param threshold [Float, nil] Minimum score threshold
|
|
21
|
+
# @param filters [Hash, nil] Metadata filters
|
|
22
|
+
# @return [Leann::SearchResults]
|
|
23
|
+
def search(query, limit: 5, threshold: nil, filters: nil)
|
|
24
|
+
start_time = Time.now
|
|
25
|
+
|
|
26
|
+
# Compute query embedding
|
|
27
|
+
query_embedding = embedding_provider.compute([query]).first
|
|
28
|
+
|
|
29
|
+
# Load all passages for embedding recomputation
|
|
30
|
+
passages = load_all_passages
|
|
31
|
+
|
|
32
|
+
# Search with on-the-fly embedding recomputation
|
|
33
|
+
backend = ActiveRecordBackend.new(index)
|
|
34
|
+
raw_results = backend.search(
|
|
35
|
+
query_embedding,
|
|
36
|
+
embedding_provider: embedding_provider,
|
|
37
|
+
passages: passages,
|
|
38
|
+
limit: limit * 2
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Build results
|
|
42
|
+
results = raw_results.map do |id, score|
|
|
43
|
+
passage = index.passages.find_by(external_id: id)
|
|
44
|
+
next unless passage
|
|
45
|
+
|
|
46
|
+
Leann::SearchResult.new(
|
|
47
|
+
id: id,
|
|
48
|
+
text: passage.text,
|
|
49
|
+
score: score,
|
|
50
|
+
metadata: passage.metadata_sym
|
|
51
|
+
)
|
|
52
|
+
end.compact
|
|
53
|
+
|
|
54
|
+
# Apply threshold filter
|
|
55
|
+
results = results.select { |r| r.score >= threshold } if threshold
|
|
56
|
+
|
|
57
|
+
# Apply metadata filters
|
|
58
|
+
results = apply_filters(results, filters) if filters
|
|
59
|
+
|
|
60
|
+
# Limit and sort results
|
|
61
|
+
results = results.sort.first(limit)
|
|
62
|
+
|
|
63
|
+
duration = Time.now - start_time
|
|
64
|
+
|
|
65
|
+
Leann::SearchResults.new(results, query: query, duration: duration)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def embedding_provider
|
|
71
|
+
@embedding_provider ||= load_embedding_provider
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def load_embedding_provider
|
|
75
|
+
case index.embedding_provider_sym
|
|
76
|
+
when :ruby_llm
|
|
77
|
+
require "leann/embedding/ruby_llm"
|
|
78
|
+
Leann::Embedding::RubyLLM.new(model: index.embedding_model)
|
|
79
|
+
when :openai
|
|
80
|
+
require "leann/embedding/openai"
|
|
81
|
+
Leann::Embedding::OpenAI.new(model: index.embedding_model)
|
|
82
|
+
when :ollama
|
|
83
|
+
require "leann/embedding/ollama"
|
|
84
|
+
Leann::Embedding::Ollama.new(model: index.embedding_model)
|
|
85
|
+
when :fastembed
|
|
86
|
+
require "leann/embedding/fastembed"
|
|
87
|
+
Leann::Embedding::FastEmbed.new(model: index.embedding_model)
|
|
88
|
+
else
|
|
89
|
+
raise Leann::ConfigurationError, "Unknown embedding provider: #{index.embedding_provider}"
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def load_all_passages
|
|
94
|
+
index.passages.pluck(:external_id, :text).to_h
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def apply_filters(results, filters)
|
|
98
|
+
results.select do |result|
|
|
99
|
+
filters.all? do |key, value|
|
|
100
|
+
metadata_value = result.metadata[key.to_sym]
|
|
101
|
+
|
|
102
|
+
case value
|
|
103
|
+
when Range
|
|
104
|
+
value.cover?(metadata_value)
|
|
105
|
+
when Array
|
|
106
|
+
value.include?(metadata_value)
|
|
107
|
+
when Regexp
|
|
108
|
+
value.match?(metadata_value.to_s)
|
|
109
|
+
else
|
|
110
|
+
metadata_value == value
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|