leann 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +375 -0
- data/exe/leann +167 -0
- data/lib/generators/leann/install/install_generator.rb +51 -0
- data/lib/generators/leann/install/templates/migration.rb.erb +28 -0
- data/lib/leann/backend/base.rb +51 -0
- data/lib/leann/backend/leann_graph.rb +476 -0
- data/lib/leann/builder.rb +317 -0
- data/lib/leann/configuration.rb +148 -0
- data/lib/leann/embedding/base.rb +63 -0
- data/lib/leann/embedding/fastembed.rb +120 -0
- data/lib/leann/embedding/ollama.rb +194 -0
- data/lib/leann/embedding/openai.rb +149 -0
- data/lib/leann/embedding/ruby_llm.rb +57 -0
- data/lib/leann/errors.rb +71 -0
- data/lib/leann/index.rb +236 -0
- data/lib/leann/rails/active_record/index.rb +70 -0
- data/lib/leann/rails/active_record/passage.rb +56 -0
- data/lib/leann/rails/builder.rb +205 -0
- data/lib/leann/rails/railtie.rb +16 -0
- data/lib/leann/rails/searcher.rb +117 -0
- data/lib/leann/rails/storage/active_record_backend.rb +332 -0
- data/lib/leann/rails.rb +90 -0
- data/lib/leann/ruby_llm/search.rb +89 -0
- data/lib/leann/search_result.rb +195 -0
- data/lib/leann/searcher.rb +189 -0
- data/lib/leann/version.rb +3 -0
- data/lib/leann.rb +133 -0
- metadata +177 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "uri"
|
|
6
|
+
require_relative "base"
|
|
7
|
+
|
|
8
|
+
module Leann
|
|
9
|
+
module Embedding
|
|
10
|
+
# Ollama Embeddings API provider
|
|
11
|
+
#
|
|
12
|
+
# Uses local Ollama server for computing embeddings.
|
|
13
|
+
# Requires Ollama to be running: https://ollama.com
|
|
14
|
+
#
|
|
15
|
+
# @example
|
|
16
|
+
# provider = Leann::Embedding::Ollama.new(model: "nomic-embed-text")
|
|
17
|
+
# embeddings = provider.compute(["Hello", "World"])
|
|
18
|
+
#
|
|
19
|
+
class Ollama < Base
|
|
20
|
+
DEFAULT_HOST = "http://localhost:11434"
|
|
21
|
+
EMBED_PATH = "/api/embed"
|
|
22
|
+
MAX_BATCH_SIZE = 32
|
|
23
|
+
TIMEOUT = 60
|
|
24
|
+
|
|
25
|
+
# Popular embedding models
|
|
26
|
+
POPULAR_MODELS = %w[
|
|
27
|
+
nomic-embed-text
|
|
28
|
+
mxbai-embed-large
|
|
29
|
+
bge-m3
|
|
30
|
+
all-minilm
|
|
31
|
+
snowflake-arctic-embed
|
|
32
|
+
].freeze
|
|
33
|
+
|
|
34
|
+
# @param model [String] Ollama embedding model name
|
|
35
|
+
# @param host [String, nil] Ollama server URL
|
|
36
|
+
def initialize(model: "nomic-embed-text", host: nil)
|
|
37
|
+
super(model: model)
|
|
38
|
+
|
|
39
|
+
@host = host || Leann.configuration.ollama_host || ENV["OLLAMA_HOST"] || DEFAULT_HOST
|
|
40
|
+
@dimensions = nil
|
|
41
|
+
|
|
42
|
+
check_connection!
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Compute embeddings for texts
|
|
46
|
+
#
|
|
47
|
+
# @param texts [Array<String>]
|
|
48
|
+
# @return [Array<Array<Float>>]
|
|
49
|
+
def compute(texts)
|
|
50
|
+
return [] if texts.empty?
|
|
51
|
+
|
|
52
|
+
all_embeddings = []
|
|
53
|
+
|
|
54
|
+
in_batches(texts, MAX_BATCH_SIZE) do |batch|
|
|
55
|
+
batch_embeddings = compute_batch(batch)
|
|
56
|
+
all_embeddings.concat(batch_embeddings)
|
|
57
|
+
print "." # Progress indicator
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
puts " Done! (#{all_embeddings.size} embeddings)" unless texts.size < MAX_BATCH_SIZE
|
|
61
|
+
|
|
62
|
+
# Normalize embeddings (Ollama may not normalize by default)
|
|
63
|
+
all_embeddings.map { |emb| normalize(emb) }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def check_connection!
|
|
69
|
+
uri = URI.parse("#{@host}/api/version")
|
|
70
|
+
|
|
71
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
72
|
+
http.use_ssl = uri.scheme == "https"
|
|
73
|
+
http.open_timeout = 5
|
|
74
|
+
http.read_timeout = 5
|
|
75
|
+
|
|
76
|
+
response = http.get(uri.request_uri)
|
|
77
|
+
|
|
78
|
+
return if response.code.to_i == 200
|
|
79
|
+
|
|
80
|
+
raise EmbeddingError.new(
|
|
81
|
+
"Cannot connect to Ollama at #{@host}. Is Ollama running?",
|
|
82
|
+
provider: :ollama
|
|
83
|
+
)
|
|
84
|
+
rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Net::OpenTimeout => e
|
|
85
|
+
raise EmbeddingError.new(
|
|
86
|
+
connection_error_message,
|
|
87
|
+
provider: :ollama,
|
|
88
|
+
original_error: e
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def compute_batch(texts)
|
|
93
|
+
uri = URI.parse("#{@host}#{EMBED_PATH}")
|
|
94
|
+
|
|
95
|
+
body = {
|
|
96
|
+
model: model,
|
|
97
|
+
input: texts
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
response = make_request(uri, body)
|
|
101
|
+
parse_response(response, texts.size)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def make_request(uri, body)
|
|
105
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
106
|
+
http.use_ssl = uri.scheme == "https"
|
|
107
|
+
http.read_timeout = TIMEOUT
|
|
108
|
+
http.open_timeout = 10
|
|
109
|
+
|
|
110
|
+
request = Net::HTTP::Post.new(uri.request_uri)
|
|
111
|
+
request["Content-Type"] = "application/json"
|
|
112
|
+
request.body = JSON.generate(body)
|
|
113
|
+
|
|
114
|
+
response = http.request(request)
|
|
115
|
+
|
|
116
|
+
case response.code.to_i
|
|
117
|
+
when 200
|
|
118
|
+
response
|
|
119
|
+
when 404
|
|
120
|
+
raise EmbeddingError.new(
|
|
121
|
+
model_not_found_message,
|
|
122
|
+
provider: :ollama
|
|
123
|
+
)
|
|
124
|
+
else
|
|
125
|
+
error_message = parse_error(response)
|
|
126
|
+
raise EmbeddingError.new(
|
|
127
|
+
"Ollama API error: #{error_message}",
|
|
128
|
+
provider: :ollama
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def parse_response(response, expected_count)
|
|
134
|
+
data = JSON.parse(response.body)
|
|
135
|
+
embeddings = data["embeddings"]
|
|
136
|
+
|
|
137
|
+
unless embeddings && embeddings.is_a?(Array)
|
|
138
|
+
raise EmbeddingError.new(
|
|
139
|
+
"Invalid response from Ollama: missing embeddings",
|
|
140
|
+
provider: :ollama
|
|
141
|
+
)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
unless embeddings.size == expected_count
|
|
145
|
+
raise EmbeddingError.new(
|
|
146
|
+
"Ollama returned #{embeddings.size} embeddings, expected #{expected_count}",
|
|
147
|
+
provider: :ollama
|
|
148
|
+
)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
embeddings
|
|
152
|
+
rescue JSON::ParserError => e
|
|
153
|
+
raise EmbeddingError.new(
|
|
154
|
+
"Failed to parse Ollama response: #{e.message}",
|
|
155
|
+
provider: :ollama,
|
|
156
|
+
original_error: e
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def parse_error(response)
|
|
161
|
+
data = JSON.parse(response.body)
|
|
162
|
+
data["error"] || response.body
|
|
163
|
+
rescue JSON::ParserError
|
|
164
|
+
response.body
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def connection_error_message
|
|
168
|
+
<<~MSG
|
|
169
|
+
Cannot connect to Ollama at #{@host}.
|
|
170
|
+
|
|
171
|
+
Please ensure Ollama is running:
|
|
172
|
+
macOS/Linux: ollama serve
|
|
173
|
+
Windows: Make sure Ollama is running in the system tray
|
|
174
|
+
|
|
175
|
+
Installation: https://ollama.com/download
|
|
176
|
+
MSG
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def model_not_found_message
|
|
180
|
+
<<~MSG
|
|
181
|
+
Model '#{model}' not found in Ollama.
|
|
182
|
+
|
|
183
|
+
To install:
|
|
184
|
+
ollama pull #{model}
|
|
185
|
+
|
|
186
|
+
Popular embedding models:
|
|
187
|
+
#{POPULAR_MODELS.map { |m| "ollama pull #{m}" }.join("\n ")}
|
|
188
|
+
|
|
189
|
+
Browse more: https://ollama.com/library
|
|
190
|
+
MSG
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "uri"
|
|
6
|
+
require_relative "base"
|
|
7
|
+
|
|
8
|
+
module Leann
|
|
9
|
+
module Embedding
|
|
10
|
+
# OpenAI Embeddings API provider
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# provider = Leann::Embedding::OpenAI.new(model: "text-embedding-3-small")
|
|
14
|
+
# embeddings = provider.compute(["Hello", "World"])
|
|
15
|
+
#
|
|
16
|
+
class OpenAI < Base
|
|
17
|
+
BASE_URL = "https://api.openai.com/v1/embeddings"
|
|
18
|
+
MAX_BATCH_SIZE = 2048
|
|
19
|
+
MAX_RETRIES = 3
|
|
20
|
+
RETRY_DELAY = 1.0
|
|
21
|
+
|
|
22
|
+
# Model dimensions lookup
|
|
23
|
+
DIMENSIONS = {
|
|
24
|
+
"text-embedding-3-small" => 1536,
|
|
25
|
+
"text-embedding-3-large" => 3072,
|
|
26
|
+
"text-embedding-ada-002" => 1536
|
|
27
|
+
}.freeze
|
|
28
|
+
|
|
29
|
+
# @param model [String] OpenAI embedding model name
|
|
30
|
+
# @param api_key [String, nil] API key (defaults to ENV or config)
|
|
31
|
+
# @param base_url [String, nil] Custom base URL
|
|
32
|
+
def initialize(model: "text-embedding-3-small", api_key: nil, base_url: nil)
|
|
33
|
+
super(model: model)
|
|
34
|
+
|
|
35
|
+
@api_key = api_key || Leann.configuration.openai_api_key || ENV["OPENAI_API_KEY"]
|
|
36
|
+
@base_url = base_url || Leann.configuration.openai_base_url || BASE_URL
|
|
37
|
+
@dimensions = DIMENSIONS[model]
|
|
38
|
+
|
|
39
|
+
validate_configuration!
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Compute embeddings for texts
|
|
43
|
+
#
|
|
44
|
+
# @param texts [Array<String>]
|
|
45
|
+
# @return [Array<Array<Float>>]
|
|
46
|
+
def compute(texts)
|
|
47
|
+
return [] if texts.empty?
|
|
48
|
+
|
|
49
|
+
all_embeddings = []
|
|
50
|
+
|
|
51
|
+
in_batches(texts, MAX_BATCH_SIZE) do |batch|
|
|
52
|
+
batch_embeddings = compute_batch(batch)
|
|
53
|
+
all_embeddings.concat(batch_embeddings)
|
|
54
|
+
print "." # Progress indicator
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
puts " Done! (#{all_embeddings.size} embeddings)" unless texts.size < MAX_BATCH_SIZE
|
|
58
|
+
|
|
59
|
+
all_embeddings
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def validate_configuration!
|
|
65
|
+
return if @api_key && !@api_key.empty?
|
|
66
|
+
|
|
67
|
+
raise ConfigurationError, <<~MSG
|
|
68
|
+
OpenAI API key is required.
|
|
69
|
+
|
|
70
|
+
Set it via:
|
|
71
|
+
- Environment: OPENAI_API_KEY=your-key
|
|
72
|
+
- Configuration: Leann.configure { |c| c.openai_api_key = "your-key" }
|
|
73
|
+
- Builder option: Leann.build("index", embedding: :openai, api_key: "your-key")
|
|
74
|
+
MSG
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def compute_batch(texts)
|
|
78
|
+
uri = URI.parse(@base_url)
|
|
79
|
+
|
|
80
|
+
body = {
|
|
81
|
+
model: model,
|
|
82
|
+
input: texts,
|
|
83
|
+
encoding_format: "float"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
response = make_request(uri, body)
|
|
87
|
+
parse_response(response)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def make_request(uri, body, retries = 0)
|
|
91
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
92
|
+
http.use_ssl = uri.scheme == "https"
|
|
93
|
+
http.read_timeout = 60
|
|
94
|
+
http.open_timeout = 10
|
|
95
|
+
|
|
96
|
+
request = Net::HTTP::Post.new(uri.request_uri)
|
|
97
|
+
request["Content-Type"] = "application/json"
|
|
98
|
+
request["Authorization"] = "Bearer #{@api_key}"
|
|
99
|
+
request.body = JSON.generate(body)
|
|
100
|
+
|
|
101
|
+
response = http.request(request)
|
|
102
|
+
|
|
103
|
+
case response.code.to_i
|
|
104
|
+
when 200
|
|
105
|
+
response
|
|
106
|
+
when 429, 500, 502, 503
|
|
107
|
+
# Rate limit or server error - retry
|
|
108
|
+
if retries < MAX_RETRIES
|
|
109
|
+
delay = RETRY_DELAY * (2**retries)
|
|
110
|
+
sleep(delay)
|
|
111
|
+
make_request(uri, body, retries + 1)
|
|
112
|
+
else
|
|
113
|
+
raise EmbeddingError.new(
|
|
114
|
+
"OpenAI API error after #{MAX_RETRIES} retries: #{response.code} #{response.body}",
|
|
115
|
+
provider: :openai
|
|
116
|
+
)
|
|
117
|
+
end
|
|
118
|
+
else
|
|
119
|
+
error_message = parse_error(response)
|
|
120
|
+
raise EmbeddingError.new(
|
|
121
|
+
"OpenAI API error: #{error_message}",
|
|
122
|
+
provider: :openai
|
|
123
|
+
)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def parse_response(response)
|
|
128
|
+
data = JSON.parse(response.body)
|
|
129
|
+
|
|
130
|
+
# Sort by index to ensure order matches input
|
|
131
|
+
embeddings = data["data"].sort_by { |e| e["index"] }
|
|
132
|
+
embeddings.map { |e| e["embedding"] }
|
|
133
|
+
rescue JSON::ParserError => e
|
|
134
|
+
raise EmbeddingError.new(
|
|
135
|
+
"Failed to parse OpenAI response: #{e.message}",
|
|
136
|
+
provider: :openai,
|
|
137
|
+
original_error: e
|
|
138
|
+
)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def parse_error(response)
|
|
142
|
+
data = JSON.parse(response.body)
|
|
143
|
+
data.dig("error", "message") || response.body
|
|
144
|
+
rescue JSON::ParserError
|
|
145
|
+
response.body
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Leann
|
|
6
|
+
module Embedding
|
|
7
|
+
# RubyLLM embedding provider
|
|
8
|
+
#
|
|
9
|
+
# Uses RubyLLM's unified embedding API which supports multiple providers
|
|
10
|
+
# (OpenAI, Ollama, etc.) through a single interface.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# provider = Leann::Embedding::RubyLLM.new
|
|
14
|
+
# vectors = provider.compute(["Hello world", "Another text"])
|
|
15
|
+
#
|
|
16
|
+
class RubyLLM < Base
|
|
17
|
+
# @param model [String, nil] Embedding model (uses RubyLLM default if nil)
|
|
18
|
+
def initialize(model: nil)
|
|
19
|
+
super(model: model)
|
|
20
|
+
|
|
21
|
+
unless defined?(::RubyLLM)
|
|
22
|
+
raise ConfigurationError, "RubyLLM gem is required. Add 'ruby_llm' to your Gemfile."
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Compute embeddings for texts
|
|
27
|
+
# @param texts [Array<String>] Texts to embed
|
|
28
|
+
# @return [Array<Array<Float>>] Embedding vectors
|
|
29
|
+
def compute(texts)
|
|
30
|
+
texts = Array(texts)
|
|
31
|
+
return [] if texts.empty?
|
|
32
|
+
|
|
33
|
+
options = {}
|
|
34
|
+
options[:model] = @model if @model
|
|
35
|
+
|
|
36
|
+
result = ::RubyLLM.embed(texts, **options)
|
|
37
|
+
result.vectors
|
|
38
|
+
rescue ::RubyLLM::Error => e
|
|
39
|
+
raise EmbeddingError, "RubyLLM embedding failed: #{e.message}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @return [Integer] Embedding dimensions (model-dependent)
|
|
43
|
+
def dimensions
|
|
44
|
+
# Get dimensions by computing a test embedding
|
|
45
|
+
@dimensions ||= begin
|
|
46
|
+
test = compute(["test"])
|
|
47
|
+
test.first&.size || 1536
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @return [String] Provider name
|
|
52
|
+
def provider_name
|
|
53
|
+
:ruby_llm
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
data/lib/leann/errors.rb
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leann
|
|
4
|
+
# Base error class for all Leann errors
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when configuration is invalid
|
|
8
|
+
class ConfigurationError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when an index is not found
|
|
11
|
+
class IndexNotFoundError < Error
|
|
12
|
+
attr_reader :index_name
|
|
13
|
+
|
|
14
|
+
def initialize(index_name)
|
|
15
|
+
@index_name = index_name
|
|
16
|
+
super("Index not found: #{index_name}")
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Raised when an index already exists
|
|
21
|
+
class IndexExistsError < Error
|
|
22
|
+
attr_reader :index_name
|
|
23
|
+
|
|
24
|
+
def initialize(index_name)
|
|
25
|
+
@index_name = index_name
|
|
26
|
+
super("Index already exists: #{index_name}. Use force: true to overwrite.")
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Raised when embedding computation fails
|
|
31
|
+
class EmbeddingError < Error
|
|
32
|
+
attr_reader :provider, :original_error
|
|
33
|
+
|
|
34
|
+
def initialize(message, provider: nil, original_error: nil)
|
|
35
|
+
@provider = provider
|
|
36
|
+
@original_error = original_error
|
|
37
|
+
super(message)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Raised when LLM request fails
|
|
42
|
+
class LLMError < Error
|
|
43
|
+
attr_reader :provider, :original_error
|
|
44
|
+
|
|
45
|
+
def initialize(message, provider: nil, original_error: nil)
|
|
46
|
+
@provider = provider
|
|
47
|
+
@original_error = original_error
|
|
48
|
+
super(message)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Raised when index is corrupted or invalid
|
|
53
|
+
class CorruptedIndexError < Error
|
|
54
|
+
attr_reader :index_name, :reason
|
|
55
|
+
|
|
56
|
+
def initialize(index_name, reason = nil)
|
|
57
|
+
@index_name = index_name
|
|
58
|
+
@reason = reason
|
|
59
|
+
message = "Corrupted index: #{index_name}"
|
|
60
|
+
message += " (#{reason})" if reason
|
|
61
|
+
super(message)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Raised when no documents are provided to builder
|
|
66
|
+
class EmptyIndexError < Error
|
|
67
|
+
def initialize
|
|
68
|
+
super("Cannot build an empty index. Add at least one document.")
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
data/lib/leann/index.rb
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module Leann
|
|
7
|
+
# Represents a Leann index on disk
|
|
8
|
+
#
|
|
9
|
+
# @example Open and search
|
|
10
|
+
# index = Leann::Index.open("my_index")
|
|
11
|
+
# results = index.search("query")
|
|
12
|
+
#
|
|
13
|
+
# @example Get info
|
|
14
|
+
# index = Leann::Index.open("my_index")
|
|
15
|
+
# puts index.document_count
|
|
16
|
+
# puts index.embedding_model
|
|
17
|
+
#
|
|
18
|
+
class Index
|
|
19
|
+
# @return [String] Index name
|
|
20
|
+
attr_reader :name
|
|
21
|
+
|
|
22
|
+
# @return [String] Index path
|
|
23
|
+
attr_reader :path
|
|
24
|
+
|
|
25
|
+
# @return [Hash] Index metadata
|
|
26
|
+
attr_reader :metadata
|
|
27
|
+
|
|
28
|
+
INDEX_EXTENSION = ".leann"
|
|
29
|
+
META_SUFFIX = ".meta.json"
|
|
30
|
+
PASSAGES_SUFFIX = ".passages.jsonl"
|
|
31
|
+
OFFSETS_SUFFIX = ".passages.offsets"
|
|
32
|
+
VECTORS_SUFFIX = ".vectors"
|
|
33
|
+
IDS_SUFFIX = ".ids"
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# Open an existing index
|
|
37
|
+
# @param name [String] Index name or path
|
|
38
|
+
# @return [Index]
|
|
39
|
+
# @raise [IndexNotFoundError] if index doesn't exist
|
|
40
|
+
def open(name)
|
|
41
|
+
path = resolve_path(name)
|
|
42
|
+
raise IndexNotFoundError, name unless exists_at?(path)
|
|
43
|
+
|
|
44
|
+
new(path)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if an index exists
|
|
48
|
+
# @param name [String] Index name or path
|
|
49
|
+
# @return [Boolean]
|
|
50
|
+
def exists?(name)
|
|
51
|
+
path = resolve_path(name)
|
|
52
|
+
exists_at?(path)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# List all indexes in a directory
|
|
56
|
+
# @param directory [String]
|
|
57
|
+
# @return [Array<String>]
|
|
58
|
+
def list(directory = ".")
|
|
59
|
+
pattern = File.join(directory, "**", "*#{META_SUFFIX}")
|
|
60
|
+
Dir.glob(pattern).map do |meta_file|
|
|
61
|
+
# Extract index name from path
|
|
62
|
+
File.basename(meta_file, META_SUFFIX).sub(/#{INDEX_EXTENSION}$/, "")
|
|
63
|
+
end.uniq.sort
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Delete an index
|
|
67
|
+
# @param name [String] Index name or path
|
|
68
|
+
# @return [Boolean]
|
|
69
|
+
def delete(name)
|
|
70
|
+
path = resolve_path(name)
|
|
71
|
+
return false unless exists_at?(path)
|
|
72
|
+
|
|
73
|
+
# Delete all index files
|
|
74
|
+
files_to_delete = [
|
|
75
|
+
"#{path}#{META_SUFFIX}",
|
|
76
|
+
"#{path}#{PASSAGES_SUFFIX}",
|
|
77
|
+
"#{path}#{OFFSETS_SUFFIX}",
|
|
78
|
+
"#{path}#{VECTORS_SUFFIX}",
|
|
79
|
+
"#{path}#{IDS_SUFFIX}",
|
|
80
|
+
"#{path}.graph.bin", # LEANN graph file
|
|
81
|
+
"#{path}.graph.meta.json" # LEANN graph metadata
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
files_to_delete.each do |file|
|
|
85
|
+
FileUtils.rm_f(file)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
true
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
def resolve_path(name)
|
|
94
|
+
# If it's already a full path with extension, use it
|
|
95
|
+
return name if name.end_with?(INDEX_EXTENSION)
|
|
96
|
+
|
|
97
|
+
# Check in current directory
|
|
98
|
+
local_path = "#{name}#{INDEX_EXTENSION}"
|
|
99
|
+
return local_path if exists_at?(local_path)
|
|
100
|
+
|
|
101
|
+
# Check in configured index directory
|
|
102
|
+
config_dir = Leann.configuration.index_directory
|
|
103
|
+
if config_dir && Dir.exist?(config_dir)
|
|
104
|
+
configured_path = File.join(config_dir, "#{name}#{INDEX_EXTENSION}")
|
|
105
|
+
return configured_path if exists_at?(configured_path)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Return local path as default
|
|
109
|
+
local_path
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def exists_at?(path)
|
|
113
|
+
meta_file = "#{path}#{META_SUFFIX}"
|
|
114
|
+
File.exist?(meta_file)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# @param path [String] Full path to index
|
|
119
|
+
def initialize(path)
|
|
120
|
+
@path = path
|
|
121
|
+
@name = File.basename(path, INDEX_EXTENSION)
|
|
122
|
+
@metadata = load_metadata
|
|
123
|
+
@searcher = nil
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Search the index
|
|
127
|
+
# @param query [String] Search query
|
|
128
|
+
# @param limit [Integer] Maximum results
|
|
129
|
+
# @param threshold [Float, nil] Minimum score threshold
|
|
130
|
+
# @param filters [Hash, nil] Metadata filters
|
|
131
|
+
# @return [SearchResults]
|
|
132
|
+
def search(query, limit: 5, threshold: nil, filters: nil)
|
|
133
|
+
searcher.search(query, limit: limit, threshold: threshold, filters: filters)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Get number of documents in the index
|
|
137
|
+
# @return [Integer]
|
|
138
|
+
def document_count
|
|
139
|
+
metadata["document_count"] || count_documents
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get embedding model used
|
|
143
|
+
# @return [String]
|
|
144
|
+
def embedding_model
|
|
145
|
+
metadata["embedding_model"]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Get embedding provider
|
|
149
|
+
# @return [Symbol]
|
|
150
|
+
def embedding_provider
|
|
151
|
+
(metadata["embedding_provider"] || "openai").to_sym
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Get embedding dimensions
|
|
155
|
+
# @return [Integer]
|
|
156
|
+
def dimensions
|
|
157
|
+
metadata["dimensions"]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Get creation timestamp
|
|
161
|
+
# @return [Time, nil]
|
|
162
|
+
def created_at
|
|
163
|
+
return nil unless metadata["created_at"]
|
|
164
|
+
|
|
165
|
+
Time.parse(metadata["created_at"])
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Get backend type
|
|
169
|
+
# @return [Symbol]
|
|
170
|
+
def backend
|
|
171
|
+
(metadata["backend"] || "leann").to_sym
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Index info as string
|
|
175
|
+
# @return [String]
|
|
176
|
+
def to_s
|
|
177
|
+
lines = [
|
|
178
|
+
"Index: #{name}",
|
|
179
|
+
" Documents: #{document_count}",
|
|
180
|
+
" Embedding: #{embedding_provider}/#{embedding_model}",
|
|
181
|
+
" Dimensions: #{dimensions}",
|
|
182
|
+
" Backend: #{backend}",
|
|
183
|
+
" Created: #{created_at&.strftime("%Y-%m-%d %H:%M:%S") || "unknown"}"
|
|
184
|
+
]
|
|
185
|
+
lines.join("\n")
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Detailed inspection
|
|
189
|
+
# @return [String]
|
|
190
|
+
def inspect
|
|
191
|
+
"#<Leann::Index name=#{name.inspect} documents=#{document_count} model=#{embedding_model.inspect}>"
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Get all passages (lazy loaded)
|
|
195
|
+
# @return [Enumerator]
|
|
196
|
+
def each_passage
|
|
197
|
+
return enum_for(:each_passage) unless block_given?
|
|
198
|
+
|
|
199
|
+
passages_file = "#{path}#{PASSAGES_SUFFIX}"
|
|
200
|
+
return unless File.exist?(passages_file)
|
|
201
|
+
|
|
202
|
+
File.foreach(passages_file) do |line|
|
|
203
|
+
yield JSON.parse(line.strip, symbolize_names: true)
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Get passage by ID
|
|
208
|
+
# @param id [String]
|
|
209
|
+
# @return [Hash, nil]
|
|
210
|
+
def get_passage(id)
|
|
211
|
+
each_passage.find { |p| p[:id] == id }
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
private
|
|
215
|
+
|
|
216
|
+
def load_metadata
|
|
217
|
+
meta_file = "#{path}#{META_SUFFIX}"
|
|
218
|
+
JSON.parse(File.read(meta_file))
|
|
219
|
+
rescue JSON::ParserError => e
|
|
220
|
+
raise CorruptedIndexError.new(name, "Invalid metadata JSON: #{e.message}")
|
|
221
|
+
rescue Errno::ENOENT
|
|
222
|
+
raise IndexNotFoundError, name
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def count_documents
|
|
226
|
+
passages_file = "#{path}#{PASSAGES_SUFFIX}"
|
|
227
|
+
return 0 unless File.exist?(passages_file)
|
|
228
|
+
|
|
229
|
+
File.foreach(passages_file).count
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def searcher
|
|
233
|
+
@searcher ||= Searcher.new(self)
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|