ragnar-cli 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +439 -0
- data/exe/ragnar +6 -0
- data/lib/ragnar/chunker.rb +97 -0
- data/lib/ragnar/cli.rb +542 -0
- data/lib/ragnar/context_repacker.rb +121 -0
- data/lib/ragnar/database.rb +267 -0
- data/lib/ragnar/embedder.rb +137 -0
- data/lib/ragnar/indexer.rb +234 -0
- data/lib/ragnar/llm_manager.rb +43 -0
- data/lib/ragnar/query_processor.rb +398 -0
- data/lib/ragnar/query_rewriter.rb +75 -0
- data/lib/ragnar/topic_modeling/engine.rb +221 -0
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +300 -0
- data/lib/ragnar/topic_modeling/llm_adapter.rb +131 -0
- data/lib/ragnar/topic_modeling/metrics.rb +186 -0
- data/lib/ragnar/topic_modeling/term_extractor.rb +170 -0
- data/lib/ragnar/topic_modeling/topic.rb +117 -0
- data/lib/ragnar/topic_modeling/topic_labeler.rb +61 -0
- data/lib/ragnar/topic_modeling.rb +24 -0
- data/lib/ragnar/umap_processor.rb +228 -0
- data/lib/ragnar/umap_transform_service.rb +124 -0
- data/lib/ragnar/version.rb +5 -0
- data/lib/ragnar.rb +36 -0
- data/lib/ragnar_cli.rb +2 -0
- metadata +234 -0
@@ -0,0 +1,267 @@
|
|
1
|
+
module Ragnar
|
2
|
+
class Database
|
3
|
+
attr_reader :db_path, :table_name
|
4
|
+
|
5
|
+
def initialize(db_path, table_name: "documents")
|
6
|
+
@db_path = db_path
|
7
|
+
@table_name = table_name
|
8
|
+
ensure_database_exists
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_documents(documents)
|
12
|
+
return if documents.empty?
|
13
|
+
|
14
|
+
# Convert documents to Lance-compatible format
|
15
|
+
data = documents.map do |doc|
|
16
|
+
{
|
17
|
+
id: doc[:id],
|
18
|
+
chunk_text: doc[:chunk_text],
|
19
|
+
file_path: doc[:file_path],
|
20
|
+
chunk_index: doc[:chunk_index],
|
21
|
+
embedding: doc[:embedding],
|
22
|
+
metadata: doc[:metadata].to_json
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
# Define schema for the table with vector type
|
27
|
+
embedding_size = documents.first[:embedding].size
|
28
|
+
schema = {
|
29
|
+
id: :string,
|
30
|
+
chunk_text: :string,
|
31
|
+
file_path: :string,
|
32
|
+
chunk_index: :int64,
|
33
|
+
embedding: { type: "vector", dimension: embedding_size },
|
34
|
+
metadata: :string
|
35
|
+
}
|
36
|
+
|
37
|
+
# Use the new open_or_create method from Lancelot
|
38
|
+
# This automatically handles both creating new and opening existing datasets
|
39
|
+
dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
|
40
|
+
dataset.add_documents(data)
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_embeddings(limit: nil, offset: 0)
|
44
|
+
return [] unless dataset_exists?
|
45
|
+
|
46
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
47
|
+
|
48
|
+
# Get all documents or a subset
|
49
|
+
docs = if limit && offset > 0
|
50
|
+
# Get limit + offset items, then drop offset
|
51
|
+
dataset.first(limit + offset).drop(offset)
|
52
|
+
elsif limit
|
53
|
+
dataset.first(limit)
|
54
|
+
else
|
55
|
+
dataset.to_a.drop(offset)
|
56
|
+
end
|
57
|
+
|
58
|
+
docs.map do |doc|
|
59
|
+
{
|
60
|
+
id: doc[:id],
|
61
|
+
embedding: doc[:embedding],
|
62
|
+
reduced_embedding: doc[:reduced_embedding]
|
63
|
+
}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def update_reduced_embeddings(updates)
|
68
|
+
return if updates.empty?
|
69
|
+
|
70
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
71
|
+
|
72
|
+
# Get all existing documents and safely extract their data
|
73
|
+
all_docs = dataset.to_a.map do |doc|
|
74
|
+
# Safely extract fields we know about
|
75
|
+
{
|
76
|
+
id: doc[:id],
|
77
|
+
content: doc[:content],
|
78
|
+
chunk_text: doc[:chunk_text],
|
79
|
+
file_path: doc[:file_path],
|
80
|
+
chunk_index: doc[:chunk_index],
|
81
|
+
embedding: doc[:embedding],
|
82
|
+
metadata: doc[:metadata],
|
83
|
+
reduced_embedding: doc[:reduced_embedding]
|
84
|
+
}
|
85
|
+
end
|
86
|
+
|
87
|
+
# Create a map for quick lookup
|
88
|
+
update_map = updates.each_with_object({}) do |update, map|
|
89
|
+
map[update[:id]] = update[:reduced_embedding]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Update documents with reduced embeddings
|
93
|
+
updated_docs = all_docs.map do |doc|
|
94
|
+
if update_map[doc[:id]]
|
95
|
+
doc.merge(reduced_embedding: update_map[doc[:id]])
|
96
|
+
else
|
97
|
+
doc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Need to recreate the dataset with updated data
|
102
|
+
# First, backup the schema including the new reduced_embedding field
|
103
|
+
embedding_size = all_docs.first[:embedding].size
|
104
|
+
reduced_size = updates.first[:reduced_embedding].size
|
105
|
+
|
106
|
+
schema = {
|
107
|
+
id: :string,
|
108
|
+
chunk_text: :string,
|
109
|
+
file_path: :string,
|
110
|
+
chunk_index: :int64,
|
111
|
+
embedding: { type: "vector", dimension: embedding_size },
|
112
|
+
reduced_embedding: { type: "vector", dimension: reduced_size },
|
113
|
+
metadata: :string
|
114
|
+
}
|
115
|
+
|
116
|
+
# Remove old dataset and create new one with updated data
|
117
|
+
FileUtils.rm_rf(@db_path)
|
118
|
+
# Use open_or_create which will create since we just deleted the path
|
119
|
+
dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
|
120
|
+
dataset.add_documents(updated_docs)
|
121
|
+
end
|
122
|
+
|
123
|
+
def search_similar(embedding, k: 10, use_reduced: false)
|
124
|
+
return [] unless dataset_exists?
|
125
|
+
|
126
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
127
|
+
|
128
|
+
embedding_field = use_reduced ? :reduced_embedding : :embedding
|
129
|
+
|
130
|
+
# Perform vector search
|
131
|
+
results = dataset.vector_search(
|
132
|
+
embedding.to_a,
|
133
|
+
column: embedding_field,
|
134
|
+
limit: k
|
135
|
+
)
|
136
|
+
|
137
|
+
results.map do |row|
|
138
|
+
{
|
139
|
+
id: row[:id],
|
140
|
+
chunk_text: row[:chunk_text],
|
141
|
+
file_path: row[:file_path],
|
142
|
+
chunk_index: row[:chunk_index],
|
143
|
+
distance: row[:_distance],
|
144
|
+
metadata: JSON.parse(row[:metadata] || "{}")
|
145
|
+
}
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def count
|
150
|
+
return 0 unless dataset_exists?
|
151
|
+
|
152
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
153
|
+
dataset.to_a.size
|
154
|
+
end
|
155
|
+
|
156
|
+
def get_stats
|
157
|
+
unless dataset_exists?
|
158
|
+
return {
|
159
|
+
document_count: 0,
|
160
|
+
total_documents: 0,
|
161
|
+
unique_files: 0,
|
162
|
+
total_chunks: 0,
|
163
|
+
with_embeddings: 0,
|
164
|
+
with_reduced_embeddings: 0,
|
165
|
+
total_size_mb: 0.0
|
166
|
+
}
|
167
|
+
end
|
168
|
+
|
169
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
170
|
+
|
171
|
+
# Get all documents
|
172
|
+
all_docs = dataset.to_a
|
173
|
+
|
174
|
+
stats = {
|
175
|
+
document_count: all_docs.size, # Add for compatibility with specs
|
176
|
+
total_documents: all_docs.size,
|
177
|
+
total_chunks: all_docs.size,
|
178
|
+
unique_files: all_docs.map { |d| d[:file_path] }.uniq.size,
|
179
|
+
with_embeddings: 0,
|
180
|
+
with_reduced_embeddings: 0,
|
181
|
+
avg_chunk_size: 0,
|
182
|
+
total_size_mb: 0, # Add for CLI stats command
|
183
|
+
embedding_dims: nil,
|
184
|
+
reduced_dims: nil
|
185
|
+
}
|
186
|
+
|
187
|
+
chunk_sizes = []
|
188
|
+
total_bytes = 0
|
189
|
+
|
190
|
+
all_docs.each do |doc|
|
191
|
+
if doc[:embedding] && !doc[:embedding].empty?
|
192
|
+
stats[:with_embeddings] += 1
|
193
|
+
stats[:embedding_dims] ||= doc[:embedding].size
|
194
|
+
end
|
195
|
+
|
196
|
+
if doc[:reduced_embedding] && !doc[:reduced_embedding].empty?
|
197
|
+
stats[:with_reduced_embeddings] += 1
|
198
|
+
stats[:reduced_dims] ||= doc[:reduced_embedding].size
|
199
|
+
end
|
200
|
+
|
201
|
+
if doc[:chunk_text]
|
202
|
+
chunk_size = doc[:chunk_text].size
|
203
|
+
chunk_sizes << chunk_size
|
204
|
+
total_bytes += chunk_size
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
stats[:avg_chunk_size] = (chunk_sizes.sum.to_f / chunk_sizes.size).round if chunk_sizes.any?
|
209
|
+
stats[:total_size_mb] = (total_bytes / 1024.0 / 1024.0).round(2)
|
210
|
+
|
211
|
+
stats
|
212
|
+
end
|
213
|
+
|
214
|
+
def get_all_documents_with_embeddings(limit: nil)
|
215
|
+
return [] unless dataset_exists?
|
216
|
+
|
217
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
218
|
+
all_docs = limit ? dataset.first(limit) : dataset.to_a
|
219
|
+
|
220
|
+
all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
|
221
|
+
end
|
222
|
+
|
223
|
+
def full_text_search(query, limit: 10)
|
224
|
+
return [] unless dataset_exists?
|
225
|
+
|
226
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
227
|
+
|
228
|
+
# Use Lancelot's full-text search
|
229
|
+
results = dataset.full_text_search(
|
230
|
+
query,
|
231
|
+
columns: [:chunk_text],
|
232
|
+
limit: limit
|
233
|
+
)
|
234
|
+
|
235
|
+
results.map do |row|
|
236
|
+
{
|
237
|
+
id: row[:id],
|
238
|
+
chunk_text: row[:chunk_text],
|
239
|
+
file_path: row[:file_path],
|
240
|
+
chunk_index: row[:chunk_index],
|
241
|
+
metadata: JSON.parse(row[:metadata] || "{}")
|
242
|
+
}
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def dataset_exists?
|
247
|
+
return false unless File.exist?(@db_path)
|
248
|
+
|
249
|
+
begin
|
250
|
+
Lancelot::Dataset.open(@db_path)
|
251
|
+
true
|
252
|
+
rescue
|
253
|
+
false
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
private
|
258
|
+
|
259
|
+
def ensure_database_exists
|
260
|
+
# Don't create directory - Lance will handle this
|
261
|
+
end
|
262
|
+
|
263
|
+
def table_exists?
|
264
|
+
dataset_exists?
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
module Ragnar
|
2
|
+
class Embedder
|
3
|
+
attr_reader :model, :model_name
|
4
|
+
|
5
|
+
def initialize(model_name: Ragnar::DEFAULT_EMBEDDING_MODEL)
|
6
|
+
@model_name = model_name
|
7
|
+
@model = load_model(model_name)
|
8
|
+
end
|
9
|
+
|
10
|
+
def embed_text(text)
|
11
|
+
return nil if text.nil? || text.empty? || (text.respond_to?(:strip) && text.strip.empty?)
|
12
|
+
|
13
|
+
# Use Candle to generate embeddings
|
14
|
+
# The embedding method returns a tensor, we need to convert to array
|
15
|
+
embedding = @model.embedding(text)
|
16
|
+
|
17
|
+
# Convert tensor to array - Candle tensors need double to_a
|
18
|
+
# First to_a gives [tensor], second to_a on the tensor gives the float array
|
19
|
+
if embedding.respond_to?(:to_a)
|
20
|
+
result = embedding.to_a
|
21
|
+
if result.is_a?(Array) && result.first.respond_to?(:to_a)
|
22
|
+
result.first.to_a
|
23
|
+
else
|
24
|
+
result
|
25
|
+
end
|
26
|
+
else
|
27
|
+
embedding
|
28
|
+
end
|
29
|
+
rescue => e
|
30
|
+
puts "Error generating embedding: #{e.message}"
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
|
34
|
+
def embed_batch(texts, show_progress: true)
|
35
|
+
embeddings = []
|
36
|
+
|
37
|
+
if show_progress
|
38
|
+
progressbar = TTY::ProgressBar.new(
|
39
|
+
"Generating embeddings [:bar] :percent :current/:total",
|
40
|
+
total: texts.size,
|
41
|
+
bar_format: :block,
|
42
|
+
width: 30
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
texts.each do |text|
|
47
|
+
embedding = embed_text(text)
|
48
|
+
embeddings << embedding
|
49
|
+
progressbar.advance if show_progress
|
50
|
+
end
|
51
|
+
|
52
|
+
embeddings
|
53
|
+
end
|
54
|
+
|
55
|
+
def embed_chunks(chunks, show_progress: true)
|
56
|
+
texts = chunks.map do |chunk|
|
57
|
+
if chunk.is_a?(Hash)
|
58
|
+
chunk[:text] || chunk["text"]
|
59
|
+
else
|
60
|
+
chunk.to_s
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
embed_batch(texts, show_progress: show_progress)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def load_model(model_name)
|
70
|
+
# Initialize Candle embedding model using the new standardized from_pretrained method
|
71
|
+
begin
|
72
|
+
# Try to load the model using from_pretrained
|
73
|
+
Candle::EmbeddingModel.from_pretrained(model_name)
|
74
|
+
rescue => e
|
75
|
+
puts "Warning: Could not load model #{model_name}, falling back to default"
|
76
|
+
puts "Error: #{e.message}"
|
77
|
+
|
78
|
+
# Fall back to default model
|
79
|
+
begin
|
80
|
+
Candle::EmbeddingModel.from_pretrained("jinaai/jina-embeddings-v2-base-en")
|
81
|
+
rescue => fallback_error
|
82
|
+
puts "Error loading fallback model: #{fallback_error.message}"
|
83
|
+
# Last resort: try the old initialization method for backwards compatibility
|
84
|
+
Candle::EmbeddingModel.new
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.available_models
|
90
|
+
# List of commonly used embedding models
|
91
|
+
# This could be expanded or made dynamic
|
92
|
+
[
|
93
|
+
"BAAI/bge-small-en-v1.5",
|
94
|
+
"BAAI/bge-base-en-v1.5",
|
95
|
+
"BAAI/bge-large-en-v1.5",
|
96
|
+
"sentence-transformers/all-MiniLM-L6-v2",
|
97
|
+
"sentence-transformers/all-mpnet-base-v2",
|
98
|
+
"thenlper/gte-small",
|
99
|
+
"thenlper/gte-base",
|
100
|
+
"thenlper/gte-large"
|
101
|
+
]
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.model_info(model_name)
|
105
|
+
# Provide information about embedding models
|
106
|
+
info = {
|
107
|
+
"BAAI/bge-small-en-v1.5" => {
|
108
|
+
dimensions: 384,
|
109
|
+
max_tokens: 512,
|
110
|
+
description: "Small, fast, good quality embeddings"
|
111
|
+
},
|
112
|
+
"BAAI/bge-base-en-v1.5" => {
|
113
|
+
dimensions: 768,
|
114
|
+
max_tokens: 512,
|
115
|
+
description: "Balanced size and quality"
|
116
|
+
},
|
117
|
+
"BAAI/bge-large-en-v1.5" => {
|
118
|
+
dimensions: 1024,
|
119
|
+
max_tokens: 512,
|
120
|
+
description: "Large, highest quality embeddings"
|
121
|
+
},
|
122
|
+
"sentence-transformers/all-MiniLM-L6-v2" => {
|
123
|
+
dimensions: 384,
|
124
|
+
max_tokens: 256,
|
125
|
+
description: "Fast, lightweight model"
|
126
|
+
},
|
127
|
+
"sentence-transformers/all-mpnet-base-v2" => {
|
128
|
+
dimensions: 768,
|
129
|
+
max_tokens: 384,
|
130
|
+
description: "High quality general purpose embeddings"
|
131
|
+
}
|
132
|
+
}
|
133
|
+
|
134
|
+
info[model_name] || { description: "Model information not available" }
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'parsekit'
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
class Indexer
|
5
|
+
attr_reader :database, :chunker, :embedder
|
6
|
+
|
7
|
+
def initialize(db_path: Ragnar::DEFAULT_DB_PATH,
|
8
|
+
chunk_size: Ragnar::DEFAULT_CHUNK_SIZE,
|
9
|
+
chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP,
|
10
|
+
embedding_model: Ragnar::DEFAULT_EMBEDDING_MODEL,
|
11
|
+
show_progress: true)
|
12
|
+
@database = Database.new(db_path)
|
13
|
+
@chunker = Chunker.new(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
|
14
|
+
@embedder = Embedder.new(model_name: embedding_model)
|
15
|
+
@show_progress = show_progress
|
16
|
+
end
|
17
|
+
|
18
|
+
def index_path(path)
|
19
|
+
stats = {
|
20
|
+
files_processed: 0,
|
21
|
+
chunks_created: 0,
|
22
|
+
errors: 0
|
23
|
+
}
|
24
|
+
|
25
|
+
files = collect_files(path)
|
26
|
+
|
27
|
+
if files.empty?
|
28
|
+
puts "No text files found at path: #{path}"
|
29
|
+
return stats
|
30
|
+
end
|
31
|
+
|
32
|
+
puts "Found #{files.size} file(s) to process" if @show_progress
|
33
|
+
|
34
|
+
file_progress = if @show_progress
|
35
|
+
TTY::ProgressBar.new(
|
36
|
+
"Processing [:bar] :percent :current/:total - :filename",
|
37
|
+
total: files.size,
|
38
|
+
bar_format: :block,
|
39
|
+
width: 30,
|
40
|
+
clear: true
|
41
|
+
)
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
|
46
|
+
files.each do |file_path|
|
47
|
+
begin
|
48
|
+
if file_progress
|
49
|
+
# Update the progress bar with current filename
|
50
|
+
filename = File.basename(file_path)
|
51
|
+
filename = filename[0..27] + "..." if filename.length > 30
|
52
|
+
file_progress.advance(0, filename: filename)
|
53
|
+
end
|
54
|
+
|
55
|
+
process_file(file_path, stats, file_progress)
|
56
|
+
stats[:files_processed] += 1
|
57
|
+
rescue => e
|
58
|
+
if file_progress
|
59
|
+
file_progress.log "Error: #{File.basename(file_path)} - #{e.message}"
|
60
|
+
else
|
61
|
+
puts "Error processing #{File.basename(file_path)}: #{e.message}" if @show_progress
|
62
|
+
end
|
63
|
+
stats[:errors] += 1
|
64
|
+
ensure
|
65
|
+
file_progress&.advance
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
stats
|
70
|
+
end
|
71
|
+
|
72
|
+
def index_text(text, metadata = {})
|
73
|
+
chunks = @chunker.chunk_text(text, metadata)
|
74
|
+
process_chunks(chunks, metadata[:file_path] || "inline_text")
|
75
|
+
end
|
76
|
+
|
77
|
+
# Convenience methods for compatibility
|
78
|
+
def index_files(files)
|
79
|
+
stats = {
|
80
|
+
files_processed: 0,
|
81
|
+
chunks_created: 0,
|
82
|
+
errors: 0
|
83
|
+
}
|
84
|
+
|
85
|
+
files.each do |file|
|
86
|
+
next unless File.exist?(file)
|
87
|
+
process_file(file, stats)
|
88
|
+
stats[:files_processed] += 1
|
89
|
+
end
|
90
|
+
|
91
|
+
stats
|
92
|
+
end
|
93
|
+
|
94
|
+
def index_directory(dir_path)
|
95
|
+
index_path(dir_path)
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def collect_files(path)
|
101
|
+
if File.file?(path)
|
102
|
+
[path]
|
103
|
+
elsif File.directory?(path)
|
104
|
+
# Now we support many more file types through parser-core
|
105
|
+
pattern = "*.{txt,md,markdown,text,pdf,docx,doc,xlsx,xls,pptx,ppt,csv,json,xml,html,htm,rb,py,js,rs,go,java,cpp,c,h}"
|
106
|
+
Dir.glob(File.join(path, "**", pattern))
|
107
|
+
else
|
108
|
+
[]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def process_file(file_path, stats, progress_bar = nil)
|
113
|
+
# Extract text using parser-core
|
114
|
+
begin
|
115
|
+
text = extract_text_from_file(file_path)
|
116
|
+
|
117
|
+
if text.nil? || text.strip.empty?
|
118
|
+
progress_bar.log(" Skipped: #{File.basename(file_path)} (empty or unsupported)") if progress_bar
|
119
|
+
return
|
120
|
+
end
|
121
|
+
|
122
|
+
# Create metadata
|
123
|
+
metadata = {
|
124
|
+
file_path: file_path,
|
125
|
+
file_name: File.basename(file_path),
|
126
|
+
file_type: File.extname(file_path).downcase[1..-1] || 'unknown'
|
127
|
+
}
|
128
|
+
|
129
|
+
# Chunk the extracted text
|
130
|
+
chunks = @chunker.chunk_text(text, metadata)
|
131
|
+
|
132
|
+
if chunks.empty?
|
133
|
+
progress_bar.log(" Skipped: #{File.basename(file_path)} (text too short)") if progress_bar
|
134
|
+
return
|
135
|
+
end
|
136
|
+
|
137
|
+
# Process chunks and create documents
|
138
|
+
chunk_count = process_chunks(chunks, file_path, progress_bar)
|
139
|
+
stats[:chunks_created] += chunk_count
|
140
|
+
rescue => e
|
141
|
+
if progress_bar
|
142
|
+
progress_bar.log(" Error processing file: #{e.message}")
|
143
|
+
progress_bar.log(" Backtrace: #{e.backtrace.first}")
|
144
|
+
end
|
145
|
+
raise e
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def process_chunks(chunks, file_path, progress_bar = nil)
|
150
|
+
return 0 if chunks.empty?
|
151
|
+
|
152
|
+
# Extract texts for embedding
|
153
|
+
texts = chunks.map { |c| c[:text] }
|
154
|
+
|
155
|
+
# Generate embeddings (silently)
|
156
|
+
embeddings = @embedder.embed_batch(texts, show_progress: false)
|
157
|
+
|
158
|
+
# Prepare documents for database
|
159
|
+
documents = []
|
160
|
+
chunks.each_with_index do |chunk, idx|
|
161
|
+
embedding = embeddings[idx]
|
162
|
+
next unless embedding # Skip if embedding failed
|
163
|
+
|
164
|
+
doc = {
|
165
|
+
id: SecureRandom.uuid,
|
166
|
+
chunk_text: chunk[:text],
|
167
|
+
file_path: file_path,
|
168
|
+
chunk_index: chunk[:index],
|
169
|
+
embedding: embedding,
|
170
|
+
metadata: chunk[:metadata] || {}
|
171
|
+
}
|
172
|
+
|
173
|
+
# Note: No need to add reduced_embedding field anymore!
|
174
|
+
# Lancelot now supports optional fields after our fix
|
175
|
+
|
176
|
+
documents << doc
|
177
|
+
end
|
178
|
+
|
179
|
+
# Store in database
|
180
|
+
if documents.any?
|
181
|
+
@database.add_documents(documents)
|
182
|
+
# Successfully stored chunks (silent to preserve progress bar)
|
183
|
+
end
|
184
|
+
|
185
|
+
documents.size
|
186
|
+
end
|
187
|
+
|
188
|
+
def extract_text_from_file(file_path)
|
189
|
+
# Use parser-core to extract text from various file formats
|
190
|
+
begin
|
191
|
+
ParseKit.parse_file(file_path)
|
192
|
+
rescue => e
|
193
|
+
# If parser-core fails, try reading as plain text for known text formats
|
194
|
+
ext = File.extname(file_path).downcase
|
195
|
+
if %w[.txt .md .markdown .text .log .rb .py .js .rs .go .java .cpp .c .h].include?(ext)
|
196
|
+
File.read(file_path, encoding: 'UTF-8')
|
197
|
+
else
|
198
|
+
raise e
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.supported_extensions
|
204
|
+
# Extended list of supported formats through parser-core
|
205
|
+
%w[.txt .md .markdown .text .log .csv .json .xml .html .htm
|
206
|
+
.pdf .docx .doc .xlsx .xls .pptx .ppt
|
207
|
+
.rb .py .js .rs .go .java .cpp .c .h]
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.is_text_file?(file_path)
|
211
|
+
# Check by extension
|
212
|
+
ext = File.extname(file_path).downcase
|
213
|
+
return true if supported_extensions.include?(ext)
|
214
|
+
|
215
|
+
# Check if file appears to be text
|
216
|
+
begin
|
217
|
+
# Read first 8KB to check if it's text
|
218
|
+
sample = File.read(file_path, 8192, mode: 'rb')
|
219
|
+
return false if sample.nil?
|
220
|
+
|
221
|
+
# Check for binary content
|
222
|
+
null_count = sample.count("\x00")
|
223
|
+
return false if null_count > 0
|
224
|
+
|
225
|
+
# Check if mostly printable ASCII
|
226
|
+
printable = sample.count("\t\n\r\x20-\x7E")
|
227
|
+
ratio = printable.to_f / sample.size
|
228
|
+
ratio > 0.9
|
229
|
+
rescue
|
230
|
+
false
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Ragnar
|
2
|
+
# Singleton manager for LLM instances to avoid reloading models
|
3
|
+
class LLMManager
|
4
|
+
include Singleton
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@llms = {}
|
8
|
+
@mutex = Mutex.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get or create an LLM instance
|
12
|
+
# @param model_id [String] The model identifier
|
13
|
+
# @param gguf_file [String, nil] Optional GGUF file for quantized models
|
14
|
+
# @return [Candle::LLM] The LLM instance
|
15
|
+
def get_llm(model_id: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
16
|
+
gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
|
17
|
+
cache_key = "#{model_id}:#{gguf_file}"
|
18
|
+
|
19
|
+
@mutex.synchronize do
|
20
|
+
@llms[cache_key] ||= begin
|
21
|
+
puts "Loading LLM: #{model_id}..." unless @llms.key?(cache_key)
|
22
|
+
if gguf_file
|
23
|
+
Candle::LLM.from_pretrained(model_id, gguf_file: gguf_file)
|
24
|
+
else
|
25
|
+
Candle::LLM.from_pretrained(model_id)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Clear all cached models (useful for memory management)
|
32
|
+
def clear_cache
|
33
|
+
@mutex.synchronize do
|
34
|
+
@llms.clear
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the default LLM for the application
|
39
|
+
def default_llm
|
40
|
+
get_llm
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|