ragnar-cli 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,267 @@
1
+ module Ragnar
2
+ class Database
3
+ attr_reader :db_path, :table_name
4
+
5
+ def initialize(db_path, table_name: "documents")
6
+ @db_path = db_path
7
+ @table_name = table_name
8
+ ensure_database_exists
9
+ end
10
+
11
+ def add_documents(documents)
12
+ return if documents.empty?
13
+
14
+ # Convert documents to Lance-compatible format
15
+ data = documents.map do |doc|
16
+ {
17
+ id: doc[:id],
18
+ chunk_text: doc[:chunk_text],
19
+ file_path: doc[:file_path],
20
+ chunk_index: doc[:chunk_index],
21
+ embedding: doc[:embedding],
22
+ metadata: doc[:metadata].to_json
23
+ }
24
+ end
25
+
26
+ # Define schema for the table with vector type
27
+ embedding_size = documents.first[:embedding].size
28
+ schema = {
29
+ id: :string,
30
+ chunk_text: :string,
31
+ file_path: :string,
32
+ chunk_index: :int64,
33
+ embedding: { type: "vector", dimension: embedding_size },
34
+ metadata: :string
35
+ }
36
+
37
+ # Use the new open_or_create method from Lancelot
38
+ # This automatically handles both creating new and opening existing datasets
39
+ dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
40
+ dataset.add_documents(data)
41
+ end
42
+
43
+ def get_embeddings(limit: nil, offset: 0)
44
+ return [] unless dataset_exists?
45
+
46
+ dataset = Lancelot::Dataset.open(@db_path)
47
+
48
+ # Get all documents or a subset
49
+ docs = if limit && offset > 0
50
+ # Get limit + offset items, then drop offset
51
+ dataset.first(limit + offset).drop(offset)
52
+ elsif limit
53
+ dataset.first(limit)
54
+ else
55
+ dataset.to_a.drop(offset)
56
+ end
57
+
58
+ docs.map do |doc|
59
+ {
60
+ id: doc[:id],
61
+ embedding: doc[:embedding],
62
+ reduced_embedding: doc[:reduced_embedding]
63
+ }
64
+ end
65
+ end
66
+
67
+ def update_reduced_embeddings(updates)
68
+ return if updates.empty?
69
+
70
+ dataset = Lancelot::Dataset.open(@db_path)
71
+
72
+ # Get all existing documents and safely extract their data
73
+ all_docs = dataset.to_a.map do |doc|
74
+ # Safely extract fields we know about
75
+ {
76
+ id: doc[:id],
77
+ content: doc[:content],
78
+ chunk_text: doc[:chunk_text],
79
+ file_path: doc[:file_path],
80
+ chunk_index: doc[:chunk_index],
81
+ embedding: doc[:embedding],
82
+ metadata: doc[:metadata],
83
+ reduced_embedding: doc[:reduced_embedding]
84
+ }
85
+ end
86
+
87
+ # Create a map for quick lookup
88
+ update_map = updates.each_with_object({}) do |update, map|
89
+ map[update[:id]] = update[:reduced_embedding]
90
+ end
91
+
92
+ # Update documents with reduced embeddings
93
+ updated_docs = all_docs.map do |doc|
94
+ if update_map[doc[:id]]
95
+ doc.merge(reduced_embedding: update_map[doc[:id]])
96
+ else
97
+ doc
98
+ end
99
+ end
100
+
101
+ # Need to recreate the dataset with updated data
102
+ # First, backup the schema including the new reduced_embedding field
103
+ embedding_size = all_docs.first[:embedding].size
104
+ reduced_size = updates.first[:reduced_embedding].size
105
+
106
+ schema = {
107
+ id: :string,
108
+ chunk_text: :string,
109
+ file_path: :string,
110
+ chunk_index: :int64,
111
+ embedding: { type: "vector", dimension: embedding_size },
112
+ reduced_embedding: { type: "vector", dimension: reduced_size },
113
+ metadata: :string
114
+ }
115
+
116
+ # Remove old dataset and create new one with updated data
117
+ FileUtils.rm_rf(@db_path)
118
+ # Use open_or_create which will create since we just deleted the path
119
+ dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
120
+ dataset.add_documents(updated_docs)
121
+ end
122
+
123
+ def search_similar(embedding, k: 10, use_reduced: false)
124
+ return [] unless dataset_exists?
125
+
126
+ dataset = Lancelot::Dataset.open(@db_path)
127
+
128
+ embedding_field = use_reduced ? :reduced_embedding : :embedding
129
+
130
+ # Perform vector search
131
+ results = dataset.vector_search(
132
+ embedding.to_a,
133
+ column: embedding_field,
134
+ limit: k
135
+ )
136
+
137
+ results.map do |row|
138
+ {
139
+ id: row[:id],
140
+ chunk_text: row[:chunk_text],
141
+ file_path: row[:file_path],
142
+ chunk_index: row[:chunk_index],
143
+ distance: row[:_distance],
144
+ metadata: JSON.parse(row[:metadata] || "{}")
145
+ }
146
+ end
147
+ end
148
+
149
+ def count
150
+ return 0 unless dataset_exists?
151
+
152
+ dataset = Lancelot::Dataset.open(@db_path)
153
+ dataset.to_a.size
154
+ end
155
+
156
+ def get_stats
157
+ unless dataset_exists?
158
+ return {
159
+ document_count: 0,
160
+ total_documents: 0,
161
+ unique_files: 0,
162
+ total_chunks: 0,
163
+ with_embeddings: 0,
164
+ with_reduced_embeddings: 0,
165
+ total_size_mb: 0.0
166
+ }
167
+ end
168
+
169
+ dataset = Lancelot::Dataset.open(@db_path)
170
+
171
+ # Get all documents
172
+ all_docs = dataset.to_a
173
+
174
+ stats = {
175
+ document_count: all_docs.size, # Add for compatibility with specs
176
+ total_documents: all_docs.size,
177
+ total_chunks: all_docs.size,
178
+ unique_files: all_docs.map { |d| d[:file_path] }.uniq.size,
179
+ with_embeddings: 0,
180
+ with_reduced_embeddings: 0,
181
+ avg_chunk_size: 0,
182
+ total_size_mb: 0, # Add for CLI stats command
183
+ embedding_dims: nil,
184
+ reduced_dims: nil
185
+ }
186
+
187
+ chunk_sizes = []
188
+ total_bytes = 0
189
+
190
+ all_docs.each do |doc|
191
+ if doc[:embedding] && !doc[:embedding].empty?
192
+ stats[:with_embeddings] += 1
193
+ stats[:embedding_dims] ||= doc[:embedding].size
194
+ end
195
+
196
+ if doc[:reduced_embedding] && !doc[:reduced_embedding].empty?
197
+ stats[:with_reduced_embeddings] += 1
198
+ stats[:reduced_dims] ||= doc[:reduced_embedding].size
199
+ end
200
+
201
+ if doc[:chunk_text]
202
+ chunk_size = doc[:chunk_text].size
203
+ chunk_sizes << chunk_size
204
+ total_bytes += chunk_size
205
+ end
206
+ end
207
+
208
+ stats[:avg_chunk_size] = (chunk_sizes.sum.to_f / chunk_sizes.size).round if chunk_sizes.any?
209
+ stats[:total_size_mb] = (total_bytes / 1024.0 / 1024.0).round(2)
210
+
211
+ stats
212
+ end
213
+
214
+ def get_all_documents_with_embeddings(limit: nil)
215
+ return [] unless dataset_exists?
216
+
217
+ dataset = Lancelot::Dataset.open(@db_path)
218
+ all_docs = limit ? dataset.first(limit) : dataset.to_a
219
+
220
+ all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
221
+ end
222
+
223
+ def full_text_search(query, limit: 10)
224
+ return [] unless dataset_exists?
225
+
226
+ dataset = Lancelot::Dataset.open(@db_path)
227
+
228
+ # Use Lancelot's full-text search
229
+ results = dataset.full_text_search(
230
+ query,
231
+ columns: [:chunk_text],
232
+ limit: limit
233
+ )
234
+
235
+ results.map do |row|
236
+ {
237
+ id: row[:id],
238
+ chunk_text: row[:chunk_text],
239
+ file_path: row[:file_path],
240
+ chunk_index: row[:chunk_index],
241
+ metadata: JSON.parse(row[:metadata] || "{}")
242
+ }
243
+ end
244
+ end
245
+
246
+ def dataset_exists?
247
+ return false unless File.exist?(@db_path)
248
+
249
+ begin
250
+ Lancelot::Dataset.open(@db_path)
251
+ true
252
+ rescue
253
+ false
254
+ end
255
+ end
256
+
257
+ private
258
+
259
+ def ensure_database_exists
260
+ # Don't create directory - Lance will handle this
261
+ end
262
+
263
+ def table_exists?
264
+ dataset_exists?
265
+ end
266
+ end
267
+ end
@@ -0,0 +1,137 @@
1
+ module Ragnar
2
+ class Embedder
3
+ attr_reader :model, :model_name
4
+
5
+ def initialize(model_name: Ragnar::DEFAULT_EMBEDDING_MODEL)
6
+ @model_name = model_name
7
+ @model = load_model(model_name)
8
+ end
9
+
10
+ def embed_text(text)
11
+ return nil if text.nil? || text.empty? || (text.respond_to?(:strip) && text.strip.empty?)
12
+
13
+ # Use Candle to generate embeddings
14
+ # The embedding method returns a tensor, we need to convert to array
15
+ embedding = @model.embedding(text)
16
+
17
+ # Convert tensor to array - Candle tensors need double to_a
18
+ # First to_a gives [tensor], second to_a on the tensor gives the float array
19
+ if embedding.respond_to?(:to_a)
20
+ result = embedding.to_a
21
+ if result.is_a?(Array) && result.first.respond_to?(:to_a)
22
+ result.first.to_a
23
+ else
24
+ result
25
+ end
26
+ else
27
+ embedding
28
+ end
29
+ rescue => e
30
+ puts "Error generating embedding: #{e.message}"
31
+ nil
32
+ end
33
+
34
+ def embed_batch(texts, show_progress: true)
35
+ embeddings = []
36
+
37
+ if show_progress
38
+ progressbar = TTY::ProgressBar.new(
39
+ "Generating embeddings [:bar] :percent :current/:total",
40
+ total: texts.size,
41
+ bar_format: :block,
42
+ width: 30
43
+ )
44
+ end
45
+
46
+ texts.each do |text|
47
+ embedding = embed_text(text)
48
+ embeddings << embedding
49
+ progressbar.advance if show_progress
50
+ end
51
+
52
+ embeddings
53
+ end
54
+
55
+ def embed_chunks(chunks, show_progress: true)
56
+ texts = chunks.map do |chunk|
57
+ if chunk.is_a?(Hash)
58
+ chunk[:text] || chunk["text"]
59
+ else
60
+ chunk.to_s
61
+ end
62
+ end
63
+
64
+ embed_batch(texts, show_progress: show_progress)
65
+ end
66
+
67
+ private
68
+
69
+ def load_model(model_name)
70
+ # Initialize Candle embedding model using the new standardized from_pretrained method
71
+ begin
72
+ # Try to load the model using from_pretrained
73
+ Candle::EmbeddingModel.from_pretrained(model_name)
74
+ rescue => e
75
+ puts "Warning: Could not load model #{model_name}, falling back to default"
76
+ puts "Error: #{e.message}"
77
+
78
+ # Fall back to default model
79
+ begin
80
+ Candle::EmbeddingModel.from_pretrained("jinaai/jina-embeddings-v2-base-en")
81
+ rescue => fallback_error
82
+ puts "Error loading fallback model: #{fallback_error.message}"
83
+ # Last resort: try the old initialization method for backwards compatibility
84
+ Candle::EmbeddingModel.new
85
+ end
86
+ end
87
+ end
88
+
89
+ def self.available_models
90
+ # List of commonly used embedding models
91
+ # This could be expanded or made dynamic
92
+ [
93
+ "BAAI/bge-small-en-v1.5",
94
+ "BAAI/bge-base-en-v1.5",
95
+ "BAAI/bge-large-en-v1.5",
96
+ "sentence-transformers/all-MiniLM-L6-v2",
97
+ "sentence-transformers/all-mpnet-base-v2",
98
+ "thenlper/gte-small",
99
+ "thenlper/gte-base",
100
+ "thenlper/gte-large"
101
+ ]
102
+ end
103
+
104
+ def self.model_info(model_name)
105
+ # Provide information about embedding models
106
+ info = {
107
+ "BAAI/bge-small-en-v1.5" => {
108
+ dimensions: 384,
109
+ max_tokens: 512,
110
+ description: "Small, fast, good quality embeddings"
111
+ },
112
+ "BAAI/bge-base-en-v1.5" => {
113
+ dimensions: 768,
114
+ max_tokens: 512,
115
+ description: "Balanced size and quality"
116
+ },
117
+ "BAAI/bge-large-en-v1.5" => {
118
+ dimensions: 1024,
119
+ max_tokens: 512,
120
+ description: "Large, highest quality embeddings"
121
+ },
122
+ "sentence-transformers/all-MiniLM-L6-v2" => {
123
+ dimensions: 384,
124
+ max_tokens: 256,
125
+ description: "Fast, lightweight model"
126
+ },
127
+ "sentence-transformers/all-mpnet-base-v2" => {
128
+ dimensions: 768,
129
+ max_tokens: 384,
130
+ description: "High quality general purpose embeddings"
131
+ }
132
+ }
133
+
134
+ info[model_name] || { description: "Model information not available" }
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,234 @@
1
+ require 'parsekit'
2
+
3
+ module Ragnar
4
+ class Indexer
5
+ attr_reader :database, :chunker, :embedder
6
+
7
+ def initialize(db_path: Ragnar::DEFAULT_DB_PATH,
8
+ chunk_size: Ragnar::DEFAULT_CHUNK_SIZE,
9
+ chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP,
10
+ embedding_model: Ragnar::DEFAULT_EMBEDDING_MODEL,
11
+ show_progress: true)
12
+ @database = Database.new(db_path)
13
+ @chunker = Chunker.new(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
14
+ @embedder = Embedder.new(model_name: embedding_model)
15
+ @show_progress = show_progress
16
+ end
17
+
18
+ def index_path(path)
19
+ stats = {
20
+ files_processed: 0,
21
+ chunks_created: 0,
22
+ errors: 0
23
+ }
24
+
25
+ files = collect_files(path)
26
+
27
+ if files.empty?
28
+ puts "No text files found at path: #{path}"
29
+ return stats
30
+ end
31
+
32
+ puts "Found #{files.size} file(s) to process" if @show_progress
33
+
34
+ file_progress = if @show_progress
35
+ TTY::ProgressBar.new(
36
+ "Processing [:bar] :percent :current/:total - :filename",
37
+ total: files.size,
38
+ bar_format: :block,
39
+ width: 30,
40
+ clear: true
41
+ )
42
+ else
43
+ nil
44
+ end
45
+
46
+ files.each do |file_path|
47
+ begin
48
+ if file_progress
49
+ # Update the progress bar with current filename
50
+ filename = File.basename(file_path)
51
+ filename = filename[0..27] + "..." if filename.length > 30
52
+ file_progress.advance(0, filename: filename)
53
+ end
54
+
55
+ process_file(file_path, stats, file_progress)
56
+ stats[:files_processed] += 1
57
+ rescue => e
58
+ if file_progress
59
+ file_progress.log "Error: #{File.basename(file_path)} - #{e.message}"
60
+ else
61
+ puts "Error processing #{File.basename(file_path)}: #{e.message}" if @show_progress
62
+ end
63
+ stats[:errors] += 1
64
+ ensure
65
+ file_progress&.advance
66
+ end
67
+ end
68
+
69
+ stats
70
+ end
71
+
72
+ def index_text(text, metadata = {})
73
+ chunks = @chunker.chunk_text(text, metadata)
74
+ process_chunks(chunks, metadata[:file_path] || "inline_text")
75
+ end
76
+
77
+ # Convenience methods for compatibility
78
+ def index_files(files)
79
+ stats = {
80
+ files_processed: 0,
81
+ chunks_created: 0,
82
+ errors: 0
83
+ }
84
+
85
+ files.each do |file|
86
+ next unless File.exist?(file)
87
+ process_file(file, stats)
88
+ stats[:files_processed] += 1
89
+ end
90
+
91
+ stats
92
+ end
93
+
94
+ def index_directory(dir_path)
95
+ index_path(dir_path)
96
+ end
97
+
98
+ private
99
+
100
+ def collect_files(path)
101
+ if File.file?(path)
102
+ [path]
103
+ elsif File.directory?(path)
104
+ # Now we support many more file types through parser-core
105
+ pattern = "*.{txt,md,markdown,text,pdf,docx,doc,xlsx,xls,pptx,ppt,csv,json,xml,html,htm,rb,py,js,rs,go,java,cpp,c,h}"
106
+ Dir.glob(File.join(path, "**", pattern))
107
+ else
108
+ []
109
+ end
110
+ end
111
+
112
+ def process_file(file_path, stats, progress_bar = nil)
113
+ # Extract text using parser-core
114
+ begin
115
+ text = extract_text_from_file(file_path)
116
+
117
+ if text.nil? || text.strip.empty?
118
+ progress_bar.log(" Skipped: #{File.basename(file_path)} (empty or unsupported)") if progress_bar
119
+ return
120
+ end
121
+
122
+ # Create metadata
123
+ metadata = {
124
+ file_path: file_path,
125
+ file_name: File.basename(file_path),
126
+ file_type: File.extname(file_path).downcase[1..-1] || 'unknown'
127
+ }
128
+
129
+ # Chunk the extracted text
130
+ chunks = @chunker.chunk_text(text, metadata)
131
+
132
+ if chunks.empty?
133
+ progress_bar.log(" Skipped: #{File.basename(file_path)} (text too short)") if progress_bar
134
+ return
135
+ end
136
+
137
+ # Process chunks and create documents
138
+ chunk_count = process_chunks(chunks, file_path, progress_bar)
139
+ stats[:chunks_created] += chunk_count
140
+ rescue => e
141
+ if progress_bar
142
+ progress_bar.log(" Error processing file: #{e.message}")
143
+ progress_bar.log(" Backtrace: #{e.backtrace.first}")
144
+ end
145
+ raise e
146
+ end
147
+ end
148
+
149
+ def process_chunks(chunks, file_path, progress_bar = nil)
150
+ return 0 if chunks.empty?
151
+
152
+ # Extract texts for embedding
153
+ texts = chunks.map { |c| c[:text] }
154
+
155
+ # Generate embeddings (silently)
156
+ embeddings = @embedder.embed_batch(texts, show_progress: false)
157
+
158
+ # Prepare documents for database
159
+ documents = []
160
+ chunks.each_with_index do |chunk, idx|
161
+ embedding = embeddings[idx]
162
+ next unless embedding # Skip if embedding failed
163
+
164
+ doc = {
165
+ id: SecureRandom.uuid,
166
+ chunk_text: chunk[:text],
167
+ file_path: file_path,
168
+ chunk_index: chunk[:index],
169
+ embedding: embedding,
170
+ metadata: chunk[:metadata] || {}
171
+ }
172
+
173
+ # Note: No need to add reduced_embedding field anymore!
174
+ # Lancelot now supports optional fields after our fix
175
+
176
+ documents << doc
177
+ end
178
+
179
+ # Store in database
180
+ if documents.any?
181
+ @database.add_documents(documents)
182
+ # Successfully stored chunks (silent to preserve progress bar)
183
+ end
184
+
185
+ documents.size
186
+ end
187
+
188
+ def extract_text_from_file(file_path)
189
+ # Use parser-core to extract text from various file formats
190
+ begin
191
+ ParseKit.parse_file(file_path)
192
+ rescue => e
193
+ # If parser-core fails, try reading as plain text for known text formats
194
+ ext = File.extname(file_path).downcase
195
+ if %w[.txt .md .markdown .text .log .rb .py .js .rs .go .java .cpp .c .h].include?(ext)
196
+ File.read(file_path, encoding: 'UTF-8')
197
+ else
198
+ raise e
199
+ end
200
+ end
201
+ end
202
+
203
+ def self.supported_extensions
204
+ # Extended list of supported formats through parser-core
205
+ %w[.txt .md .markdown .text .log .csv .json .xml .html .htm
206
+ .pdf .docx .doc .xlsx .xls .pptx .ppt
207
+ .rb .py .js .rs .go .java .cpp .c .h]
208
+ end
209
+
210
+ def self.is_text_file?(file_path)
211
+ # Check by extension
212
+ ext = File.extname(file_path).downcase
213
+ return true if supported_extensions.include?(ext)
214
+
215
+ # Check if file appears to be text
216
+ begin
217
+ # Read first 8KB to check if it's text
218
+ sample = File.read(file_path, 8192, mode: 'rb')
219
+ return false if sample.nil?
220
+
221
+ # Check for binary content
222
+ null_count = sample.count("\x00")
223
+ return false if null_count > 0
224
+
225
+ # Check if mostly printable ASCII
226
+ printable = sample.count("\t\n\r\x20-\x7E")
227
+ ratio = printable.to_f / sample.size
228
+ ratio > 0.9
229
+ rescue
230
+ false
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,43 @@
1
+ module Ragnar
2
+ # Singleton manager for LLM instances to avoid reloading models
3
+ class LLMManager
4
+ include Singleton
5
+
6
+ def initialize
7
+ @llms = {}
8
+ @mutex = Mutex.new
9
+ end
10
+
11
+ # Get or create an LLM instance
12
+ # @param model_id [String] The model identifier
13
+ # @param gguf_file [String, nil] Optional GGUF file for quantized models
14
+ # @return [Candle::LLM] The LLM instance
15
+ def get_llm(model_id: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
16
+ gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
17
+ cache_key = "#{model_id}:#{gguf_file}"
18
+
19
+ @mutex.synchronize do
20
+ @llms[cache_key] ||= begin
21
+ puts "Loading LLM: #{model_id}..." unless @llms.key?(cache_key)
22
+ if gguf_file
23
+ Candle::LLM.from_pretrained(model_id, gguf_file: gguf_file)
24
+ else
25
+ Candle::LLM.from_pretrained(model_id)
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ # Clear all cached models (useful for memory management)
32
+ def clear_cache
33
+ @mutex.synchronize do
34
+ @llms.clear
35
+ end
36
+ end
37
+
38
+ # Get the default LLM for the application
39
+ def default_llm
40
+ get_llm
41
+ end
42
+ end
43
+ end