ragnar-cli 0.1.0.pre.1 → 0.1.0.pre.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c0b9db7d48838621cadf2a90bff6fc4afca333ecd7fdc2364666196fce437474
4
- data.tar.gz: '0468b4cdb2893fb80b7b52ad9c9dc4857bb6a90e0105c0e4356ba0612a5bcfae'
3
+ metadata.gz: 6b9a7fdbf0345f1c111f8028f8b881d8014a55226cfe3d02f6a76fd6cd9b213c
4
+ data.tar.gz: 2341e27f16b442c0631876303e0da5141438559ef6685e2c15514cd18416d99c
5
5
  SHA512:
6
- metadata.gz: c398c02f5019e86476a59ebe64639ec42b009c4699d3704a39492950d00e3be0e252c0cdd84101b24121f359f12c8652c3129643376fc4b84b03c9ab99843f13
7
- data.tar.gz: c295cfa1ec329d954d7d86e026b29536d83194ed47ed3a28c96527cfc3c8ae7c8d652a09d4b4e771599e1671a16523bcffa79abb01feddb5592edb48561b19ff
6
+ metadata.gz: a87f39a5dfd246732be4e24b19aba8b49a7a735f78d825d0241f04b0b776fc8b23c15f0b3488416dedfba37d9027ff6442f38ab2aad43bb79395e0c769247275
7
+ data.tar.gz: 00d7533c2e16b57da59786a840f1b653cfe472c63168a6e003abb31a79b3f6e0f056fdabe32f4b4566ea682d229e3d0710167358ab7c78fd18c497690d9a3675
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Ragnar
1
+ <img src="/docs/assets/ragnar-wide.png" alt="ragnar" height="80px">
2
2
 
3
3
  A complete Ruby implementation of Retrieval-Augmented Generation (RAG) pipeline using native Ruby ML/NLP gems.
4
4
 
@@ -419,11 +419,11 @@ MIT License - see LICENSE file for details
419
419
  ## Acknowledgments
420
420
 
421
421
  This project integrates several excellent Ruby gems:
422
- - [red-candle](https://github.com/red-candle) - Ruby ML/LLM toolkit
423
- - [lancelot](https://github.com/lancelot) - Lance database bindings
422
+ - [red-candle](https://github.com/assaydepot/red-candle) - Ruby ML/LLM toolkit
423
+ - [lancelot](https://github.com/cpetersen/lancelot) - Lance database bindings
424
424
  - [clusterkit](https://github.com/cpetersen/clusterkit) - UMAP and clustering implementation
425
425
  - [parsekit](https://github.com/cpetersen/parsekit) - Content extraction
426
- - [baran](https://github.com/baran) - Text splitting utilities
426
+ - [baran](https://github.com/moeki0/baran) - Text splitting utilities
427
427
 
428
428
  ## Roadmap
429
429
 
@@ -436,4 +436,4 @@ This project integrates several excellent Ruby gems:
436
436
  - [ ] Performance benchmarking suite
437
437
  - [ ] Support for multiple embedding models simultaneously
438
438
  - [ ] Query result caching
439
- - [ ] Automatic index optimization
439
+ - [ ] Automatic index optimization
data/lib/ragnar/cli.rb CHANGED
@@ -127,7 +127,22 @@ module Ragnar
127
127
  exit 1
128
128
  end
129
129
 
130
- embeddings = docs_with_embeddings.map { |d| d[:embedding] }
130
+ # Check if we have reduced embeddings available
131
+ first_doc = docs_with_embeddings.first
132
+ has_reduced = first_doc[:reduced_embedding] && !first_doc[:reduced_embedding].empty?
133
+
134
+ if has_reduced
135
+ embeddings = docs_with_embeddings.map { |d| d[:reduced_embedding] }
136
+ say "Using reduced embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
137
+ # Already reduced, so don't reduce again in the engine
138
+ reduce_dims = false
139
+ else
140
+ embeddings = docs_with_embeddings.map { |d| d[:embedding] }
141
+ say "Using original embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
142
+ # Let the engine handle dimensionality reduction if needed
143
+ reduce_dims = true
144
+ end
145
+
131
146
  documents = docs_with_embeddings.map { |d| d[:chunk_text] }
132
147
  metadata = docs_with_embeddings.map { |d| { file_path: d[:file_path], chunk_index: d[:chunk_index] } }
133
148
 
@@ -137,7 +152,8 @@ module Ragnar
137
152
  engine = Ragnar::TopicModeling::Engine.new(
138
153
  min_cluster_size: options[:min_cluster_size],
139
154
  labeling_method: options[:method].to_sym,
140
- verbose: options[:verbose]
155
+ verbose: options[:verbose],
156
+ reduce_dimensions: reduce_dims
141
157
  )
142
158
 
143
159
  # Extract topics
@@ -128,19 +128,99 @@ module Ragnar
128
128
  def reduce_dimensions(embeddings)
129
129
  require 'clusterkit'
130
130
 
131
- umap = ClusterKit::Dimensionality::UMAP.new(
132
- n_components: @n_components,
133
- n_neighbors: 15,
134
- random_seed: 42 # For reproducibility
135
- )
136
-
137
- # Convert to format UMAP expects
138
- umap.fit_transform(embeddings)
131
+ # Validate embeddings before UMAP
132
+ valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
133
+
134
+ if valid_embeddings.empty?
135
+ raise "No valid embeddings for dimensionality reduction.\n\n" \
136
+ "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
137
+ "Try running without dimensionality reduction:\n" \
138
+ " ragnar topics --reduce-dimensions false"
139
+ end
140
+
141
+ if invalid_indices.any? && @verbose
142
+ puts " ⚠️ Warning: #{invalid_indices.size} embeddings with invalid values removed"
143
+ end
144
+
145
+ begin
146
+ # Adjust parameters based on data size
147
+ n_samples = valid_embeddings.size
148
+ n_components = [@n_components, n_samples - 1, 50].min
149
+ n_neighbors = [15, n_samples - 1].min
150
+
151
+ if @verbose && n_components != @n_components
152
+ puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
153
+ end
154
+
155
+ umap = ClusterKit::Dimensionality::UMAP.new(
156
+ n_components: n_components,
157
+ n_neighbors: n_neighbors,
158
+ random_seed: 42 # For reproducibility
159
+ )
160
+
161
+ # Convert to format UMAP expects
162
+ reduced = umap.fit_transform(valid_embeddings)
163
+
164
+ # If we had to remove invalid embeddings, reconstruct the full array
165
+ if invalid_indices.any?
166
+ full_reduced = []
167
+ valid_idx = 0
168
+ embeddings.size.times do |i|
169
+ if invalid_indices.include?(i)
170
+ # Use zeros for invalid embeddings (they'll be outliers anyway)
171
+ full_reduced << Array.new(n_components, 0.0)
172
+ else
173
+ full_reduced << reduced[valid_idx]
174
+ valid_idx += 1
175
+ end
176
+ end
177
+ full_reduced
178
+ else
179
+ reduced
180
+ end
181
+ rescue => e
182
+ if e.message.include?("index out of bounds")
183
+ error_msg = "\n❌ Dimensionality reduction failed\n\n"
184
+ error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
185
+ error_msg += "This typically happens with:\n"
186
+ error_msg += " • Embeddings containing invalid values\n"
187
+ error_msg += " • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
188
+ error_msg += " • Incompatible parameters\n\n"
189
+ error_msg += "Solutions:\n"
190
+ error_msg += " 1. Run without dimensionality reduction:\n"
191
+ error_msg += " ragnar topics --reduce-dimensions false\n\n"
192
+ error_msg += " 2. Use fewer dimensions:\n"
193
+ error_msg += " ragnar topics --n-components 2\n\n"
194
+ error_msg += " 3. Re-index your documents:\n"
195
+ error_msg += " ragnar index <path> --force\n"
196
+ raise error_msg
197
+ else
198
+ raise
199
+ end
200
+ end
139
201
  rescue LoadError
140
202
  puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
141
203
  embeddings
142
204
  end
143
205
 
206
+ private
207
+
208
+ def validate_embeddings_for_umap(embeddings)
209
+ valid = []
210
+ invalid_indices = []
211
+
212
+ embeddings.each_with_index do |embedding, idx|
213
+ if embedding.is_a?(Array) &&
214
+ embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
215
+ valid << embedding
216
+ else
217
+ invalid_indices << idx
218
+ end
219
+ end
220
+
221
+ [valid, invalid_indices]
222
+ end
223
+
144
224
  def build_topics(cluster_ids)
145
225
  @cluster_ids = cluster_ids
146
226
 
@@ -28,6 +28,67 @@ module Ragnar
28
28
 
29
29
  puts "Found #{embeddings.size} embeddings"
30
30
 
31
+ # Validate embeddings
32
+ embedding_dims = embeddings.map(&:size).uniq
33
+ if embedding_dims.size > 1
34
+ puts " ⚠️ Warning: Inconsistent embedding dimensions found: #{embedding_dims.inspect}"
35
+ puts " This may cause errors during UMAP training."
36
+ # Filter to only embeddings with the most common dimension
37
+ most_common_dim = embedding_dims.max_by { |dim| embeddings.count { |e| e.size == dim } }
38
+ embeddings = embeddings.select { |e| e.size == most_common_dim }
39
+ puts " Using only embeddings with #{most_common_dim} dimensions (#{embeddings.size} embeddings)"
40
+ end
41
+
42
+ # Check for nil or invalid values
43
+ invalid_count = 0
44
+ nan_count = 0
45
+ inf_count = 0
46
+
47
+ valid_embeddings = embeddings.select do |embedding|
48
+ if !embedding.is_a?(Array)
49
+ invalid_count += 1
50
+ false
51
+ elsif embedding.any? { |v| !v.is_a?(Numeric) }
52
+ invalid_count += 1
53
+ false
54
+ elsif embedding.any?(&:nan?)
55
+ nan_count += 1
56
+ false
57
+ elsif embedding.any? { |v| !v.finite? }
58
+ inf_count += 1
59
+ false
60
+ else
61
+ true
62
+ end
63
+ end
64
+
65
+ if valid_embeddings.size < embeddings.size
66
+ puts "\n ⚠️ Data quality issues detected:"
67
+ puts " • Invalid embeddings: #{invalid_count}" if invalid_count > 0
68
+ puts " • Embeddings with NaN: #{nan_count}" if nan_count > 0
69
+ puts " • Embeddings with Infinity: #{inf_count}" if inf_count > 0
70
+ puts " • Total removed: #{embeddings.size - valid_embeddings.size}"
71
+ puts " • Remaining valid: #{valid_embeddings.size}"
72
+
73
+ embeddings = valid_embeddings
74
+ end
75
+
76
+ if embeddings.empty?
77
+ raise "No valid embeddings found after validation.\n\n" \
78
+ "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
79
+ "This suggests a problem with the embedding model or indexing process.\n\n" \
80
+ "Please try:\n" \
81
+ " 1. Re-indexing your documents: ragnar index <path> --force\n" \
82
+ " 2. Using a different embedding model\n" \
83
+ " 3. Checking your document content for unusual characters"
84
+ end
85
+
86
+ if embeddings.size < 10
87
+ raise "Too few valid embeddings (#{embeddings.size}) for UMAP training.\n\n" \
88
+ "UMAP requires at least 10 samples to work effectively.\n" \
89
+ "Please index more documents or check for data quality issues."
90
+ end
91
+
31
92
  # Adjust parameters based on the number of samples
32
93
  # UMAP requires n_neighbors < n_samples
33
94
  # Also, n_components should be less than n_samples for stability
@@ -55,41 +116,68 @@ module Ragnar
55
116
  embedding_matrix = embeddings
56
117
  original_dims = embeddings.first.size
57
118
 
119
+ # Ensure n_components is reasonable
120
+ if n_components >= original_dims
121
+ puts " ⚠️ Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})"
122
+ n_components = [original_dims / 2, 50].min
123
+ puts " Reducing n_components to #{n_components}"
124
+ end
125
+
126
+ # For very high dimensional data, be more conservative
127
+ if original_dims > 500 && n_components > 50
128
+ puts " ⚠️ Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D"
129
+ puts " Consider using n_components <= 50 for stability"
130
+ end
131
+
58
132
  puts "\nTraining UMAP model..."
59
133
  puts " Original dimensions: #{original_dims}"
60
134
  puts " Target dimensions: #{n_components}"
61
135
  puts " Neighbors: #{n_neighbors}"
62
136
  puts " Min distance: #{min_dist}"
63
137
 
64
- # Use the simple ClusterKit.umap method
65
- progressbar = TTY::ProgressBar.new(
66
- "Training UMAP [:bar] :percent",
67
- total: 100,
68
- bar_format: :block,
69
- width: 30
70
- )
71
-
72
- # Start progress in background (ClusterKit doesn't provide callbacks)
73
- progress_thread = Thread.new do
74
- 100.times do
75
- sleep(0.05)
76
- progressbar.advance
77
- break if @training_complete
138
+ # Perform the actual training using the class-based API
139
+ puts " Training UMAP model (this may take a moment)..."
140
+
141
+ begin
142
+ @umap_instance = ClusterKit::Dimensionality::UMAP.new(
143
+ n_components: n_components,
144
+ n_neighbors: n_neighbors
145
+ )
146
+
147
+ @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
148
+
149
+ puts " ✓ UMAP training complete"
150
+ rescue => e
151
+ # Provide helpful error message without exposing internal stack trace
152
+ error_msg = "\n❌ UMAP training failed\n\n"
153
+
154
+ if e.message.include?("index out of bounds")
155
+ error_msg += "The UMAP algorithm encountered an index out of bounds error.\n\n"
156
+ error_msg += "This typically happens when:\n"
157
+ error_msg += " • The embedding data contains invalid values (NaN, Infinity)\n"
158
+ error_msg += " • The parameters are incompatible with your data\n"
159
+ error_msg += " • There are duplicate or corrupted embeddings\n\n"
160
+ error_msg += "Suggested solutions:\n"
161
+ error_msg += " 1. Try with more conservative parameters:\n"
162
+ error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n\n"
163
+ error_msg += " 2. Re-index your documents to regenerate embeddings:\n"
164
+ error_msg += " ragnar index <path> --force\n\n"
165
+ error_msg += " 3. Check your embedding model configuration\n\n"
166
+ error_msg += "Current parameters:\n"
167
+ error_msg += " • n_components: #{n_components}\n"
168
+ error_msg += " • n_neighbors: #{n_neighbors}\n"
169
+ error_msg += " • embeddings: #{embeddings.size} samples\n"
170
+ error_msg += " • dimensions: #{original_dims}\n"
171
+ else
172
+ error_msg += "Error: #{e.message}\n\n"
173
+ error_msg += "This may be due to incompatible parameters or data issues.\n"
174
+ error_msg += "Try using more conservative parameters:\n"
175
+ error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n"
78
176
  end
177
+
178
+ raise RuntimeError, error_msg
79
179
  end
80
180
 
81
- # Perform the actual training using the class-based API
82
- @umap_instance = ClusterKit::Dimensionality::UMAP.new(
83
- n_components: n_components,
84
- n_neighbors: n_neighbors
85
- )
86
-
87
- @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
88
-
89
- @training_complete = true
90
- progress_thread.join
91
- progressbar.finish
92
-
93
181
  # Store the parameters for saving
94
182
  @model_params = {
95
183
  n_components: n_components,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ragnar
4
- VERSION = "0.1.0.pre.1"
4
+ VERSION = "0.1.0.pre.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ragnar-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.pre.1
4
+ version: 0.1.0.pre.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-08-22 00:00:00.000000000 Z
11
+ date: 2025-08-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.0'
33
+ version: '1.2'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.0'
40
+ version: '1.2'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: lancelot
43
43
  requirement: !ruby/object:Gem::Requirement