ragnar-cli 0.1.0.pre.1 → 0.1.0.pre.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/lib/ragnar/cli.rb +18 -2
- data/lib/ragnar/topic_modeling/engine.rb +88 -8
- data/lib/ragnar/umap_processor.rb +114 -26
- data/lib/ragnar/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b9a7fdbf0345f1c111f8028f8b881d8014a55226cfe3d02f6a76fd6cd9b213c
|
4
|
+
data.tar.gz: 2341e27f16b442c0631876303e0da5141438559ef6685e2c15514cd18416d99c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a87f39a5dfd246732be4e24b19aba8b49a7a735f78d825d0241f04b0b776fc8b23c15f0b3488416dedfba37d9027ff6442f38ab2aad43bb79395e0c769247275
|
7
|
+
data.tar.gz: 00d7533c2e16b57da59786a840f1b653cfe472c63168a6e003abb31a79b3f6e0f056fdabe32f4b4566ea682d229e3d0710167358ab7c78fd18c497690d9a3675
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
<img src="/docs/assets/ragnar-wide.png" alt="ragnar" height="80px">
|
2
2
|
|
3
3
|
A complete Ruby implementation of Retrieval-Augmented Generation (RAG) pipeline using native Ruby ML/NLP gems.
|
4
4
|
|
@@ -419,11 +419,11 @@ MIT License - see LICENSE file for details
|
|
419
419
|
## Acknowledgments
|
420
420
|
|
421
421
|
This project integrates several excellent Ruby gems:
|
422
|
-
- [red-candle](https://github.com/red-candle) - Ruby ML/LLM toolkit
|
423
|
-
- [lancelot](https://github.com/lancelot) - Lance database bindings
|
422
|
+
- [red-candle](https://github.com/assaydepot/red-candle) - Ruby ML/LLM toolkit
|
423
|
+
- [lancelot](https://github.com/cpetersen/lancelot) - Lance database bindings
|
424
424
|
- [clusterkit](https://github.com/cpetersen/clusterkit) - UMAP and clustering implementation
|
425
425
|
- [parsekit](https://github.com/cpetersen/parsekit) - Content extraction
|
426
|
-
- [baran](https://github.com/baran) - Text splitting utilities
|
426
|
+
- [baran](https://github.com/moeki0/baran) - Text splitting utilities
|
427
427
|
|
428
428
|
## Roadmap
|
429
429
|
|
@@ -436,4 +436,4 @@ This project integrates several excellent Ruby gems:
|
|
436
436
|
- [ ] Performance benchmarking suite
|
437
437
|
- [ ] Support for multiple embedding models simultaneously
|
438
438
|
- [ ] Query result caching
|
439
|
-
- [ ] Automatic index optimization
|
439
|
+
- [ ] Automatic index optimization
|
data/lib/ragnar/cli.rb
CHANGED
@@ -127,7 +127,22 @@ module Ragnar
|
|
127
127
|
exit 1
|
128
128
|
end
|
129
129
|
|
130
|
-
|
130
|
+
# Check if we have reduced embeddings available
|
131
|
+
first_doc = docs_with_embeddings.first
|
132
|
+
has_reduced = first_doc[:reduced_embedding] && !first_doc[:reduced_embedding].empty?
|
133
|
+
|
134
|
+
if has_reduced
|
135
|
+
embeddings = docs_with_embeddings.map { |d| d[:reduced_embedding] }
|
136
|
+
say "Using reduced embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
137
|
+
# Already reduced, so don't reduce again in the engine
|
138
|
+
reduce_dims = false
|
139
|
+
else
|
140
|
+
embeddings = docs_with_embeddings.map { |d| d[:embedding] }
|
141
|
+
say "Using original embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
142
|
+
# Let the engine handle dimensionality reduction if needed
|
143
|
+
reduce_dims = true
|
144
|
+
end
|
145
|
+
|
131
146
|
documents = docs_with_embeddings.map { |d| d[:chunk_text] }
|
132
147
|
metadata = docs_with_embeddings.map { |d| { file_path: d[:file_path], chunk_index: d[:chunk_index] } }
|
133
148
|
|
@@ -137,7 +152,8 @@ module Ragnar
|
|
137
152
|
engine = Ragnar::TopicModeling::Engine.new(
|
138
153
|
min_cluster_size: options[:min_cluster_size],
|
139
154
|
labeling_method: options[:method].to_sym,
|
140
|
-
verbose: options[:verbose]
|
155
|
+
verbose: options[:verbose],
|
156
|
+
reduce_dimensions: reduce_dims
|
141
157
|
)
|
142
158
|
|
143
159
|
# Extract topics
|
@@ -128,19 +128,99 @@ module Ragnar
|
|
128
128
|
def reduce_dimensions(embeddings)
|
129
129
|
require 'clusterkit'
|
130
130
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
131
|
+
# Validate embeddings before UMAP
|
132
|
+
valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
|
133
|
+
|
134
|
+
if valid_embeddings.empty?
|
135
|
+
raise "No valid embeddings for dimensionality reduction.\n\n" \
|
136
|
+
"All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
|
137
|
+
"Try running without dimensionality reduction:\n" \
|
138
|
+
" ragnar topics --reduce-dimensions false"
|
139
|
+
end
|
140
|
+
|
141
|
+
if invalid_indices.any? && @verbose
|
142
|
+
puts " ⚠️ Warning: #{invalid_indices.size} embeddings with invalid values removed"
|
143
|
+
end
|
144
|
+
|
145
|
+
begin
|
146
|
+
# Adjust parameters based on data size
|
147
|
+
n_samples = valid_embeddings.size
|
148
|
+
n_components = [@n_components, n_samples - 1, 50].min
|
149
|
+
n_neighbors = [15, n_samples - 1].min
|
150
|
+
|
151
|
+
if @verbose && n_components != @n_components
|
152
|
+
puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
|
153
|
+
end
|
154
|
+
|
155
|
+
umap = ClusterKit::Dimensionality::UMAP.new(
|
156
|
+
n_components: n_components,
|
157
|
+
n_neighbors: n_neighbors,
|
158
|
+
random_seed: 42 # For reproducibility
|
159
|
+
)
|
160
|
+
|
161
|
+
# Convert to format UMAP expects
|
162
|
+
reduced = umap.fit_transform(valid_embeddings)
|
163
|
+
|
164
|
+
# If we had to remove invalid embeddings, reconstruct the full array
|
165
|
+
if invalid_indices.any?
|
166
|
+
full_reduced = []
|
167
|
+
valid_idx = 0
|
168
|
+
embeddings.size.times do |i|
|
169
|
+
if invalid_indices.include?(i)
|
170
|
+
# Use zeros for invalid embeddings (they'll be outliers anyway)
|
171
|
+
full_reduced << Array.new(n_components, 0.0)
|
172
|
+
else
|
173
|
+
full_reduced << reduced[valid_idx]
|
174
|
+
valid_idx += 1
|
175
|
+
end
|
176
|
+
end
|
177
|
+
full_reduced
|
178
|
+
else
|
179
|
+
reduced
|
180
|
+
end
|
181
|
+
rescue => e
|
182
|
+
if e.message.include?("index out of bounds")
|
183
|
+
error_msg = "\n❌ Dimensionality reduction failed\n\n"
|
184
|
+
error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
|
185
|
+
error_msg += "This typically happens with:\n"
|
186
|
+
error_msg += " • Embeddings containing invalid values\n"
|
187
|
+
error_msg += " • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
|
188
|
+
error_msg += " • Incompatible parameters\n\n"
|
189
|
+
error_msg += "Solutions:\n"
|
190
|
+
error_msg += " 1. Run without dimensionality reduction:\n"
|
191
|
+
error_msg += " ragnar topics --reduce-dimensions false\n\n"
|
192
|
+
error_msg += " 2. Use fewer dimensions:\n"
|
193
|
+
error_msg += " ragnar topics --n-components 2\n\n"
|
194
|
+
error_msg += " 3. Re-index your documents:\n"
|
195
|
+
error_msg += " ragnar index <path> --force\n"
|
196
|
+
raise error_msg
|
197
|
+
else
|
198
|
+
raise
|
199
|
+
end
|
200
|
+
end
|
139
201
|
rescue LoadError
|
140
202
|
puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
|
141
203
|
embeddings
|
142
204
|
end
|
143
205
|
|
206
|
+
private
|
207
|
+
|
208
|
+
def validate_embeddings_for_umap(embeddings)
|
209
|
+
valid = []
|
210
|
+
invalid_indices = []
|
211
|
+
|
212
|
+
embeddings.each_with_index do |embedding, idx|
|
213
|
+
if embedding.is_a?(Array) &&
|
214
|
+
embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
|
215
|
+
valid << embedding
|
216
|
+
else
|
217
|
+
invalid_indices << idx
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
[valid, invalid_indices]
|
222
|
+
end
|
223
|
+
|
144
224
|
def build_topics(cluster_ids)
|
145
225
|
@cluster_ids = cluster_ids
|
146
226
|
|
@@ -28,6 +28,67 @@ module Ragnar
|
|
28
28
|
|
29
29
|
puts "Found #{embeddings.size} embeddings"
|
30
30
|
|
31
|
+
# Validate embeddings
|
32
|
+
embedding_dims = embeddings.map(&:size).uniq
|
33
|
+
if embedding_dims.size > 1
|
34
|
+
puts " ⚠️ Warning: Inconsistent embedding dimensions found: #{embedding_dims.inspect}"
|
35
|
+
puts " This may cause errors during UMAP training."
|
36
|
+
# Filter to only embeddings with the most common dimension
|
37
|
+
most_common_dim = embedding_dims.max_by { |dim| embeddings.count { |e| e.size == dim } }
|
38
|
+
embeddings = embeddings.select { |e| e.size == most_common_dim }
|
39
|
+
puts " Using only embeddings with #{most_common_dim} dimensions (#{embeddings.size} embeddings)"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Check for nil or invalid values
|
43
|
+
invalid_count = 0
|
44
|
+
nan_count = 0
|
45
|
+
inf_count = 0
|
46
|
+
|
47
|
+
valid_embeddings = embeddings.select do |embedding|
|
48
|
+
if !embedding.is_a?(Array)
|
49
|
+
invalid_count += 1
|
50
|
+
false
|
51
|
+
elsif embedding.any? { |v| !v.is_a?(Numeric) }
|
52
|
+
invalid_count += 1
|
53
|
+
false
|
54
|
+
elsif embedding.any?(&:nan?)
|
55
|
+
nan_count += 1
|
56
|
+
false
|
57
|
+
elsif embedding.any? { |v| !v.finite? }
|
58
|
+
inf_count += 1
|
59
|
+
false
|
60
|
+
else
|
61
|
+
true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
if valid_embeddings.size < embeddings.size
|
66
|
+
puts "\n ⚠️ Data quality issues detected:"
|
67
|
+
puts " • Invalid embeddings: #{invalid_count}" if invalid_count > 0
|
68
|
+
puts " • Embeddings with NaN: #{nan_count}" if nan_count > 0
|
69
|
+
puts " • Embeddings with Infinity: #{inf_count}" if inf_count > 0
|
70
|
+
puts " • Total removed: #{embeddings.size - valid_embeddings.size}"
|
71
|
+
puts " • Remaining valid: #{valid_embeddings.size}"
|
72
|
+
|
73
|
+
embeddings = valid_embeddings
|
74
|
+
end
|
75
|
+
|
76
|
+
if embeddings.empty?
|
77
|
+
raise "No valid embeddings found after validation.\n\n" \
|
78
|
+
"All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
|
79
|
+
"This suggests a problem with the embedding model or indexing process.\n\n" \
|
80
|
+
"Please try:\n" \
|
81
|
+
" 1. Re-indexing your documents: ragnar index <path> --force\n" \
|
82
|
+
" 2. Using a different embedding model\n" \
|
83
|
+
" 3. Checking your document content for unusual characters"
|
84
|
+
end
|
85
|
+
|
86
|
+
if embeddings.size < 10
|
87
|
+
raise "Too few valid embeddings (#{embeddings.size}) for UMAP training.\n\n" \
|
88
|
+
"UMAP requires at least 10 samples to work effectively.\n" \
|
89
|
+
"Please index more documents or check for data quality issues."
|
90
|
+
end
|
91
|
+
|
31
92
|
# Adjust parameters based on the number of samples
|
32
93
|
# UMAP requires n_neighbors < n_samples
|
33
94
|
# Also, n_components should be less than n_samples for stability
|
@@ -55,41 +116,68 @@ module Ragnar
|
|
55
116
|
embedding_matrix = embeddings
|
56
117
|
original_dims = embeddings.first.size
|
57
118
|
|
119
|
+
# Ensure n_components is reasonable
|
120
|
+
if n_components >= original_dims
|
121
|
+
puts " ⚠️ Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})"
|
122
|
+
n_components = [original_dims / 2, 50].min
|
123
|
+
puts " Reducing n_components to #{n_components}"
|
124
|
+
end
|
125
|
+
|
126
|
+
# For very high dimensional data, be more conservative
|
127
|
+
if original_dims > 500 && n_components > 50
|
128
|
+
puts " ⚠️ Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D"
|
129
|
+
puts " Consider using n_components <= 50 for stability"
|
130
|
+
end
|
131
|
+
|
58
132
|
puts "\nTraining UMAP model..."
|
59
133
|
puts " Original dimensions: #{original_dims}"
|
60
134
|
puts " Target dimensions: #{n_components}"
|
61
135
|
puts " Neighbors: #{n_neighbors}"
|
62
136
|
puts " Min distance: #{min_dist}"
|
63
137
|
|
64
|
-
#
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
138
|
+
# Perform the actual training using the class-based API
|
139
|
+
puts " Training UMAP model (this may take a moment)..."
|
140
|
+
|
141
|
+
begin
|
142
|
+
@umap_instance = ClusterKit::Dimensionality::UMAP.new(
|
143
|
+
n_components: n_components,
|
144
|
+
n_neighbors: n_neighbors
|
145
|
+
)
|
146
|
+
|
147
|
+
@reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
|
148
|
+
|
149
|
+
puts " ✓ UMAP training complete"
|
150
|
+
rescue => e
|
151
|
+
# Provide helpful error message without exposing internal stack trace
|
152
|
+
error_msg = "\n❌ UMAP training failed\n\n"
|
153
|
+
|
154
|
+
if e.message.include?("index out of bounds")
|
155
|
+
error_msg += "The UMAP algorithm encountered an index out of bounds error.\n\n"
|
156
|
+
error_msg += "This typically happens when:\n"
|
157
|
+
error_msg += " • The embedding data contains invalid values (NaN, Infinity)\n"
|
158
|
+
error_msg += " • The parameters are incompatible with your data\n"
|
159
|
+
error_msg += " • There are duplicate or corrupted embeddings\n\n"
|
160
|
+
error_msg += "Suggested solutions:\n"
|
161
|
+
error_msg += " 1. Try with more conservative parameters:\n"
|
162
|
+
error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n\n"
|
163
|
+
error_msg += " 2. Re-index your documents to regenerate embeddings:\n"
|
164
|
+
error_msg += " ragnar index <path> --force\n\n"
|
165
|
+
error_msg += " 3. Check your embedding model configuration\n\n"
|
166
|
+
error_msg += "Current parameters:\n"
|
167
|
+
error_msg += " • n_components: #{n_components}\n"
|
168
|
+
error_msg += " • n_neighbors: #{n_neighbors}\n"
|
169
|
+
error_msg += " • embeddings: #{embeddings.size} samples\n"
|
170
|
+
error_msg += " • dimensions: #{original_dims}\n"
|
171
|
+
else
|
172
|
+
error_msg += "Error: #{e.message}\n\n"
|
173
|
+
error_msg += "This may be due to incompatible parameters or data issues.\n"
|
174
|
+
error_msg += "Try using more conservative parameters:\n"
|
175
|
+
error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n"
|
78
176
|
end
|
177
|
+
|
178
|
+
raise RuntimeError, error_msg
|
79
179
|
end
|
80
180
|
|
81
|
-
# Perform the actual training using the class-based API
|
82
|
-
@umap_instance = ClusterKit::Dimensionality::UMAP.new(
|
83
|
-
n_components: n_components,
|
84
|
-
n_neighbors: n_neighbors
|
85
|
-
)
|
86
|
-
|
87
|
-
@reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
|
88
|
-
|
89
|
-
@training_complete = true
|
90
|
-
progress_thread.join
|
91
|
-
progressbar.finish
|
92
|
-
|
93
181
|
# Store the parameters for saving
|
94
182
|
@model_params = {
|
95
183
|
n_components: n_components,
|
data/lib/ragnar/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ragnar-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.pre.
|
4
|
+
version: 0.1.0.pre.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-08-
|
11
|
+
date: 2025-08-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '1.
|
33
|
+
version: '1.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1.
|
40
|
+
version: '1.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: lancelot
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|