ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ragnar-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.pre.3
4
+ version: 0.1.0.pre.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2025-08-23 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: thor
@@ -28,16 +27,44 @@ dependencies:
28
27
  name: red-candle
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
- - - "~>"
30
+ - - ">="
32
31
  - !ruby/object:Gem::Version
33
- version: '1.2'
32
+ version: 1.2.3
34
33
  type: :runtime
35
34
  prerelease: false
36
35
  version_requirements: !ruby/object:Gem::Requirement
37
36
  requirements:
38
- - - "~>"
37
+ - - ">="
39
38
  - !ruby/object:Gem::Version
40
- version: '1.2'
39
+ version: 1.2.3
40
+ - !ruby/object:Gem::Dependency
41
+ name: ruby_llm
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '1.14'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '1.14'
54
+ - !ruby/object:Gem::Dependency
55
+ name: ruby_llm-red_candle
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0.1'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0.1'
41
68
  - !ruby/object:Gem::Dependency
42
69
  name: lancelot
43
70
  requirement: !ruby/object:Gem::Requirement
@@ -47,7 +74,7 @@ dependencies:
47
74
  version: '0.3'
48
75
  - - ">="
49
76
  - !ruby/object:Gem::Version
50
- version: 0.3.2
77
+ version: 0.3.3
51
78
  type: :runtime
52
79
  prerelease: false
53
80
  version_requirements: !ruby/object:Gem::Requirement
@@ -57,21 +84,21 @@ dependencies:
57
84
  version: '0.3'
58
85
  - - ">="
59
86
  - !ruby/object:Gem::Version
60
- version: 0.3.2
87
+ version: 0.3.3
61
88
  - !ruby/object:Gem::Dependency
62
- name: clusterkit
89
+ name: topical
63
90
  requirement: !ruby/object:Gem::Requirement
64
91
  requirements:
65
- - - "~>"
92
+ - - ">="
66
93
  - !ruby/object:Gem::Version
67
- version: 0.1.0.pre.2
94
+ version: 0.1.2
68
95
  type: :runtime
69
96
  prerelease: false
70
97
  version_requirements: !ruby/object:Gem::Requirement
71
98
  requirements:
72
- - - "~>"
99
+ - - ">="
73
100
  - !ruby/object:Gem::Version
74
- version: 0.1.0.pre.2
101
+ version: 0.1.2
75
102
  - !ruby/object:Gem::Dependency
76
103
  name: baran
77
104
  requirement: !ruby/object:Gem::Requirement
@@ -92,14 +119,20 @@ dependencies:
92
119
  requirements:
93
120
  - - "~>"
94
121
  - !ruby/object:Gem::Version
95
- version: 0.1.0.pre.1
122
+ version: '0.1'
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: 0.1.2
96
126
  type: :runtime
97
127
  prerelease: false
98
128
  version_requirements: !ruby/object:Gem::Requirement
99
129
  requirements:
100
130
  - - "~>"
101
131
  - !ruby/object:Gem::Version
102
- version: 0.1.0.pre.1
132
+ version: '0.1'
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: 0.1.2
103
136
  - !ruby/object:Gem::Dependency
104
137
  name: tty-progressbar
105
138
  requirement: !ruby/object:Gem::Requirement
@@ -114,6 +147,20 @@ dependencies:
114
147
  - - "~>"
115
148
  - !ruby/object:Gem::Version
116
149
  version: '0.18'
150
+ - !ruby/object:Gem::Dependency
151
+ name: thor-interactive
152
+ requirement: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - "~>"
155
+ - !ruby/object:Gem::Version
156
+ version: 0.1.0.pre.3
157
+ type: :runtime
158
+ prerelease: false
159
+ version_requirements: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - "~>"
162
+ - !ruby/object:Gem::Version
163
+ version: 0.1.0.pre.3
117
164
  - !ruby/object:Gem::Dependency
118
165
  name: rake
119
166
  requirement: !ruby/object:Gem::Requirement
@@ -186,6 +233,9 @@ files:
186
233
  - lib/ragnar.rb
187
234
  - lib/ragnar/chunker.rb
188
235
  - lib/ragnar/cli.rb
236
+ - lib/ragnar/cli_umap.rb
237
+ - lib/ragnar/cli_visualization.rb
238
+ - lib/ragnar/config.rb
189
239
  - lib/ragnar/context_repacker.rb
190
240
  - lib/ragnar/database.rb
191
241
  - lib/ragnar/embedder.rb
@@ -194,25 +244,17 @@ files:
194
244
  - lib/ragnar/query_processor.rb
195
245
  - lib/ragnar/query_rewriter.rb
196
246
  - lib/ragnar/topic_modeling.rb
197
- - lib/ragnar/topic_modeling/engine.rb
198
- - lib/ragnar/topic_modeling/labeling_strategies.rb
199
- - lib/ragnar/topic_modeling/llm_adapter.rb
200
- - lib/ragnar/topic_modeling/metrics.rb
201
- - lib/ragnar/topic_modeling/term_extractor.rb
202
- - lib/ragnar/topic_modeling/topic.rb
203
- - lib/ragnar/topic_modeling/topic_labeler.rb
204
247
  - lib/ragnar/umap_processor.rb
205
248
  - lib/ragnar/umap_transform_service.rb
206
249
  - lib/ragnar/version.rb
207
250
  - lib/ragnar_cli.rb
208
- homepage: https://github.com/cpetersen/ragnar
251
+ homepage: https://github.com/scientist-labs/ragnar
209
252
  licenses:
210
253
  - MIT
211
254
  metadata:
212
- homepage_uri: https://github.com/cpetersen/ragnar
213
- source_code_uri: https://github.com/cpetersen/ragnar
214
- changelog_uri: https://github.com/cpetersen/ragnar/blob/main/CHANGELOG.md
215
- post_install_message:
255
+ homepage_uri: https://github.com/scientist-labs/ragnar
256
+ source_code_uri: https://github.com/scientist-labs/ragnar
257
+ changelog_uri: https://github.com/scientist-labs/ragnar/blob/main/CHANGELOG.md
216
258
  rdoc_options: []
217
259
  require_paths:
218
260
  - lib
@@ -227,8 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
227
269
  - !ruby/object:Gem::Version
228
270
  version: '0'
229
271
  requirements: []
230
- rubygems_version: 3.5.3
231
- signing_key:
272
+ rubygems_version: 3.6.9
232
273
  specification_version: 4
233
274
  summary: A Ruby + Rust powered RAG (Retrieval-Augmented Generation) system
234
275
  test_files: []
@@ -1,301 +0,0 @@
1
- require 'json'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class Engine
6
- attr_reader :topics, :clusterer, :term_extractor
7
-
8
- def initialize(
9
- min_cluster_size: 5,
10
- min_samples: 3,
11
- clustering_backend: nil,
12
- reduce_dimensions: true,
13
- n_components: 50,
14
- labeling_method: :hybrid,
15
- llm_client: nil,
16
- verbose: false
17
- )
18
- @min_cluster_size = min_cluster_size
19
- @min_samples = min_samples
20
- @reduce_dimensions = reduce_dimensions
21
- @n_components = n_components
22
- @labeling_method = labeling_method
23
- @verbose = verbose
24
-
25
- @clusterer = clustering_backend || build_default_clusterer
26
- @term_extractor = TermExtractor.new
27
- @labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
28
- @topics = []
29
- end
30
-
31
- def fit(embeddings:, documents:, metadata: nil)
32
- raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
33
-
34
- @embeddings = embeddings
35
- @documents = documents
36
- @metadata = metadata || Array.new(documents.length) { {} }
37
-
38
- puts "Starting topic extraction..." if @verbose
39
-
40
- # Step 1: Optionally reduce dimensions for better clustering
41
- working_embeddings = @embeddings
42
- if @reduce_dimensions && @embeddings.first.length > @n_components
43
- puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
44
- working_embeddings = reduce_dimensions(@embeddings)
45
- end
46
-
47
- # Step 2: Cluster embeddings
48
- puts " Clustering #{working_embeddings.length} documents..." if @verbose
49
- cluster_ids = @clusterer.fit_predict(working_embeddings)
50
-
51
- # Step 3: Build topics from clusters
52
- puts " Building topics..." if @verbose
53
- @topics = build_topics(cluster_ids)
54
-
55
- # Step 4: Extract terms for each topic
56
- puts " Extracting distinctive terms..." if @verbose
57
- extract_topic_terms
58
-
59
- # Step 5: Generate labels
60
- puts " Generating topic labels..." if @verbose
61
- generate_topic_labels
62
-
63
- puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
64
-
65
- @topics
66
- end
67
-
68
- def transform(embeddings:, documents: nil)
69
- # Assign new documents to existing topics
70
- raise "Must call fit before transform" if @topics.empty?
71
-
72
- # Use approximate prediction if available
73
- if @clusterer.respond_to?(:approximate_predict)
74
- @clusterer.approximate_predict(embeddings)
75
- else
76
- # Fallback: assign to nearest topic centroid
77
- assign_to_nearest_topic(embeddings)
78
- end
79
- end
80
-
81
- def get_topic(topic_id)
82
- @topics.find { |t| t.id == topic_id }
83
- end
84
-
85
- def outliers
86
- @outliers ||= @documents.each_with_index.select { |_, idx|
87
- @cluster_ids && @cluster_ids[idx] == -1
88
- }.map(&:first)
89
- end
90
-
91
- def save(path)
92
- data = {
93
- topics: @topics.map(&:to_h),
94
- config: {
95
- min_cluster_size: @min_cluster_size,
96
- min_samples: @min_samples,
97
- reduce_dimensions: @reduce_dimensions,
98
- n_components: @n_components,
99
- labeling_method: @labeling_method
100
- }
101
- }
102
- File.write(path, JSON.pretty_generate(data))
103
- end
104
-
105
- def self.load(path)
106
- data = JSON.parse(File.read(path), symbolize_names: true)
107
- engine = new(**data[:config])
108
- # Reconstruct topics
109
- engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
110
- engine
111
- end
112
-
113
- private
114
-
115
- def build_default_clusterer
116
- begin
117
- require 'clusterkit'
118
- ClusterKit::Clustering::HDBSCAN.new(
119
- min_cluster_size: @min_cluster_size,
120
- min_samples: @min_samples,
121
- metric: 'euclidean'
122
- )
123
- rescue LoadError
124
- raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
125
- end
126
- end
127
-
128
- def reduce_dimensions(embeddings)
129
- require 'clusterkit'
130
-
131
- # Validate embeddings before UMAP
132
- valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
133
-
134
- if valid_embeddings.empty?
135
- raise "No valid embeddings for dimensionality reduction.\n\n" \
136
- "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
137
- "Try running without dimensionality reduction:\n" \
138
- " ragnar topics --reduce-dimensions false"
139
- end
140
-
141
- if invalid_indices.any? && @verbose
142
- puts " ⚠️ Warning: #{invalid_indices.size} embeddings with invalid values removed"
143
- end
144
-
145
- begin
146
- # Adjust parameters based on data size
147
- n_samples = valid_embeddings.size
148
- n_components = [@n_components, n_samples - 1, 50].min
149
- n_neighbors = [15, n_samples - 1].min
150
-
151
- if @verbose && n_components != @n_components
152
- puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
153
- end
154
-
155
- umap = ClusterKit::Dimensionality::UMAP.new(
156
- n_components: n_components,
157
- n_neighbors: n_neighbors,
158
- random_seed: 42 # For reproducibility
159
- )
160
-
161
- # Convert to format UMAP expects
162
- reduced = umap.fit_transform(valid_embeddings)
163
-
164
- # If we had to remove invalid embeddings, reconstruct the full array
165
- if invalid_indices.any?
166
- full_reduced = []
167
- valid_idx = 0
168
- embeddings.size.times do |i|
169
- if invalid_indices.include?(i)
170
- # Use zeros for invalid embeddings (they'll be outliers anyway)
171
- full_reduced << Array.new(n_components, 0.0)
172
- else
173
- full_reduced << reduced[valid_idx]
174
- valid_idx += 1
175
- end
176
- end
177
- full_reduced
178
- else
179
- reduced
180
- end
181
- rescue => e
182
- if e.message.include?("index out of bounds")
183
- error_msg = "\n❌ Dimensionality reduction failed\n\n"
184
- error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
185
- error_msg += "This typically happens with:\n"
186
- error_msg += " • Embeddings containing invalid values\n"
187
- error_msg += " • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
188
- error_msg += " • Incompatible parameters\n\n"
189
- error_msg += "Solutions:\n"
190
- error_msg += " 1. Run without dimensionality reduction:\n"
191
- error_msg += " ragnar topics --reduce-dimensions false\n\n"
192
- error_msg += " 2. Use fewer dimensions:\n"
193
- error_msg += " ragnar topics --n-components 2\n\n"
194
- error_msg += " 3. Re-index your documents:\n"
195
- error_msg += " ragnar index <path> --force\n"
196
- raise error_msg
197
- else
198
- raise
199
- end
200
- end
201
- rescue LoadError
202
- puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
203
- embeddings
204
- end
205
-
206
- private
207
-
208
- def validate_embeddings_for_umap(embeddings)
209
- valid = []
210
- invalid_indices = []
211
-
212
- embeddings.each_with_index do |embedding, idx|
213
- if embedding.is_a?(Array) &&
214
- embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
215
- valid << embedding
216
- else
217
- invalid_indices << idx
218
- end
219
- end
220
-
221
- [valid, invalid_indices]
222
- end
223
-
224
- def build_topics(cluster_ids)
225
- @cluster_ids = cluster_ids
226
-
227
- # Group documents by cluster
228
- clusters = {}
229
- cluster_ids.each_with_index do |cluster_id, doc_idx|
230
- next if cluster_id == -1 # Skip outliers
231
- clusters[cluster_id] ||= []
232
- clusters[cluster_id] << doc_idx
233
- end
234
-
235
- # Create Topic objects
236
- clusters.map do |cluster_id, doc_indices|
237
- Topic.new(
238
- id: cluster_id,
239
- document_indices: doc_indices,
240
- documents: doc_indices.map { |i| @documents[i] },
241
- embeddings: doc_indices.map { |i| @embeddings[i] },
242
- metadata: doc_indices.map { |i| @metadata[i] }
243
- )
244
- end.sort_by(&:id)
245
- end
246
-
247
- def extract_topic_terms
248
- # Extract distinctive terms for each topic
249
- all_docs_text = @documents.join(" ")
250
-
251
- @topics.each do |topic|
252
- topic_docs_text = topic.documents.join(" ")
253
-
254
- # Use c-TF-IDF to find distinctive terms
255
- terms = @term_extractor.extract_distinctive_terms(
256
- topic_docs: topic.documents,
257
- all_docs: @documents,
258
- top_n: 20
259
- )
260
-
261
- topic.set_terms(terms)
262
- end
263
- end
264
-
265
- def generate_topic_labels
266
- @topics.each do |topic|
267
- result = @labeler.generate_label(
268
- topic: topic,
269
- terms: topic.terms,
270
- documents: topic.documents.first(3) # Use top 3 representative docs
271
- )
272
-
273
- # Set both label and description if available
274
- topic.set_label(result[:label])
275
- topic.instance_variable_set(:@description, result[:description]) if result[:description]
276
- topic.instance_variable_set(:@label_confidence, result[:confidence])
277
- topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
278
- end
279
- end
280
-
281
- def count_outliers(cluster_ids)
282
- cluster_ids.count { |id| id == -1 }
283
- end
284
-
285
- def assign_to_nearest_topic(embeddings)
286
- # Simple nearest centroid assignment
287
- topic_centroids = @topics.map(&:centroid)
288
-
289
- embeddings.map do |embedding|
290
- distances = topic_centroids.map do |centroid|
291
- # Euclidean distance
292
- Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
293
- end
294
-
295
- min_idx = distances.index(distances.min)
296
- @topics[min_idx].id
297
- end
298
- end
299
- end
300
- end
301
- end