ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ragnar-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.pre.2
4
+ version: 0.1.0.pre.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-08-23 00:00:00.000000000 Z
11
+ date: 2025-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.2'
33
+ version: 1.2.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.2'
40
+ version: 1.2.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: lancelot
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -47,7 +47,7 @@ dependencies:
47
47
  version: '0.3'
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: 0.3.2
50
+ version: 0.3.3
51
51
  type: :runtime
52
52
  prerelease: false
53
53
  version_requirements: !ruby/object:Gem::Requirement
@@ -57,21 +57,27 @@ dependencies:
57
57
  version: '0.3'
58
58
  - - ">="
59
59
  - !ruby/object:Gem::Version
60
- version: 0.3.2
60
+ version: 0.3.3
61
61
  - !ruby/object:Gem::Dependency
62
- name: clusterkit
62
+ name: topical
63
63
  requirement: !ruby/object:Gem::Requirement
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: 0.1.0.pre.2
67
+ version: 0.1.0
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 0.1.1
68
71
  type: :runtime
69
72
  prerelease: false
70
73
  version_requirements: !ruby/object:Gem::Requirement
71
74
  requirements:
72
75
  - - "~>"
73
76
  - !ruby/object:Gem::Version
74
- version: 0.1.0.pre.2
77
+ version: 0.1.0
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 0.1.1
75
81
  - !ruby/object:Gem::Dependency
76
82
  name: baran
77
83
  requirement: !ruby/object:Gem::Requirement
@@ -92,14 +98,20 @@ dependencies:
92
98
  requirements:
93
99
  - - "~>"
94
100
  - !ruby/object:Gem::Version
95
- version: 0.1.0.pre.1
101
+ version: '0.1'
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: 0.1.2
96
105
  type: :runtime
97
106
  prerelease: false
98
107
  version_requirements: !ruby/object:Gem::Requirement
99
108
  requirements:
100
109
  - - "~>"
101
110
  - !ruby/object:Gem::Version
102
- version: 0.1.0.pre.1
111
+ version: '0.1'
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: 0.1.2
103
115
  - !ruby/object:Gem::Dependency
104
116
  name: tty-progressbar
105
117
  requirement: !ruby/object:Gem::Requirement
@@ -114,6 +126,20 @@ dependencies:
114
126
  - - "~>"
115
127
  - !ruby/object:Gem::Version
116
128
  version: '0.18'
129
+ - !ruby/object:Gem::Dependency
130
+ name: thor-interactive
131
+ requirement: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - "~>"
134
+ - !ruby/object:Gem::Version
135
+ version: 0.1.0.pre.3
136
+ type: :runtime
137
+ prerelease: false
138
+ version_requirements: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - "~>"
141
+ - !ruby/object:Gem::Version
142
+ version: 0.1.0.pre.3
117
143
  - !ruby/object:Gem::Dependency
118
144
  name: rake
119
145
  requirement: !ruby/object:Gem::Requirement
@@ -186,6 +212,8 @@ files:
186
212
  - lib/ragnar.rb
187
213
  - lib/ragnar/chunker.rb
188
214
  - lib/ragnar/cli.rb
215
+ - lib/ragnar/cli_visualization.rb
216
+ - lib/ragnar/config.rb
189
217
  - lib/ragnar/context_repacker.rb
190
218
  - lib/ragnar/database.rb
191
219
  - lib/ragnar/embedder.rb
@@ -194,24 +222,17 @@ files:
194
222
  - lib/ragnar/query_processor.rb
195
223
  - lib/ragnar/query_rewriter.rb
196
224
  - lib/ragnar/topic_modeling.rb
197
- - lib/ragnar/topic_modeling/engine.rb
198
- - lib/ragnar/topic_modeling/labeling_strategies.rb
199
- - lib/ragnar/topic_modeling/llm_adapter.rb
200
- - lib/ragnar/topic_modeling/metrics.rb
201
- - lib/ragnar/topic_modeling/term_extractor.rb
202
- - lib/ragnar/topic_modeling/topic.rb
203
- - lib/ragnar/topic_modeling/topic_labeler.rb
204
225
  - lib/ragnar/umap_processor.rb
205
226
  - lib/ragnar/umap_transform_service.rb
206
227
  - lib/ragnar/version.rb
207
228
  - lib/ragnar_cli.rb
208
- homepage: https://github.com/cpetersen/ragnar
229
+ homepage: https://github.com/scientist-labs/ragnar
209
230
  licenses:
210
231
  - MIT
211
232
  metadata:
212
- homepage_uri: https://github.com/cpetersen/ragnar
213
- source_code_uri: https://github.com/cpetersen/ragnar
214
- changelog_uri: https://github.com/cpetersen/ragnar/blob/main/CHANGELOG.md
233
+ homepage_uri: https://github.com/scientist-labs/ragnar
234
+ source_code_uri: https://github.com/scientist-labs/ragnar
235
+ changelog_uri: https://github.com/scientist-labs/ragnar/blob/main/CHANGELOG.md
215
236
  post_install_message:
216
237
  rdoc_options: []
217
238
  require_paths:
@@ -1,221 +0,0 @@
1
- require 'json'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class Engine
6
- attr_reader :topics, :clusterer, :term_extractor
7
-
8
- def initialize(
9
- min_cluster_size: 5,
10
- min_samples: 3,
11
- clustering_backend: nil,
12
- reduce_dimensions: true,
13
- n_components: 50,
14
- labeling_method: :hybrid,
15
- llm_client: nil,
16
- verbose: false
17
- )
18
- @min_cluster_size = min_cluster_size
19
- @min_samples = min_samples
20
- @reduce_dimensions = reduce_dimensions
21
- @n_components = n_components
22
- @labeling_method = labeling_method
23
- @verbose = verbose
24
-
25
- @clusterer = clustering_backend || build_default_clusterer
26
- @term_extractor = TermExtractor.new
27
- @labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
28
- @topics = []
29
- end
30
-
31
- def fit(embeddings:, documents:, metadata: nil)
32
- raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
33
-
34
- @embeddings = embeddings
35
- @documents = documents
36
- @metadata = metadata || Array.new(documents.length) { {} }
37
-
38
- puts "Starting topic extraction..." if @verbose
39
-
40
- # Step 1: Optionally reduce dimensions for better clustering
41
- working_embeddings = @embeddings
42
- if @reduce_dimensions && @embeddings.first.length > @n_components
43
- puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
44
- working_embeddings = reduce_dimensions(@embeddings)
45
- end
46
-
47
- # Step 2: Cluster embeddings
48
- puts " Clustering #{working_embeddings.length} documents..." if @verbose
49
- cluster_ids = @clusterer.fit_predict(working_embeddings)
50
-
51
- # Step 3: Build topics from clusters
52
- puts " Building topics..." if @verbose
53
- @topics = build_topics(cluster_ids)
54
-
55
- # Step 4: Extract terms for each topic
56
- puts " Extracting distinctive terms..." if @verbose
57
- extract_topic_terms
58
-
59
- # Step 5: Generate labels
60
- puts " Generating topic labels..." if @verbose
61
- generate_topic_labels
62
-
63
- puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
64
-
65
- @topics
66
- end
67
-
68
- def transform(embeddings:, documents: nil)
69
- # Assign new documents to existing topics
70
- raise "Must call fit before transform" if @topics.empty?
71
-
72
- # Use approximate prediction if available
73
- if @clusterer.respond_to?(:approximate_predict)
74
- @clusterer.approximate_predict(embeddings)
75
- else
76
- # Fallback: assign to nearest topic centroid
77
- assign_to_nearest_topic(embeddings)
78
- end
79
- end
80
-
81
- def get_topic(topic_id)
82
- @topics.find { |t| t.id == topic_id }
83
- end
84
-
85
- def outliers
86
- @outliers ||= @documents.each_with_index.select { |_, idx|
87
- @cluster_ids && @cluster_ids[idx] == -1
88
- }.map(&:first)
89
- end
90
-
91
- def save(path)
92
- data = {
93
- topics: @topics.map(&:to_h),
94
- config: {
95
- min_cluster_size: @min_cluster_size,
96
- min_samples: @min_samples,
97
- reduce_dimensions: @reduce_dimensions,
98
- n_components: @n_components,
99
- labeling_method: @labeling_method
100
- }
101
- }
102
- File.write(path, JSON.pretty_generate(data))
103
- end
104
-
105
- def self.load(path)
106
- data = JSON.parse(File.read(path), symbolize_names: true)
107
- engine = new(**data[:config])
108
- # Reconstruct topics
109
- engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
110
- engine
111
- end
112
-
113
- private
114
-
115
- def build_default_clusterer
116
- begin
117
- require 'clusterkit'
118
- ClusterKit::Clustering::HDBSCAN.new(
119
- min_cluster_size: @min_cluster_size,
120
- min_samples: @min_samples,
121
- metric: 'euclidean'
122
- )
123
- rescue LoadError
124
- raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
125
- end
126
- end
127
-
128
- def reduce_dimensions(embeddings)
129
- require 'clusterkit'
130
-
131
- umap = ClusterKit::Dimensionality::UMAP.new(
132
- n_components: @n_components,
133
- n_neighbors: 15,
134
- random_seed: 42 # For reproducibility
135
- )
136
-
137
- # Convert to format UMAP expects
138
- umap.fit_transform(embeddings)
139
- rescue LoadError
140
- puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
141
- embeddings
142
- end
143
-
144
- def build_topics(cluster_ids)
145
- @cluster_ids = cluster_ids
146
-
147
- # Group documents by cluster
148
- clusters = {}
149
- cluster_ids.each_with_index do |cluster_id, doc_idx|
150
- next if cluster_id == -1 # Skip outliers
151
- clusters[cluster_id] ||= []
152
- clusters[cluster_id] << doc_idx
153
- end
154
-
155
- # Create Topic objects
156
- clusters.map do |cluster_id, doc_indices|
157
- Topic.new(
158
- id: cluster_id,
159
- document_indices: doc_indices,
160
- documents: doc_indices.map { |i| @documents[i] },
161
- embeddings: doc_indices.map { |i| @embeddings[i] },
162
- metadata: doc_indices.map { |i| @metadata[i] }
163
- )
164
- end.sort_by(&:id)
165
- end
166
-
167
- def extract_topic_terms
168
- # Extract distinctive terms for each topic
169
- all_docs_text = @documents.join(" ")
170
-
171
- @topics.each do |topic|
172
- topic_docs_text = topic.documents.join(" ")
173
-
174
- # Use c-TF-IDF to find distinctive terms
175
- terms = @term_extractor.extract_distinctive_terms(
176
- topic_docs: topic.documents,
177
- all_docs: @documents,
178
- top_n: 20
179
- )
180
-
181
- topic.set_terms(terms)
182
- end
183
- end
184
-
185
- def generate_topic_labels
186
- @topics.each do |topic|
187
- result = @labeler.generate_label(
188
- topic: topic,
189
- terms: topic.terms,
190
- documents: topic.documents.first(3) # Use top 3 representative docs
191
- )
192
-
193
- # Set both label and description if available
194
- topic.set_label(result[:label])
195
- topic.instance_variable_set(:@description, result[:description]) if result[:description]
196
- topic.instance_variable_set(:@label_confidence, result[:confidence])
197
- topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
198
- end
199
- end
200
-
201
- def count_outliers(cluster_ids)
202
- cluster_ids.count { |id| id == -1 }
203
- end
204
-
205
- def assign_to_nearest_topic(embeddings)
206
- # Simple nearest centroid assignment
207
- topic_centroids = @topics.map(&:centroid)
208
-
209
- embeddings.map do |embedding|
210
- distances = topic_centroids.map do |centroid|
211
- # Euclidean distance
212
- Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
213
- end
214
-
215
- min_idx = distances.index(distances.min)
216
- @topics[min_idx].id
217
- end
218
- end
219
- end
220
- end
221
- end
@@ -1,300 +0,0 @@
1
- # Separate strategy classes for different labeling approaches
2
- module Ragnar
3
- module TopicModeling
4
- module LabelingStrategies
5
-
6
- # Base strategy class
7
- class Base
8
- def generate_label(topic:, terms:, documents:)
9
- raise NotImplementedError, "Subclasses must implement generate_label"
10
- end
11
-
12
- protected
13
-
14
- def select_representative_docs(documents, k: 3)
15
- return documents if documents.length <= k
16
-
17
- # For now, just take first k
18
- # Could be improved to select most central docs
19
- documents.first(k)
20
- end
21
-
22
- def capitalize_phrase(phrase)
23
- phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
24
- end
25
- end
26
-
27
- # Fast term-based labeling using c-TF-IDF terms
28
- class TermBased < Base
29
- def generate_label(topic:, terms:, documents:)
30
- return { label: "Empty Topic", description: "No terms found" } if terms.empty?
31
-
32
- # Take top distinctive terms
33
- label_terms = terms.first(3).select { |t| t.length > 3 }
34
-
35
- label = if label_terms.length >= 2
36
- "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
37
- else
38
- capitalize_phrase(label_terms.first || terms.first)
39
- end
40
-
41
- {
42
- label: label,
43
- description: "Documents about #{terms.first(5).join(', ')}",
44
- method: :term_based,
45
- confidence: calculate_confidence(terms)
46
- }
47
- end
48
-
49
- private
50
-
51
- def calculate_confidence(terms)
52
- # Simple heuristic: more distinctive terms = higher confidence
53
- return 0.0 if terms.empty?
54
-
55
- # Assume terms come with scores if available
56
- if terms.is_a?(Array) && terms.first.is_a?(Array)
57
- # Terms are [word, score] pairs
58
- avg_score = terms.first(5).map(&:last).sum / 5.0
59
- [avg_score, 1.0].min
60
- else
61
- # Just have terms, use count as proxy
62
- [terms.length / 20.0, 1.0].min
63
- end
64
- end
65
- end
66
-
67
- # Quality LLM-based labeling
68
- class LLMBased < Base
69
- def initialize(llm_client: nil)
70
- @llm_client = llm_client
71
- end
72
-
73
- def generate_label(topic:, terms:, documents:)
74
- unless llm_available?
75
- # Fallback to term-based if LLM not available
76
- return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
77
- end
78
-
79
- # Select best documents to send to LLM
80
- sample_docs = select_representative_docs(documents, k: 3)
81
-
82
- # Generate comprehensive analysis
83
- response = analyze_with_llm(sample_docs, terms)
84
-
85
- {
86
- label: response[:label],
87
- description: response[:description],
88
- themes: response[:themes],
89
- method: :llm_based,
90
- confidence: response[:confidence] || 0.8
91
- }
92
- rescue => e
93
- # Fallback on error
94
- puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
95
- TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
96
- end
97
-
98
- private
99
-
100
- def llm_available?
101
- return true if @llm_client
102
-
103
- # Try to create LLM adapter
104
- begin
105
- require_relative 'llm_adapter'
106
- @llm_client = LLMAdapter.create(type: :auto)
107
- @llm_client && @llm_client.available?
108
- rescue LoadError, StandardError => e
109
- puts "LLM not available: #{e.message}" if ENV['DEBUG']
110
- false
111
- end
112
- end
113
-
114
- def analyze_with_llm(documents, terms)
115
- prompt = build_analysis_prompt(documents, terms)
116
-
117
- response = @llm_client.generate(
118
- prompt: prompt,
119
- max_tokens: 150,
120
- temperature: 0.3,
121
- response_format: { type: "json_object" }
122
- )
123
-
124
- # Parse JSON response
125
- result = JSON.parse(response, symbolize_names: true)
126
-
127
- # Validate and clean
128
- {
129
- label: clean_label(result[:label]),
130
- description: result[:description] || "Topic about #{result[:label]}",
131
- themes: result[:themes] || [],
132
- confidence: result[:confidence] || 0.8
133
- }
134
- end
135
-
136
- def build_analysis_prompt(documents, terms)
137
- doc_samples = documents.map.with_index do |doc, i|
138
- preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
139
- "Document #{i + 1}:\n#{preview}"
140
- end.join("\n\n")
141
-
142
- <<~PROMPT
143
- Analyze this cluster of related documents and provide a structured summary.
144
-
145
- Distinctive terms found: #{terms.first(10).join(', ')}
146
-
147
- Sample documents:
148
- #{doc_samples}
149
-
150
- Provide a JSON response with:
151
- {
152
- "label": "A 2-4 word topic label",
153
- "description": "One sentence describing what connects these documents",
154
- "themes": ["theme1", "theme2", "theme3"],
155
- "confidence": 0.0-1.0 score of how coherent this topic is
156
- }
157
-
158
- Focus on what meaningfully connects these documents, not just common words.
159
- PROMPT
160
- end
161
-
162
- def clean_label(label)
163
- return "Unknown Topic" unless label
164
-
165
- # Remove quotes, trim, limit length
166
- cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
167
- cleaned = cleaned.split("\n").first if cleaned.include?("\n")
168
-
169
- # Limit to reasonable length
170
- if cleaned.length > 50
171
- cleaned[0..47] + "..."
172
- else
173
- cleaned
174
- end
175
- end
176
- end
177
-
178
- # Hybrid approach - uses terms to guide LLM for efficiency
179
- class Hybrid < Base
180
- def initialize(llm_client: nil)
181
- @llm_client = llm_client
182
- @term_strategy = TermBased.new
183
- end
184
-
185
- def generate_label(topic:, terms:, documents:)
186
- # Start with term-based analysis
187
- term_result = @term_strategy.generate_label(
188
- topic: topic,
189
- terms: terms,
190
- documents: documents
191
- )
192
-
193
- # If no LLM available, return term-based result
194
- unless llm_available?
195
- return term_result.merge(method: :hybrid_fallback)
196
- end
197
-
198
- # Enhance with focused LLM call
199
- enhanced = enhance_with_llm(term_result, terms, documents)
200
-
201
- {
202
- label: enhanced[:label] || term_result[:label],
203
- description: enhanced[:description] || term_result[:description],
204
- method: :hybrid,
205
- confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
206
- term_label: term_result[:label], # Keep original for comparison
207
- themes: enhanced[:themes]
208
- }
209
- rescue => e
210
- # Fallback to term-based
211
- puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
212
- term_result.merge(method: :hybrid_fallback)
213
- end
214
-
215
- private
216
-
217
- def llm_available?
218
- return true if @llm_client
219
-
220
- begin
221
- require_relative 'llm_adapter'
222
- @llm_client = LLMAdapter.create(type: :auto)
223
- @llm_client && @llm_client.available?
224
- rescue LoadError, StandardError => e
225
- puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
226
- false
227
- end
228
- end
229
-
230
- def enhance_with_llm(term_result, terms, documents)
231
- # Lighter-weight prompt using term analysis as starting point
232
- prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
233
-
234
- response = @llm_client.generate(
235
- prompt: prompt,
236
- max_tokens: 100,
237
- temperature: 0.3
238
- )
239
-
240
- # Parse response (simpler format for speed)
241
- parse_enhancement_response(response)
242
- end
243
-
244
- def build_enhancement_prompt(term_label, terms, sample_doc)
245
- doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
246
-
247
- <<~PROMPT
248
- Current topic label based on terms: "#{term_label}"
249
- Key terms: #{terms.first(8).join(', ')}
250
-
251
- Sample document:
252
- #{doc_preview}
253
-
254
- Provide a better topic label if possible (2-4 words), or confirm the current one.
255
- Also provide a one-sentence description.
256
-
257
- Format:
258
- Label: [your label]
259
- Description: [one sentence]
260
- Themes: [comma-separated list]
261
- PROMPT
262
- end
263
-
264
- def parse_enhancement_response(response)
265
- result = {}
266
-
267
- # Simple line-based parsing
268
- response.lines.each do |line|
269
- if line.start_with?("Label:")
270
- result[:label] = line.sub("Label:", "").strip
271
- elsif line.start_with?("Description:")
272
- result[:description] = line.sub("Description:", "").strip
273
- elsif line.start_with?("Themes:")
274
- themes_str = line.sub("Themes:", "").strip
275
- result[:themes] = themes_str.split(",").map(&:strip)
276
- end
277
- end
278
-
279
- result[:confidence] = result[:label] ? 0.7 : 0.3
280
- result
281
- end
282
- end
283
-
284
- # Factory method to get appropriate strategy
285
- def self.create(method, llm_client: nil)
286
- case method.to_sym
287
- when :fast, :term_based, :terms
288
- TermBased.new
289
- when :quality, :llm_based, :llm
290
- LLMBased.new(llm_client: llm_client)
291
- when :hybrid, :auto, :smart
292
- Hybrid.new(llm_client: llm_client)
293
- else
294
- # Default to hybrid
295
- Hybrid.new(llm_client: llm_client)
296
- end
297
- end
298
- end
299
- end
300
- end