ragnar-cli 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +439 -0
- data/exe/ragnar +6 -0
- data/lib/ragnar/chunker.rb +97 -0
- data/lib/ragnar/cli.rb +542 -0
- data/lib/ragnar/context_repacker.rb +121 -0
- data/lib/ragnar/database.rb +267 -0
- data/lib/ragnar/embedder.rb +137 -0
- data/lib/ragnar/indexer.rb +234 -0
- data/lib/ragnar/llm_manager.rb +43 -0
- data/lib/ragnar/query_processor.rb +398 -0
- data/lib/ragnar/query_rewriter.rb +75 -0
- data/lib/ragnar/topic_modeling/engine.rb +221 -0
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +300 -0
- data/lib/ragnar/topic_modeling/llm_adapter.rb +131 -0
- data/lib/ragnar/topic_modeling/metrics.rb +186 -0
- data/lib/ragnar/topic_modeling/term_extractor.rb +170 -0
- data/lib/ragnar/topic_modeling/topic.rb +117 -0
- data/lib/ragnar/topic_modeling/topic_labeler.rb +61 -0
- data/lib/ragnar/topic_modeling.rb +24 -0
- data/lib/ragnar/umap_processor.rb +228 -0
- data/lib/ragnar/umap_transform_service.rb +124 -0
- data/lib/ragnar/version.rb +5 -0
- data/lib/ragnar.rb +36 -0
- data/lib/ragnar_cli.rb +2 -0
- metadata +234 -0
@@ -0,0 +1,398 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'singleton'
|
3
|
+
require 'set'
|
4
|
+
require 'digest'
|
5
|
+
|
6
|
+
module Ragnar
|
7
|
+
class QueryProcessor
|
8
|
+
attr_reader :database, :embedder, :rewriter, :reranker
|
9
|
+
|
10
|
+
def initialize(db_path: Ragnar::DEFAULT_DB_PATH)
|
11
|
+
@database = Database.new(db_path)
|
12
|
+
@embedder = Embedder.new
|
13
|
+
@llm_manager = LLMManager.instance
|
14
|
+
@umap_service = UmapTransformService.instance
|
15
|
+
@rewriter = QueryRewriter.new(llm_manager: @llm_manager)
|
16
|
+
@reranker = nil # Will initialize when needed
|
17
|
+
end
|
18
|
+
|
19
|
+
def query(user_query, top_k: 3, verbose: false)
|
20
|
+
puts "Processing query: #{user_query}" if verbose
|
21
|
+
|
22
|
+
# Step 1: Rewrite and analyze the query
|
23
|
+
puts "\n#{'-'*60}" if verbose
|
24
|
+
puts "STEP 1: Query Analysis & Rewriting" if verbose
|
25
|
+
puts "-"*60 if verbose
|
26
|
+
|
27
|
+
rewritten = @rewriter.rewrite(user_query)
|
28
|
+
|
29
|
+
if verbose
|
30
|
+
puts "\nOriginal Query: #{user_query}"
|
31
|
+
puts "\nRewritten Query Analysis:"
|
32
|
+
puts " Clarified Intent: #{rewritten['clarified_intent']}"
|
33
|
+
puts " Query Type: #{rewritten['query_type']}"
|
34
|
+
puts " Context Needed: #{rewritten['context_needed']}"
|
35
|
+
puts "\nGenerated Sub-queries (#{rewritten['sub_queries'].length}):"
|
36
|
+
rewritten['sub_queries'].each_with_index do |sq, idx|
|
37
|
+
puts " #{idx + 1}. #{sq}"
|
38
|
+
end
|
39
|
+
if rewritten['key_terms'] && !rewritten['key_terms'].empty?
|
40
|
+
puts "\nKey Terms Identified:"
|
41
|
+
puts " #{rewritten['key_terms'].join(', ')}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Step 2: Retrieve candidates using RRF
|
46
|
+
if verbose
|
47
|
+
puts "\n#{'-'*60}"
|
48
|
+
puts "STEP 2: Document Retrieval with RRF"
|
49
|
+
puts "-"*60
|
50
|
+
end
|
51
|
+
|
52
|
+
candidates = retrieve_with_rrf(
|
53
|
+
rewritten['sub_queries'],
|
54
|
+
k: 20,
|
55
|
+
verbose: verbose
|
56
|
+
)
|
57
|
+
|
58
|
+
if candidates.empty?
|
59
|
+
return {
|
60
|
+
query: user_query,
|
61
|
+
clarified: rewritten['clarified_intent'],
|
62
|
+
answer: "No relevant documents found in the database.",
|
63
|
+
sources: []
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
if verbose
|
68
|
+
puts "\nRetrieval Summary:"
|
69
|
+
puts " Total candidates found: #{candidates.size}"
|
70
|
+
puts " Unique sources: #{candidates.map { |c| c[:file_path] }.uniq.size}"
|
71
|
+
end
|
72
|
+
|
73
|
+
# Step 3: Rerank candidates
|
74
|
+
if verbose
|
75
|
+
puts "\n#{'-'*60}"
|
76
|
+
puts "STEP 3: Document Reranking"
|
77
|
+
puts "-"*60
|
78
|
+
end
|
79
|
+
|
80
|
+
reranked = rerank_documents(
|
81
|
+
query: rewritten['clarified_intent'],
|
82
|
+
documents: candidates,
|
83
|
+
top_k: top_k * 2 # Get more than we need for context
|
84
|
+
)
|
85
|
+
|
86
|
+
if verbose && reranked.any?
|
87
|
+
puts "\nTop Reranked Documents:"
|
88
|
+
reranked[0..2].each_with_index do |doc, idx|
|
89
|
+
full_text = (doc[:chunk_text] || doc[:text] || "").gsub(/\s+/, ' ')
|
90
|
+
puts " #{idx + 1}. [#{File.basename(doc[:file_path] || 'unknown')}]"
|
91
|
+
puts " Score: #{doc[:score]&.round(4) if doc[:score]}"
|
92
|
+
puts " Full chunk (#{full_text.length} chars):"
|
93
|
+
puts " \"#{full_text}\""
|
94
|
+
puts ""
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Step 4: Prepare context with neighboring chunks
|
99
|
+
if verbose
|
100
|
+
puts "\n#{'-'*60}"
|
101
|
+
puts "STEP 4: Context Preparation"
|
102
|
+
puts "-"*60
|
103
|
+
end
|
104
|
+
|
105
|
+
context_docs = prepare_context(reranked[0...top_k], rewritten['context_needed'])
|
106
|
+
|
107
|
+
if verbose
|
108
|
+
puts "\nContext Documents Selected: #{context_docs.length}"
|
109
|
+
puts "Context strategy: #{rewritten['context_needed']}"
|
110
|
+
end
|
111
|
+
|
112
|
+
# Step 5: Repack context for optimal LLM consumption
|
113
|
+
if verbose
|
114
|
+
puts "\n#{'-'*60}"
|
115
|
+
puts "STEP 5: Context Repacking"
|
116
|
+
puts "-"*60
|
117
|
+
end
|
118
|
+
|
119
|
+
repacked_context = ContextRepacker.repack(
|
120
|
+
context_docs,
|
121
|
+
rewritten['clarified_intent']
|
122
|
+
)
|
123
|
+
|
124
|
+
if verbose
|
125
|
+
original_size = context_docs.sum { |d| (d[:chunk_text] || "").length }
|
126
|
+
puts "\nContext Optimization:"
|
127
|
+
puts " Original size: #{original_size} chars"
|
128
|
+
puts " Repacked size: #{repacked_context.length} chars"
|
129
|
+
puts " Compression ratio: #{(100.0 * repacked_context.length / original_size).round(1)}%"
|
130
|
+
puts "\nFull Repacked Context:"
|
131
|
+
puts "-" * 40
|
132
|
+
puts repacked_context
|
133
|
+
puts "-" * 40
|
134
|
+
end
|
135
|
+
|
136
|
+
# Step 6: Generate response
|
137
|
+
if verbose
|
138
|
+
puts "\n#{'-'*60}"
|
139
|
+
puts "STEP 6: Response Generation"
|
140
|
+
puts "-"*60
|
141
|
+
end
|
142
|
+
response = generate_response(
|
143
|
+
query: rewritten['clarified_intent'],
|
144
|
+
repacked_context: repacked_context,
|
145
|
+
query_type: rewritten['query_type']
|
146
|
+
)
|
147
|
+
|
148
|
+
if verbose
|
149
|
+
puts "\nGenerated Response:"
|
150
|
+
puts "-" * 40
|
151
|
+
puts response
|
152
|
+
puts "-" * 40
|
153
|
+
end
|
154
|
+
|
155
|
+
result = {
|
156
|
+
query: user_query,
|
157
|
+
clarified: rewritten['clarified_intent'],
|
158
|
+
answer: response,
|
159
|
+
sources: context_docs.map { |d|
|
160
|
+
{
|
161
|
+
source_file: d[:file_path] || d[:source_file],
|
162
|
+
chunk_index: d[:chunk_index]
|
163
|
+
}
|
164
|
+
},
|
165
|
+
sub_queries: rewritten['sub_queries'],
|
166
|
+
confidence: calculate_confidence(reranked[0...top_k])
|
167
|
+
}
|
168
|
+
|
169
|
+
if verbose
|
170
|
+
puts "\n#{'-'*60}"
|
171
|
+
puts "FINAL RESULTS"
|
172
|
+
puts "-"*60
|
173
|
+
puts "\nConfidence Score: #{result[:confidence]}%"
|
174
|
+
puts "\nSources Used:"
|
175
|
+
result[:sources].each_with_index do |source, idx|
|
176
|
+
puts " #{idx + 1}. #{source[:source_file]} (chunk #{source[:chunk_index]})"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
result
|
181
|
+
end
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def retrieve_with_rrf(queries, k: 20, verbose: false)
|
186
|
+
all_results = []
|
187
|
+
|
188
|
+
queries.each_with_index do |query, idx|
|
189
|
+
if verbose
|
190
|
+
puts "\nSub-query #{idx + 1}: \"#{query}\""
|
191
|
+
puts " Generating embedding..."
|
192
|
+
end
|
193
|
+
|
194
|
+
# Generate embedding for the query
|
195
|
+
query_embedding = @embedder.embed_text(query)
|
196
|
+
|
197
|
+
if verbose
|
198
|
+
puts " Embedding dimensions: #{query_embedding.length}"
|
199
|
+
puts " Searching vector database..."
|
200
|
+
end
|
201
|
+
|
202
|
+
# Check if we have reduced embeddings available for more efficient search
|
203
|
+
stats = @database.get_stats
|
204
|
+
use_reduced = stats[:with_reduced_embeddings] > 0
|
205
|
+
|
206
|
+
# Prepare the search embedding (either full or reduced)
|
207
|
+
search_embedding = query_embedding
|
208
|
+
|
209
|
+
if use_reduced
|
210
|
+
# Check if UMAP model is available
|
211
|
+
model_path = "./umap_model.bin"
|
212
|
+
|
213
|
+
if @umap_service.model_available?(model_path)
|
214
|
+
if verbose
|
215
|
+
puts " Transforming query to reduced space (#{stats[:reduced_dims]}D)"
|
216
|
+
end
|
217
|
+
|
218
|
+
begin
|
219
|
+
# Transform the query embedding to reduced space
|
220
|
+
search_embedding = @umap_service.transform_query(query_embedding, model_path)
|
221
|
+
|
222
|
+
if verbose
|
223
|
+
puts " ✓ Query transformed to #{search_embedding.size}D"
|
224
|
+
puts " Searching with reduced embeddings..."
|
225
|
+
end
|
226
|
+
rescue => e
|
227
|
+
puts " ⚠️ Failed to transform query: #{e.message}" if verbose
|
228
|
+
puts " Falling back to full embeddings" if verbose
|
229
|
+
use_reduced = false
|
230
|
+
end
|
231
|
+
else
|
232
|
+
if verbose
|
233
|
+
puts " Note: Reduced embeddings available but UMAP model not found"
|
234
|
+
puts " Falling back to full embeddings"
|
235
|
+
end
|
236
|
+
use_reduced = false
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
vector_results = @database.search_similar(
|
241
|
+
search_embedding,
|
242
|
+
k: k,
|
243
|
+
use_reduced: use_reduced
|
244
|
+
)
|
245
|
+
|
246
|
+
if verbose
|
247
|
+
puts " Found #{vector_results.length} matches"
|
248
|
+
if vector_results.any?
|
249
|
+
best = vector_results.first
|
250
|
+
puts " Best match: [#{File.basename(best[:file_path] || 'unknown')}] (distance: #{best[:distance]&.round(3)})"
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# Add query index for RRF
|
255
|
+
vector_results.each do |result|
|
256
|
+
result[:query_idx] = idx
|
257
|
+
result[:retrieval_method] = :vector
|
258
|
+
end
|
259
|
+
|
260
|
+
all_results.concat(vector_results)
|
261
|
+
end
|
262
|
+
|
263
|
+
if verbose
|
264
|
+
puts "\nApplying Reciprocal Rank Fusion..."
|
265
|
+
puts " Total results before fusion: #{all_results.length}"
|
266
|
+
end
|
267
|
+
|
268
|
+
# Apply Reciprocal Rank Fusion
|
269
|
+
fused = apply_rrf(all_results, k: k)
|
270
|
+
|
271
|
+
if verbose
|
272
|
+
puts " Results after RRF: #{fused.length}"
|
273
|
+
end
|
274
|
+
|
275
|
+
fused
|
276
|
+
end
|
277
|
+
|
278
|
+
def apply_rrf(results, k: 60)
|
279
|
+
# Group by document ID
|
280
|
+
doc_scores = {}
|
281
|
+
|
282
|
+
results.each do |result|
|
283
|
+
doc_id = result[:id]
|
284
|
+
doc_scores[doc_id] ||= {
|
285
|
+
score: 0.0,
|
286
|
+
document: result
|
287
|
+
}
|
288
|
+
|
289
|
+
# RRF formula: 1 / (k + rank)
|
290
|
+
# Using distance as a proxy for rank (lower distance = better rank)
|
291
|
+
rank = result[:distance] * 100 # Scale distance to rank-like values
|
292
|
+
doc_scores[doc_id][:score] += 1.0 / (k + rank)
|
293
|
+
end
|
294
|
+
|
295
|
+
# Sort by RRF score and return documents
|
296
|
+
doc_scores.values
|
297
|
+
.sort_by { |item| -item[:score] }
|
298
|
+
.map { |item| item[:document] }
|
299
|
+
end
|
300
|
+
|
301
|
+
def rerank_documents(query:, documents:, top_k:)
|
302
|
+
# Deduplicate documents based on chunk_text before reranking
|
303
|
+
seen_texts = Set.new
|
304
|
+
unique_docs = []
|
305
|
+
|
306
|
+
documents.each do |doc|
|
307
|
+
text = doc[:chunk_text] || doc[:text] || ""
|
308
|
+
text_hash = Digest::SHA256.hexdigest(text)
|
309
|
+
|
310
|
+
unless seen_texts.include?(text_hash)
|
311
|
+
seen_texts.add(text_hash)
|
312
|
+
unique_docs << doc
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
if documents.length > unique_docs.length && @verbose
|
317
|
+
puts " Deduplicated: #{documents.length} -> #{unique_docs.length} documents"
|
318
|
+
end
|
319
|
+
|
320
|
+
# Initialize reranker if not already done
|
321
|
+
@reranker ||= Candle::Reranker.from_pretrained(
|
322
|
+
"cross-encoder/ms-marco-MiniLM-L-12-v2"
|
323
|
+
)
|
324
|
+
|
325
|
+
# Prepare document texts - use chunk_text field
|
326
|
+
texts = unique_docs.map { |doc| doc[:chunk_text] || doc[:text] || "" }
|
327
|
+
|
328
|
+
# Rerank - returns array of {doc_id:, score:, text:}
|
329
|
+
reranked = @reranker.rerank(query, texts)
|
330
|
+
|
331
|
+
# Map back to original documents with scores
|
332
|
+
reranked.map do |result|
|
333
|
+
doc_idx = result[:doc_id]
|
334
|
+
unique_docs[doc_idx].merge(score: result[:score])
|
335
|
+
end.sort_by { |doc| -doc[:score] }.first(top_k)
|
336
|
+
rescue => e
|
337
|
+
puts "Warning: Reranking failed (#{e.message}), using original order"
|
338
|
+
unique_docs.first(top_k)
|
339
|
+
end
|
340
|
+
|
341
|
+
def prepare_context(documents, context_needed)
|
342
|
+
# For now, just return the documents
|
343
|
+
# In the future, we could fetch neighboring chunks for more context
|
344
|
+
context_size = case context_needed
|
345
|
+
when "extensive" then 5
|
346
|
+
when "moderate" then 3
|
347
|
+
else 2
|
348
|
+
end
|
349
|
+
|
350
|
+
documents.first(context_size)
|
351
|
+
end
|
352
|
+
|
353
|
+
def generate_response(query:, repacked_context:, query_type:)
|
354
|
+
# Get cached LLM from manager
|
355
|
+
llm = @llm_manager.default_llm
|
356
|
+
|
357
|
+
# Create prompt with repacked context
|
358
|
+
prompt = build_prompt(query, repacked_context, query_type)
|
359
|
+
|
360
|
+
# Generate response using default config
|
361
|
+
llm.generate(prompt)
|
362
|
+
rescue => e
|
363
|
+
# Fallback to returning the repacked context
|
364
|
+
puts "Warning: LLM generation failed (#{e.message})"
|
365
|
+
"Based on the retrieved information:\n\n#{repacked_context[0..500]}..."
|
366
|
+
end
|
367
|
+
|
368
|
+
def build_prompt(query, context, query_type)
|
369
|
+
base_prompt = <<~PROMPT
|
370
|
+
<|system|>
|
371
|
+
You are a helpful assistant. Answer questions based ONLY on the provided context.
|
372
|
+
If the answer is not in the context, say "I don't have enough information to answer that question."
|
373
|
+
</s>
|
374
|
+
<|user|>
|
375
|
+
Context:
|
376
|
+
#{context}
|
377
|
+
|
378
|
+
Question: #{query}
|
379
|
+
</s>
|
380
|
+
<|assistant|>
|
381
|
+
PROMPT
|
382
|
+
|
383
|
+
base_prompt
|
384
|
+
end
|
385
|
+
|
386
|
+
def calculate_confidence(documents)
|
387
|
+
return 0.0 if documents.empty?
|
388
|
+
|
389
|
+
# Simple confidence based on average similarity
|
390
|
+
avg_distance = documents.map { |d| d[:distance] }.sum / documents.size
|
391
|
+
|
392
|
+
# Convert distance to confidence (0-1 scale)
|
393
|
+
# Assuming distances are typically 0-2
|
394
|
+
confidence = [1.0 - (avg_distance / 2.0), 0.0].max
|
395
|
+
(confidence * 100).round(1)
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Ragnar
|
2
|
+
class QueryRewriter
|
3
|
+
def initialize(llm_manager: nil)
|
4
|
+
@llm_manager = llm_manager || LLMManager.instance
|
5
|
+
end
|
6
|
+
|
7
|
+
def rewrite(query)
|
8
|
+
# Get the cached LLM
|
9
|
+
model = @llm_manager.default_llm
|
10
|
+
|
11
|
+
# Define the JSON schema for structured output
|
12
|
+
schema = {
|
13
|
+
type: "object",
|
14
|
+
properties: {
|
15
|
+
clarified_intent: {
|
16
|
+
type: "string",
|
17
|
+
description: "A clear, specific statement of what the user is looking for"
|
18
|
+
},
|
19
|
+
query_type: {
|
20
|
+
type: "string",
|
21
|
+
enum: ["factual", "conceptual", "procedural", "comparative", "analytical"],
|
22
|
+
description: "The type of query"
|
23
|
+
},
|
24
|
+
sub_queries: {
|
25
|
+
type: "array",
|
26
|
+
items: { type: "string" },
|
27
|
+
minItems: 2,
|
28
|
+
maxItems: 5,
|
29
|
+
description: "Simpler, focused queries that together answer the main query"
|
30
|
+
},
|
31
|
+
key_terms: {
|
32
|
+
type: "array",
|
33
|
+
items: { type: "string" },
|
34
|
+
description: "Important terms and their synonyms for searching"
|
35
|
+
},
|
36
|
+
context_needed: {
|
37
|
+
type: "string",
|
38
|
+
enum: ["minimal", "moderate", "extensive"],
|
39
|
+
description: "How much context is likely needed to answer this query"
|
40
|
+
}
|
41
|
+
},
|
42
|
+
required: ["clarified_intent", "query_type", "sub_queries", "key_terms", "context_needed"]
|
43
|
+
}
|
44
|
+
|
45
|
+
prompt = <<~PROMPT
|
46
|
+
Analyze the following user query and break it down for retrieval-augmented generation.
|
47
|
+
Focus on understanding the user's intent and creating effective sub-queries for searching.
|
48
|
+
|
49
|
+
User Query: #{query}
|
50
|
+
|
51
|
+
Provide a structured analysis that will help retrieve the most relevant documents.
|
52
|
+
PROMPT
|
53
|
+
|
54
|
+
begin
|
55
|
+
# Use structured generation with schema
|
56
|
+
result = model.generate_structured(
|
57
|
+
prompt,
|
58
|
+
schema: schema
|
59
|
+
)
|
60
|
+
|
61
|
+
# The result should already be a JSON string
|
62
|
+
JSON.parse(result)
|
63
|
+
rescue => e
|
64
|
+
# Fallback to simple rewriting if structured generation fails
|
65
|
+
{
|
66
|
+
"clarified_intent" => query,
|
67
|
+
"query_type" => "general",
|
68
|
+
"sub_queries" => [query],
|
69
|
+
"key_terms" => query.split(/\s+/).select { |w| w.length > 3 },
|
70
|
+
"context_needed" => "moderate"
|
71
|
+
}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
module TopicModeling
|
5
|
+
class Engine
|
6
|
+
attr_reader :topics, :clusterer, :term_extractor
|
7
|
+
|
8
|
+
def initialize(
|
9
|
+
min_cluster_size: 5,
|
10
|
+
min_samples: 3,
|
11
|
+
clustering_backend: nil,
|
12
|
+
reduce_dimensions: true,
|
13
|
+
n_components: 50,
|
14
|
+
labeling_method: :hybrid,
|
15
|
+
llm_client: nil,
|
16
|
+
verbose: false
|
17
|
+
)
|
18
|
+
@min_cluster_size = min_cluster_size
|
19
|
+
@min_samples = min_samples
|
20
|
+
@reduce_dimensions = reduce_dimensions
|
21
|
+
@n_components = n_components
|
22
|
+
@labeling_method = labeling_method
|
23
|
+
@verbose = verbose
|
24
|
+
|
25
|
+
@clusterer = clustering_backend || build_default_clusterer
|
26
|
+
@term_extractor = TermExtractor.new
|
27
|
+
@labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
|
28
|
+
@topics = []
|
29
|
+
end
|
30
|
+
|
31
|
+
def fit(embeddings:, documents:, metadata: nil)
|
32
|
+
raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
|
33
|
+
|
34
|
+
@embeddings = embeddings
|
35
|
+
@documents = documents
|
36
|
+
@metadata = metadata || Array.new(documents.length) { {} }
|
37
|
+
|
38
|
+
puts "Starting topic extraction..." if @verbose
|
39
|
+
|
40
|
+
# Step 1: Optionally reduce dimensions for better clustering
|
41
|
+
working_embeddings = @embeddings
|
42
|
+
if @reduce_dimensions && @embeddings.first.length > @n_components
|
43
|
+
puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
|
44
|
+
working_embeddings = reduce_dimensions(@embeddings)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Step 2: Cluster embeddings
|
48
|
+
puts " Clustering #{working_embeddings.length} documents..." if @verbose
|
49
|
+
cluster_ids = @clusterer.fit_predict(working_embeddings)
|
50
|
+
|
51
|
+
# Step 3: Build topics from clusters
|
52
|
+
puts " Building topics..." if @verbose
|
53
|
+
@topics = build_topics(cluster_ids)
|
54
|
+
|
55
|
+
# Step 4: Extract terms for each topic
|
56
|
+
puts " Extracting distinctive terms..." if @verbose
|
57
|
+
extract_topic_terms
|
58
|
+
|
59
|
+
# Step 5: Generate labels
|
60
|
+
puts " Generating topic labels..." if @verbose
|
61
|
+
generate_topic_labels
|
62
|
+
|
63
|
+
puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
|
64
|
+
|
65
|
+
@topics
|
66
|
+
end
|
67
|
+
|
68
|
+
def transform(embeddings:, documents: nil)
|
69
|
+
# Assign new documents to existing topics
|
70
|
+
raise "Must call fit before transform" if @topics.empty?
|
71
|
+
|
72
|
+
# Use approximate prediction if available
|
73
|
+
if @clusterer.respond_to?(:approximate_predict)
|
74
|
+
@clusterer.approximate_predict(embeddings)
|
75
|
+
else
|
76
|
+
# Fallback: assign to nearest topic centroid
|
77
|
+
assign_to_nearest_topic(embeddings)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def get_topic(topic_id)
|
82
|
+
@topics.find { |t| t.id == topic_id }
|
83
|
+
end
|
84
|
+
|
85
|
+
def outliers
|
86
|
+
@outliers ||= @documents.each_with_index.select { |_, idx|
|
87
|
+
@cluster_ids && @cluster_ids[idx] == -1
|
88
|
+
}.map(&:first)
|
89
|
+
end
|
90
|
+
|
91
|
+
def save(path)
|
92
|
+
data = {
|
93
|
+
topics: @topics.map(&:to_h),
|
94
|
+
config: {
|
95
|
+
min_cluster_size: @min_cluster_size,
|
96
|
+
min_samples: @min_samples,
|
97
|
+
reduce_dimensions: @reduce_dimensions,
|
98
|
+
n_components: @n_components,
|
99
|
+
labeling_method: @labeling_method
|
100
|
+
}
|
101
|
+
}
|
102
|
+
File.write(path, JSON.pretty_generate(data))
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.load(path)
|
106
|
+
data = JSON.parse(File.read(path), symbolize_names: true)
|
107
|
+
engine = new(**data[:config])
|
108
|
+
# Reconstruct topics
|
109
|
+
engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
|
110
|
+
engine
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def build_default_clusterer
|
116
|
+
begin
|
117
|
+
require 'clusterkit'
|
118
|
+
ClusterKit::Clustering::HDBSCAN.new(
|
119
|
+
min_cluster_size: @min_cluster_size,
|
120
|
+
min_samples: @min_samples,
|
121
|
+
metric: 'euclidean'
|
122
|
+
)
|
123
|
+
rescue LoadError
|
124
|
+
raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def reduce_dimensions(embeddings)
|
129
|
+
require 'clusterkit'
|
130
|
+
|
131
|
+
umap = ClusterKit::Dimensionality::UMAP.new(
|
132
|
+
n_components: @n_components,
|
133
|
+
n_neighbors: 15,
|
134
|
+
random_seed: 42 # For reproducibility
|
135
|
+
)
|
136
|
+
|
137
|
+
# Convert to format UMAP expects
|
138
|
+
umap.fit_transform(embeddings)
|
139
|
+
rescue LoadError
|
140
|
+
puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
|
141
|
+
embeddings
|
142
|
+
end
|
143
|
+
|
144
|
+
def build_topics(cluster_ids)
|
145
|
+
@cluster_ids = cluster_ids
|
146
|
+
|
147
|
+
# Group documents by cluster
|
148
|
+
clusters = {}
|
149
|
+
cluster_ids.each_with_index do |cluster_id, doc_idx|
|
150
|
+
next if cluster_id == -1 # Skip outliers
|
151
|
+
clusters[cluster_id] ||= []
|
152
|
+
clusters[cluster_id] << doc_idx
|
153
|
+
end
|
154
|
+
|
155
|
+
# Create Topic objects
|
156
|
+
clusters.map do |cluster_id, doc_indices|
|
157
|
+
Topic.new(
|
158
|
+
id: cluster_id,
|
159
|
+
document_indices: doc_indices,
|
160
|
+
documents: doc_indices.map { |i| @documents[i] },
|
161
|
+
embeddings: doc_indices.map { |i| @embeddings[i] },
|
162
|
+
metadata: doc_indices.map { |i| @metadata[i] }
|
163
|
+
)
|
164
|
+
end.sort_by(&:id)
|
165
|
+
end
|
166
|
+
|
167
|
+
def extract_topic_terms
|
168
|
+
# Extract distinctive terms for each topic
|
169
|
+
all_docs_text = @documents.join(" ")
|
170
|
+
|
171
|
+
@topics.each do |topic|
|
172
|
+
topic_docs_text = topic.documents.join(" ")
|
173
|
+
|
174
|
+
# Use c-TF-IDF to find distinctive terms
|
175
|
+
terms = @term_extractor.extract_distinctive_terms(
|
176
|
+
topic_docs: topic.documents,
|
177
|
+
all_docs: @documents,
|
178
|
+
top_n: 20
|
179
|
+
)
|
180
|
+
|
181
|
+
topic.set_terms(terms)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def generate_topic_labels
|
186
|
+
@topics.each do |topic|
|
187
|
+
result = @labeler.generate_label(
|
188
|
+
topic: topic,
|
189
|
+
terms: topic.terms,
|
190
|
+
documents: topic.documents.first(3) # Use top 3 representative docs
|
191
|
+
)
|
192
|
+
|
193
|
+
# Set both label and description if available
|
194
|
+
topic.set_label(result[:label])
|
195
|
+
topic.instance_variable_set(:@description, result[:description]) if result[:description]
|
196
|
+
topic.instance_variable_set(:@label_confidence, result[:confidence])
|
197
|
+
topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def count_outliers(cluster_ids)
|
202
|
+
cluster_ids.count { |id| id == -1 }
|
203
|
+
end
|
204
|
+
|
205
|
+
def assign_to_nearest_topic(embeddings)
|
206
|
+
# Simple nearest centroid assignment
|
207
|
+
topic_centroids = @topics.map(&:centroid)
|
208
|
+
|
209
|
+
embeddings.map do |embedding|
|
210
|
+
distances = topic_centroids.map do |centroid|
|
211
|
+
# Euclidean distance
|
212
|
+
Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
|
213
|
+
end
|
214
|
+
|
215
|
+
min_idx = distances.index(distances.min)
|
216
|
+
@topics[min_idx].id
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|