ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +249 -41
- data/lib/ragnar/cli.rb +563 -219
- data/lib/ragnar/cli_umap.rb +86 -0
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +320 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/embedder.rb +1 -1
- data/lib/ragnar/indexer.rb +4 -2
- data/lib/ragnar/llm_manager.rb +31 -27
- data/lib/ragnar/query_processor.rb +123 -70
- data/lib/ragnar/query_rewriter.rb +21 -18
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +131 -95
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- data/lib/ragnar.rb +3 -1
- metadata +71 -30
- data/lib/ragnar/topic_modeling/engine.rb +0 -301
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ragnar-cli
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.0.pre.
|
|
4
|
+
version: 0.1.0.pre.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: thor
|
|
@@ -28,16 +27,44 @@ dependencies:
|
|
|
28
27
|
name: red-candle
|
|
29
28
|
requirement: !ruby/object:Gem::Requirement
|
|
30
29
|
requirements:
|
|
31
|
-
- - "
|
|
30
|
+
- - ">="
|
|
32
31
|
- !ruby/object:Gem::Version
|
|
33
|
-
version:
|
|
32
|
+
version: 1.2.3
|
|
34
33
|
type: :runtime
|
|
35
34
|
prerelease: false
|
|
36
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
36
|
requirements:
|
|
38
|
-
- - "
|
|
37
|
+
- - ">="
|
|
39
38
|
- !ruby/object:Gem::Version
|
|
40
|
-
version:
|
|
39
|
+
version: 1.2.3
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: ruby_llm
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '1.14'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '1.14'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: ruby_llm-red_candle
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0.1'
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '0.1'
|
|
41
68
|
- !ruby/object:Gem::Dependency
|
|
42
69
|
name: lancelot
|
|
43
70
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -47,7 +74,7 @@ dependencies:
|
|
|
47
74
|
version: '0.3'
|
|
48
75
|
- - ">="
|
|
49
76
|
- !ruby/object:Gem::Version
|
|
50
|
-
version: 0.3.
|
|
77
|
+
version: 0.3.3
|
|
51
78
|
type: :runtime
|
|
52
79
|
prerelease: false
|
|
53
80
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -57,21 +84,21 @@ dependencies:
|
|
|
57
84
|
version: '0.3'
|
|
58
85
|
- - ">="
|
|
59
86
|
- !ruby/object:Gem::Version
|
|
60
|
-
version: 0.3.
|
|
87
|
+
version: 0.3.3
|
|
61
88
|
- !ruby/object:Gem::Dependency
|
|
62
|
-
name:
|
|
89
|
+
name: topical
|
|
63
90
|
requirement: !ruby/object:Gem::Requirement
|
|
64
91
|
requirements:
|
|
65
|
-
- - "
|
|
92
|
+
- - ">="
|
|
66
93
|
- !ruby/object:Gem::Version
|
|
67
|
-
version: 0.1.
|
|
94
|
+
version: 0.1.2
|
|
68
95
|
type: :runtime
|
|
69
96
|
prerelease: false
|
|
70
97
|
version_requirements: !ruby/object:Gem::Requirement
|
|
71
98
|
requirements:
|
|
72
|
-
- - "
|
|
99
|
+
- - ">="
|
|
73
100
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: 0.1.
|
|
101
|
+
version: 0.1.2
|
|
75
102
|
- !ruby/object:Gem::Dependency
|
|
76
103
|
name: baran
|
|
77
104
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -92,14 +119,20 @@ dependencies:
|
|
|
92
119
|
requirements:
|
|
93
120
|
- - "~>"
|
|
94
121
|
- !ruby/object:Gem::Version
|
|
95
|
-
version: 0.1
|
|
122
|
+
version: '0.1'
|
|
123
|
+
- - ">="
|
|
124
|
+
- !ruby/object:Gem::Version
|
|
125
|
+
version: 0.1.2
|
|
96
126
|
type: :runtime
|
|
97
127
|
prerelease: false
|
|
98
128
|
version_requirements: !ruby/object:Gem::Requirement
|
|
99
129
|
requirements:
|
|
100
130
|
- - "~>"
|
|
101
131
|
- !ruby/object:Gem::Version
|
|
102
|
-
version: 0.1
|
|
132
|
+
version: '0.1'
|
|
133
|
+
- - ">="
|
|
134
|
+
- !ruby/object:Gem::Version
|
|
135
|
+
version: 0.1.2
|
|
103
136
|
- !ruby/object:Gem::Dependency
|
|
104
137
|
name: tty-progressbar
|
|
105
138
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -114,6 +147,20 @@ dependencies:
|
|
|
114
147
|
- - "~>"
|
|
115
148
|
- !ruby/object:Gem::Version
|
|
116
149
|
version: '0.18'
|
|
150
|
+
- !ruby/object:Gem::Dependency
|
|
151
|
+
name: thor-interactive
|
|
152
|
+
requirement: !ruby/object:Gem::Requirement
|
|
153
|
+
requirements:
|
|
154
|
+
- - "~>"
|
|
155
|
+
- !ruby/object:Gem::Version
|
|
156
|
+
version: 0.1.0.pre.3
|
|
157
|
+
type: :runtime
|
|
158
|
+
prerelease: false
|
|
159
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
160
|
+
requirements:
|
|
161
|
+
- - "~>"
|
|
162
|
+
- !ruby/object:Gem::Version
|
|
163
|
+
version: 0.1.0.pre.3
|
|
117
164
|
- !ruby/object:Gem::Dependency
|
|
118
165
|
name: rake
|
|
119
166
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -186,6 +233,9 @@ files:
|
|
|
186
233
|
- lib/ragnar.rb
|
|
187
234
|
- lib/ragnar/chunker.rb
|
|
188
235
|
- lib/ragnar/cli.rb
|
|
236
|
+
- lib/ragnar/cli_umap.rb
|
|
237
|
+
- lib/ragnar/cli_visualization.rb
|
|
238
|
+
- lib/ragnar/config.rb
|
|
189
239
|
- lib/ragnar/context_repacker.rb
|
|
190
240
|
- lib/ragnar/database.rb
|
|
191
241
|
- lib/ragnar/embedder.rb
|
|
@@ -194,25 +244,17 @@ files:
|
|
|
194
244
|
- lib/ragnar/query_processor.rb
|
|
195
245
|
- lib/ragnar/query_rewriter.rb
|
|
196
246
|
- lib/ragnar/topic_modeling.rb
|
|
197
|
-
- lib/ragnar/topic_modeling/engine.rb
|
|
198
|
-
- lib/ragnar/topic_modeling/labeling_strategies.rb
|
|
199
|
-
- lib/ragnar/topic_modeling/llm_adapter.rb
|
|
200
|
-
- lib/ragnar/topic_modeling/metrics.rb
|
|
201
|
-
- lib/ragnar/topic_modeling/term_extractor.rb
|
|
202
|
-
- lib/ragnar/topic_modeling/topic.rb
|
|
203
|
-
- lib/ragnar/topic_modeling/topic_labeler.rb
|
|
204
247
|
- lib/ragnar/umap_processor.rb
|
|
205
248
|
- lib/ragnar/umap_transform_service.rb
|
|
206
249
|
- lib/ragnar/version.rb
|
|
207
250
|
- lib/ragnar_cli.rb
|
|
208
|
-
homepage: https://github.com/
|
|
251
|
+
homepage: https://github.com/scientist-labs/ragnar
|
|
209
252
|
licenses:
|
|
210
253
|
- MIT
|
|
211
254
|
metadata:
|
|
212
|
-
homepage_uri: https://github.com/
|
|
213
|
-
source_code_uri: https://github.com/
|
|
214
|
-
changelog_uri: https://github.com/
|
|
215
|
-
post_install_message:
|
|
255
|
+
homepage_uri: https://github.com/scientist-labs/ragnar
|
|
256
|
+
source_code_uri: https://github.com/scientist-labs/ragnar
|
|
257
|
+
changelog_uri: https://github.com/scientist-labs/ragnar/blob/main/CHANGELOG.md
|
|
216
258
|
rdoc_options: []
|
|
217
259
|
require_paths:
|
|
218
260
|
- lib
|
|
@@ -227,8 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
227
269
|
- !ruby/object:Gem::Version
|
|
228
270
|
version: '0'
|
|
229
271
|
requirements: []
|
|
230
|
-
rubygems_version: 3.
|
|
231
|
-
signing_key:
|
|
272
|
+
rubygems_version: 3.6.9
|
|
232
273
|
specification_version: 4
|
|
233
274
|
summary: A Ruby + Rust powered RAG (Retrieval-Augmented Generation) system
|
|
234
275
|
test_files: []
|
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
require 'json'
|
|
2
|
-
|
|
3
|
-
module Ragnar
|
|
4
|
-
module TopicModeling
|
|
5
|
-
class Engine
|
|
6
|
-
attr_reader :topics, :clusterer, :term_extractor
|
|
7
|
-
|
|
8
|
-
def initialize(
|
|
9
|
-
min_cluster_size: 5,
|
|
10
|
-
min_samples: 3,
|
|
11
|
-
clustering_backend: nil,
|
|
12
|
-
reduce_dimensions: true,
|
|
13
|
-
n_components: 50,
|
|
14
|
-
labeling_method: :hybrid,
|
|
15
|
-
llm_client: nil,
|
|
16
|
-
verbose: false
|
|
17
|
-
)
|
|
18
|
-
@min_cluster_size = min_cluster_size
|
|
19
|
-
@min_samples = min_samples
|
|
20
|
-
@reduce_dimensions = reduce_dimensions
|
|
21
|
-
@n_components = n_components
|
|
22
|
-
@labeling_method = labeling_method
|
|
23
|
-
@verbose = verbose
|
|
24
|
-
|
|
25
|
-
@clusterer = clustering_backend || build_default_clusterer
|
|
26
|
-
@term_extractor = TermExtractor.new
|
|
27
|
-
@labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
|
|
28
|
-
@topics = []
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def fit(embeddings:, documents:, metadata: nil)
|
|
32
|
-
raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
|
|
33
|
-
|
|
34
|
-
@embeddings = embeddings
|
|
35
|
-
@documents = documents
|
|
36
|
-
@metadata = metadata || Array.new(documents.length) { {} }
|
|
37
|
-
|
|
38
|
-
puts "Starting topic extraction..." if @verbose
|
|
39
|
-
|
|
40
|
-
# Step 1: Optionally reduce dimensions for better clustering
|
|
41
|
-
working_embeddings = @embeddings
|
|
42
|
-
if @reduce_dimensions && @embeddings.first.length > @n_components
|
|
43
|
-
puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
|
|
44
|
-
working_embeddings = reduce_dimensions(@embeddings)
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# Step 2: Cluster embeddings
|
|
48
|
-
puts " Clustering #{working_embeddings.length} documents..." if @verbose
|
|
49
|
-
cluster_ids = @clusterer.fit_predict(working_embeddings)
|
|
50
|
-
|
|
51
|
-
# Step 3: Build topics from clusters
|
|
52
|
-
puts " Building topics..." if @verbose
|
|
53
|
-
@topics = build_topics(cluster_ids)
|
|
54
|
-
|
|
55
|
-
# Step 4: Extract terms for each topic
|
|
56
|
-
puts " Extracting distinctive terms..." if @verbose
|
|
57
|
-
extract_topic_terms
|
|
58
|
-
|
|
59
|
-
# Step 5: Generate labels
|
|
60
|
-
puts " Generating topic labels..." if @verbose
|
|
61
|
-
generate_topic_labels
|
|
62
|
-
|
|
63
|
-
puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
|
|
64
|
-
|
|
65
|
-
@topics
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def transform(embeddings:, documents: nil)
|
|
69
|
-
# Assign new documents to existing topics
|
|
70
|
-
raise "Must call fit before transform" if @topics.empty?
|
|
71
|
-
|
|
72
|
-
# Use approximate prediction if available
|
|
73
|
-
if @clusterer.respond_to?(:approximate_predict)
|
|
74
|
-
@clusterer.approximate_predict(embeddings)
|
|
75
|
-
else
|
|
76
|
-
# Fallback: assign to nearest topic centroid
|
|
77
|
-
assign_to_nearest_topic(embeddings)
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
def get_topic(topic_id)
|
|
82
|
-
@topics.find { |t| t.id == topic_id }
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def outliers
|
|
86
|
-
@outliers ||= @documents.each_with_index.select { |_, idx|
|
|
87
|
-
@cluster_ids && @cluster_ids[idx] == -1
|
|
88
|
-
}.map(&:first)
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
def save(path)
|
|
92
|
-
data = {
|
|
93
|
-
topics: @topics.map(&:to_h),
|
|
94
|
-
config: {
|
|
95
|
-
min_cluster_size: @min_cluster_size,
|
|
96
|
-
min_samples: @min_samples,
|
|
97
|
-
reduce_dimensions: @reduce_dimensions,
|
|
98
|
-
n_components: @n_components,
|
|
99
|
-
labeling_method: @labeling_method
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
File.write(path, JSON.pretty_generate(data))
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
def self.load(path)
|
|
106
|
-
data = JSON.parse(File.read(path), symbolize_names: true)
|
|
107
|
-
engine = new(**data[:config])
|
|
108
|
-
# Reconstruct topics
|
|
109
|
-
engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
|
|
110
|
-
engine
|
|
111
|
-
end
|
|
112
|
-
|
|
113
|
-
private
|
|
114
|
-
|
|
115
|
-
def build_default_clusterer
|
|
116
|
-
begin
|
|
117
|
-
require 'clusterkit'
|
|
118
|
-
ClusterKit::Clustering::HDBSCAN.new(
|
|
119
|
-
min_cluster_size: @min_cluster_size,
|
|
120
|
-
min_samples: @min_samples,
|
|
121
|
-
metric: 'euclidean'
|
|
122
|
-
)
|
|
123
|
-
rescue LoadError
|
|
124
|
-
raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def reduce_dimensions(embeddings)
|
|
129
|
-
require 'clusterkit'
|
|
130
|
-
|
|
131
|
-
# Validate embeddings before UMAP
|
|
132
|
-
valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
|
|
133
|
-
|
|
134
|
-
if valid_embeddings.empty?
|
|
135
|
-
raise "No valid embeddings for dimensionality reduction.\n\n" \
|
|
136
|
-
"All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
|
|
137
|
-
"Try running without dimensionality reduction:\n" \
|
|
138
|
-
" ragnar topics --reduce-dimensions false"
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
if invalid_indices.any? && @verbose
|
|
142
|
-
puts " ⚠️ Warning: #{invalid_indices.size} embeddings with invalid values removed"
|
|
143
|
-
end
|
|
144
|
-
|
|
145
|
-
begin
|
|
146
|
-
# Adjust parameters based on data size
|
|
147
|
-
n_samples = valid_embeddings.size
|
|
148
|
-
n_components = [@n_components, n_samples - 1, 50].min
|
|
149
|
-
n_neighbors = [15, n_samples - 1].min
|
|
150
|
-
|
|
151
|
-
if @verbose && n_components != @n_components
|
|
152
|
-
puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
umap = ClusterKit::Dimensionality::UMAP.new(
|
|
156
|
-
n_components: n_components,
|
|
157
|
-
n_neighbors: n_neighbors,
|
|
158
|
-
random_seed: 42 # For reproducibility
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# Convert to format UMAP expects
|
|
162
|
-
reduced = umap.fit_transform(valid_embeddings)
|
|
163
|
-
|
|
164
|
-
# If we had to remove invalid embeddings, reconstruct the full array
|
|
165
|
-
if invalid_indices.any?
|
|
166
|
-
full_reduced = []
|
|
167
|
-
valid_idx = 0
|
|
168
|
-
embeddings.size.times do |i|
|
|
169
|
-
if invalid_indices.include?(i)
|
|
170
|
-
# Use zeros for invalid embeddings (they'll be outliers anyway)
|
|
171
|
-
full_reduced << Array.new(n_components, 0.0)
|
|
172
|
-
else
|
|
173
|
-
full_reduced << reduced[valid_idx]
|
|
174
|
-
valid_idx += 1
|
|
175
|
-
end
|
|
176
|
-
end
|
|
177
|
-
full_reduced
|
|
178
|
-
else
|
|
179
|
-
reduced
|
|
180
|
-
end
|
|
181
|
-
rescue => e
|
|
182
|
-
if e.message.include?("index out of bounds")
|
|
183
|
-
error_msg = "\n❌ Dimensionality reduction failed\n\n"
|
|
184
|
-
error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
|
|
185
|
-
error_msg += "This typically happens with:\n"
|
|
186
|
-
error_msg += " • Embeddings containing invalid values\n"
|
|
187
|
-
error_msg += " • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
|
|
188
|
-
error_msg += " • Incompatible parameters\n\n"
|
|
189
|
-
error_msg += "Solutions:\n"
|
|
190
|
-
error_msg += " 1. Run without dimensionality reduction:\n"
|
|
191
|
-
error_msg += " ragnar topics --reduce-dimensions false\n\n"
|
|
192
|
-
error_msg += " 2. Use fewer dimensions:\n"
|
|
193
|
-
error_msg += " ragnar topics --n-components 2\n\n"
|
|
194
|
-
error_msg += " 3. Re-index your documents:\n"
|
|
195
|
-
error_msg += " ragnar index <path> --force\n"
|
|
196
|
-
raise error_msg
|
|
197
|
-
else
|
|
198
|
-
raise
|
|
199
|
-
end
|
|
200
|
-
end
|
|
201
|
-
rescue LoadError
|
|
202
|
-
puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
|
|
203
|
-
embeddings
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
private
|
|
207
|
-
|
|
208
|
-
def validate_embeddings_for_umap(embeddings)
|
|
209
|
-
valid = []
|
|
210
|
-
invalid_indices = []
|
|
211
|
-
|
|
212
|
-
embeddings.each_with_index do |embedding, idx|
|
|
213
|
-
if embedding.is_a?(Array) &&
|
|
214
|
-
embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
|
|
215
|
-
valid << embedding
|
|
216
|
-
else
|
|
217
|
-
invalid_indices << idx
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
[valid, invalid_indices]
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
def build_topics(cluster_ids)
|
|
225
|
-
@cluster_ids = cluster_ids
|
|
226
|
-
|
|
227
|
-
# Group documents by cluster
|
|
228
|
-
clusters = {}
|
|
229
|
-
cluster_ids.each_with_index do |cluster_id, doc_idx|
|
|
230
|
-
next if cluster_id == -1 # Skip outliers
|
|
231
|
-
clusters[cluster_id] ||= []
|
|
232
|
-
clusters[cluster_id] << doc_idx
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
# Create Topic objects
|
|
236
|
-
clusters.map do |cluster_id, doc_indices|
|
|
237
|
-
Topic.new(
|
|
238
|
-
id: cluster_id,
|
|
239
|
-
document_indices: doc_indices,
|
|
240
|
-
documents: doc_indices.map { |i| @documents[i] },
|
|
241
|
-
embeddings: doc_indices.map { |i| @embeddings[i] },
|
|
242
|
-
metadata: doc_indices.map { |i| @metadata[i] }
|
|
243
|
-
)
|
|
244
|
-
end.sort_by(&:id)
|
|
245
|
-
end
|
|
246
|
-
|
|
247
|
-
def extract_topic_terms
|
|
248
|
-
# Extract distinctive terms for each topic
|
|
249
|
-
all_docs_text = @documents.join(" ")
|
|
250
|
-
|
|
251
|
-
@topics.each do |topic|
|
|
252
|
-
topic_docs_text = topic.documents.join(" ")
|
|
253
|
-
|
|
254
|
-
# Use c-TF-IDF to find distinctive terms
|
|
255
|
-
terms = @term_extractor.extract_distinctive_terms(
|
|
256
|
-
topic_docs: topic.documents,
|
|
257
|
-
all_docs: @documents,
|
|
258
|
-
top_n: 20
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
topic.set_terms(terms)
|
|
262
|
-
end
|
|
263
|
-
end
|
|
264
|
-
|
|
265
|
-
def generate_topic_labels
|
|
266
|
-
@topics.each do |topic|
|
|
267
|
-
result = @labeler.generate_label(
|
|
268
|
-
topic: topic,
|
|
269
|
-
terms: topic.terms,
|
|
270
|
-
documents: topic.documents.first(3) # Use top 3 representative docs
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
# Set both label and description if available
|
|
274
|
-
topic.set_label(result[:label])
|
|
275
|
-
topic.instance_variable_set(:@description, result[:description]) if result[:description]
|
|
276
|
-
topic.instance_variable_set(:@label_confidence, result[:confidence])
|
|
277
|
-
topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
|
|
278
|
-
end
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
def count_outliers(cluster_ids)
|
|
282
|
-
cluster_ids.count { |id| id == -1 }
|
|
283
|
-
end
|
|
284
|
-
|
|
285
|
-
def assign_to_nearest_topic(embeddings)
|
|
286
|
-
# Simple nearest centroid assignment
|
|
287
|
-
topic_centroids = @topics.map(&:centroid)
|
|
288
|
-
|
|
289
|
-
embeddings.map do |embedding|
|
|
290
|
-
distances = topic_centroids.map do |centroid|
|
|
291
|
-
# Euclidean distance
|
|
292
|
-
Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
|
|
293
|
-
end
|
|
294
|
-
|
|
295
|
-
min_idx = distances.index(distances.min)
|
|
296
|
-
@topics[min_idx].id
|
|
297
|
-
end
|
|
298
|
-
end
|
|
299
|
-
end
|
|
300
|
-
end
|
|
301
|
-
end
|