topical 0.0.1.pre.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +159 -107
- data/docs/assets/topical-wide.png +0 -0
- data/examples/detect_new_topics.rb +190 -0
- data/examples/quick_demo.rb +1 -1
- data/examples/topic_summaries_with_llm.rb +128 -0
- data/examples/verify_migration.rb +1 -1
- data/lib/topical/clustering/adapter.rb +1 -1
- data/lib/topical/clustering/hdbscan_adapter.rb +1 -1
- data/lib/topical/clustering/kmeans_adapter.rb +1 -1
- data/lib/topical/dimensionality_reducer.rb +96 -0
- data/lib/topical/engine.rb +31 -126
- data/lib/topical/extractors/term_extractor.rb +1 -1
- data/lib/topical/labelers/base.rb +1 -1
- data/lib/topical/labelers/term_based.rb +1 -1
- data/lib/topical/metrics.rb +1 -1
- data/lib/topical/model_serializer.rb +59 -0
- data/lib/topical/topic.rb +1 -1
- data/lib/topical/version.rb +1 -1
- data/lib/topical.rb +6 -11
- metadata +29 -13
- data/lib/topical/labelers/hybrid.rb +0 -24
- data/lib/topical/labelers/llm_adapter.rb +0 -126
- data/lib/topical/labelers/llm_based.rb +0 -111
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: topical
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: clusterkit
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.2.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.2.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: red-candle
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '1.
|
33
|
+
version: '1.2'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1.
|
40
|
+
version: '1.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '1.3'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.22'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.22'
|
83
97
|
description: Extract topics from document embeddings using HDBSCAN clustering and
|
84
98
|
c-TF-IDF term extraction. Provides automatic topic labeling, quality metrics, and
|
85
99
|
support for various clustering algorithms.
|
@@ -95,30 +109,32 @@ files:
|
|
95
109
|
- LICENSE.txt
|
96
110
|
- README.md
|
97
111
|
- Rakefile
|
112
|
+
- docs/assets/topical-wide.png
|
113
|
+
- examples/detect_new_topics.rb
|
98
114
|
- examples/quick_demo.rb
|
115
|
+
- examples/topic_summaries_with_llm.rb
|
99
116
|
- examples/verify_migration.rb
|
100
117
|
- lib/topical.rb
|
101
118
|
- lib/topical/clustering/adapter.rb
|
102
119
|
- lib/topical/clustering/hdbscan_adapter.rb
|
103
120
|
- lib/topical/clustering/kmeans_adapter.rb
|
121
|
+
- lib/topical/dimensionality_reducer.rb
|
104
122
|
- lib/topical/engine.rb
|
105
123
|
- lib/topical/extractors/term_extractor.rb
|
106
124
|
- lib/topical/labelers/base.rb
|
107
|
-
- lib/topical/labelers/hybrid.rb
|
108
|
-
- lib/topical/labelers/llm_adapter.rb
|
109
|
-
- lib/topical/labelers/llm_based.rb
|
110
125
|
- lib/topical/labelers/term_based.rb
|
111
126
|
- lib/topical/metrics.rb
|
127
|
+
- lib/topical/model_serializer.rb
|
112
128
|
- lib/topical/topic.rb
|
113
129
|
- lib/topical/version.rb
|
114
130
|
- sig/topical.rbs
|
115
|
-
homepage: https://github.com/
|
131
|
+
homepage: https://github.com/scientist-labs/topical
|
116
132
|
licenses:
|
117
133
|
- MIT
|
118
134
|
metadata:
|
119
|
-
homepage_uri: https://github.com/
|
120
|
-
source_code_uri: https://github.com/
|
121
|
-
changelog_uri: https://github.com/
|
135
|
+
homepage_uri: https://github.com/scientist-labs/topical
|
136
|
+
source_code_uri: https://github.com/scientist-labs/topical
|
137
|
+
changelog_uri: https://github.com/scientist-labs/topical/blob/main/CHANGELOG.md
|
122
138
|
documentation_uri: https://rubydoc.info/gems/topical
|
123
139
|
post_install_message:
|
124
140
|
rdoc_options: []
|
@@ -1,24 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Topical
|
4
|
-
module Labelers
|
5
|
-
# Hybrid labeling that combines term-based and LLM approaches
|
6
|
-
class Hybrid < Base
|
7
|
-
def initialize(provider: nil)
|
8
|
-
@term_labeler = TermBased.new
|
9
|
-
@llm_labeler = LLMBased.new(provider: provider)
|
10
|
-
end
|
11
|
-
|
12
|
-
def generate_label(topic)
|
13
|
-
# Start with term-based label
|
14
|
-
term_label = @term_labeler.generate_label(topic)
|
15
|
-
|
16
|
-
# Try to enhance with LLM if available
|
17
|
-
llm_label = @llm_labeler.generate_label(topic)
|
18
|
-
|
19
|
-
# For now, just return the LLM label if different, otherwise term label
|
20
|
-
llm_label != "LLM Topic #{topic.id}" ? llm_label : term_label
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
@@ -1,126 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Topical
|
4
|
-
module Labelers
|
5
|
-
# Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
|
6
|
-
class LLMAdapter
|
7
|
-
# Factory method to create appropriate LLM client
|
8
|
-
def self.create(type: :auto, **options)
|
9
|
-
case type
|
10
|
-
when :red_candle
|
11
|
-
RedCandleAdapter.new(**options)
|
12
|
-
when :openai
|
13
|
-
# Future: OpenAIAdapter.new(**options)
|
14
|
-
raise NotImplementedError, "OpenAI adapter not yet implemented"
|
15
|
-
when :anthropic
|
16
|
-
# Future: AnthropicAdapter.new(**options)
|
17
|
-
raise NotImplementedError, "Anthropic adapter not yet implemented"
|
18
|
-
when :auto
|
19
|
-
# Try red-candle first, then fall back to others
|
20
|
-
begin
|
21
|
-
RedCandleAdapter.new(**options)
|
22
|
-
rescue LoadError
|
23
|
-
nil # No LLM available
|
24
|
-
end
|
25
|
-
else
|
26
|
-
raise ArgumentError, "Unknown LLM type: #{type}"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
# Adapter for red-candle (local LLMs)
|
32
|
-
class RedCandleAdapter
|
33
|
-
def initialize(model: nil, **options)
|
34
|
-
require 'red-candle'
|
35
|
-
|
36
|
-
@model = model || default_model
|
37
|
-
@options = options
|
38
|
-
@llm = load_or_create_llm
|
39
|
-
end
|
40
|
-
|
41
|
-
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
42
|
-
# Red-candle specific generation
|
43
|
-
response = @llm.generate(
|
44
|
-
prompt,
|
45
|
-
max_length: max_tokens,
|
46
|
-
temperature: temperature,
|
47
|
-
do_sample: temperature > 0
|
48
|
-
)
|
49
|
-
|
50
|
-
# Handle JSON response format if requested
|
51
|
-
if response_format && response_format[:type] == "json_object"
|
52
|
-
ensure_json_response(response)
|
53
|
-
else
|
54
|
-
response
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def available?
|
59
|
-
true
|
60
|
-
end
|
61
|
-
|
62
|
-
private
|
63
|
-
|
64
|
-
def default_model
|
65
|
-
# Use a small, fast model by default for topic labeling
|
66
|
-
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
67
|
-
end
|
68
|
-
|
69
|
-
def load_or_create_llm
|
70
|
-
# Create new LLM instance with red-candle
|
71
|
-
RedCandle::Model.new(
|
72
|
-
model_id: @model,
|
73
|
-
model_type: :llama,
|
74
|
-
quantized: true
|
75
|
-
)
|
76
|
-
end
|
77
|
-
|
78
|
-
def ensure_json_response(response)
|
79
|
-
# Try to extract JSON from response
|
80
|
-
begin
|
81
|
-
require 'json'
|
82
|
-
# Look for JSON-like content
|
83
|
-
json_match = response.match(/\{.*\}/m)
|
84
|
-
if json_match
|
85
|
-
JSON.parse(json_match[0])
|
86
|
-
json_match[0] # Return the JSON string if valid
|
87
|
-
else
|
88
|
-
# Generate a basic JSON response
|
89
|
-
generate_fallback_json(response)
|
90
|
-
end
|
91
|
-
rescue JSON::ParserError
|
92
|
-
generate_fallback_json(response)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def generate_fallback_json(text)
|
97
|
-
# Create a simple JSON from text response
|
98
|
-
require 'json'
|
99
|
-
label = text.lines.first&.strip || "Unknown"
|
100
|
-
{
|
101
|
-
label: label,
|
102
|
-
description: text,
|
103
|
-
confidence: 0.5
|
104
|
-
}.to_json
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
# Future adapter for remote LLMs
|
109
|
-
class RemoteAdapter
|
110
|
-
def initialize(api_key:, endpoint:, **options)
|
111
|
-
@api_key = api_key
|
112
|
-
@endpoint = endpoint
|
113
|
-
@options = options
|
114
|
-
end
|
115
|
-
|
116
|
-
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
117
|
-
# Make API call
|
118
|
-
raise NotImplementedError, "Remote LLM adapter coming soon"
|
119
|
-
end
|
120
|
-
|
121
|
-
def available?
|
122
|
-
!@api_key.nil?
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
@@ -1,111 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Topical
|
4
|
-
module Labelers
|
5
|
-
# LLM-powered topic labeling (requires red-candle or other LLM provider)
|
6
|
-
class LLMBased < Base
|
7
|
-
def initialize(provider: nil)
|
8
|
-
@provider = provider
|
9
|
-
end
|
10
|
-
|
11
|
-
def generate_label(topic)
|
12
|
-
unless llm_available?
|
13
|
-
# Fallback to term-based if LLM not available
|
14
|
-
return TermBased.new.generate_label(topic)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Select best documents to send to LLM
|
18
|
-
sample_docs = topic.representative_docs(k: 3)
|
19
|
-
|
20
|
-
# Generate comprehensive analysis
|
21
|
-
response = analyze_with_llm(sample_docs, topic.terms)
|
22
|
-
|
23
|
-
response[:label]
|
24
|
-
rescue => e
|
25
|
-
# Fallback on error
|
26
|
-
puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
|
27
|
-
TermBased.new.generate_label(topic)
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def llm_available?
|
33
|
-
return true if @provider
|
34
|
-
|
35
|
-
# Try to create LLM adapter
|
36
|
-
begin
|
37
|
-
require_relative 'llm_adapter'
|
38
|
-
@provider = LLMAdapter.create(type: :auto)
|
39
|
-
@provider && @provider.available?
|
40
|
-
rescue LoadError, StandardError => e
|
41
|
-
puts "LLM not available: #{e.message}" if ENV['DEBUG']
|
42
|
-
false
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def analyze_with_llm(documents, terms)
|
47
|
-
prompt = build_analysis_prompt(documents, terms)
|
48
|
-
|
49
|
-
response = @provider.generate(
|
50
|
-
prompt: prompt,
|
51
|
-
max_tokens: 150,
|
52
|
-
temperature: 0.3,
|
53
|
-
response_format: { type: "json_object" }
|
54
|
-
)
|
55
|
-
|
56
|
-
# Parse JSON response
|
57
|
-
require 'json'
|
58
|
-
result = JSON.parse(response, symbolize_names: true)
|
59
|
-
|
60
|
-
# Validate and clean
|
61
|
-
{
|
62
|
-
label: clean_label(result[:label]),
|
63
|
-
description: result[:description] || "Topic about #{result[:label]}",
|
64
|
-
themes: result[:themes] || [],
|
65
|
-
confidence: result[:confidence] || 0.8
|
66
|
-
}
|
67
|
-
end
|
68
|
-
|
69
|
-
def build_analysis_prompt(documents, terms)
|
70
|
-
doc_samples = documents.map.with_index do |doc, i|
|
71
|
-
preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
|
72
|
-
"Document #{i + 1}:\n#{preview}"
|
73
|
-
end.join("\n\n")
|
74
|
-
|
75
|
-
<<~PROMPT
|
76
|
-
Analyze this cluster of related documents and provide a structured summary.
|
77
|
-
|
78
|
-
Distinctive terms found: #{terms.first(10).join(', ')}
|
79
|
-
|
80
|
-
Sample documents:
|
81
|
-
#{doc_samples}
|
82
|
-
|
83
|
-
Provide a JSON response with:
|
84
|
-
{
|
85
|
-
"label": "A 2-4 word topic label",
|
86
|
-
"description": "One sentence describing what connects these documents",
|
87
|
-
"themes": ["theme1", "theme2", "theme3"],
|
88
|
-
"confidence": 0.0-1.0 score of how coherent this topic is
|
89
|
-
}
|
90
|
-
|
91
|
-
Focus on what meaningfully connects these documents, not just common words.
|
92
|
-
PROMPT
|
93
|
-
end
|
94
|
-
|
95
|
-
def clean_label(label)
|
96
|
-
return "Unknown Topic" unless label
|
97
|
-
|
98
|
-
# Remove quotes, trim, limit length
|
99
|
-
cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
|
100
|
-
cleaned = cleaned.split("\n").first if cleaned.include?("\n")
|
101
|
-
|
102
|
-
# Limit to reasonable length
|
103
|
-
if cleaned.length > 50
|
104
|
-
cleaned[0..47] + "..."
|
105
|
-
else
|
106
|
-
cleaned
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|