tritopic 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tritopic/__init__.py +46 -0
- tritopic/core/__init__.py +17 -0
- tritopic/core/clustering.py +331 -0
- tritopic/core/embeddings.py +222 -0
- tritopic/core/graph_builder.py +493 -0
- tritopic/core/keywords.py +337 -0
- tritopic/core/model.py +810 -0
- tritopic/labeling/__init__.py +5 -0
- tritopic/labeling/llm_labeler.py +279 -0
- tritopic/utils/__init__.py +13 -0
- tritopic/utils/metrics.py +254 -0
- tritopic/visualization/__init__.py +5 -0
- tritopic/visualization/plotter.py +523 -0
- tritopic-0.1.0.dist-info/METADATA +400 -0
- tritopic-0.1.0.dist-info/RECORD +18 -0
- tritopic-0.1.0.dist-info/WHEEL +5 -0
- tritopic-0.1.0.dist-info/licenses/LICENSE +21 -0
- tritopic-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Keyword Extraction for TriTopic
|
|
3
|
+
================================
|
|
4
|
+
|
|
5
|
+
Extract representative keywords for topics using:
|
|
6
|
+
- c-TF-IDF (class-based TF-IDF, like BERTopic)
|
|
7
|
+
- BM25 scoring
|
|
8
|
+
- KeyBERT (embedding-based)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Literal
|
|
14
|
+
from collections import Counter
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KeywordExtractor:
|
|
21
|
+
"""
|
|
22
|
+
Extract keywords for topics.
|
|
23
|
+
|
|
24
|
+
Supports multiple extraction methods for flexibility.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
method : str
|
|
29
|
+
Extraction method: "ctfidf", "bm25", or "keybert"
|
|
30
|
+
n_keywords : int
|
|
31
|
+
Number of keywords to extract per topic. Default: 10
|
|
32
|
+
ngram_range : tuple
|
|
33
|
+
N-gram range for keyword extraction. Default: (1, 2)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf",
|
|
39
|
+
n_keywords: int = 10,
|
|
40
|
+
ngram_range: tuple[int, int] = (1, 2),
|
|
41
|
+
min_df: int = 2,
|
|
42
|
+
max_df: float = 0.95,
|
|
43
|
+
):
|
|
44
|
+
self.method = method
|
|
45
|
+
self.n_keywords = n_keywords
|
|
46
|
+
self.ngram_range = ngram_range
|
|
47
|
+
self.min_df = min_df
|
|
48
|
+
self.max_df = max_df
|
|
49
|
+
|
|
50
|
+
self._vectorizer = None
|
|
51
|
+
self._vocabulary = None
|
|
52
|
+
|
|
53
|
+
def extract(
|
|
54
|
+
self,
|
|
55
|
+
topic_docs: list[str],
|
|
56
|
+
all_docs: list[str] | None = None,
|
|
57
|
+
n_keywords: int | None = None,
|
|
58
|
+
) -> tuple[list[str], list[float]]:
|
|
59
|
+
"""
|
|
60
|
+
Extract keywords from topic documents.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
topic_docs : list[str]
|
|
65
|
+
Documents belonging to the topic.
|
|
66
|
+
all_docs : list[str], optional
|
|
67
|
+
All documents in corpus (needed for c-TF-IDF).
|
|
68
|
+
n_keywords : int, optional
|
|
69
|
+
Override default n_keywords.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
keywords : list[str]
|
|
74
|
+
Top keywords for the topic.
|
|
75
|
+
scores : list[float]
|
|
76
|
+
Keyword scores.
|
|
77
|
+
"""
|
|
78
|
+
n = n_keywords or self.n_keywords
|
|
79
|
+
|
|
80
|
+
if self.method == "ctfidf":
|
|
81
|
+
return self._extract_ctfidf(topic_docs, all_docs or topic_docs, n)
|
|
82
|
+
elif self.method == "bm25":
|
|
83
|
+
return self._extract_bm25(topic_docs, all_docs or topic_docs, n)
|
|
84
|
+
elif self.method == "keybert":
|
|
85
|
+
return self._extract_keybert(topic_docs, n)
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError(f"Unknown method: {self.method}")
|
|
88
|
+
|
|
89
|
+
def _extract_ctfidf(
|
|
90
|
+
self,
|
|
91
|
+
topic_docs: list[str],
|
|
92
|
+
all_docs: list[str],
|
|
93
|
+
n_keywords: int,
|
|
94
|
+
) -> tuple[list[str], list[float]]:
|
|
95
|
+
"""
|
|
96
|
+
Extract keywords using class-based TF-IDF (c-TF-IDF).
|
|
97
|
+
|
|
98
|
+
c-TF-IDF treats all documents in a topic as a single "class document"
|
|
99
|
+
and computes TF-IDF against the corpus. This highlights words that
|
|
100
|
+
are distinctive for the topic.
|
|
101
|
+
"""
|
|
102
|
+
# Fit vectorizer on all docs if not already
|
|
103
|
+
if self._vectorizer is None:
|
|
104
|
+
self._vectorizer = CountVectorizer(
|
|
105
|
+
ngram_range=self.ngram_range,
|
|
106
|
+
stop_words="english",
|
|
107
|
+
min_df=self.min_df,
|
|
108
|
+
max_df=self.max_df,
|
|
109
|
+
)
|
|
110
|
+
self._vectorizer.fit(all_docs)
|
|
111
|
+
self._vocabulary = self._vectorizer.get_feature_names_out()
|
|
112
|
+
|
|
113
|
+
# Concatenate topic docs into a single "class document"
|
|
114
|
+
topic_text = " ".join(topic_docs)
|
|
115
|
+
|
|
116
|
+
# Get term frequencies for topic
|
|
117
|
+
topic_tf = self._vectorizer.transform([topic_text]).toarray()[0]
|
|
118
|
+
|
|
119
|
+
# Get term frequencies across all docs
|
|
120
|
+
all_tf = self._vectorizer.transform(all_docs).toarray()
|
|
121
|
+
|
|
122
|
+
# Compute IDF: log(N / (1 + df))
|
|
123
|
+
doc_freq = np.sum(all_tf > 0, axis=0)
|
|
124
|
+
idf = np.log(len(all_docs) / (1 + doc_freq))
|
|
125
|
+
|
|
126
|
+
# c-TF-IDF = TF * IDF (with smoothing)
|
|
127
|
+
topic_tf_normalized = topic_tf / (topic_tf.sum() + 1e-10)
|
|
128
|
+
ctfidf_scores = topic_tf_normalized * idf
|
|
129
|
+
|
|
130
|
+
# Get top keywords
|
|
131
|
+
top_indices = np.argsort(ctfidf_scores)[::-1][:n_keywords]
|
|
132
|
+
|
|
133
|
+
keywords = [self._vocabulary[i] for i in top_indices]
|
|
134
|
+
scores = [float(ctfidf_scores[i]) for i in top_indices]
|
|
135
|
+
|
|
136
|
+
return keywords, scores
|
|
137
|
+
|
|
138
|
+
def _extract_bm25(
|
|
139
|
+
self,
|
|
140
|
+
topic_docs: list[str],
|
|
141
|
+
all_docs: list[str],
|
|
142
|
+
n_keywords: int,
|
|
143
|
+
) -> tuple[list[str], list[float]]:
|
|
144
|
+
"""
|
|
145
|
+
Extract keywords using BM25 scoring.
|
|
146
|
+
|
|
147
|
+
BM25 is more robust to document length variations than TF-IDF.
|
|
148
|
+
"""
|
|
149
|
+
from rank_bm25 import BM25Okapi
|
|
150
|
+
|
|
151
|
+
# Tokenize
|
|
152
|
+
def tokenize(text):
|
|
153
|
+
# Simple tokenization
|
|
154
|
+
import re
|
|
155
|
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
156
|
+
# Remove stopwords
|
|
157
|
+
stopwords = {
|
|
158
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
|
159
|
+
'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were',
|
|
160
|
+
'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
|
161
|
+
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
|
|
162
|
+
'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'then',
|
|
163
|
+
}
|
|
164
|
+
return [t for t in tokens if t not in stopwords and len(t) > 2]
|
|
165
|
+
|
|
166
|
+
# Tokenize all docs
|
|
167
|
+
tokenized_all = [tokenize(doc) for doc in all_docs]
|
|
168
|
+
tokenized_topic = [tokenize(doc) for doc in topic_docs]
|
|
169
|
+
|
|
170
|
+
# Build vocabulary from topic docs
|
|
171
|
+
topic_vocab = Counter()
|
|
172
|
+
for tokens in tokenized_topic:
|
|
173
|
+
topic_vocab.update(tokens)
|
|
174
|
+
|
|
175
|
+
# Fit BM25 on all docs
|
|
176
|
+
bm25 = BM25Okapi(tokenized_all)
|
|
177
|
+
|
|
178
|
+
# Score each word in topic vocabulary
|
|
179
|
+
word_scores = {}
|
|
180
|
+
for word, freq in topic_vocab.items():
|
|
181
|
+
# Use word as query
|
|
182
|
+
scores = bm25.get_scores([word])
|
|
183
|
+
|
|
184
|
+
# Average score weighted by frequency in topic
|
|
185
|
+
avg_score = np.mean(scores)
|
|
186
|
+
word_scores[word] = avg_score * np.log1p(freq)
|
|
187
|
+
|
|
188
|
+
# Sort by score
|
|
189
|
+
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
|
|
190
|
+
|
|
191
|
+
keywords = [w for w, s in sorted_words[:n_keywords]]
|
|
192
|
+
scores = [s for w, s in sorted_words[:n_keywords]]
|
|
193
|
+
|
|
194
|
+
# Normalize scores
|
|
195
|
+
max_score = max(scores) if scores else 1
|
|
196
|
+
scores = [s / max_score for s in scores]
|
|
197
|
+
|
|
198
|
+
return keywords, scores
|
|
199
|
+
|
|
200
|
+
def _extract_keybert(
|
|
201
|
+
self,
|
|
202
|
+
topic_docs: list[str],
|
|
203
|
+
n_keywords: int,
|
|
204
|
+
) -> tuple[list[str], list[float]]:
|
|
205
|
+
"""
|
|
206
|
+
Extract keywords using KeyBERT (embedding-based).
|
|
207
|
+
|
|
208
|
+
KeyBERT finds keywords by comparing candidate embeddings
|
|
209
|
+
to the document embedding.
|
|
210
|
+
"""
|
|
211
|
+
from keybert import KeyBERT
|
|
212
|
+
|
|
213
|
+
# Concatenate topic docs
|
|
214
|
+
topic_text = " ".join(topic_docs)
|
|
215
|
+
|
|
216
|
+
# Initialize KeyBERT
|
|
217
|
+
kw_model = KeyBERT()
|
|
218
|
+
|
|
219
|
+
# Extract keywords
|
|
220
|
+
keywords_with_scores = kw_model.extract_keywords(
|
|
221
|
+
topic_text,
|
|
222
|
+
keyphrase_ngram_range=self.ngram_range,
|
|
223
|
+
stop_words="english",
|
|
224
|
+
top_n=n_keywords,
|
|
225
|
+
use_mmr=True, # Maximal Marginal Relevance for diversity
|
|
226
|
+
diversity=0.5,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
keywords = [kw for kw, score in keywords_with_scores]
|
|
230
|
+
scores = [float(score) for kw, score in keywords_with_scores]
|
|
231
|
+
|
|
232
|
+
return keywords, scores
|
|
233
|
+
|
|
234
|
+
def extract_all_topics(
|
|
235
|
+
self,
|
|
236
|
+
documents: list[str],
|
|
237
|
+
labels: np.ndarray,
|
|
238
|
+
n_keywords: int | None = None,
|
|
239
|
+
) -> dict[int, tuple[list[str], list[float]]]:
|
|
240
|
+
"""
|
|
241
|
+
Extract keywords for all topics at once.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
documents : list[str]
|
|
246
|
+
All documents.
|
|
247
|
+
labels : np.ndarray
|
|
248
|
+
Topic assignments.
|
|
249
|
+
n_keywords : int, optional
|
|
250
|
+
Override default n_keywords.
|
|
251
|
+
|
|
252
|
+
Returns
|
|
253
|
+
-------
|
|
254
|
+
topic_keywords : dict
|
|
255
|
+
Mapping from topic_id to (keywords, scores).
|
|
256
|
+
"""
|
|
257
|
+
result = {}
|
|
258
|
+
|
|
259
|
+
for topic_id in np.unique(labels):
|
|
260
|
+
if topic_id == -1:
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
mask = labels == topic_id
|
|
264
|
+
topic_docs = [documents[i] for i in np.where(mask)[0]]
|
|
265
|
+
|
|
266
|
+
keywords, scores = self.extract(topic_docs, documents, n_keywords)
|
|
267
|
+
result[int(topic_id)] = (keywords, scores)
|
|
268
|
+
|
|
269
|
+
return result
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class KeyphraseExtractor:
|
|
273
|
+
"""
|
|
274
|
+
Extract keyphrases (multi-word) using YAKE or TextRank.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
def __init__(
|
|
278
|
+
self,
|
|
279
|
+
method: Literal["yake", "textrank"] = "yake",
|
|
280
|
+
n_keyphrases: int = 10,
|
|
281
|
+
max_ngram: int = 3,
|
|
282
|
+
):
|
|
283
|
+
self.method = method
|
|
284
|
+
self.n_keyphrases = n_keyphrases
|
|
285
|
+
self.max_ngram = max_ngram
|
|
286
|
+
|
|
287
|
+
def extract(self, text: str) -> list[tuple[str, float]]:
|
|
288
|
+
"""Extract keyphrases from text."""
|
|
289
|
+
if self.method == "yake":
|
|
290
|
+
return self._extract_yake(text)
|
|
291
|
+
else:
|
|
292
|
+
raise ValueError(f"Unknown method: {self.method}")
|
|
293
|
+
|
|
294
|
+
def _extract_yake(self, text: str) -> list[tuple[str, float]]:
|
|
295
|
+
"""Extract using YAKE algorithm."""
|
|
296
|
+
try:
|
|
297
|
+
import yake
|
|
298
|
+
except ImportError:
|
|
299
|
+
# Fallback to simple extraction
|
|
300
|
+
return self._simple_extract(text)
|
|
301
|
+
|
|
302
|
+
kw_extractor = yake.KeywordExtractor(
|
|
303
|
+
lan="en",
|
|
304
|
+
n=self.max_ngram,
|
|
305
|
+
dedupLim=0.7,
|
|
306
|
+
top=self.n_keyphrases,
|
|
307
|
+
features=None,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
keywords = kw_extractor.extract_keywords(text)
|
|
311
|
+
|
|
312
|
+
# YAKE returns (keyword, score) where lower score is better
|
|
313
|
+
# Invert for consistency
|
|
314
|
+
max_score = max(s for _, s in keywords) if keywords else 1
|
|
315
|
+
return [(kw, 1 - s/max_score) for kw, s in keywords]
|
|
316
|
+
|
|
317
|
+
def _simple_extract(self, text: str) -> list[tuple[str, float]]:
|
|
318
|
+
"""Simple n-gram frequency extraction."""
|
|
319
|
+
import re
|
|
320
|
+
from collections import Counter
|
|
321
|
+
|
|
322
|
+
# Tokenize
|
|
323
|
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
324
|
+
|
|
325
|
+
# Generate n-grams
|
|
326
|
+
ngrams = []
|
|
327
|
+
for n in range(1, self.max_ngram + 1):
|
|
328
|
+
for i in range(len(tokens) - n + 1):
|
|
329
|
+
ngram = " ".join(tokens[i:i+n])
|
|
330
|
+
ngrams.append(ngram)
|
|
331
|
+
|
|
332
|
+
# Count and return top
|
|
333
|
+
counts = Counter(ngrams)
|
|
334
|
+
top = counts.most_common(self.n_keyphrases)
|
|
335
|
+
|
|
336
|
+
max_count = top[0][1] if top else 1
|
|
337
|
+
return [(phrase, count/max_count) for phrase, count in top]
|