tritopic 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ """
2
+ Keyword Extraction for TriTopic
3
+ ================================
4
+
5
+ Extract representative keywords for topics using:
6
+ - c-TF-IDF (class-based TF-IDF, like BERTopic)
7
+ - BM25 scoring
8
+ - KeyBERT (embedding-based)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Literal
14
+ from collections import Counter
15
+
16
+ import numpy as np
17
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
18
+
19
+
20
+ class KeywordExtractor:
21
+ """
22
+ Extract keywords for topics.
23
+
24
+ Supports multiple extraction methods for flexibility.
25
+
26
+ Parameters
27
+ ----------
28
+ method : str
29
+ Extraction method: "ctfidf", "bm25", or "keybert"
30
+ n_keywords : int
31
+ Number of keywords to extract per topic. Default: 10
32
+ ngram_range : tuple
33
+ N-gram range for keyword extraction. Default: (1, 2)
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf",
39
+ n_keywords: int = 10,
40
+ ngram_range: tuple[int, int] = (1, 2),
41
+ min_df: int = 2,
42
+ max_df: float = 0.95,
43
+ ):
44
+ self.method = method
45
+ self.n_keywords = n_keywords
46
+ self.ngram_range = ngram_range
47
+ self.min_df = min_df
48
+ self.max_df = max_df
49
+
50
+ self._vectorizer = None
51
+ self._vocabulary = None
52
+
53
+ def extract(
54
+ self,
55
+ topic_docs: list[str],
56
+ all_docs: list[str] | None = None,
57
+ n_keywords: int | None = None,
58
+ ) -> tuple[list[str], list[float]]:
59
+ """
60
+ Extract keywords from topic documents.
61
+
62
+ Parameters
63
+ ----------
64
+ topic_docs : list[str]
65
+ Documents belonging to the topic.
66
+ all_docs : list[str], optional
67
+ All documents in corpus (needed for c-TF-IDF).
68
+ n_keywords : int, optional
69
+ Override default n_keywords.
70
+
71
+ Returns
72
+ -------
73
+ keywords : list[str]
74
+ Top keywords for the topic.
75
+ scores : list[float]
76
+ Keyword scores.
77
+ """
78
+ n = n_keywords or self.n_keywords
79
+
80
+ if self.method == "ctfidf":
81
+ return self._extract_ctfidf(topic_docs, all_docs or topic_docs, n)
82
+ elif self.method == "bm25":
83
+ return self._extract_bm25(topic_docs, all_docs or topic_docs, n)
84
+ elif self.method == "keybert":
85
+ return self._extract_keybert(topic_docs, n)
86
+ else:
87
+ raise ValueError(f"Unknown method: {self.method}")
88
+
89
+ def _extract_ctfidf(
90
+ self,
91
+ topic_docs: list[str],
92
+ all_docs: list[str],
93
+ n_keywords: int,
94
+ ) -> tuple[list[str], list[float]]:
95
+ """
96
+ Extract keywords using class-based TF-IDF (c-TF-IDF).
97
+
98
+ c-TF-IDF treats all documents in a topic as a single "class document"
99
+ and computes TF-IDF against the corpus. This highlights words that
100
+ are distinctive for the topic.
101
+ """
102
+ # Fit vectorizer on all docs if not already
103
+ if self._vectorizer is None:
104
+ self._vectorizer = CountVectorizer(
105
+ ngram_range=self.ngram_range,
106
+ stop_words="english",
107
+ min_df=self.min_df,
108
+ max_df=self.max_df,
109
+ )
110
+ self._vectorizer.fit(all_docs)
111
+ self._vocabulary = self._vectorizer.get_feature_names_out()
112
+
113
+ # Concatenate topic docs into a single "class document"
114
+ topic_text = " ".join(topic_docs)
115
+
116
+ # Get term frequencies for topic
117
+ topic_tf = self._vectorizer.transform([topic_text]).toarray()[0]
118
+
119
+ # Get term frequencies across all docs
120
+ all_tf = self._vectorizer.transform(all_docs).toarray()
121
+
122
+ # Compute IDF: log(N / (1 + df))
123
+ doc_freq = np.sum(all_tf > 0, axis=0)
124
+ idf = np.log(len(all_docs) / (1 + doc_freq))
125
+
126
+ # c-TF-IDF = TF * IDF (with smoothing)
127
+ topic_tf_normalized = topic_tf / (topic_tf.sum() + 1e-10)
128
+ ctfidf_scores = topic_tf_normalized * idf
129
+
130
+ # Get top keywords
131
+ top_indices = np.argsort(ctfidf_scores)[::-1][:n_keywords]
132
+
133
+ keywords = [self._vocabulary[i] for i in top_indices]
134
+ scores = [float(ctfidf_scores[i]) for i in top_indices]
135
+
136
+ return keywords, scores
137
+
138
+ def _extract_bm25(
139
+ self,
140
+ topic_docs: list[str],
141
+ all_docs: list[str],
142
+ n_keywords: int,
143
+ ) -> tuple[list[str], list[float]]:
144
+ """
145
+ Extract keywords using BM25 scoring.
146
+
147
+ BM25 is more robust to document length variations than TF-IDF.
148
+ """
149
+ from rank_bm25 import BM25Okapi
150
+
151
+ # Tokenize
152
+ def tokenize(text):
153
+ # Simple tokenization
154
+ import re
155
+ tokens = re.findall(r'\b\w+\b', text.lower())
156
+ # Remove stopwords
157
+ stopwords = {
158
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
159
+ 'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were',
160
+ 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
161
+ 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
162
+ 'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'then',
163
+ }
164
+ return [t for t in tokens if t not in stopwords and len(t) > 2]
165
+
166
+ # Tokenize all docs
167
+ tokenized_all = [tokenize(doc) for doc in all_docs]
168
+ tokenized_topic = [tokenize(doc) for doc in topic_docs]
169
+
170
+ # Build vocabulary from topic docs
171
+ topic_vocab = Counter()
172
+ for tokens in tokenized_topic:
173
+ topic_vocab.update(tokens)
174
+
175
+ # Fit BM25 on all docs
176
+ bm25 = BM25Okapi(tokenized_all)
177
+
178
+ # Score each word in topic vocabulary
179
+ word_scores = {}
180
+ for word, freq in topic_vocab.items():
181
+ # Use word as query
182
+ scores = bm25.get_scores([word])
183
+
184
+ # Average score weighted by frequency in topic
185
+ avg_score = np.mean(scores)
186
+ word_scores[word] = avg_score * np.log1p(freq)
187
+
188
+ # Sort by score
189
+ sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
190
+
191
+ keywords = [w for w, s in sorted_words[:n_keywords]]
192
+ scores = [s for w, s in sorted_words[:n_keywords]]
193
+
194
+ # Normalize scores
195
+ max_score = max(scores) if scores else 1
196
+ scores = [s / max_score for s in scores]
197
+
198
+ return keywords, scores
199
+
200
+ def _extract_keybert(
201
+ self,
202
+ topic_docs: list[str],
203
+ n_keywords: int,
204
+ ) -> tuple[list[str], list[float]]:
205
+ """
206
+ Extract keywords using KeyBERT (embedding-based).
207
+
208
+ KeyBERT finds keywords by comparing candidate embeddings
209
+ to the document embedding.
210
+ """
211
+ from keybert import KeyBERT
212
+
213
+ # Concatenate topic docs
214
+ topic_text = " ".join(topic_docs)
215
+
216
+ # Initialize KeyBERT
217
+ kw_model = KeyBERT()
218
+
219
+ # Extract keywords
220
+ keywords_with_scores = kw_model.extract_keywords(
221
+ topic_text,
222
+ keyphrase_ngram_range=self.ngram_range,
223
+ stop_words="english",
224
+ top_n=n_keywords,
225
+ use_mmr=True, # Maximal Marginal Relevance for diversity
226
+ diversity=0.5,
227
+ )
228
+
229
+ keywords = [kw for kw, score in keywords_with_scores]
230
+ scores = [float(score) for kw, score in keywords_with_scores]
231
+
232
+ return keywords, scores
233
+
234
+ def extract_all_topics(
235
+ self,
236
+ documents: list[str],
237
+ labels: np.ndarray,
238
+ n_keywords: int | None = None,
239
+ ) -> dict[int, tuple[list[str], list[float]]]:
240
+ """
241
+ Extract keywords for all topics at once.
242
+
243
+ Parameters
244
+ ----------
245
+ documents : list[str]
246
+ All documents.
247
+ labels : np.ndarray
248
+ Topic assignments.
249
+ n_keywords : int, optional
250
+ Override default n_keywords.
251
+
252
+ Returns
253
+ -------
254
+ topic_keywords : dict
255
+ Mapping from topic_id to (keywords, scores).
256
+ """
257
+ result = {}
258
+
259
+ for topic_id in np.unique(labels):
260
+ if topic_id == -1:
261
+ continue
262
+
263
+ mask = labels == topic_id
264
+ topic_docs = [documents[i] for i in np.where(mask)[0]]
265
+
266
+ keywords, scores = self.extract(topic_docs, documents, n_keywords)
267
+ result[int(topic_id)] = (keywords, scores)
268
+
269
+ return result
270
+
271
+
272
+ class KeyphraseExtractor:
273
+ """
274
+ Extract keyphrases (multi-word) using YAKE or TextRank.
275
+ """
276
+
277
+ def __init__(
278
+ self,
279
+ method: Literal["yake", "textrank"] = "yake",
280
+ n_keyphrases: int = 10,
281
+ max_ngram: int = 3,
282
+ ):
283
+ self.method = method
284
+ self.n_keyphrases = n_keyphrases
285
+ self.max_ngram = max_ngram
286
+
287
+ def extract(self, text: str) -> list[tuple[str, float]]:
288
+ """Extract keyphrases from text."""
289
+ if self.method == "yake":
290
+ return self._extract_yake(text)
291
+ else:
292
+ raise ValueError(f"Unknown method: {self.method}")
293
+
294
+ def _extract_yake(self, text: str) -> list[tuple[str, float]]:
295
+ """Extract using YAKE algorithm."""
296
+ try:
297
+ import yake
298
+ except ImportError:
299
+ # Fallback to simple extraction
300
+ return self._simple_extract(text)
301
+
302
+ kw_extractor = yake.KeywordExtractor(
303
+ lan="en",
304
+ n=self.max_ngram,
305
+ dedupLim=0.7,
306
+ top=self.n_keyphrases,
307
+ features=None,
308
+ )
309
+
310
+ keywords = kw_extractor.extract_keywords(text)
311
+
312
+ # YAKE returns (keyword, score) where lower score is better
313
+ # Invert for consistency
314
+ max_score = max(s for _, s in keywords) if keywords else 1
315
+ return [(kw, 1 - s/max_score) for kw, s in keywords]
316
+
317
+ def _simple_extract(self, text: str) -> list[tuple[str, float]]:
318
+ """Simple n-gram frequency extraction."""
319
+ import re
320
+ from collections import Counter
321
+
322
+ # Tokenize
323
+ tokens = re.findall(r'\b\w+\b', text.lower())
324
+
325
+ # Generate n-grams
326
+ ngrams = []
327
+ for n in range(1, self.max_ngram + 1):
328
+ for i in range(len(tokens) - n + 1):
329
+ ngram = " ".join(tokens[i:i+n])
330
+ ngrams.append(ngram)
331
+
332
+ # Count and return top
333
+ counts = Counter(ngrams)
334
+ top = counts.most_common(self.n_keyphrases)
335
+
336
+ max_count = top[0][1] if top else 1
337
+ return [(phrase, count/max_count) for phrase, count in top]