tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tritopic might be problematic. Click here for more details.

tritopic/core/keywords.py CHANGED
@@ -1,337 +1,301 @@
1
1
  """
2
- Keyword Extraction for TriTopic
3
- ================================
2
+ Keyword Extraction Module
4
3
 
5
- Extract representative keywords for topics using:
6
- - c-TF-IDF (class-based TF-IDF, like BERTopic)
7
- - BM25 scoring
4
+ Extracts topic keywords using various methods:
5
+ - c-TF-IDF (class-based TF-IDF)
6
+ - BM25
8
7
  - KeyBERT (embedding-based)
9
8
  """
10
9
 
11
- from __future__ import annotations
12
-
13
- from typing import Literal
14
- from collections import Counter
15
-
10
+ from typing import List, Dict, Set, Optional, Literal
16
11
  import numpy as np
17
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
12
+ from collections import Counter
18
13
 
19
14
 
20
15
  class KeywordExtractor:
21
16
  """
22
- Extract keywords for topics.
23
-
24
- Supports multiple extraction methods for flexibility.
25
-
26
- Parameters
27
- ----------
28
- method : str
29
- Extraction method: "ctfidf", "bm25", or "keybert"
30
- n_keywords : int
31
- Number of keywords to extract per topic. Default: 10
32
- ngram_range : tuple
33
- N-gram range for keyword extraction. Default: (1, 2)
17
+ Extracts keywords for topics using various methods.
34
18
  """
35
19
 
36
20
  def __init__(
37
21
  self,
38
22
  method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf",
39
23
  n_keywords: int = 10,
40
- ngram_range: tuple[int, int] = (1, 2),
24
+ ngram_range: tuple = (1, 2),
41
25
  min_df: int = 2,
42
- max_df: float = 0.95,
26
+ stopwords: Set[str] = None,
43
27
  ):
28
+ """
29
+ Initialize the keyword extractor.
30
+
31
+ Parameters
32
+ ----------
33
+ method : str
34
+ Extraction method
35
+ n_keywords : int
36
+ Number of keywords per topic
37
+ ngram_range : tuple
38
+ N-gram range
39
+ min_df : int
40
+ Minimum document frequency
41
+ stopwords : set
42
+ Stopwords to exclude
43
+ """
44
44
  self.method = method
45
45
  self.n_keywords = n_keywords
46
46
  self.ngram_range = ngram_range
47
47
  self.min_df = min_df
48
- self.max_df = max_df
49
-
50
- self._vectorizer = None
51
- self._vocabulary = None
48
+ self.stopwords = stopwords or set()
52
49
 
53
50
  def extract(
54
51
  self,
55
- topic_docs: list[str],
56
- all_docs: list[str] | None = None,
57
- n_keywords: int | None = None,
58
- ) -> tuple[list[str], list[float]]:
52
+ documents: List[str],
53
+ labels: np.ndarray,
54
+ tokenized_documents: List[List[str]] = None,
55
+ embeddings: np.ndarray = None,
56
+ ) -> Dict[int, List[str]]:
59
57
  """
60
- Extract keywords from topic documents.
58
+ Extract keywords for each topic.
61
59
 
62
60
  Parameters
63
61
  ----------
64
- topic_docs : list[str]
65
- Documents belonging to the topic.
66
- all_docs : list[str], optional
67
- All documents in corpus (needed for c-TF-IDF).
68
- n_keywords : int, optional
69
- Override default n_keywords.
62
+ documents : List[str]
63
+ Original documents
64
+ labels : np.ndarray
65
+ Topic labels
66
+ tokenized_documents : List[List[str]], optional
67
+ Pre-tokenized documents
68
+ embeddings : np.ndarray, optional
69
+ Document embeddings (for KeyBERT)
70
70
 
71
71
  Returns
72
72
  -------
73
- keywords : list[str]
74
- Top keywords for the topic.
75
- scores : list[float]
76
- Keyword scores.
73
+ Dict[int, List[str]]
74
+ Keywords per topic
77
75
  """
78
- n = n_keywords or self.n_keywords
79
-
80
76
  if self.method == "ctfidf":
81
- return self._extract_ctfidf(topic_docs, all_docs or topic_docs, n)
77
+ return self._extract_ctfidf(documents, labels, tokenized_documents)
82
78
  elif self.method == "bm25":
83
- return self._extract_bm25(topic_docs, all_docs or topic_docs, n)
79
+ return self._extract_bm25(documents, labels, tokenized_documents)
84
80
  elif self.method == "keybert":
85
- return self._extract_keybert(topic_docs, n)
81
+ return self._extract_keybert(documents, labels, embeddings)
86
82
  else:
87
83
  raise ValueError(f"Unknown method: {self.method}")
88
84
 
89
85
  def _extract_ctfidf(
90
86
  self,
91
- topic_docs: list[str],
92
- all_docs: list[str],
93
- n_keywords: int,
94
- ) -> tuple[list[str], list[float]]:
87
+ documents: List[str],
88
+ labels: np.ndarray,
89
+ tokenized_documents: List[List[str]] = None,
90
+ ) -> Dict[int, List[str]]:
95
91
  """
96
- Extract keywords using class-based TF-IDF (c-TF-IDF).
92
+ Extract keywords using c-TF-IDF.
97
93
 
98
- c-TF-IDF treats all documents in a topic as a single "class document"
99
- and computes TF-IDF against the corpus. This highlights words that
100
- are distinctive for the topic.
94
+ c-TF-IDF treats each topic as a single "document" by
95
+ concatenating all documents in the topic.
101
96
  """
102
- # Fit vectorizer on all docs if not already
103
- if self._vectorizer is None:
104
- self._vectorizer = CountVectorizer(
105
- ngram_range=self.ngram_range,
106
- stop_words="english",
107
- min_df=self.min_df,
108
- max_df=self.max_df,
109
- )
110
- self._vectorizer.fit(all_docs)
111
- self._vocabulary = self._vectorizer.get_feature_names_out()
97
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
112
98
 
113
- # Concatenate topic docs into a single "class document"
114
- topic_text = " ".join(topic_docs)
99
+ unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
115
100
 
116
- # Get term frequencies for topic
117
- topic_tf = self._vectorizer.transform([topic_text]).toarray()[0]
101
+ # Create topic documents (concatenate all docs in each topic)
102
+ topic_docs = []
103
+ topic_ids = []
118
104
 
119
- # Get term frequencies across all docs
120
- all_tf = self._vectorizer.transform(all_docs).toarray()
105
+ for topic_id in unique_topics:
106
+ mask = labels == topic_id
107
+ topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
108
+ topic_docs.append(topic_text)
109
+ topic_ids.append(topic_id)
110
+
111
+ # Vectorize
112
+ vectorizer = CountVectorizer(
113
+ ngram_range=self.ngram_range,
114
+ min_df=1, # We're using topic-level docs
115
+ stop_words=list(self.stopwords) if self.stopwords else None,
116
+ )
121
117
 
122
- # Compute IDF: log(N / (1 + df))
123
- doc_freq = np.sum(all_tf > 0, axis=0)
124
- idf = np.log(len(all_docs) / (1 + doc_freq))
118
+ count_matrix = vectorizer.fit_transform(topic_docs)
119
+ feature_names = vectorizer.get_feature_names_out()
125
120
 
126
- # c-TF-IDF = TF * IDF (with smoothing)
127
- topic_tf_normalized = topic_tf / (topic_tf.sum() + 1e-10)
128
- ctfidf_scores = topic_tf_normalized * idf
121
+ # Apply TF-IDF transformation
122
+ tfidf = TfidfTransformer()
123
+ tfidf_matrix = tfidf.fit_transform(count_matrix)
129
124
 
130
- # Get top keywords
131
- top_indices = np.argsort(ctfidf_scores)[::-1][:n_keywords]
125
+ # Extract top keywords per topic
126
+ keywords = {}
132
127
 
133
- keywords = [self._vocabulary[i] for i in top_indices]
134
- scores = [float(ctfidf_scores[i]) for i in top_indices]
128
+ for i, topic_id in enumerate(topic_ids):
129
+ scores = tfidf_matrix[i].toarray().flatten()
130
+ top_indices = np.argsort(scores)[::-1]
131
+
132
+ topic_keywords = []
133
+ for idx in top_indices:
134
+ word = feature_names[idx]
135
+ if len(topic_keywords) >= self.n_keywords:
136
+ break
137
+ if word.lower() not in self.stopwords:
138
+ topic_keywords.append(word)
139
+
140
+ keywords[topic_id] = topic_keywords
135
141
 
136
- return keywords, scores
142
+ return keywords
137
143
 
138
144
  def _extract_bm25(
139
145
  self,
140
- topic_docs: list[str],
141
- all_docs: list[str],
142
- n_keywords: int,
143
- ) -> tuple[list[str], list[float]]:
146
+ documents: List[str],
147
+ labels: np.ndarray,
148
+ tokenized_documents: List[List[str]] = None,
149
+ ) -> Dict[int, List[str]]:
144
150
  """
145
151
  Extract keywords using BM25 scoring.
146
-
147
- BM25 is more robust to document length variations than TF-IDF.
148
152
  """
149
- from rank_bm25 import BM25Okapi
153
+ unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
150
154
 
151
- # Tokenize
152
- def tokenize(text):
153
- # Simple tokenization
154
- import re
155
- tokens = re.findall(r'\b\w+\b', text.lower())
156
- # Remove stopwords
157
- stopwords = {
158
- 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
159
- 'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were',
160
- 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
161
- 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
162
- 'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'then',
163
- }
164
- return [t for t in tokens if t not in stopwords and len(t) > 2]
155
+ # Tokenize if not provided
156
+ if tokenized_documents is None:
157
+ tokenized_documents = [doc.lower().split() for doc in documents]
165
158
 
166
- # Tokenize all docs
167
- tokenized_all = [tokenize(doc) for doc in all_docs]
168
- tokenized_topic = [tokenize(doc) for doc in topic_docs]
159
+ # Filter stopwords
160
+ tokenized_documents = [
161
+ [w for w in tokens if w not in self.stopwords]
162
+ for tokens in tokenized_documents
163
+ ]
169
164
 
170
- # Build vocabulary from topic docs
171
- topic_vocab = Counter()
172
- for tokens in tokenized_topic:
173
- topic_vocab.update(tokens)
165
+ keywords = {}
174
166
 
175
- # Fit BM25 on all docs
176
- bm25 = BM25Okapi(tokenized_all)
177
-
178
- # Score each word in topic vocabulary
179
- word_scores = {}
180
- for word, freq in topic_vocab.items():
181
- # Use word as query
182
- scores = bm25.get_scores([word])
167
+ for topic_id in unique_topics:
168
+ mask = labels == topic_id
169
+ topic_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if m]
170
+ other_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if not m]
183
171
 
184
- # Average score weighted by frequency in topic
185
- avg_score = np.mean(scores)
186
- word_scores[word] = avg_score * np.log1p(freq)
187
-
188
- # Sort by score
189
- sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
190
-
191
- keywords = [w for w, s in sorted_words[:n_keywords]]
192
- scores = [s for w, s in sorted_words[:n_keywords]]
193
-
194
- # Normalize scores
195
- max_score = max(scores) if scores else 1
196
- scores = [s / max_score for s in scores]
172
+ # Count term frequencies in topic
173
+ topic_tf = Counter()
174
+ for tokens in topic_tokens:
175
+ topic_tf.update(tokens)
176
+
177
+ # Count document frequencies across all docs
178
+ all_tokens = tokenized_documents
179
+ df = Counter()
180
+ for tokens in all_tokens:
181
+ df.update(set(tokens))
182
+
183
+ # Compute BM25-like scores
184
+ N = len(all_tokens)
185
+ avgdl = np.mean([len(t) for t in all_tokens])
186
+ k1, b = 1.5, 0.75
187
+
188
+ scores = {}
189
+ topic_len = sum(len(t) for t in topic_tokens)
190
+
191
+ for term, freq in topic_tf.items():
192
+ if df[term] < self.min_df:
193
+ continue
194
+
195
+ idf = np.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1)
196
+ tf_normalized = freq * (k1 + 1) / (freq + k1 * (1 - b + b * topic_len / avgdl))
197
+ scores[term] = idf * tf_normalized
198
+
199
+ # Get top keywords
200
+ sorted_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)
201
+ keywords[topic_id] = [term for term, _ in sorted_terms[:self.n_keywords]]
197
202
 
198
- return keywords, scores
203
+ return keywords
199
204
 
200
205
  def _extract_keybert(
201
206
  self,
202
- topic_docs: list[str],
203
- n_keywords: int,
204
- ) -> tuple[list[str], list[float]]:
207
+ documents: List[str],
208
+ labels: np.ndarray,
209
+ embeddings: np.ndarray = None,
210
+ ) -> Dict[int, List[str]]:
205
211
  """
206
212
  Extract keywords using KeyBERT (embedding-based).
207
-
208
- KeyBERT finds keywords by comparing candidate embeddings
209
- to the document embedding.
210
213
  """
211
- from keybert import KeyBERT
214
+ try:
215
+ from keybert import KeyBERT
216
+ except ImportError:
217
+ # Fall back to c-TF-IDF
218
+ import warnings
219
+ warnings.warn("KeyBERT not installed, falling back to c-TF-IDF")
220
+ return self._extract_ctfidf(documents, labels, None)
212
221
 
213
- # Concatenate topic docs
214
- topic_text = " ".join(topic_docs)
222
+ unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
215
223
 
216
224
  # Initialize KeyBERT
217
225
  kw_model = KeyBERT()
218
226
 
219
- # Extract keywords
220
- keywords_with_scores = kw_model.extract_keywords(
221
- topic_text,
222
- keyphrase_ngram_range=self.ngram_range,
223
- stop_words="english",
224
- top_n=n_keywords,
225
- use_mmr=True, # Maximal Marginal Relevance for diversity
226
- diversity=0.5,
227
- )
228
-
229
- keywords = [kw for kw, score in keywords_with_scores]
230
- scores = [float(score) for kw, score in keywords_with_scores]
231
-
232
- return keywords, scores
233
-
234
- def extract_all_topics(
235
- self,
236
- documents: list[str],
237
- labels: np.ndarray,
238
- n_keywords: int | None = None,
239
- ) -> dict[int, tuple[list[str], list[float]]]:
240
- """
241
- Extract keywords for all topics at once.
242
-
243
- Parameters
244
- ----------
245
- documents : list[str]
246
- All documents.
247
- labels : np.ndarray
248
- Topic assignments.
249
- n_keywords : int, optional
250
- Override default n_keywords.
251
-
252
- Returns
253
- -------
254
- topic_keywords : dict
255
- Mapping from topic_id to (keywords, scores).
256
- """
257
- result = {}
227
+ keywords = {}
258
228
 
259
- for topic_id in np.unique(labels):
260
- if topic_id == -1:
261
- continue
262
-
229
+ for topic_id in unique_topics:
263
230
  mask = labels == topic_id
264
- topic_docs = [documents[i] for i in np.where(mask)[0]]
231
+ topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
265
232
 
266
- keywords, scores = self.extract(topic_docs, documents, n_keywords)
267
- result[int(topic_id)] = (keywords, scores)
233
+ # Extract keywords
234
+ kw_results = kw_model.extract_keywords(
235
+ topic_text,
236
+ keyphrase_ngram_range=self.ngram_range,
237
+ stop_words=list(self.stopwords) if self.stopwords else None,
238
+ top_n=self.n_keywords,
239
+ )
240
+
241
+ keywords[topic_id] = [kw for kw, _ in kw_results]
268
242
 
269
- return result
243
+ return keywords
270
244
 
271
245
 
272
- class KeyphraseExtractor:
273
- """
274
- Extract keyphrases (multi-word) using YAKE or TextRank.
246
+ def compute_keyword_scores(
247
+ keywords: Dict[int, List[str]],
248
+ documents: List[str],
249
+ labels: np.ndarray,
250
+ ) -> Dict[int, List[tuple]]:
275
251
  """
252
+ Compute scores for keywords based on their discriminative power.
276
253
 
277
- def __init__(
278
- self,
279
- method: Literal["yake", "textrank"] = "yake",
280
- n_keyphrases: int = 10,
281
- max_ngram: int = 3,
282
- ):
283
- self.method = method
284
- self.n_keyphrases = n_keyphrases
285
- self.max_ngram = max_ngram
286
-
287
- def extract(self, text: str) -> list[tuple[str, float]]:
288
- """Extract keyphrases from text."""
289
- if self.method == "yake":
290
- return self._extract_yake(text)
291
- else:
292
- raise ValueError(f"Unknown method: {self.method}")
254
+ Returns keywords with their scores.
255
+ """
256
+ # Get document frequencies per topic
257
+ topic_dfs = {}
258
+ unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
293
259
 
294
- def _extract_yake(self, text: str) -> list[tuple[str, float]]:
295
- """Extract using YAKE algorithm."""
296
- try:
297
- import yake
298
- except ImportError:
299
- # Fallback to simple extraction
300
- return self._simple_extract(text)
301
-
302
- kw_extractor = yake.KeywordExtractor(
303
- lan="en",
304
- n=self.max_ngram,
305
- dedupLim=0.7,
306
- top=self.n_keyphrases,
307
- features=None,
308
- )
260
+ for topic_id in unique_topics:
261
+ mask = labels == topic_id
262
+ topic_docs = [documents[i].lower() for i, m in enumerate(mask) if m]
309
263
 
310
- keywords = kw_extractor.extract_keywords(text)
264
+ df = Counter()
265
+ for doc in topic_docs:
266
+ words = set(doc.split())
267
+ df.update(words)
311
268
 
312
- # YAKE returns (keyword, score) where lower score is better
313
- # Invert for consistency
314
- max_score = max(s for _, s in keywords) if keywords else 1
315
- return [(kw, 1 - s/max_score) for kw, s in keywords]
269
+ topic_dfs[topic_id] = df
316
270
 
317
- def _simple_extract(self, text: str) -> list[tuple[str, float]]:
318
- """Simple n-gram frequency extraction."""
319
- import re
320
- from collections import Counter
321
-
322
- # Tokenize
323
- tokens = re.findall(r'\b\w+\b', text.lower())
324
-
325
- # Generate n-grams
326
- ngrams = []
327
- for n in range(1, self.max_ngram + 1):
328
- for i in range(len(tokens) - n + 1):
329
- ngram = " ".join(tokens[i:i+n])
330
- ngrams.append(ngram)
331
-
332
- # Count and return top
333
- counts = Counter(ngrams)
334
- top = counts.most_common(self.n_keyphrases)
271
+ # Compute scores
272
+ scored_keywords = {}
273
+
274
+ for topic_id, topic_keywords in keywords.items():
275
+ scored = []
276
+ topic_df = topic_dfs[topic_id]
277
+ n_topic_docs = sum(labels == topic_id)
278
+
279
+ for keyword in topic_keywords:
280
+ # TF in topic
281
+ tf = topic_df.get(keyword.lower(), 0) / n_topic_docs
282
+
283
+ # DF across other topics (for IDF-like scoring)
284
+ other_df = sum(
285
+ topic_dfs[t].get(keyword.lower(), 0)
286
+ for t in unique_topics if t != topic_id
287
+ )
288
+ n_other_docs = sum(labels != topic_id)
289
+
290
+ if n_other_docs > 0:
291
+ other_ratio = other_df / n_other_docs
292
+ # Discriminative score
293
+ score = tf / (other_ratio + 0.1)
294
+ else:
295
+ score = tf
296
+
297
+ scored.append((keyword, round(score, 4)))
335
298
 
336
- max_count = top[0][1] if top else 1
337
- return [(phrase, count/max_count) for phrase, count in top]
299
+ scored_keywords[topic_id] = scored
300
+
301
+ return scored_keywords