tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tritopic/__init__.py +22 -32
- tritopic/config.py +305 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
- tritopic-1.0.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0
tritopic/core/keywords.py
CHANGED
|
@@ -1,337 +1,301 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Keyword Extraction
|
|
3
|
-
================================
|
|
2
|
+
Keyword Extraction Module
|
|
4
3
|
|
|
5
|
-
|
|
6
|
-
- c-TF-IDF (class-based TF-IDF
|
|
7
|
-
- BM25
|
|
4
|
+
Extracts topic keywords using various methods:
|
|
5
|
+
- c-TF-IDF (class-based TF-IDF)
|
|
6
|
+
- BM25
|
|
8
7
|
- KeyBERT (embedding-based)
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
from typing import Literal
|
|
14
|
-
from collections import Counter
|
|
15
|
-
|
|
10
|
+
from typing import List, Dict, Set, Optional, Literal
|
|
16
11
|
import numpy as np
|
|
17
|
-
from
|
|
12
|
+
from collections import Counter
|
|
18
13
|
|
|
19
14
|
|
|
20
15
|
class KeywordExtractor:
|
|
21
16
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Supports multiple extraction methods for flexibility.
|
|
25
|
-
|
|
26
|
-
Parameters
|
|
27
|
-
----------
|
|
28
|
-
method : str
|
|
29
|
-
Extraction method: "ctfidf", "bm25", or "keybert"
|
|
30
|
-
n_keywords : int
|
|
31
|
-
Number of keywords to extract per topic. Default: 10
|
|
32
|
-
ngram_range : tuple
|
|
33
|
-
N-gram range for keyword extraction. Default: (1, 2)
|
|
17
|
+
Extracts keywords for topics using various methods.
|
|
34
18
|
"""
|
|
35
19
|
|
|
36
20
|
def __init__(
|
|
37
21
|
self,
|
|
38
22
|
method: Literal["ctfidf", "bm25", "keybert"] = "ctfidf",
|
|
39
23
|
n_keywords: int = 10,
|
|
40
|
-
ngram_range: tuple
|
|
24
|
+
ngram_range: tuple = (1, 2),
|
|
41
25
|
min_df: int = 2,
|
|
42
|
-
|
|
26
|
+
stopwords: Set[str] = None,
|
|
43
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the keyword extractor.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
method : str
|
|
34
|
+
Extraction method
|
|
35
|
+
n_keywords : int
|
|
36
|
+
Number of keywords per topic
|
|
37
|
+
ngram_range : tuple
|
|
38
|
+
N-gram range
|
|
39
|
+
min_df : int
|
|
40
|
+
Minimum document frequency
|
|
41
|
+
stopwords : set
|
|
42
|
+
Stopwords to exclude
|
|
43
|
+
"""
|
|
44
44
|
self.method = method
|
|
45
45
|
self.n_keywords = n_keywords
|
|
46
46
|
self.ngram_range = ngram_range
|
|
47
47
|
self.min_df = min_df
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
self._vectorizer = None
|
|
51
|
-
self._vocabulary = None
|
|
48
|
+
self.stopwords = stopwords or set()
|
|
52
49
|
|
|
53
50
|
def extract(
|
|
54
51
|
self,
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
52
|
+
documents: List[str],
|
|
53
|
+
labels: np.ndarray,
|
|
54
|
+
tokenized_documents: List[List[str]] = None,
|
|
55
|
+
embeddings: np.ndarray = None,
|
|
56
|
+
) -> Dict[int, List[str]]:
|
|
59
57
|
"""
|
|
60
|
-
Extract keywords
|
|
58
|
+
Extract keywords for each topic.
|
|
61
59
|
|
|
62
60
|
Parameters
|
|
63
61
|
----------
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
62
|
+
documents : List[str]
|
|
63
|
+
Original documents
|
|
64
|
+
labels : np.ndarray
|
|
65
|
+
Topic labels
|
|
66
|
+
tokenized_documents : List[List[str]], optional
|
|
67
|
+
Pre-tokenized documents
|
|
68
|
+
embeddings : np.ndarray, optional
|
|
69
|
+
Document embeddings (for KeyBERT)
|
|
70
70
|
|
|
71
71
|
Returns
|
|
72
72
|
-------
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
scores : list[float]
|
|
76
|
-
Keyword scores.
|
|
73
|
+
Dict[int, List[str]]
|
|
74
|
+
Keywords per topic
|
|
77
75
|
"""
|
|
78
|
-
n = n_keywords or self.n_keywords
|
|
79
|
-
|
|
80
76
|
if self.method == "ctfidf":
|
|
81
|
-
return self._extract_ctfidf(
|
|
77
|
+
return self._extract_ctfidf(documents, labels, tokenized_documents)
|
|
82
78
|
elif self.method == "bm25":
|
|
83
|
-
return self._extract_bm25(
|
|
79
|
+
return self._extract_bm25(documents, labels, tokenized_documents)
|
|
84
80
|
elif self.method == "keybert":
|
|
85
|
-
return self._extract_keybert(
|
|
81
|
+
return self._extract_keybert(documents, labels, embeddings)
|
|
86
82
|
else:
|
|
87
83
|
raise ValueError(f"Unknown method: {self.method}")
|
|
88
84
|
|
|
89
85
|
def _extract_ctfidf(
|
|
90
86
|
self,
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
) ->
|
|
87
|
+
documents: List[str],
|
|
88
|
+
labels: np.ndarray,
|
|
89
|
+
tokenized_documents: List[List[str]] = None,
|
|
90
|
+
) -> Dict[int, List[str]]:
|
|
95
91
|
"""
|
|
96
|
-
Extract keywords using
|
|
92
|
+
Extract keywords using c-TF-IDF.
|
|
97
93
|
|
|
98
|
-
c-TF-IDF treats
|
|
99
|
-
|
|
100
|
-
are distinctive for the topic.
|
|
94
|
+
c-TF-IDF treats each topic as a single "document" by
|
|
95
|
+
concatenating all documents in the topic.
|
|
101
96
|
"""
|
|
102
|
-
|
|
103
|
-
if self._vectorizer is None:
|
|
104
|
-
self._vectorizer = CountVectorizer(
|
|
105
|
-
ngram_range=self.ngram_range,
|
|
106
|
-
stop_words="english",
|
|
107
|
-
min_df=self.min_df,
|
|
108
|
-
max_df=self.max_df,
|
|
109
|
-
)
|
|
110
|
-
self._vectorizer.fit(all_docs)
|
|
111
|
-
self._vocabulary = self._vectorizer.get_feature_names_out()
|
|
97
|
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
|
112
98
|
|
|
113
|
-
|
|
114
|
-
topic_text = " ".join(topic_docs)
|
|
99
|
+
unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
|
|
115
100
|
|
|
116
|
-
#
|
|
117
|
-
|
|
101
|
+
# Create topic documents (concatenate all docs in each topic)
|
|
102
|
+
topic_docs = []
|
|
103
|
+
topic_ids = []
|
|
118
104
|
|
|
119
|
-
|
|
120
|
-
|
|
105
|
+
for topic_id in unique_topics:
|
|
106
|
+
mask = labels == topic_id
|
|
107
|
+
topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
|
|
108
|
+
topic_docs.append(topic_text)
|
|
109
|
+
topic_ids.append(topic_id)
|
|
110
|
+
|
|
111
|
+
# Vectorize
|
|
112
|
+
vectorizer = CountVectorizer(
|
|
113
|
+
ngram_range=self.ngram_range,
|
|
114
|
+
min_df=1, # We're using topic-level docs
|
|
115
|
+
stop_words=list(self.stopwords) if self.stopwords else None,
|
|
116
|
+
)
|
|
121
117
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
idf = np.log(len(all_docs) / (1 + doc_freq))
|
|
118
|
+
count_matrix = vectorizer.fit_transform(topic_docs)
|
|
119
|
+
feature_names = vectorizer.get_feature_names_out()
|
|
125
120
|
|
|
126
|
-
#
|
|
127
|
-
|
|
128
|
-
|
|
121
|
+
# Apply TF-IDF transformation
|
|
122
|
+
tfidf = TfidfTransformer()
|
|
123
|
+
tfidf_matrix = tfidf.fit_transform(count_matrix)
|
|
129
124
|
|
|
130
|
-
#
|
|
131
|
-
|
|
125
|
+
# Extract top keywords per topic
|
|
126
|
+
keywords = {}
|
|
132
127
|
|
|
133
|
-
|
|
134
|
-
|
|
128
|
+
for i, topic_id in enumerate(topic_ids):
|
|
129
|
+
scores = tfidf_matrix[i].toarray().flatten()
|
|
130
|
+
top_indices = np.argsort(scores)[::-1]
|
|
131
|
+
|
|
132
|
+
topic_keywords = []
|
|
133
|
+
for idx in top_indices:
|
|
134
|
+
word = feature_names[idx]
|
|
135
|
+
if len(topic_keywords) >= self.n_keywords:
|
|
136
|
+
break
|
|
137
|
+
if word.lower() not in self.stopwords:
|
|
138
|
+
topic_keywords.append(word)
|
|
139
|
+
|
|
140
|
+
keywords[topic_id] = topic_keywords
|
|
135
141
|
|
|
136
|
-
return keywords
|
|
142
|
+
return keywords
|
|
137
143
|
|
|
138
144
|
def _extract_bm25(
|
|
139
145
|
self,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
) ->
|
|
146
|
+
documents: List[str],
|
|
147
|
+
labels: np.ndarray,
|
|
148
|
+
tokenized_documents: List[List[str]] = None,
|
|
149
|
+
) -> Dict[int, List[str]]:
|
|
144
150
|
"""
|
|
145
151
|
Extract keywords using BM25 scoring.
|
|
146
|
-
|
|
147
|
-
BM25 is more robust to document length variations than TF-IDF.
|
|
148
152
|
"""
|
|
149
|
-
|
|
153
|
+
unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
|
|
150
154
|
|
|
151
|
-
# Tokenize
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
import re
|
|
155
|
-
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
156
|
-
# Remove stopwords
|
|
157
|
-
stopwords = {
|
|
158
|
-
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
|
159
|
-
'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were',
|
|
160
|
-
'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
|
161
|
-
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
|
|
162
|
-
'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'then',
|
|
163
|
-
}
|
|
164
|
-
return [t for t in tokens if t not in stopwords and len(t) > 2]
|
|
155
|
+
# Tokenize if not provided
|
|
156
|
+
if tokenized_documents is None:
|
|
157
|
+
tokenized_documents = [doc.lower().split() for doc in documents]
|
|
165
158
|
|
|
166
|
-
#
|
|
167
|
-
|
|
168
|
-
|
|
159
|
+
# Filter stopwords
|
|
160
|
+
tokenized_documents = [
|
|
161
|
+
[w for w in tokens if w not in self.stopwords]
|
|
162
|
+
for tokens in tokenized_documents
|
|
163
|
+
]
|
|
169
164
|
|
|
170
|
-
|
|
171
|
-
topic_vocab = Counter()
|
|
172
|
-
for tokens in tokenized_topic:
|
|
173
|
-
topic_vocab.update(tokens)
|
|
165
|
+
keywords = {}
|
|
174
166
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
word_scores = {}
|
|
180
|
-
for word, freq in topic_vocab.items():
|
|
181
|
-
# Use word as query
|
|
182
|
-
scores = bm25.get_scores([word])
|
|
167
|
+
for topic_id in unique_topics:
|
|
168
|
+
mask = labels == topic_id
|
|
169
|
+
topic_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if m]
|
|
170
|
+
other_tokens = [tokenized_documents[i] for i, m in enumerate(mask) if not m]
|
|
183
171
|
|
|
184
|
-
#
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
172
|
+
# Count term frequencies in topic
|
|
173
|
+
topic_tf = Counter()
|
|
174
|
+
for tokens in topic_tokens:
|
|
175
|
+
topic_tf.update(tokens)
|
|
176
|
+
|
|
177
|
+
# Count document frequencies across all docs
|
|
178
|
+
all_tokens = tokenized_documents
|
|
179
|
+
df = Counter()
|
|
180
|
+
for tokens in all_tokens:
|
|
181
|
+
df.update(set(tokens))
|
|
182
|
+
|
|
183
|
+
# Compute BM25-like scores
|
|
184
|
+
N = len(all_tokens)
|
|
185
|
+
avgdl = np.mean([len(t) for t in all_tokens])
|
|
186
|
+
k1, b = 1.5, 0.75
|
|
187
|
+
|
|
188
|
+
scores = {}
|
|
189
|
+
topic_len = sum(len(t) for t in topic_tokens)
|
|
190
|
+
|
|
191
|
+
for term, freq in topic_tf.items():
|
|
192
|
+
if df[term] < self.min_df:
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
idf = np.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1)
|
|
196
|
+
tf_normalized = freq * (k1 + 1) / (freq + k1 * (1 - b + b * topic_len / avgdl))
|
|
197
|
+
scores[term] = idf * tf_normalized
|
|
198
|
+
|
|
199
|
+
# Get top keywords
|
|
200
|
+
sorted_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
201
|
+
keywords[topic_id] = [term for term, _ in sorted_terms[:self.n_keywords]]
|
|
197
202
|
|
|
198
|
-
return keywords
|
|
203
|
+
return keywords
|
|
199
204
|
|
|
200
205
|
def _extract_keybert(
|
|
201
206
|
self,
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
207
|
+
documents: List[str],
|
|
208
|
+
labels: np.ndarray,
|
|
209
|
+
embeddings: np.ndarray = None,
|
|
210
|
+
) -> Dict[int, List[str]]:
|
|
205
211
|
"""
|
|
206
212
|
Extract keywords using KeyBERT (embedding-based).
|
|
207
|
-
|
|
208
|
-
KeyBERT finds keywords by comparing candidate embeddings
|
|
209
|
-
to the document embedding.
|
|
210
213
|
"""
|
|
211
|
-
|
|
214
|
+
try:
|
|
215
|
+
from keybert import KeyBERT
|
|
216
|
+
except ImportError:
|
|
217
|
+
# Fall back to c-TF-IDF
|
|
218
|
+
import warnings
|
|
219
|
+
warnings.warn("KeyBERT not installed, falling back to c-TF-IDF")
|
|
220
|
+
return self._extract_ctfidf(documents, labels, None)
|
|
212
221
|
|
|
213
|
-
|
|
214
|
-
topic_text = " ".join(topic_docs)
|
|
222
|
+
unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
|
|
215
223
|
|
|
216
224
|
# Initialize KeyBERT
|
|
217
225
|
kw_model = KeyBERT()
|
|
218
226
|
|
|
219
|
-
|
|
220
|
-
keywords_with_scores = kw_model.extract_keywords(
|
|
221
|
-
topic_text,
|
|
222
|
-
keyphrase_ngram_range=self.ngram_range,
|
|
223
|
-
stop_words="english",
|
|
224
|
-
top_n=n_keywords,
|
|
225
|
-
use_mmr=True, # Maximal Marginal Relevance for diversity
|
|
226
|
-
diversity=0.5,
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
keywords = [kw for kw, score in keywords_with_scores]
|
|
230
|
-
scores = [float(score) for kw, score in keywords_with_scores]
|
|
231
|
-
|
|
232
|
-
return keywords, scores
|
|
233
|
-
|
|
234
|
-
def extract_all_topics(
|
|
235
|
-
self,
|
|
236
|
-
documents: list[str],
|
|
237
|
-
labels: np.ndarray,
|
|
238
|
-
n_keywords: int | None = None,
|
|
239
|
-
) -> dict[int, tuple[list[str], list[float]]]:
|
|
240
|
-
"""
|
|
241
|
-
Extract keywords for all topics at once.
|
|
242
|
-
|
|
243
|
-
Parameters
|
|
244
|
-
----------
|
|
245
|
-
documents : list[str]
|
|
246
|
-
All documents.
|
|
247
|
-
labels : np.ndarray
|
|
248
|
-
Topic assignments.
|
|
249
|
-
n_keywords : int, optional
|
|
250
|
-
Override default n_keywords.
|
|
251
|
-
|
|
252
|
-
Returns
|
|
253
|
-
-------
|
|
254
|
-
topic_keywords : dict
|
|
255
|
-
Mapping from topic_id to (keywords, scores).
|
|
256
|
-
"""
|
|
257
|
-
result = {}
|
|
227
|
+
keywords = {}
|
|
258
228
|
|
|
259
|
-
for topic_id in
|
|
260
|
-
if topic_id == -1:
|
|
261
|
-
continue
|
|
262
|
-
|
|
229
|
+
for topic_id in unique_topics:
|
|
263
230
|
mask = labels == topic_id
|
|
264
|
-
|
|
231
|
+
topic_text = ' '.join(documents[i] for i, m in enumerate(mask) if m)
|
|
265
232
|
|
|
266
|
-
|
|
267
|
-
|
|
233
|
+
# Extract keywords
|
|
234
|
+
kw_results = kw_model.extract_keywords(
|
|
235
|
+
topic_text,
|
|
236
|
+
keyphrase_ngram_range=self.ngram_range,
|
|
237
|
+
stop_words=list(self.stopwords) if self.stopwords else None,
|
|
238
|
+
top_n=self.n_keywords,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
keywords[topic_id] = [kw for kw, _ in kw_results]
|
|
268
242
|
|
|
269
|
-
return
|
|
243
|
+
return keywords
|
|
270
244
|
|
|
271
245
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
246
|
+
def compute_keyword_scores(
|
|
247
|
+
keywords: Dict[int, List[str]],
|
|
248
|
+
documents: List[str],
|
|
249
|
+
labels: np.ndarray,
|
|
250
|
+
) -> Dict[int, List[tuple]]:
|
|
275
251
|
"""
|
|
252
|
+
Compute scores for keywords based on their discriminative power.
|
|
276
253
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
):
|
|
283
|
-
self.method = method
|
|
284
|
-
self.n_keyphrases = n_keyphrases
|
|
285
|
-
self.max_ngram = max_ngram
|
|
286
|
-
|
|
287
|
-
def extract(self, text: str) -> list[tuple[str, float]]:
|
|
288
|
-
"""Extract keyphrases from text."""
|
|
289
|
-
if self.method == "yake":
|
|
290
|
-
return self._extract_yake(text)
|
|
291
|
-
else:
|
|
292
|
-
raise ValueError(f"Unknown method: {self.method}")
|
|
254
|
+
Returns keywords with their scores.
|
|
255
|
+
"""
|
|
256
|
+
# Get document frequencies per topic
|
|
257
|
+
topic_dfs = {}
|
|
258
|
+
unique_topics = sorted([t for t in np.unique(labels) if t >= 0])
|
|
293
259
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
import yake
|
|
298
|
-
except ImportError:
|
|
299
|
-
# Fallback to simple extraction
|
|
300
|
-
return self._simple_extract(text)
|
|
301
|
-
|
|
302
|
-
kw_extractor = yake.KeywordExtractor(
|
|
303
|
-
lan="en",
|
|
304
|
-
n=self.max_ngram,
|
|
305
|
-
dedupLim=0.7,
|
|
306
|
-
top=self.n_keyphrases,
|
|
307
|
-
features=None,
|
|
308
|
-
)
|
|
260
|
+
for topic_id in unique_topics:
|
|
261
|
+
mask = labels == topic_id
|
|
262
|
+
topic_docs = [documents[i].lower() for i, m in enumerate(mask) if m]
|
|
309
263
|
|
|
310
|
-
|
|
264
|
+
df = Counter()
|
|
265
|
+
for doc in topic_docs:
|
|
266
|
+
words = set(doc.split())
|
|
267
|
+
df.update(words)
|
|
311
268
|
|
|
312
|
-
|
|
313
|
-
# Invert for consistency
|
|
314
|
-
max_score = max(s for _, s in keywords) if keywords else 1
|
|
315
|
-
return [(kw, 1 - s/max_score) for kw, s in keywords]
|
|
269
|
+
topic_dfs[topic_id] = df
|
|
316
270
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
271
|
+
# Compute scores
|
|
272
|
+
scored_keywords = {}
|
|
273
|
+
|
|
274
|
+
for topic_id, topic_keywords in keywords.items():
|
|
275
|
+
scored = []
|
|
276
|
+
topic_df = topic_dfs[topic_id]
|
|
277
|
+
n_topic_docs = sum(labels == topic_id)
|
|
278
|
+
|
|
279
|
+
for keyword in topic_keywords:
|
|
280
|
+
# TF in topic
|
|
281
|
+
tf = topic_df.get(keyword.lower(), 0) / n_topic_docs
|
|
282
|
+
|
|
283
|
+
# DF across other topics (for IDF-like scoring)
|
|
284
|
+
other_df = sum(
|
|
285
|
+
topic_dfs[t].get(keyword.lower(), 0)
|
|
286
|
+
for t in unique_topics if t != topic_id
|
|
287
|
+
)
|
|
288
|
+
n_other_docs = sum(labels != topic_id)
|
|
289
|
+
|
|
290
|
+
if n_other_docs > 0:
|
|
291
|
+
other_ratio = other_df / n_other_docs
|
|
292
|
+
# Discriminative score
|
|
293
|
+
score = tf / (other_ratio + 0.1)
|
|
294
|
+
else:
|
|
295
|
+
score = tf
|
|
296
|
+
|
|
297
|
+
scored.append((keyword, round(score, 4)))
|
|
335
298
|
|
|
336
|
-
|
|
337
|
-
|
|
299
|
+
scored_keywords[topic_id] = scored
|
|
300
|
+
|
|
301
|
+
return scored_keywords
|