tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tritopic might be problematic. Click here for more details.
- tritopic/__init__.py +22 -32
- tritopic/config.py +289 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/METADATA +91 -51
- tritopic-1.1.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.1.0.dist-info}/top_level.txt +0 -0
tritopic/core/graph.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph Construction Module
|
|
3
|
+
|
|
4
|
+
Builds document similarity graphs using various methods:
|
|
5
|
+
- kNN (k-Nearest Neighbors)
|
|
6
|
+
- Mutual kNN (bidirectional connections only)
|
|
7
|
+
- SNN (Shared Nearest Neighbors)
|
|
8
|
+
- Hybrid (combination of MkNN and SNN)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Literal, Optional, Tuple
|
|
12
|
+
import numpy as np
|
|
13
|
+
from scipy import sparse
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GraphBuilder:
|
|
18
|
+
"""
|
|
19
|
+
Builds document similarity graphs for topic modeling.
|
|
20
|
+
|
|
21
|
+
Supports multiple graph construction methods and multi-view fusion.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
n_neighbors: int = 15,
|
|
27
|
+
metric: str = "cosine",
|
|
28
|
+
graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid",
|
|
29
|
+
snn_weight: float = 0.5,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the graph builder.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
n_neighbors : int
|
|
37
|
+
Number of neighbors for kNN
|
|
38
|
+
metric : str
|
|
39
|
+
Distance metric
|
|
40
|
+
graph_type : str
|
|
41
|
+
Type of graph to build
|
|
42
|
+
snn_weight : float
|
|
43
|
+
Weight for SNN in hybrid graph (0-1)
|
|
44
|
+
"""
|
|
45
|
+
self.n_neighbors = n_neighbors
|
|
46
|
+
self.metric = metric
|
|
47
|
+
self.graph_type = graph_type
|
|
48
|
+
self.snn_weight = snn_weight
|
|
49
|
+
|
|
50
|
+
def build_from_similarity(
|
|
51
|
+
self,
|
|
52
|
+
similarity_matrix: np.ndarray,
|
|
53
|
+
) -> sparse.csr_matrix:
|
|
54
|
+
"""
|
|
55
|
+
Build a graph from a similarity matrix.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
similarity_matrix : np.ndarray
|
|
60
|
+
Pairwise similarity matrix (n x n)
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
sparse.csr_matrix
|
|
65
|
+
Adjacency matrix of the graph
|
|
66
|
+
"""
|
|
67
|
+
n = similarity_matrix.shape[0]
|
|
68
|
+
|
|
69
|
+
# Get kNN indices for each document
|
|
70
|
+
# Note: We need k+1 because the first neighbor is the document itself
|
|
71
|
+
k = min(self.n_neighbors + 1, n)
|
|
72
|
+
knn_indices = np.argsort(-similarity_matrix, axis=1)[:, :k]
|
|
73
|
+
|
|
74
|
+
if self.graph_type == "knn":
|
|
75
|
+
return self._build_knn_graph(similarity_matrix, knn_indices)
|
|
76
|
+
elif self.graph_type == "mutual_knn":
|
|
77
|
+
return self._build_mutual_knn_graph(similarity_matrix, knn_indices)
|
|
78
|
+
elif self.graph_type == "snn":
|
|
79
|
+
return self._build_snn_graph(similarity_matrix, knn_indices)
|
|
80
|
+
elif self.graph_type == "hybrid":
|
|
81
|
+
return self._build_hybrid_graph(similarity_matrix, knn_indices)
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(f"Unknown graph type: {self.graph_type}")
|
|
84
|
+
|
|
85
|
+
def _build_knn_graph(
|
|
86
|
+
self,
|
|
87
|
+
similarity_matrix: np.ndarray,
|
|
88
|
+
knn_indices: np.ndarray,
|
|
89
|
+
) -> sparse.csr_matrix:
|
|
90
|
+
"""Build standard kNN graph."""
|
|
91
|
+
n = similarity_matrix.shape[0]
|
|
92
|
+
k = knn_indices.shape[1]
|
|
93
|
+
|
|
94
|
+
rows = np.repeat(np.arange(n), k)
|
|
95
|
+
cols = knn_indices.flatten()
|
|
96
|
+
|
|
97
|
+
# Get similarity values for each edge
|
|
98
|
+
data = similarity_matrix[rows, cols]
|
|
99
|
+
|
|
100
|
+
# Create sparse matrix
|
|
101
|
+
graph = sparse.csr_matrix((data, (rows, cols)), shape=(n, n))
|
|
102
|
+
|
|
103
|
+
# Make symmetric by taking maximum
|
|
104
|
+
graph = graph.maximum(graph.T)
|
|
105
|
+
|
|
106
|
+
return graph
|
|
107
|
+
|
|
108
|
+
def _build_mutual_knn_graph(
|
|
109
|
+
self,
|
|
110
|
+
similarity_matrix: np.ndarray,
|
|
111
|
+
knn_indices: np.ndarray,
|
|
112
|
+
) -> sparse.csr_matrix:
|
|
113
|
+
"""
|
|
114
|
+
Build mutual kNN graph.
|
|
115
|
+
|
|
116
|
+
An edge exists between i and j only if:
|
|
117
|
+
- j is in i's k-nearest neighbors AND
|
|
118
|
+
- i is in j's k-nearest neighbors
|
|
119
|
+
"""
|
|
120
|
+
n = similarity_matrix.shape[0]
|
|
121
|
+
|
|
122
|
+
# Create kNN indicator matrix
|
|
123
|
+
knn_mask = np.zeros((n, n), dtype=bool)
|
|
124
|
+
for i in range(n):
|
|
125
|
+
knn_mask[i, knn_indices[i]] = True
|
|
126
|
+
|
|
127
|
+
# Mutual kNN: both directions must exist
|
|
128
|
+
mutual_mask = knn_mask & knn_mask.T
|
|
129
|
+
|
|
130
|
+
# Create adjacency matrix with similarity weights
|
|
131
|
+
adjacency = np.where(mutual_mask, similarity_matrix, 0)
|
|
132
|
+
|
|
133
|
+
return sparse.csr_matrix(adjacency)
|
|
134
|
+
|
|
135
|
+
def _build_snn_graph(
|
|
136
|
+
self,
|
|
137
|
+
similarity_matrix: np.ndarray,
|
|
138
|
+
knn_indices: np.ndarray,
|
|
139
|
+
) -> sparse.csr_matrix:
|
|
140
|
+
"""
|
|
141
|
+
Build Shared Nearest Neighbors (SNN) graph.
|
|
142
|
+
|
|
143
|
+
Edge weight = |shared neighbors| / k
|
|
144
|
+
"""
|
|
145
|
+
n = similarity_matrix.shape[0]
|
|
146
|
+
k = knn_indices.shape[1] - 1 # Exclude self
|
|
147
|
+
|
|
148
|
+
# Create neighbor sets (excluding self)
|
|
149
|
+
neighbor_sets = [set(knn_indices[i, 1:]) for i in range(n)]
|
|
150
|
+
|
|
151
|
+
# Compute SNN similarity
|
|
152
|
+
rows, cols, data = [], [], []
|
|
153
|
+
|
|
154
|
+
for i in range(n):
|
|
155
|
+
for j in knn_indices[i, 1:]: # Only consider kNN neighbors
|
|
156
|
+
if j > i: # Avoid duplicate computation
|
|
157
|
+
shared = len(neighbor_sets[i] & neighbor_sets[j])
|
|
158
|
+
if shared > 0:
|
|
159
|
+
snn_sim = shared / k
|
|
160
|
+
rows.extend([i, j])
|
|
161
|
+
cols.extend([j, i])
|
|
162
|
+
data.extend([snn_sim, snn_sim])
|
|
163
|
+
|
|
164
|
+
return sparse.csr_matrix((data, (rows, cols)), shape=(n, n))
|
|
165
|
+
|
|
166
|
+
def _build_hybrid_graph(
|
|
167
|
+
self,
|
|
168
|
+
similarity_matrix: np.ndarray,
|
|
169
|
+
knn_indices: np.ndarray,
|
|
170
|
+
) -> sparse.csr_matrix:
|
|
171
|
+
"""
|
|
172
|
+
Build hybrid graph combining MkNN and SNN.
|
|
173
|
+
|
|
174
|
+
hybrid_weight = alpha * mknn_weight + (1-alpha) * snn_weight
|
|
175
|
+
"""
|
|
176
|
+
mknn_graph = self._build_mutual_knn_graph(similarity_matrix, knn_indices)
|
|
177
|
+
snn_graph = self._build_snn_graph(similarity_matrix, knn_indices)
|
|
178
|
+
|
|
179
|
+
# Normalize graphs to [0, 1]
|
|
180
|
+
mknn_max = mknn_graph.max()
|
|
181
|
+
snn_max = snn_graph.max()
|
|
182
|
+
|
|
183
|
+
if mknn_max > 0:
|
|
184
|
+
mknn_graph = mknn_graph / mknn_max
|
|
185
|
+
if snn_max > 0:
|
|
186
|
+
snn_graph = snn_graph / snn_max
|
|
187
|
+
|
|
188
|
+
# Combine with weights
|
|
189
|
+
alpha = 1 - self.snn_weight # MkNN weight
|
|
190
|
+
hybrid = alpha * mknn_graph + self.snn_weight * snn_graph
|
|
191
|
+
|
|
192
|
+
return hybrid
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class MultiViewGraphBuilder:
|
|
196
|
+
"""
|
|
197
|
+
Builds multi-view graphs by combining semantic, lexical, and metadata similarities.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
n_neighbors: int = 15,
|
|
203
|
+
graph_type: str = "hybrid",
|
|
204
|
+
snn_weight: float = 0.5,
|
|
205
|
+
semantic_weight: float = 0.5,
|
|
206
|
+
lexical_weight: float = 0.3,
|
|
207
|
+
metadata_weight: float = 0.2,
|
|
208
|
+
):
|
|
209
|
+
"""
|
|
210
|
+
Initialize the multi-view graph builder.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
n_neighbors : int
|
|
215
|
+
Number of neighbors for kNN
|
|
216
|
+
graph_type : str
|
|
217
|
+
Type of graph
|
|
218
|
+
snn_weight : float
|
|
219
|
+
Weight for SNN in hybrid
|
|
220
|
+
semantic_weight : float
|
|
221
|
+
Weight for semantic view
|
|
222
|
+
lexical_weight : float
|
|
223
|
+
Weight for lexical view
|
|
224
|
+
metadata_weight : float
|
|
225
|
+
Weight for metadata view
|
|
226
|
+
"""
|
|
227
|
+
self.n_neighbors = n_neighbors
|
|
228
|
+
self.graph_type = graph_type
|
|
229
|
+
self.snn_weight = snn_weight
|
|
230
|
+
self.semantic_weight = semantic_weight
|
|
231
|
+
self.lexical_weight = lexical_weight
|
|
232
|
+
self.metadata_weight = metadata_weight
|
|
233
|
+
|
|
234
|
+
self.graph_builder = GraphBuilder(
|
|
235
|
+
n_neighbors=n_neighbors,
|
|
236
|
+
graph_type=graph_type,
|
|
237
|
+
snn_weight=snn_weight,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def build(
|
|
241
|
+
self,
|
|
242
|
+
semantic_similarity: np.ndarray,
|
|
243
|
+
lexical_similarity: Optional[np.ndarray] = None,
|
|
244
|
+
metadata_similarity: Optional[np.ndarray] = None,
|
|
245
|
+
) -> sparse.csr_matrix:
|
|
246
|
+
"""
|
|
247
|
+
Build multi-view graph.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
semantic_similarity : np.ndarray
|
|
252
|
+
Semantic similarity matrix (from embeddings)
|
|
253
|
+
lexical_similarity : np.ndarray, optional
|
|
254
|
+
Lexical similarity matrix (from TF-IDF/BM25)
|
|
255
|
+
metadata_similarity : np.ndarray, optional
|
|
256
|
+
Metadata similarity matrix
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
-------
|
|
260
|
+
sparse.csr_matrix
|
|
261
|
+
Combined multi-view graph
|
|
262
|
+
"""
|
|
263
|
+
# Normalize weights based on available views
|
|
264
|
+
weights = {'semantic': self.semantic_weight}
|
|
265
|
+
if lexical_similarity is not None:
|
|
266
|
+
weights['lexical'] = self.lexical_weight
|
|
267
|
+
if metadata_similarity is not None:
|
|
268
|
+
weights['metadata'] = self.metadata_weight
|
|
269
|
+
|
|
270
|
+
total_weight = sum(weights.values())
|
|
271
|
+
weights = {k: v / total_weight for k, v in weights.items()}
|
|
272
|
+
|
|
273
|
+
# Build combined similarity matrix
|
|
274
|
+
combined_similarity = weights['semantic'] * semantic_similarity
|
|
275
|
+
|
|
276
|
+
if lexical_similarity is not None:
|
|
277
|
+
combined_similarity += weights['lexical'] * lexical_similarity
|
|
278
|
+
|
|
279
|
+
if metadata_similarity is not None:
|
|
280
|
+
combined_similarity += weights['metadata'] * metadata_similarity
|
|
281
|
+
|
|
282
|
+
# Build graph from combined similarity
|
|
283
|
+
return self.graph_builder.build_from_similarity(combined_similarity)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def compute_lexical_similarity(
|
|
287
|
+
documents: list,
|
|
288
|
+
tokenized_documents: list,
|
|
289
|
+
method: str = "tfidf",
|
|
290
|
+
ngram_range: tuple = (1, 2),
|
|
291
|
+
max_features: int = 10000,
|
|
292
|
+
stopwords: set = None,
|
|
293
|
+
) -> np.ndarray:
|
|
294
|
+
"""
|
|
295
|
+
Compute lexical similarity matrix from documents.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
documents : list
|
|
300
|
+
Original documents (for BM25)
|
|
301
|
+
tokenized_documents : list
|
|
302
|
+
Tokenized documents
|
|
303
|
+
method : str
|
|
304
|
+
Method: "tfidf" or "bm25"
|
|
305
|
+
ngram_range : tuple
|
|
306
|
+
N-gram range
|
|
307
|
+
max_features : int
|
|
308
|
+
Maximum vocabulary size
|
|
309
|
+
stopwords : set
|
|
310
|
+
Stopwords to exclude
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
np.ndarray
|
|
315
|
+
Lexical similarity matrix
|
|
316
|
+
"""
|
|
317
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
318
|
+
|
|
319
|
+
# Convert tokenized documents back to strings for vectorizer
|
|
320
|
+
processed_docs = [' '.join(tokens) for tokens in tokenized_documents]
|
|
321
|
+
|
|
322
|
+
if method == "tfidf":
|
|
323
|
+
vectorizer = TfidfVectorizer(
|
|
324
|
+
ngram_range=ngram_range,
|
|
325
|
+
max_features=max_features,
|
|
326
|
+
stop_words=list(stopwords) if stopwords else None,
|
|
327
|
+
)
|
|
328
|
+
tfidf_matrix = vectorizer.fit_transform(processed_docs)
|
|
329
|
+
|
|
330
|
+
# Compute cosine similarity
|
|
331
|
+
similarity = (tfidf_matrix @ tfidf_matrix.T).toarray()
|
|
332
|
+
|
|
333
|
+
elif method == "bm25":
|
|
334
|
+
try:
|
|
335
|
+
from rank_bm25 import BM25Okapi
|
|
336
|
+
|
|
337
|
+
bm25 = BM25Okapi(tokenized_documents)
|
|
338
|
+
|
|
339
|
+
n = len(tokenized_documents)
|
|
340
|
+
similarity = np.zeros((n, n))
|
|
341
|
+
|
|
342
|
+
for i, tokens in enumerate(tokenized_documents):
|
|
343
|
+
scores = bm25.get_scores(tokens)
|
|
344
|
+
similarity[i] = scores
|
|
345
|
+
|
|
346
|
+
# Normalize to [0, 1]
|
|
347
|
+
max_val = similarity.max()
|
|
348
|
+
if max_val > 0:
|
|
349
|
+
similarity = similarity / max_val
|
|
350
|
+
|
|
351
|
+
except ImportError:
|
|
352
|
+
warnings.warn("rank_bm25 not installed, falling back to TF-IDF")
|
|
353
|
+
return compute_lexical_similarity(
|
|
354
|
+
documents, tokenized_documents, "tfidf", ngram_range, max_features, stopwords
|
|
355
|
+
)
|
|
356
|
+
else:
|
|
357
|
+
raise ValueError(f"Unknown method: {method}")
|
|
358
|
+
|
|
359
|
+
# Ensure diagonal is 1
|
|
360
|
+
np.fill_diagonal(similarity, 1.0)
|
|
361
|
+
|
|
362
|
+
# Clip to [0, 1]
|
|
363
|
+
similarity = np.clip(similarity, 0, 1)
|
|
364
|
+
|
|
365
|
+
return similarity
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def compute_metadata_similarity(
|
|
369
|
+
metadata: "pd.DataFrame",
|
|
370
|
+
categorical_cols: list = None,
|
|
371
|
+
numerical_cols: list = None,
|
|
372
|
+
) -> np.ndarray:
|
|
373
|
+
"""
|
|
374
|
+
Compute similarity matrix from metadata.
|
|
375
|
+
|
|
376
|
+
Parameters
|
|
377
|
+
----------
|
|
378
|
+
metadata : pd.DataFrame
|
|
379
|
+
Metadata dataframe
|
|
380
|
+
categorical_cols : list
|
|
381
|
+
Categorical column names
|
|
382
|
+
numerical_cols : list
|
|
383
|
+
Numerical column names
|
|
384
|
+
|
|
385
|
+
Returns
|
|
386
|
+
-------
|
|
387
|
+
np.ndarray
|
|
388
|
+
Metadata similarity matrix
|
|
389
|
+
"""
|
|
390
|
+
import pandas as pd
|
|
391
|
+
|
|
392
|
+
n = len(metadata)
|
|
393
|
+
similarity = np.zeros((n, n))
|
|
394
|
+
|
|
395
|
+
# Auto-detect column types if not specified
|
|
396
|
+
if categorical_cols is None and numerical_cols is None:
|
|
397
|
+
categorical_cols = metadata.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
398
|
+
numerical_cols = metadata.select_dtypes(include=['number']).columns.tolist()
|
|
399
|
+
|
|
400
|
+
# Categorical similarity (Jaccard-like)
|
|
401
|
+
if categorical_cols:
|
|
402
|
+
for col in categorical_cols:
|
|
403
|
+
values = metadata[col].values
|
|
404
|
+
for i in range(n):
|
|
405
|
+
for j in range(i, n):
|
|
406
|
+
if values[i] == values[j]:
|
|
407
|
+
similarity[i, j] += 1 / len(categorical_cols)
|
|
408
|
+
similarity[j, i] += 1 / len(categorical_cols)
|
|
409
|
+
|
|
410
|
+
# Numerical similarity (1 - normalized distance)
|
|
411
|
+
if numerical_cols:
|
|
412
|
+
from sklearn.preprocessing import MinMaxScaler
|
|
413
|
+
|
|
414
|
+
num_data = metadata[numerical_cols].values
|
|
415
|
+
scaler = MinMaxScaler()
|
|
416
|
+
num_scaled = scaler.fit_transform(num_data)
|
|
417
|
+
|
|
418
|
+
from scipy.spatial.distance import cdist
|
|
419
|
+
distances = cdist(num_scaled, num_scaled, metric='euclidean')
|
|
420
|
+
max_dist = distances.max()
|
|
421
|
+
if max_dist > 0:
|
|
422
|
+
num_similarity = 1 - distances / max_dist
|
|
423
|
+
else:
|
|
424
|
+
num_similarity = np.ones((n, n))
|
|
425
|
+
|
|
426
|
+
# Combine with categorical
|
|
427
|
+
if categorical_cols:
|
|
428
|
+
similarity = 0.5 * similarity + 0.5 * num_similarity
|
|
429
|
+
else:
|
|
430
|
+
similarity = num_similarity
|
|
431
|
+
|
|
432
|
+
# Ensure diagonal is 1
|
|
433
|
+
np.fill_diagonal(similarity, 1.0)
|
|
434
|
+
|
|
435
|
+
return similarity
|