tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,493 +0,0 @@
1
- """
2
- Graph Builder for TriTopic
3
- ============================
4
-
5
- Constructs similarity graphs using multiple strategies:
6
- - Mutual kNN: Only keep edges where both nodes are in each other's neighborhood
7
- - SNN (Shared Nearest Neighbors): Weight edges by number of shared neighbors
8
- - Multi-view fusion: Combine semantic, lexical, and metadata graphs
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- from typing import Any, Literal
14
-
15
- import numpy as np
16
- from scipy.sparse import csr_matrix, lil_matrix
17
- from sklearn.neighbors import NearestNeighbors
18
- from sklearn.feature_extraction.text import TfidfVectorizer
19
- from sklearn.metrics.pairwise import cosine_similarity
20
-
21
-
22
- class GraphBuilder:
23
- """
24
- Build similarity graphs for topic modeling.
25
-
26
- Supports multiple graph construction strategies for robust clustering.
27
-
28
- Parameters
29
- ----------
30
- n_neighbors : int
31
- Number of neighbors for kNN graph. Default: 15
32
- metric : str
33
- Distance metric. Default: "cosine"
34
- graph_type : str
35
- Type of graph: "knn", "mutual_knn", "snn", or "hybrid"
36
- snn_weight : float
37
- Weight for SNN edges in hybrid mode. Default: 0.5
38
- """
39
-
40
- def __init__(
41
- self,
42
- n_neighbors: int = 15,
43
- metric: str = "cosine",
44
- graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid",
45
- snn_weight: float = 0.5,
46
- ):
47
- self.n_neighbors = n_neighbors
48
- self.metric = metric
49
- self.graph_type = graph_type
50
- self.snn_weight = snn_weight
51
-
52
- self._tfidf_vectorizer = TfidfVectorizer(
53
- max_features=10000,
54
- stop_words="english",
55
- ngram_range=(1, 2),
56
- min_df=2,
57
- max_df=0.95,
58
- )
59
-
60
- def build_knn_graph(
61
- self,
62
- embeddings: np.ndarray,
63
- n_neighbors: int | None = None,
64
- ) -> csr_matrix:
65
- """
66
- Build a basic kNN graph.
67
-
68
- Parameters
69
- ----------
70
- embeddings : np.ndarray
71
- Document embeddings of shape (n_docs, n_dims).
72
- n_neighbors : int, optional
73
- Override default n_neighbors.
74
-
75
- Returns
76
- -------
77
- adjacency : csr_matrix
78
- Sparse adjacency matrix with cosine similarity weights.
79
- """
80
- k = n_neighbors or self.n_neighbors
81
- n_samples = embeddings.shape[0]
82
-
83
- # Fit nearest neighbors
84
- nn = NearestNeighbors(
85
- n_neighbors=min(k + 1, n_samples), # +1 because point is its own neighbor
86
- metric=self.metric,
87
- algorithm="auto",
88
- )
89
- nn.fit(embeddings)
90
-
91
- # Get distances and indices
92
- distances, indices = nn.kneighbors(embeddings)
93
-
94
- # Convert to similarity and build sparse matrix
95
- if self.metric == "cosine":
96
- # Cosine distance to similarity
97
- similarities = 1 - distances
98
- else:
99
- # For other metrics, use inverse distance
100
- similarities = 1 / (1 + distances)
101
-
102
- # Build adjacency matrix
103
- adjacency = lil_matrix((n_samples, n_samples))
104
-
105
- for i in range(n_samples):
106
- for j_idx, j in enumerate(indices[i]):
107
- if i != j: # Skip self-loops
108
- adjacency[i, j] = similarities[i, j_idx]
109
-
110
- return adjacency.tocsr()
111
-
112
- def build_mutual_knn_graph(
113
- self,
114
- embeddings: np.ndarray,
115
- n_neighbors: int | None = None,
116
- ) -> csr_matrix:
117
- """
118
- Build a mutual kNN graph.
119
-
120
- Edge (i, j) exists only if i is in j's neighbors AND j is in i's neighbors.
121
- This removes "one-way" connections that often represent noise.
122
-
123
- Parameters
124
- ----------
125
- embeddings : np.ndarray
126
- Document embeddings.
127
- n_neighbors : int, optional
128
- Override default n_neighbors.
129
-
130
- Returns
131
- -------
132
- adjacency : csr_matrix
133
- Sparse adjacency matrix.
134
- """
135
- k = n_neighbors or self.n_neighbors
136
- n_samples = embeddings.shape[0]
137
-
138
- # Get kNN graph first
139
- nn = NearestNeighbors(
140
- n_neighbors=min(k + 1, n_samples),
141
- metric=self.metric,
142
- algorithm="auto",
143
- )
144
- nn.fit(embeddings)
145
- distances, indices = nn.kneighbors(embeddings)
146
-
147
- if self.metric == "cosine":
148
- similarities = 1 - distances
149
- else:
150
- similarities = 1 / (1 + distances)
151
-
152
- # Build neighbor sets for mutual check
153
- neighbor_sets = [set(indices[i][1:]) for i in range(n_samples)] # Skip self
154
-
155
- # Build mutual kNN adjacency
156
- adjacency = lil_matrix((n_samples, n_samples))
157
-
158
- for i in range(n_samples):
159
- for j_idx, j in enumerate(indices[i][1:], 1): # Skip self
160
- # Check if mutual
161
- if i in neighbor_sets[j]:
162
- # Average the similarities
163
- sim_ij = similarities[i, j_idx]
164
- # Find j's similarity to i
165
- j_indices = list(indices[j])
166
- if i in j_indices:
167
- sim_ji = similarities[j, j_indices.index(i)]
168
- else:
169
- sim_ji = sim_ij
170
-
171
- avg_sim = (sim_ij + sim_ji) / 2
172
- adjacency[i, j] = avg_sim
173
- adjacency[j, i] = avg_sim
174
-
175
- return adjacency.tocsr()
176
-
177
- def build_snn_graph(
178
- self,
179
- embeddings: np.ndarray,
180
- n_neighbors: int | None = None,
181
- ) -> csr_matrix:
182
- """
183
- Build a Shared Nearest Neighbors (SNN) graph.
184
-
185
- Edge weight = number of shared neighbors between two nodes.
186
- This is very robust against noise and outliers.
187
-
188
- Parameters
189
- ----------
190
- embeddings : np.ndarray
191
- Document embeddings.
192
- n_neighbors : int, optional
193
- Override default n_neighbors.
194
-
195
- Returns
196
- -------
197
- adjacency : csr_matrix
198
- Sparse adjacency matrix with SNN weights.
199
- """
200
- k = n_neighbors or self.n_neighbors
201
- n_samples = embeddings.shape[0]
202
-
203
- # Get kNN
204
- nn = NearestNeighbors(
205
- n_neighbors=min(k + 1, n_samples),
206
- metric=self.metric,
207
- algorithm="auto",
208
- )
209
- nn.fit(embeddings)
210
- _, indices = nn.kneighbors(embeddings)
211
-
212
- # Build neighbor sets
213
- neighbor_sets = [set(indices[i]) for i in range(n_samples)]
214
-
215
- # Compute SNN: edge weight = |N(i) ∩ N(j)|
216
- adjacency = lil_matrix((n_samples, n_samples))
217
-
218
- for i in range(n_samples):
219
- for j in neighbor_sets[i]:
220
- if i < j: # Avoid duplicate computation
221
- shared = len(neighbor_sets[i] & neighbor_sets[j])
222
- if shared > 0:
223
- # Normalize by k
224
- weight = shared / k
225
- adjacency[i, j] = weight
226
- adjacency[j, i] = weight
227
-
228
- return adjacency.tocsr()
229
-
230
- def build_hybrid_graph(
231
- self,
232
- embeddings: np.ndarray,
233
- n_neighbors: int | None = None,
234
- ) -> csr_matrix:
235
- """
236
- Build a hybrid graph combining mutual kNN and SNN.
237
-
238
- This gives the best of both worlds:
239
- - Mutual kNN for strong direct connections
240
- - SNN for structural similarity
241
-
242
- Parameters
243
- ----------
244
- embeddings : np.ndarray
245
- Document embeddings.
246
- n_neighbors : int, optional
247
- Override default n_neighbors.
248
-
249
- Returns
250
- -------
251
- adjacency : csr_matrix
252
- Combined adjacency matrix.
253
- """
254
- mutual_adj = self.build_mutual_knn_graph(embeddings, n_neighbors)
255
- snn_adj = self.build_snn_graph(embeddings, n_neighbors)
256
-
257
- # Normalize both
258
- mutual_max = mutual_adj.max() if mutual_adj.nnz > 0 else 1
259
- snn_max = snn_adj.max() if snn_adj.nnz > 0 else 1
260
-
261
- if mutual_max > 0:
262
- mutual_adj = mutual_adj / mutual_max
263
- if snn_max > 0:
264
- snn_adj = snn_adj / snn_max
265
-
266
- # Combine
267
- combined = (1 - self.snn_weight) * mutual_adj + self.snn_weight * snn_adj
268
-
269
- return combined.tocsr()
270
-
271
- def build_lexical_matrix(
272
- self,
273
- documents: list[str],
274
- ) -> csr_matrix:
275
- """
276
- Build TF-IDF matrix for lexical similarity.
277
-
278
- Parameters
279
- ----------
280
- documents : list[str]
281
- Document texts.
282
-
283
- Returns
284
- -------
285
- tfidf_matrix : csr_matrix
286
- TF-IDF sparse matrix.
287
- """
288
- tfidf_matrix = self._tfidf_vectorizer.fit_transform(documents)
289
- return tfidf_matrix
290
-
291
- def build_lexical_graph(
292
- self,
293
- tfidf_matrix: csr_matrix,
294
- n_neighbors: int | None = None,
295
- ) -> csr_matrix:
296
- """
297
- Build lexical similarity graph from TF-IDF.
298
-
299
- Parameters
300
- ----------
301
- tfidf_matrix : csr_matrix
302
- TF-IDF matrix.
303
- n_neighbors : int, optional
304
- Override default n_neighbors.
305
-
306
- Returns
307
- -------
308
- adjacency : csr_matrix
309
- Lexical similarity adjacency matrix.
310
- """
311
- k = n_neighbors or self.n_neighbors
312
- n_samples = tfidf_matrix.shape[0]
313
-
314
- # Use NearestNeighbors with cosine metric on TF-IDF
315
- nn = NearestNeighbors(
316
- n_neighbors=min(k + 1, n_samples),
317
- metric="cosine",
318
- algorithm="brute", # For sparse matrices
319
- )
320
- nn.fit(tfidf_matrix)
321
- distances, indices = nn.kneighbors(tfidf_matrix)
322
-
323
- # Convert distance to similarity
324
- similarities = 1 - distances
325
-
326
- # Build mutual kNN for lexical
327
- neighbor_sets = [set(indices[i][1:]) for i in range(n_samples)]
328
- adjacency = lil_matrix((n_samples, n_samples))
329
-
330
- for i in range(n_samples):
331
- for j_idx, j in enumerate(indices[i][1:], 1):
332
- if i in neighbor_sets[j]:
333
- adjacency[i, j] = similarities[i, j_idx]
334
- adjacency[j, i] = similarities[i, j_idx]
335
-
336
- return adjacency.tocsr()
337
-
338
- def build_metadata_graph(
339
- self,
340
- metadata: "pd.DataFrame",
341
- ) -> csr_matrix:
342
- """
343
- Build metadata similarity graph.
344
-
345
- Documents with matching metadata get connected.
346
-
347
- Parameters
348
- ----------
349
- metadata : pd.DataFrame
350
- Metadata DataFrame with same index as documents.
351
-
352
- Returns
353
- -------
354
- adjacency : csr_matrix
355
- Metadata similarity adjacency matrix.
356
- """
357
- import pandas as pd
358
-
359
- n_samples = len(metadata)
360
- adjacency = lil_matrix((n_samples, n_samples))
361
-
362
- # For each categorical column, connect matching documents
363
- for col in metadata.columns:
364
- if metadata[col].dtype == "object" or metadata[col].dtype.name == "category":
365
- # Categorical: exact match
366
- for value in metadata[col].unique():
367
- if pd.isna(value):
368
- continue
369
- mask = metadata[col] == value
370
- indices = np.where(mask)[0]
371
-
372
- # Connect all pairs in this group
373
- for i in range(len(indices)):
374
- for j in range(i + 1, len(indices)):
375
- idx_i, idx_j = indices[i], indices[j]
376
- adjacency[idx_i, idx_j] += 1
377
- adjacency[idx_j, idx_i] += 1
378
- else:
379
- # Numerical: use similarity based on normalized distance
380
- values = metadata[col].values
381
- if np.isnan(values).all():
382
- continue
383
-
384
- # Normalize
385
- values = (values - np.nanmin(values)) / (np.nanmax(values) - np.nanmin(values) + 1e-10)
386
-
387
- # Add similarity for nearby values
388
- for i in range(n_samples):
389
- for j in range(i + 1, n_samples):
390
- if not (np.isnan(values[i]) or np.isnan(values[j])):
391
- sim = 1 - abs(values[i] - values[j])
392
- if sim > 0.8: # Only strong similarity
393
- adjacency[i, j] += sim
394
- adjacency[j, i] += sim
395
-
396
- # Normalize
397
- max_val = adjacency.max()
398
- if max_val > 0:
399
- adjacency = adjacency / max_val
400
-
401
- return adjacency.tocsr()
402
-
403
- def build_multiview_graph(
404
- self,
405
- semantic_embeddings: np.ndarray,
406
- lexical_matrix: csr_matrix | None = None,
407
- metadata_graph: csr_matrix | None = None,
408
- weights: dict[str, float] | None = None,
409
- ) -> "igraph.Graph":
410
- """
411
- Build combined multi-view graph.
412
-
413
- Fuses semantic, lexical, and metadata views into a single graph
414
- for robust community detection.
415
-
416
- Parameters
417
- ----------
418
- semantic_embeddings : np.ndarray
419
- Document embeddings.
420
- lexical_matrix : csr_matrix, optional
421
- TF-IDF matrix for lexical view.
422
- metadata_graph : csr_matrix, optional
423
- Pre-computed metadata adjacency.
424
- weights : dict, optional
425
- Weights for each view. Keys: "semantic", "lexical", "metadata"
426
-
427
- Returns
428
- -------
429
- graph : igraph.Graph
430
- Combined weighted graph.
431
- """
432
- import igraph as ig
433
-
434
- weights = weights or {"semantic": 0.5, "lexical": 0.3, "metadata": 0.2}
435
- n_samples = semantic_embeddings.shape[0]
436
-
437
- # Build semantic graph
438
- if self.graph_type == "knn":
439
- semantic_adj = self.build_knn_graph(semantic_embeddings)
440
- elif self.graph_type == "mutual_knn":
441
- semantic_adj = self.build_mutual_knn_graph(semantic_embeddings)
442
- elif self.graph_type == "snn":
443
- semantic_adj = self.build_snn_graph(semantic_embeddings)
444
- else: # hybrid
445
- semantic_adj = self.build_hybrid_graph(semantic_embeddings)
446
-
447
- # Normalize
448
- if semantic_adj.max() > 0:
449
- semantic_adj = semantic_adj / semantic_adj.max()
450
-
451
- # Start with semantic
452
- combined_adj = weights["semantic"] * semantic_adj
453
-
454
- # Add lexical if available
455
- if lexical_matrix is not None and weights.get("lexical", 0) > 0:
456
- lexical_adj = self.build_lexical_graph(lexical_matrix)
457
- if lexical_adj.max() > 0:
458
- lexical_adj = lexical_adj / lexical_adj.max()
459
- combined_adj = combined_adj + weights["lexical"] * lexical_adj
460
-
461
- # Add metadata if available
462
- if metadata_graph is not None and weights.get("metadata", 0) > 0:
463
- if metadata_graph.max() > 0:
464
- metadata_graph = metadata_graph / metadata_graph.max()
465
- combined_adj = combined_adj + weights["metadata"] * metadata_graph
466
-
467
- # Convert to igraph
468
- combined_adj = combined_adj.tocoo()
469
-
470
- edges = list(zip(combined_adj.row, combined_adj.col))
471
- weights_list = combined_adj.data.tolist()
472
-
473
- # Remove duplicate edges (keep max weight)
474
- edge_weights = {}
475
- for (i, j), w in zip(edges, weights_list):
476
- key = (min(i, j), max(i, j))
477
- if key not in edge_weights or w > edge_weights[key]:
478
- edge_weights[key] = w
479
-
480
- edges = list(edge_weights.keys())
481
- weights_list = list(edge_weights.values())
482
-
483
- # Create graph
484
- graph = ig.Graph(n=n_samples, edges=edges, directed=False)
485
- graph.es["weight"] = weights_list
486
-
487
- return graph
488
-
489
- def get_feature_names(self) -> list[str]:
490
- """Get TF-IDF feature names (for keyword extraction)."""
491
- if hasattr(self._tfidf_vectorizer, "get_feature_names_out"):
492
- return list(self._tfidf_vectorizer.get_feature_names_out())
493
- return list(self._tfidf_vectorizer.get_feature_names())