tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tritopic/core/graph.py ADDED
@@ -0,0 +1,435 @@
1
+ """
2
+ Graph Construction Module
3
+
4
+ Builds document similarity graphs using various methods:
5
+ - kNN (k-Nearest Neighbors)
6
+ - Mutual kNN (bidirectional connections only)
7
+ - SNN (Shared Nearest Neighbors)
8
+ - Hybrid (combination of MkNN and SNN)
9
+ """
10
+
11
+ from typing import Literal, Optional, Tuple
12
+ import numpy as np
13
+ from scipy import sparse
14
+ import warnings
15
+
16
+
17
+ class GraphBuilder:
18
+ """
19
+ Builds document similarity graphs for topic modeling.
20
+
21
+ Supports multiple graph construction methods and multi-view fusion.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ n_neighbors: int = 15,
27
+ metric: str = "cosine",
28
+ graph_type: Literal["knn", "mutual_knn", "snn", "hybrid"] = "hybrid",
29
+ snn_weight: float = 0.5,
30
+ ):
31
+ """
32
+ Initialize the graph builder.
33
+
34
+ Parameters
35
+ ----------
36
+ n_neighbors : int
37
+ Number of neighbors for kNN
38
+ metric : str
39
+ Distance metric
40
+ graph_type : str
41
+ Type of graph to build
42
+ snn_weight : float
43
+ Weight for SNN in hybrid graph (0-1)
44
+ """
45
+ self.n_neighbors = n_neighbors
46
+ self.metric = metric
47
+ self.graph_type = graph_type
48
+ self.snn_weight = snn_weight
49
+
50
+ def build_from_similarity(
51
+ self,
52
+ similarity_matrix: np.ndarray,
53
+ ) -> sparse.csr_matrix:
54
+ """
55
+ Build a graph from a similarity matrix.
56
+
57
+ Parameters
58
+ ----------
59
+ similarity_matrix : np.ndarray
60
+ Pairwise similarity matrix (n x n)
61
+
62
+ Returns
63
+ -------
64
+ sparse.csr_matrix
65
+ Adjacency matrix of the graph
66
+ """
67
+ n = similarity_matrix.shape[0]
68
+
69
+ # Get kNN indices for each document
70
+ # Note: We need k+1 because the first neighbor is the document itself
71
+ k = min(self.n_neighbors + 1, n)
72
+ knn_indices = np.argsort(-similarity_matrix, axis=1)[:, :k]
73
+
74
+ if self.graph_type == "knn":
75
+ return self._build_knn_graph(similarity_matrix, knn_indices)
76
+ elif self.graph_type == "mutual_knn":
77
+ return self._build_mutual_knn_graph(similarity_matrix, knn_indices)
78
+ elif self.graph_type == "snn":
79
+ return self._build_snn_graph(similarity_matrix, knn_indices)
80
+ elif self.graph_type == "hybrid":
81
+ return self._build_hybrid_graph(similarity_matrix, knn_indices)
82
+ else:
83
+ raise ValueError(f"Unknown graph type: {self.graph_type}")
84
+
85
+ def _build_knn_graph(
86
+ self,
87
+ similarity_matrix: np.ndarray,
88
+ knn_indices: np.ndarray,
89
+ ) -> sparse.csr_matrix:
90
+ """Build standard kNN graph."""
91
+ n = similarity_matrix.shape[0]
92
+ k = knn_indices.shape[1]
93
+
94
+ rows = np.repeat(np.arange(n), k)
95
+ cols = knn_indices.flatten()
96
+
97
+ # Get similarity values for each edge
98
+ data = similarity_matrix[rows, cols]
99
+
100
+ # Create sparse matrix
101
+ graph = sparse.csr_matrix((data, (rows, cols)), shape=(n, n))
102
+
103
+ # Make symmetric by taking maximum
104
+ graph = graph.maximum(graph.T)
105
+
106
+ return graph
107
+
108
+ def _build_mutual_knn_graph(
109
+ self,
110
+ similarity_matrix: np.ndarray,
111
+ knn_indices: np.ndarray,
112
+ ) -> sparse.csr_matrix:
113
+ """
114
+ Build mutual kNN graph.
115
+
116
+ An edge exists between i and j only if:
117
+ - j is in i's k-nearest neighbors AND
118
+ - i is in j's k-nearest neighbors
119
+ """
120
+ n = similarity_matrix.shape[0]
121
+
122
+ # Create kNN indicator matrix
123
+ knn_mask = np.zeros((n, n), dtype=bool)
124
+ for i in range(n):
125
+ knn_mask[i, knn_indices[i]] = True
126
+
127
+ # Mutual kNN: both directions must exist
128
+ mutual_mask = knn_mask & knn_mask.T
129
+
130
+ # Create adjacency matrix with similarity weights
131
+ adjacency = np.where(mutual_mask, similarity_matrix, 0)
132
+
133
+ return sparse.csr_matrix(adjacency)
134
+
135
+ def _build_snn_graph(
136
+ self,
137
+ similarity_matrix: np.ndarray,
138
+ knn_indices: np.ndarray,
139
+ ) -> sparse.csr_matrix:
140
+ """
141
+ Build Shared Nearest Neighbors (SNN) graph.
142
+
143
+ Edge weight = |shared neighbors| / k
144
+ """
145
+ n = similarity_matrix.shape[0]
146
+ k = knn_indices.shape[1] - 1 # Exclude self
147
+
148
+ # Create neighbor sets (excluding self)
149
+ neighbor_sets = [set(knn_indices[i, 1:]) for i in range(n)]
150
+
151
+ # Compute SNN similarity
152
+ rows, cols, data = [], [], []
153
+
154
+ for i in range(n):
155
+ for j in knn_indices[i, 1:]: # Only consider kNN neighbors
156
+ if j > i: # Avoid duplicate computation
157
+ shared = len(neighbor_sets[i] & neighbor_sets[j])
158
+ if shared > 0:
159
+ snn_sim = shared / k
160
+ rows.extend([i, j])
161
+ cols.extend([j, i])
162
+ data.extend([snn_sim, snn_sim])
163
+
164
+ return sparse.csr_matrix((data, (rows, cols)), shape=(n, n))
165
+
166
+ def _build_hybrid_graph(
167
+ self,
168
+ similarity_matrix: np.ndarray,
169
+ knn_indices: np.ndarray,
170
+ ) -> sparse.csr_matrix:
171
+ """
172
+ Build hybrid graph combining MkNN and SNN.
173
+
174
+ hybrid_weight = alpha * mknn_weight + (1-alpha) * snn_weight
175
+ """
176
+ mknn_graph = self._build_mutual_knn_graph(similarity_matrix, knn_indices)
177
+ snn_graph = self._build_snn_graph(similarity_matrix, knn_indices)
178
+
179
+ # Normalize graphs to [0, 1]
180
+ mknn_max = mknn_graph.max()
181
+ snn_max = snn_graph.max()
182
+
183
+ if mknn_max > 0:
184
+ mknn_graph = mknn_graph / mknn_max
185
+ if snn_max > 0:
186
+ snn_graph = snn_graph / snn_max
187
+
188
+ # Combine with weights
189
+ alpha = 1 - self.snn_weight # MkNN weight
190
+ hybrid = alpha * mknn_graph + self.snn_weight * snn_graph
191
+
192
+ return hybrid
193
+
194
+
195
+ class MultiViewGraphBuilder:
196
+ """
197
+ Builds multi-view graphs by combining semantic, lexical, and metadata similarities.
198
+ """
199
+
200
+ def __init__(
201
+ self,
202
+ n_neighbors: int = 15,
203
+ graph_type: str = "hybrid",
204
+ snn_weight: float = 0.5,
205
+ semantic_weight: float = 0.5,
206
+ lexical_weight: float = 0.3,
207
+ metadata_weight: float = 0.2,
208
+ ):
209
+ """
210
+ Initialize the multi-view graph builder.
211
+
212
+ Parameters
213
+ ----------
214
+ n_neighbors : int
215
+ Number of neighbors for kNN
216
+ graph_type : str
217
+ Type of graph
218
+ snn_weight : float
219
+ Weight for SNN in hybrid
220
+ semantic_weight : float
221
+ Weight for semantic view
222
+ lexical_weight : float
223
+ Weight for lexical view
224
+ metadata_weight : float
225
+ Weight for metadata view
226
+ """
227
+ self.n_neighbors = n_neighbors
228
+ self.graph_type = graph_type
229
+ self.snn_weight = snn_weight
230
+ self.semantic_weight = semantic_weight
231
+ self.lexical_weight = lexical_weight
232
+ self.metadata_weight = metadata_weight
233
+
234
+ self.graph_builder = GraphBuilder(
235
+ n_neighbors=n_neighbors,
236
+ graph_type=graph_type,
237
+ snn_weight=snn_weight,
238
+ )
239
+
240
+ def build(
241
+ self,
242
+ semantic_similarity: np.ndarray,
243
+ lexical_similarity: Optional[np.ndarray] = None,
244
+ metadata_similarity: Optional[np.ndarray] = None,
245
+ ) -> sparse.csr_matrix:
246
+ """
247
+ Build multi-view graph.
248
+
249
+ Parameters
250
+ ----------
251
+ semantic_similarity : np.ndarray
252
+ Semantic similarity matrix (from embeddings)
253
+ lexical_similarity : np.ndarray, optional
254
+ Lexical similarity matrix (from TF-IDF/BM25)
255
+ metadata_similarity : np.ndarray, optional
256
+ Metadata similarity matrix
257
+
258
+ Returns
259
+ -------
260
+ sparse.csr_matrix
261
+ Combined multi-view graph
262
+ """
263
+ # Normalize weights based on available views
264
+ weights = {'semantic': self.semantic_weight}
265
+ if lexical_similarity is not None:
266
+ weights['lexical'] = self.lexical_weight
267
+ if metadata_similarity is not None:
268
+ weights['metadata'] = self.metadata_weight
269
+
270
+ total_weight = sum(weights.values())
271
+ weights = {k: v / total_weight for k, v in weights.items()}
272
+
273
+ # Build combined similarity matrix
274
+ combined_similarity = weights['semantic'] * semantic_similarity
275
+
276
+ if lexical_similarity is not None:
277
+ combined_similarity += weights['lexical'] * lexical_similarity
278
+
279
+ if metadata_similarity is not None:
280
+ combined_similarity += weights['metadata'] * metadata_similarity
281
+
282
+ # Build graph from combined similarity
283
+ return self.graph_builder.build_from_similarity(combined_similarity)
284
+
285
+
286
+ def compute_lexical_similarity(
287
+ documents: list,
288
+ tokenized_documents: list,
289
+ method: str = "tfidf",
290
+ ngram_range: tuple = (1, 2),
291
+ max_features: int = 10000,
292
+ stopwords: set = None,
293
+ ) -> np.ndarray:
294
+ """
295
+ Compute lexical similarity matrix from documents.
296
+
297
+ Parameters
298
+ ----------
299
+ documents : list
300
+ Original documents (for BM25)
301
+ tokenized_documents : list
302
+ Tokenized documents
303
+ method : str
304
+ Method: "tfidf" or "bm25"
305
+ ngram_range : tuple
306
+ N-gram range
307
+ max_features : int
308
+ Maximum vocabulary size
309
+ stopwords : set
310
+ Stopwords to exclude
311
+
312
+ Returns
313
+ -------
314
+ np.ndarray
315
+ Lexical similarity matrix
316
+ """
317
+ from sklearn.feature_extraction.text import TfidfVectorizer
318
+
319
+ # Convert tokenized documents back to strings for vectorizer
320
+ processed_docs = [' '.join(tokens) for tokens in tokenized_documents]
321
+
322
+ if method == "tfidf":
323
+ vectorizer = TfidfVectorizer(
324
+ ngram_range=ngram_range,
325
+ max_features=max_features,
326
+ stop_words=list(stopwords) if stopwords else None,
327
+ )
328
+ tfidf_matrix = vectorizer.fit_transform(processed_docs)
329
+
330
+ # Compute cosine similarity
331
+ similarity = (tfidf_matrix @ tfidf_matrix.T).toarray()
332
+
333
+ elif method == "bm25":
334
+ try:
335
+ from rank_bm25 import BM25Okapi
336
+
337
+ bm25 = BM25Okapi(tokenized_documents)
338
+
339
+ n = len(tokenized_documents)
340
+ similarity = np.zeros((n, n))
341
+
342
+ for i, tokens in enumerate(tokenized_documents):
343
+ scores = bm25.get_scores(tokens)
344
+ similarity[i] = scores
345
+
346
+ # Normalize to [0, 1]
347
+ max_val = similarity.max()
348
+ if max_val > 0:
349
+ similarity = similarity / max_val
350
+
351
+ except ImportError:
352
+ warnings.warn("rank_bm25 not installed, falling back to TF-IDF")
353
+ return compute_lexical_similarity(
354
+ documents, tokenized_documents, "tfidf", ngram_range, max_features, stopwords
355
+ )
356
+ else:
357
+ raise ValueError(f"Unknown method: {method}")
358
+
359
+ # Ensure diagonal is 1
360
+ np.fill_diagonal(similarity, 1.0)
361
+
362
+ # Clip to [0, 1]
363
+ similarity = np.clip(similarity, 0, 1)
364
+
365
+ return similarity
366
+
367
+
368
+ def compute_metadata_similarity(
369
+ metadata: "pd.DataFrame",
370
+ categorical_cols: list = None,
371
+ numerical_cols: list = None,
372
+ ) -> np.ndarray:
373
+ """
374
+ Compute similarity matrix from metadata.
375
+
376
+ Parameters
377
+ ----------
378
+ metadata : pd.DataFrame
379
+ Metadata dataframe
380
+ categorical_cols : list
381
+ Categorical column names
382
+ numerical_cols : list
383
+ Numerical column names
384
+
385
+ Returns
386
+ -------
387
+ np.ndarray
388
+ Metadata similarity matrix
389
+ """
390
+ import pandas as pd
391
+
392
+ n = len(metadata)
393
+ similarity = np.zeros((n, n))
394
+
395
+ # Auto-detect column types if not specified
396
+ if categorical_cols is None and numerical_cols is None:
397
+ categorical_cols = metadata.select_dtypes(include=['object', 'category']).columns.tolist()
398
+ numerical_cols = metadata.select_dtypes(include=['number']).columns.tolist()
399
+
400
+ # Categorical similarity (Jaccard-like)
401
+ if categorical_cols:
402
+ for col in categorical_cols:
403
+ values = metadata[col].values
404
+ for i in range(n):
405
+ for j in range(i, n):
406
+ if values[i] == values[j]:
407
+ similarity[i, j] += 1 / len(categorical_cols)
408
+ similarity[j, i] += 1 / len(categorical_cols)
409
+
410
+ # Numerical similarity (1 - normalized distance)
411
+ if numerical_cols:
412
+ from sklearn.preprocessing import MinMaxScaler
413
+
414
+ num_data = metadata[numerical_cols].values
415
+ scaler = MinMaxScaler()
416
+ num_scaled = scaler.fit_transform(num_data)
417
+
418
+ from scipy.spatial.distance import cdist
419
+ distances = cdist(num_scaled, num_scaled, metric='euclidean')
420
+ max_dist = distances.max()
421
+ if max_dist > 0:
422
+ num_similarity = 1 - distances / max_dist
423
+ else:
424
+ num_similarity = np.ones((n, n))
425
+
426
+ # Combine with categorical
427
+ if categorical_cols:
428
+ similarity = 0.5 * similarity + 0.5 * num_similarity
429
+ else:
430
+ similarity = num_similarity
431
+
432
+ # Ensure diagonal is 1
433
+ np.fill_diagonal(similarity, 1.0)
434
+
435
+ return similarity