tritopic 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tritopic might be problematic. Click here for more details.

@@ -1,331 +1,317 @@
1
1
  """
2
- Consensus Leiden Clustering
3
- ============================
2
+ Clustering Module
4
3
 
5
- Robust community detection with:
6
- - Leiden algorithm (better than Louvain)
7
- - Consensus clustering for stability
8
- - Resolution parameter tuning
4
+ Implements Consensus Leiden clustering for stable topic discovery.
9
5
  """
10
6
 
11
- from __future__ import annotations
12
-
13
- from typing import Any
14
-
7
+ from typing import List, Optional, Tuple
15
8
  import numpy as np
16
- from scipy.cluster.hierarchy import linkage, fcluster
17
- from sklearn.metrics import adjusted_rand_score
18
- from collections import Counter
9
+ from scipy import sparse
10
+ import warnings
19
11
 
20
12
 
21
13
  class ConsensusLeiden:
22
14
  """
23
- Leiden clustering with consensus for stability.
24
-
25
- Runs multiple Leiden clusterings with different seeds and combines
26
- results using consensus clustering. This dramatically improves
27
- reproducibility and reduces sensitivity to random initialization.
15
+ Consensus clustering using Leiden algorithm.
28
16
 
29
- Parameters
30
- ----------
31
- resolution : float
32
- Resolution parameter for Leiden. Higher = more clusters. Default: 1.0
33
- n_runs : int
34
- Number of consensus runs. Default: 10
35
- random_state : int
36
- Random seed for reproducibility. Default: 42
37
- consensus_threshold : float
38
- Minimum agreement ratio for consensus. Default: 0.5
17
+ Runs Leiden multiple times and builds a co-assignment matrix
18
+ to find stable cluster assignments.
39
19
  """
40
20
 
41
21
  def __init__(
42
22
  self,
43
23
  resolution: float = 1.0,
44
24
  n_runs: int = 10,
45
- random_state: int = 42,
46
- consensus_threshold: float = 0.5,
25
+ min_cluster_size: int = 5,
26
+ random_state: Optional[int] = 42,
47
27
  ):
28
+ """
29
+ Initialize the consensus clustering.
30
+
31
+ Parameters
32
+ ----------
33
+ resolution : float
34
+ Resolution parameter for Leiden
35
+ n_runs : int
36
+ Number of clustering runs
37
+ min_cluster_size : int
38
+ Minimum cluster size
39
+ random_state : int, optional
40
+ Random seed for reproducibility
41
+ """
48
42
  self.resolution = resolution
49
43
  self.n_runs = n_runs
44
+ self.min_cluster_size = min_cluster_size
50
45
  self.random_state = random_state
51
- self.consensus_threshold = consensus_threshold
52
46
 
53
- self.labels_: np.ndarray | None = None
54
- self.stability_score_: float | None = None
55
- self._all_partitions: list[np.ndarray] = []
47
+ self._check_dependencies()
48
+
49
+ def _check_dependencies(self):
50
+ """Check if required packages are installed."""
51
+ try:
52
+ import leidenalg
53
+ import igraph
54
+ except ImportError:
55
+ raise ImportError(
56
+ "leidenalg and python-igraph are required for clustering. "
57
+ "Install with: pip install leidenalg python-igraph"
58
+ )
56
59
 
57
60
  def fit_predict(
58
61
  self,
59
- graph: "igraph.Graph",
60
- min_cluster_size: int = 5,
61
- resolution: float | None = None,
62
+ graph: sparse.csr_matrix,
62
63
  ) -> np.ndarray:
63
64
  """
64
- Fit Leiden clustering with consensus.
65
+ Fit the consensus clustering and predict labels.
65
66
 
66
67
  Parameters
67
68
  ----------
68
- graph : igraph.Graph
69
- Input graph with edge weights.
70
- min_cluster_size : int
71
- Minimum cluster size. Smaller clusters become outliers.
72
- resolution : float, optional
73
- Override default resolution.
69
+ graph : sparse.csr_matrix
70
+ Adjacency matrix of the document graph
74
71
 
75
72
  Returns
76
73
  -------
77
- labels : np.ndarray
78
- Cluster assignments. -1 for outliers.
74
+ np.ndarray
75
+ Cluster labels for each document
79
76
  """
80
- import leidenalg as la
77
+ import leidenalg
78
+ import igraph as ig
79
+
80
+ n = graph.shape[0]
81
81
 
82
- res = resolution or self.resolution
83
- n_nodes = graph.vcount()
82
+ # Convert sparse matrix to igraph
83
+ sources, targets = graph.nonzero()
84
+ weights = np.array(graph[sources, targets]).flatten()
84
85
 
85
- # Run multiple Leiden clusterings
86
- self._all_partitions = []
86
+ # Create igraph graph
87
+ g = ig.Graph(directed=False)
88
+ g.add_vertices(n)
89
+ edges = list(zip(sources.tolist(), targets.tolist()))
90
+ g.add_edges(edges)
91
+ g.es['weight'] = weights.tolist()
92
+
93
+ # Run Leiden multiple times
94
+ partitions = []
87
95
 
88
96
  for run in range(self.n_runs):
89
- seed = self.random_state + run
97
+ seed = None if self.random_state is None else self.random_state + run
90
98
 
91
- # Run Leiden
92
- partition = la.find_partition(
93
- graph,
94
- la.RBConfigurationVertexPartition,
95
- weights="weight",
96
- resolution_parameter=res,
99
+ partition = leidenalg.find_partition(
100
+ g,
101
+ leidenalg.RBConfigurationVertexPartition,
102
+ weights='weight',
103
+ resolution_parameter=self.resolution,
97
104
  seed=seed,
98
105
  )
99
106
 
100
- # Convert to labels
101
107
  labels = np.array(partition.membership)
102
- self._all_partitions.append(labels)
108
+ partitions.append(labels)
103
109
 
104
- # Compute consensus
105
- self.labels_ = self._compute_consensus(self._all_partitions)
110
+ # Build co-assignment matrix
111
+ co_assignment = self._build_co_assignment_matrix(partitions)
106
112
 
107
- # Handle small clusters as outliers
108
- self.labels_ = self._handle_small_clusters(self.labels_, min_cluster_size)
113
+ # Final clustering on co-assignment matrix
114
+ final_labels = self._final_clustering(co_assignment, g)
109
115
 
110
- # Compute stability score
111
- self.stability_score_ = self._compute_stability()
116
+ # Apply minimum cluster size constraint
117
+ final_labels = self._apply_min_cluster_size(final_labels)
112
118
 
113
- return self.labels_
119
+ return final_labels
114
120
 
115
- def _compute_consensus(self, partitions: list[np.ndarray]) -> np.ndarray:
121
+ def _build_co_assignment_matrix(
122
+ self,
123
+ partitions: List[np.ndarray],
124
+ ) -> np.ndarray:
116
125
  """
117
- Compute consensus partition from multiple runs.
126
+ Build co-assignment matrix from multiple partitions.
118
127
 
119
- Uses co-occurrence matrix and hierarchical clustering.
128
+ C[i,j] = fraction of runs where i and j are in the same cluster
120
129
  """
121
- n_nodes = len(partitions[0])
122
- n_runs = len(partitions)
123
-
124
- # Build co-occurrence matrix
125
- # co_occur[i,j] = fraction of runs where i and j are in same cluster
126
- co_occur = np.zeros((n_nodes, n_nodes))
127
-
128
- for partition in partitions:
129
- for cluster_id in np.unique(partition):
130
- members = np.where(partition == cluster_id)[0]
131
- for i in members:
132
- for j in members:
133
- co_occur[i, j] += 1
134
-
135
- co_occur /= n_runs
136
-
137
- # Convert co-occurrence to distance
138
- distance = 1 - co_occur
139
-
140
- # Hierarchical clustering on distance matrix
141
- # Use condensed form for linkage
142
- condensed = []
143
- for i in range(n_nodes):
144
- for j in range(i + 1, n_nodes):
145
- condensed.append(distance[i, j])
146
- condensed = np.array(condensed)
147
-
148
- # Average linkage tends to work well for consensus
149
- Z = linkage(condensed, method="average")
150
-
151
- # Cut at threshold that matches approximate number of clusters
152
- # from the most frequent partition
153
- n_clusters_list = [len(np.unique(p)) for p in partitions]
154
- median_n_clusters = int(np.median(n_clusters_list))
155
-
156
- # Find optimal cut
157
- best_labels = None
158
- best_score = -1
159
-
160
- for n_clusters in range(max(2, median_n_clusters - 2), median_n_clusters + 3):
161
- try:
162
- labels = fcluster(Z, n_clusters, criterion="maxclust")
163
- labels = labels - 1 # 0-indexed
164
-
165
- # Score by average ARI with original partitions
166
- ari_scores = [adjusted_rand_score(labels, p) for p in partitions]
167
- avg_ari = np.mean(ari_scores)
168
-
169
- if avg_ari > best_score:
170
- best_score = avg_ari
171
- best_labels = labels
172
- except Exception:
173
- continue
174
-
175
- if best_labels is None:
176
- # Fallback to most common partition
177
- best_labels = partitions[0]
178
-
179
- return best_labels
130
+ n = len(partitions[0])
131
+ co_assignment = np.zeros((n, n))
132
+
133
+ for labels in partitions:
134
+ # Documents with same label get co-assignment
135
+ for label in np.unique(labels):
136
+ mask = labels == label
137
+ indices = np.where(mask)[0]
138
+ for i in indices:
139
+ for j in indices:
140
+ co_assignment[i, j] += 1
141
+
142
+ # Normalize by number of runs
143
+ co_assignment /= len(partitions)
144
+
145
+ return co_assignment
146
+
147
+ def _final_clustering(
148
+ self,
149
+ co_assignment: np.ndarray,
150
+ original_graph: "igraph.Graph",
151
+ ) -> np.ndarray:
152
+ """
153
+ Perform final clustering on the co-assignment matrix.
154
+ """
155
+ import leidenalg
156
+ import igraph as ig
157
+
158
+ n = co_assignment.shape[0]
159
+
160
+ # Threshold the co-assignment matrix
161
+ # Keep only edges where co-assignment > 0.5 (majority of runs)
162
+ threshold = 0.5
163
+ adjacency = np.where(co_assignment > threshold, co_assignment, 0)
164
+
165
+ # Create graph from co-assignment
166
+ sources, targets = np.where(adjacency > 0)
167
+ weights = adjacency[sources, targets]
168
+
169
+ g = ig.Graph(directed=False)
170
+ g.add_vertices(n)
171
+ edges = list(zip(sources.tolist(), targets.tolist()))
172
+ g.add_edges(edges)
173
+ g.es['weight'] = weights.tolist()
174
+
175
+ # Final Leiden run
176
+ partition = leidenalg.find_partition(
177
+ g,
178
+ leidenalg.RBConfigurationVertexPartition,
179
+ weights='weight',
180
+ resolution_parameter=self.resolution,
181
+ seed=self.random_state,
182
+ )
183
+
184
+ return np.array(partition.membership)
180
185
 
181
- def _handle_small_clusters(
186
+ def _apply_min_cluster_size(
182
187
  self,
183
188
  labels: np.ndarray,
184
- min_size: int,
185
189
  ) -> np.ndarray:
186
- """Mark small clusters as outliers (-1)."""
187
- result = labels.copy()
190
+ """
191
+ Apply minimum cluster size constraint.
188
192
 
189
- for cluster_id in np.unique(labels):
190
- if cluster_id == -1:
191
- continue
192
-
193
- size = np.sum(labels == cluster_id)
194
- if size < min_size:
195
- result[labels == cluster_id] = -1
193
+ Small clusters are marked as outliers (-1).
194
+ """
195
+ unique_labels, counts = np.unique(labels, return_counts=True)
196
196
 
197
- # Relabel to consecutive integers
198
- unique_labels = sorted([l for l in np.unique(result) if l != -1])
199
- label_map = {old: new for new, old in enumerate(unique_labels)}
200
- label_map[-1] = -1
197
+ # Find small clusters
198
+ small_clusters = unique_labels[counts < self.min_cluster_size]
201
199
 
202
- result = np.array([label_map[l] for l in result])
200
+ # Mark as outliers
201
+ result = labels.copy()
202
+ for small_label in small_clusters:
203
+ result[labels == small_label] = -1
204
+
205
+ # Renumber remaining clusters from 0
206
+ if len(np.unique(result[result >= 0])) > 0:
207
+ unique_valid = np.unique(result[result >= 0])
208
+ label_map = {old: new for new, old in enumerate(unique_valid)}
209
+
210
+ for old, new in label_map.items():
211
+ result[labels == old] = new
203
212
 
204
213
  return result
205
214
 
206
- def _compute_stability(self) -> float:
207
- """Compute stability score as average pairwise ARI."""
208
- if len(self._all_partitions) < 2:
209
- return 1.0
210
-
211
- ari_scores = []
212
- for i in range(len(self._all_partitions)):
213
- for j in range(i + 1, len(self._all_partitions)):
214
- ari = adjusted_rand_score(
215
- self._all_partitions[i],
216
- self._all_partitions[j]
217
- )
218
- ari_scores.append(ari)
219
-
220
- return float(np.mean(ari_scores))
221
-
222
215
  def find_optimal_resolution(
223
216
  self,
224
- graph: "igraph.Graph",
225
- resolution_range: tuple[float, float] = (0.1, 2.0),
217
+ graph: sparse.csr_matrix,
218
+ resolution_range: Tuple[float, float] = (0.1, 2.0),
226
219
  n_steps: int = 10,
227
- target_n_topics: int | None = None,
220
+ target_n_topics: Optional[int] = None,
228
221
  ) -> float:
229
222
  """
230
223
  Find optimal resolution parameter.
231
224
 
232
225
  Parameters
233
226
  ----------
234
- graph : igraph.Graph
235
- Input graph.
227
+ graph : sparse.csr_matrix
228
+ Document graph
236
229
  resolution_range : tuple
237
- Range of resolutions to search.
230
+ Range of resolutions to search
238
231
  n_steps : int
239
- Number of resolutions to try.
232
+ Number of steps in search
240
233
  target_n_topics : int, optional
241
- If provided, find resolution closest to this number of topics.
234
+ Target number of topics
242
235
 
243
236
  Returns
244
237
  -------
245
- optimal_resolution : float
246
- Best resolution parameter.
238
+ float
239
+ Optimal resolution
247
240
  """
248
- import leidenalg as la
249
-
250
241
  resolutions = np.linspace(resolution_range[0], resolution_range[1], n_steps)
251
- results = []
242
+ best_resolution = self.resolution
243
+ best_score = float('-inf')
252
244
 
253
245
  for res in resolutions:
254
- partition = la.find_partition(
255
- graph,
256
- la.RBConfigurationVertexPartition,
257
- weights="weight",
258
- resolution_parameter=res,
259
- seed=self.random_state,
260
- )
246
+ self.resolution = res
247
+ labels = self.fit_predict(graph)
261
248
 
262
- n_clusters = len(set(partition.membership))
263
- modularity = partition.modularity
249
+ n_topics = len(np.unique(labels[labels >= 0]))
264
250
 
265
- results.append({
266
- "resolution": res,
267
- "n_clusters": n_clusters,
268
- "modularity": modularity,
269
- })
270
-
271
- if target_n_topics is not None:
272
- # Find closest to target
273
- best = min(results, key=lambda x: abs(x["n_clusters"] - target_n_topics))
274
- else:
275
- # Find highest modularity
276
- best = max(results, key=lambda x: x["modularity"])
277
-
278
- return best["resolution"]
279
-
280
-
281
- class HDBSCANClusterer:
282
- """
283
- Alternative clustering using HDBSCAN.
284
-
285
- Useful for datasets with varying density or many outliers.
286
- """
287
-
288
- def __init__(
289
- self,
290
- min_cluster_size: int = 10,
291
- min_samples: int = 5,
292
- metric: str = "euclidean",
293
- ):
294
- self.min_cluster_size = min_cluster_size
295
- self.min_samples = min_samples
296
- self.metric = metric
251
+ if target_n_topics is not None:
252
+ # Score based on closeness to target
253
+ score = -abs(n_topics - target_n_topics)
254
+ else:
255
+ # Score based on modularity (higher is better)
256
+ score = self._compute_modularity(graph, labels)
257
+
258
+ if score > best_score:
259
+ best_score = score
260
+ best_resolution = res
297
261
 
298
- self.labels_: np.ndarray | None = None
299
- self.probabilities_: np.ndarray | None = None
262
+ self.resolution = best_resolution
263
+ return best_resolution
300
264
 
301
- def fit_predict(
265
+ def _compute_modularity(
302
266
  self,
303
- embeddings: np.ndarray,
304
- **kwargs,
305
- ) -> np.ndarray:
306
- """
307
- Fit HDBSCAN clustering.
267
+ graph: sparse.csr_matrix,
268
+ labels: np.ndarray,
269
+ ) -> float:
270
+ """Compute modularity of a partition."""
271
+ import igraph as ig
308
272
 
309
- Parameters
310
- ----------
311
- embeddings : np.ndarray
312
- Document embeddings (optionally reduced with UMAP first).
313
-
314
- Returns
315
- -------
316
- labels : np.ndarray
317
- Cluster assignments. -1 for outliers.
318
- """
319
- import hdbscan
273
+ n = graph.shape[0]
274
+ sources, targets = graph.nonzero()
275
+ weights = np.array(graph[sources, targets]).flatten()
320
276
 
321
- clusterer = hdbscan.HDBSCAN(
322
- min_cluster_size=self.min_cluster_size,
323
- min_samples=self.min_samples,
324
- metric=self.metric,
325
- **kwargs,
326
- )
277
+ g = ig.Graph(directed=False)
278
+ g.add_vertices(n)
279
+ edges = list(zip(sources.tolist(), targets.tolist()))
280
+ g.add_edges(edges)
281
+ g.es['weight'] = weights.tolist()
327
282
 
328
- self.labels_ = clusterer.fit_predict(embeddings)
329
- self.probabilities_ = clusterer.probabilities_
283
+ # Filter out outliers
284
+ valid_labels = labels.copy()
285
+ valid_labels[labels < 0] = 0 # Temporarily assign to cluster 0
330
286
 
331
- return self.labels_
287
+ return g.modularity(valid_labels.tolist(), weights='weight')
288
+
289
+
290
+ def compute_clustering_stability(
291
+ labels1: np.ndarray,
292
+ labels2: np.ndarray,
293
+ ) -> float:
294
+ """
295
+ Compute stability between two label assignments using Adjusted Rand Index.
296
+
297
+ Parameters
298
+ ----------
299
+ labels1 : np.ndarray
300
+ First label assignment
301
+ labels2 : np.ndarray
302
+ Second label assignment
303
+
304
+ Returns
305
+ -------
306
+ float
307
+ Adjusted Rand Index (0-1, higher is more stable)
308
+ """
309
+ from sklearn.metrics import adjusted_rand_score
310
+
311
+ # Filter out outliers from both
312
+ mask = (labels1 >= 0) & (labels2 >= 0)
313
+
314
+ if mask.sum() < 2:
315
+ return 0.0
316
+
317
+ return adjusted_rand_score(labels1[mask], labels2[mask])