tritopic 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tritopic/__init__.py +22 -32
- tritopic/config.py +305 -0
- tritopic/core/__init__.py +0 -17
- tritopic/core/clustering.py +229 -243
- tritopic/core/embeddings.py +151 -157
- tritopic/core/graph.py +435 -0
- tritopic/core/keywords.py +213 -249
- tritopic/core/refinement.py +231 -0
- tritopic/core/representatives.py +560 -0
- tritopic/labeling.py +313 -0
- tritopic/model.py +718 -0
- tritopic/multilingual/__init__.py +38 -0
- tritopic/multilingual/detection.py +208 -0
- tritopic/multilingual/stopwords.py +467 -0
- tritopic/multilingual/tokenizers.py +275 -0
- tritopic/visualization.py +371 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/METADATA +92 -48
- tritopic-1.0.0.dist-info/RECORD +20 -0
- tritopic/core/graph_builder.py +0 -493
- tritopic/core/model.py +0 -810
- tritopic/labeling/__init__.py +0 -5
- tritopic/labeling/llm_labeler.py +0 -279
- tritopic/utils/__init__.py +0 -13
- tritopic/utils/metrics.py +0 -254
- tritopic/visualization/__init__.py +0 -5
- tritopic/visualization/plotter.py +0 -523
- tritopic-0.1.0.dist-info/RECORD +0 -18
- tritopic-0.1.0.dist-info/licenses/LICENSE +0 -21
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/WHEEL +0 -0
- {tritopic-0.1.0.dist-info → tritopic-1.0.0.dist-info}/top_level.txt +0 -0
tritopic/core/clustering.py
CHANGED
|
@@ -1,331 +1,317 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
============================
|
|
2
|
+
Clustering Module
|
|
4
3
|
|
|
5
|
-
|
|
6
|
-
- Leiden algorithm (better than Louvain)
|
|
7
|
-
- Consensus clustering for stability
|
|
8
|
-
- Resolution parameter tuning
|
|
4
|
+
Implements Consensus Leiden clustering for stable topic discovery.
|
|
9
5
|
"""
|
|
10
6
|
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
from typing import Any
|
|
14
|
-
|
|
7
|
+
from typing import List, Optional, Tuple
|
|
15
8
|
import numpy as np
|
|
16
|
-
from scipy
|
|
17
|
-
|
|
18
|
-
from collections import Counter
|
|
9
|
+
from scipy import sparse
|
|
10
|
+
import warnings
|
|
19
11
|
|
|
20
12
|
|
|
21
13
|
class ConsensusLeiden:
|
|
22
14
|
"""
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
Runs multiple Leiden clusterings with different seeds and combines
|
|
26
|
-
results using consensus clustering. This dramatically improves
|
|
27
|
-
reproducibility and reduces sensitivity to random initialization.
|
|
15
|
+
Consensus clustering using Leiden algorithm.
|
|
28
16
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
resolution : float
|
|
32
|
-
Resolution parameter for Leiden. Higher = more clusters. Default: 1.0
|
|
33
|
-
n_runs : int
|
|
34
|
-
Number of consensus runs. Default: 10
|
|
35
|
-
random_state : int
|
|
36
|
-
Random seed for reproducibility. Default: 42
|
|
37
|
-
consensus_threshold : float
|
|
38
|
-
Minimum agreement ratio for consensus. Default: 0.5
|
|
17
|
+
Runs Leiden multiple times and builds a co-assignment matrix
|
|
18
|
+
to find stable cluster assignments.
|
|
39
19
|
"""
|
|
40
20
|
|
|
41
21
|
def __init__(
|
|
42
22
|
self,
|
|
43
23
|
resolution: float = 1.0,
|
|
44
24
|
n_runs: int = 10,
|
|
45
|
-
|
|
46
|
-
|
|
25
|
+
min_cluster_size: int = 5,
|
|
26
|
+
random_state: Optional[int] = 42,
|
|
47
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the consensus clustering.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
resolution : float
|
|
34
|
+
Resolution parameter for Leiden
|
|
35
|
+
n_runs : int
|
|
36
|
+
Number of clustering runs
|
|
37
|
+
min_cluster_size : int
|
|
38
|
+
Minimum cluster size
|
|
39
|
+
random_state : int, optional
|
|
40
|
+
Random seed for reproducibility
|
|
41
|
+
"""
|
|
48
42
|
self.resolution = resolution
|
|
49
43
|
self.n_runs = n_runs
|
|
44
|
+
self.min_cluster_size = min_cluster_size
|
|
50
45
|
self.random_state = random_state
|
|
51
|
-
self.consensus_threshold = consensus_threshold
|
|
52
46
|
|
|
53
|
-
self.
|
|
54
|
-
|
|
55
|
-
|
|
47
|
+
self._check_dependencies()
|
|
48
|
+
|
|
49
|
+
def _check_dependencies(self):
|
|
50
|
+
"""Check if required packages are installed."""
|
|
51
|
+
try:
|
|
52
|
+
import leidenalg
|
|
53
|
+
import igraph
|
|
54
|
+
except ImportError:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"leidenalg and python-igraph are required for clustering. "
|
|
57
|
+
"Install with: pip install leidenalg python-igraph"
|
|
58
|
+
)
|
|
56
59
|
|
|
57
60
|
def fit_predict(
|
|
58
61
|
self,
|
|
59
|
-
graph:
|
|
60
|
-
min_cluster_size: int = 5,
|
|
61
|
-
resolution: float | None = None,
|
|
62
|
+
graph: sparse.csr_matrix,
|
|
62
63
|
) -> np.ndarray:
|
|
63
64
|
"""
|
|
64
|
-
Fit
|
|
65
|
+
Fit the consensus clustering and predict labels.
|
|
65
66
|
|
|
66
67
|
Parameters
|
|
67
68
|
----------
|
|
68
|
-
graph :
|
|
69
|
-
|
|
70
|
-
min_cluster_size : int
|
|
71
|
-
Minimum cluster size. Smaller clusters become outliers.
|
|
72
|
-
resolution : float, optional
|
|
73
|
-
Override default resolution.
|
|
69
|
+
graph : sparse.csr_matrix
|
|
70
|
+
Adjacency matrix of the document graph
|
|
74
71
|
|
|
75
72
|
Returns
|
|
76
73
|
-------
|
|
77
|
-
|
|
78
|
-
Cluster
|
|
74
|
+
np.ndarray
|
|
75
|
+
Cluster labels for each document
|
|
79
76
|
"""
|
|
80
|
-
import leidenalg
|
|
77
|
+
import leidenalg
|
|
78
|
+
import igraph as ig
|
|
79
|
+
|
|
80
|
+
n = graph.shape[0]
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
# Convert sparse matrix to igraph
|
|
83
|
+
sources, targets = graph.nonzero()
|
|
84
|
+
weights = np.array(graph[sources, targets]).flatten()
|
|
84
85
|
|
|
85
|
-
#
|
|
86
|
-
|
|
86
|
+
# Create igraph graph
|
|
87
|
+
g = ig.Graph(directed=False)
|
|
88
|
+
g.add_vertices(n)
|
|
89
|
+
edges = list(zip(sources.tolist(), targets.tolist()))
|
|
90
|
+
g.add_edges(edges)
|
|
91
|
+
g.es['weight'] = weights.tolist()
|
|
92
|
+
|
|
93
|
+
# Run Leiden multiple times
|
|
94
|
+
partitions = []
|
|
87
95
|
|
|
88
96
|
for run in range(self.n_runs):
|
|
89
|
-
seed = self.random_state + run
|
|
97
|
+
seed = None if self.random_state is None else self.random_state + run
|
|
90
98
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
resolution_parameter=res,
|
|
99
|
+
partition = leidenalg.find_partition(
|
|
100
|
+
g,
|
|
101
|
+
leidenalg.RBConfigurationVertexPartition,
|
|
102
|
+
weights='weight',
|
|
103
|
+
resolution_parameter=self.resolution,
|
|
97
104
|
seed=seed,
|
|
98
105
|
)
|
|
99
106
|
|
|
100
|
-
# Convert to labels
|
|
101
107
|
labels = np.array(partition.membership)
|
|
102
|
-
|
|
108
|
+
partitions.append(labels)
|
|
103
109
|
|
|
104
|
-
#
|
|
105
|
-
|
|
110
|
+
# Build co-assignment matrix
|
|
111
|
+
co_assignment = self._build_co_assignment_matrix(partitions)
|
|
106
112
|
|
|
107
|
-
#
|
|
108
|
-
|
|
113
|
+
# Final clustering on co-assignment matrix
|
|
114
|
+
final_labels = self._final_clustering(co_assignment, g)
|
|
109
115
|
|
|
110
|
-
#
|
|
111
|
-
|
|
116
|
+
# Apply minimum cluster size constraint
|
|
117
|
+
final_labels = self._apply_min_cluster_size(final_labels)
|
|
112
118
|
|
|
113
|
-
return
|
|
119
|
+
return final_labels
|
|
114
120
|
|
|
115
|
-
def
|
|
121
|
+
def _build_co_assignment_matrix(
|
|
122
|
+
self,
|
|
123
|
+
partitions: List[np.ndarray],
|
|
124
|
+
) -> np.ndarray:
|
|
116
125
|
"""
|
|
117
|
-
|
|
126
|
+
Build co-assignment matrix from multiple partitions.
|
|
118
127
|
|
|
119
|
-
|
|
128
|
+
C[i,j] = fraction of runs where i and j are in the same cluster
|
|
120
129
|
"""
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
#
|
|
152
|
-
#
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
#
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
# Fallback to most common partition
|
|
177
|
-
best_labels = partitions[0]
|
|
178
|
-
|
|
179
|
-
return best_labels
|
|
130
|
+
n = len(partitions[0])
|
|
131
|
+
co_assignment = np.zeros((n, n))
|
|
132
|
+
|
|
133
|
+
for labels in partitions:
|
|
134
|
+
# Documents with same label get co-assignment
|
|
135
|
+
for label in np.unique(labels):
|
|
136
|
+
mask = labels == label
|
|
137
|
+
indices = np.where(mask)[0]
|
|
138
|
+
for i in indices:
|
|
139
|
+
for j in indices:
|
|
140
|
+
co_assignment[i, j] += 1
|
|
141
|
+
|
|
142
|
+
# Normalize by number of runs
|
|
143
|
+
co_assignment /= len(partitions)
|
|
144
|
+
|
|
145
|
+
return co_assignment
|
|
146
|
+
|
|
147
|
+
def _final_clustering(
|
|
148
|
+
self,
|
|
149
|
+
co_assignment: np.ndarray,
|
|
150
|
+
original_graph: "igraph.Graph",
|
|
151
|
+
) -> np.ndarray:
|
|
152
|
+
"""
|
|
153
|
+
Perform final clustering on the co-assignment matrix.
|
|
154
|
+
"""
|
|
155
|
+
import leidenalg
|
|
156
|
+
import igraph as ig
|
|
157
|
+
|
|
158
|
+
n = co_assignment.shape[0]
|
|
159
|
+
|
|
160
|
+
# Threshold the co-assignment matrix
|
|
161
|
+
# Keep only edges where co-assignment > 0.5 (majority of runs)
|
|
162
|
+
threshold = 0.5
|
|
163
|
+
adjacency = np.where(co_assignment > threshold, co_assignment, 0)
|
|
164
|
+
|
|
165
|
+
# Create graph from co-assignment
|
|
166
|
+
sources, targets = np.where(adjacency > 0)
|
|
167
|
+
weights = adjacency[sources, targets]
|
|
168
|
+
|
|
169
|
+
g = ig.Graph(directed=False)
|
|
170
|
+
g.add_vertices(n)
|
|
171
|
+
edges = list(zip(sources.tolist(), targets.tolist()))
|
|
172
|
+
g.add_edges(edges)
|
|
173
|
+
g.es['weight'] = weights.tolist()
|
|
174
|
+
|
|
175
|
+
# Final Leiden run
|
|
176
|
+
partition = leidenalg.find_partition(
|
|
177
|
+
g,
|
|
178
|
+
leidenalg.RBConfigurationVertexPartition,
|
|
179
|
+
weights='weight',
|
|
180
|
+
resolution_parameter=self.resolution,
|
|
181
|
+
seed=self.random_state,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return np.array(partition.membership)
|
|
180
185
|
|
|
181
|
-
def
|
|
186
|
+
def _apply_min_cluster_size(
|
|
182
187
|
self,
|
|
183
188
|
labels: np.ndarray,
|
|
184
|
-
min_size: int,
|
|
185
189
|
) -> np.ndarray:
|
|
186
|
-
"""
|
|
187
|
-
|
|
190
|
+
"""
|
|
191
|
+
Apply minimum cluster size constraint.
|
|
188
192
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
size = np.sum(labels == cluster_id)
|
|
194
|
-
if size < min_size:
|
|
195
|
-
result[labels == cluster_id] = -1
|
|
193
|
+
Small clusters are marked as outliers (-1).
|
|
194
|
+
"""
|
|
195
|
+
unique_labels, counts = np.unique(labels, return_counts=True)
|
|
196
196
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
label_map = {old: new for new, old in enumerate(unique_labels)}
|
|
200
|
-
label_map[-1] = -1
|
|
197
|
+
# Find small clusters
|
|
198
|
+
small_clusters = unique_labels[counts < self.min_cluster_size]
|
|
201
199
|
|
|
202
|
-
|
|
200
|
+
# Mark as outliers
|
|
201
|
+
result = labels.copy()
|
|
202
|
+
for small_label in small_clusters:
|
|
203
|
+
result[labels == small_label] = -1
|
|
204
|
+
|
|
205
|
+
# Renumber remaining clusters from 0
|
|
206
|
+
if len(np.unique(result[result >= 0])) > 0:
|
|
207
|
+
unique_valid = np.unique(result[result >= 0])
|
|
208
|
+
label_map = {old: new for new, old in enumerate(unique_valid)}
|
|
209
|
+
|
|
210
|
+
for old, new in label_map.items():
|
|
211
|
+
result[labels == old] = new
|
|
203
212
|
|
|
204
213
|
return result
|
|
205
214
|
|
|
206
|
-
def _compute_stability(self) -> float:
|
|
207
|
-
"""Compute stability score as average pairwise ARI."""
|
|
208
|
-
if len(self._all_partitions) < 2:
|
|
209
|
-
return 1.0
|
|
210
|
-
|
|
211
|
-
ari_scores = []
|
|
212
|
-
for i in range(len(self._all_partitions)):
|
|
213
|
-
for j in range(i + 1, len(self._all_partitions)):
|
|
214
|
-
ari = adjusted_rand_score(
|
|
215
|
-
self._all_partitions[i],
|
|
216
|
-
self._all_partitions[j]
|
|
217
|
-
)
|
|
218
|
-
ari_scores.append(ari)
|
|
219
|
-
|
|
220
|
-
return float(np.mean(ari_scores))
|
|
221
|
-
|
|
222
215
|
def find_optimal_resolution(
|
|
223
216
|
self,
|
|
224
|
-
graph:
|
|
225
|
-
resolution_range:
|
|
217
|
+
graph: sparse.csr_matrix,
|
|
218
|
+
resolution_range: Tuple[float, float] = (0.1, 2.0),
|
|
226
219
|
n_steps: int = 10,
|
|
227
|
-
target_n_topics: int
|
|
220
|
+
target_n_topics: Optional[int] = None,
|
|
228
221
|
) -> float:
|
|
229
222
|
"""
|
|
230
223
|
Find optimal resolution parameter.
|
|
231
224
|
|
|
232
225
|
Parameters
|
|
233
226
|
----------
|
|
234
|
-
graph :
|
|
235
|
-
|
|
227
|
+
graph : sparse.csr_matrix
|
|
228
|
+
Document graph
|
|
236
229
|
resolution_range : tuple
|
|
237
|
-
Range of resolutions to search
|
|
230
|
+
Range of resolutions to search
|
|
238
231
|
n_steps : int
|
|
239
|
-
Number of
|
|
232
|
+
Number of steps in search
|
|
240
233
|
target_n_topics : int, optional
|
|
241
|
-
|
|
234
|
+
Target number of topics
|
|
242
235
|
|
|
243
236
|
Returns
|
|
244
237
|
-------
|
|
245
|
-
|
|
246
|
-
|
|
238
|
+
float
|
|
239
|
+
Optimal resolution
|
|
247
240
|
"""
|
|
248
|
-
import leidenalg as la
|
|
249
|
-
|
|
250
241
|
resolutions = np.linspace(resolution_range[0], resolution_range[1], n_steps)
|
|
251
|
-
|
|
242
|
+
best_resolution = self.resolution
|
|
243
|
+
best_score = float('-inf')
|
|
252
244
|
|
|
253
245
|
for res in resolutions:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
la.RBConfigurationVertexPartition,
|
|
257
|
-
weights="weight",
|
|
258
|
-
resolution_parameter=res,
|
|
259
|
-
seed=self.random_state,
|
|
260
|
-
)
|
|
246
|
+
self.resolution = res
|
|
247
|
+
labels = self.fit_predict(graph)
|
|
261
248
|
|
|
262
|
-
|
|
263
|
-
modularity = partition.modularity
|
|
249
|
+
n_topics = len(np.unique(labels[labels >= 0]))
|
|
264
250
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
# Find highest modularity
|
|
276
|
-
best = max(results, key=lambda x: x["modularity"])
|
|
277
|
-
|
|
278
|
-
return best["resolution"]
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
class HDBSCANClusterer:
|
|
282
|
-
"""
|
|
283
|
-
Alternative clustering using HDBSCAN.
|
|
284
|
-
|
|
285
|
-
Useful for datasets with varying density or many outliers.
|
|
286
|
-
"""
|
|
287
|
-
|
|
288
|
-
def __init__(
|
|
289
|
-
self,
|
|
290
|
-
min_cluster_size: int = 10,
|
|
291
|
-
min_samples: int = 5,
|
|
292
|
-
metric: str = "euclidean",
|
|
293
|
-
):
|
|
294
|
-
self.min_cluster_size = min_cluster_size
|
|
295
|
-
self.min_samples = min_samples
|
|
296
|
-
self.metric = metric
|
|
251
|
+
if target_n_topics is not None:
|
|
252
|
+
# Score based on closeness to target
|
|
253
|
+
score = -abs(n_topics - target_n_topics)
|
|
254
|
+
else:
|
|
255
|
+
# Score based on modularity (higher is better)
|
|
256
|
+
score = self._compute_modularity(graph, labels)
|
|
257
|
+
|
|
258
|
+
if score > best_score:
|
|
259
|
+
best_score = score
|
|
260
|
+
best_resolution = res
|
|
297
261
|
|
|
298
|
-
self.
|
|
299
|
-
|
|
262
|
+
self.resolution = best_resolution
|
|
263
|
+
return best_resolution
|
|
300
264
|
|
|
301
|
-
def
|
|
265
|
+
def _compute_modularity(
|
|
302
266
|
self,
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
) ->
|
|
306
|
-
"""
|
|
307
|
-
|
|
267
|
+
graph: sparse.csr_matrix,
|
|
268
|
+
labels: np.ndarray,
|
|
269
|
+
) -> float:
|
|
270
|
+
"""Compute modularity of a partition."""
|
|
271
|
+
import igraph as ig
|
|
308
272
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
Document embeddings (optionally reduced with UMAP first).
|
|
313
|
-
|
|
314
|
-
Returns
|
|
315
|
-
-------
|
|
316
|
-
labels : np.ndarray
|
|
317
|
-
Cluster assignments. -1 for outliers.
|
|
318
|
-
"""
|
|
319
|
-
import hdbscan
|
|
273
|
+
n = graph.shape[0]
|
|
274
|
+
sources, targets = graph.nonzero()
|
|
275
|
+
weights = np.array(graph[sources, targets]).flatten()
|
|
320
276
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
)
|
|
277
|
+
g = ig.Graph(directed=False)
|
|
278
|
+
g.add_vertices(n)
|
|
279
|
+
edges = list(zip(sources.tolist(), targets.tolist()))
|
|
280
|
+
g.add_edges(edges)
|
|
281
|
+
g.es['weight'] = weights.tolist()
|
|
327
282
|
|
|
328
|
-
|
|
329
|
-
|
|
283
|
+
# Filter out outliers
|
|
284
|
+
valid_labels = labels.copy()
|
|
285
|
+
valid_labels[labels < 0] = 0 # Temporarily assign to cluster 0
|
|
330
286
|
|
|
331
|
-
return
|
|
287
|
+
return g.modularity(valid_labels.tolist(), weights='weight')
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def compute_clustering_stability(
|
|
291
|
+
labels1: np.ndarray,
|
|
292
|
+
labels2: np.ndarray,
|
|
293
|
+
) -> float:
|
|
294
|
+
"""
|
|
295
|
+
Compute stability between two label assignments using Adjusted Rand Index.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
labels1 : np.ndarray
|
|
300
|
+
First label assignment
|
|
301
|
+
labels2 : np.ndarray
|
|
302
|
+
Second label assignment
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
float
|
|
307
|
+
Adjusted Rand Index (0-1, higher is more stable)
|
|
308
|
+
"""
|
|
309
|
+
from sklearn.metrics import adjusted_rand_score
|
|
310
|
+
|
|
311
|
+
# Filter out outliers from both
|
|
312
|
+
mask = (labels1 >= 0) & (labels2 >= 0)
|
|
313
|
+
|
|
314
|
+
if mask.sum() < 2:
|
|
315
|
+
return 0.0
|
|
316
|
+
|
|
317
|
+
return adjusted_rand_score(labels1[mask], labels2[mask])
|