compsil 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compsil/__init__.py +1 -0
- compsil/compsil.py +377 -0
- compsil-0.1.0.dist-info/METADATA +377 -0
- compsil-0.1.0.dist-info/RECORD +7 -0
- compsil-0.1.0.dist-info/WHEEL +5 -0
- compsil-0.1.0.dist-info/licenses/LICENSE +21 -0
- compsil-0.1.0.dist-info/top_level.txt +1 -0
compsil/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .compsil import CompSil
|
compsil/compsil.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.cluster import KMeans
|
|
4
|
+
from sklearn.metrics import silhouette_score, silhouette_samples
|
|
5
|
+
from sklearn.utils import resample
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
from joblib import Parallel, delayed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CompSil:
|
|
11
|
+
"""
|
|
12
|
+
Composite Silhouette (CompSil)
|
|
13
|
+
|
|
14
|
+
Per subsample b=1..B for each k:
|
|
15
|
+
- compute S_micro^(b), S_macro^(b)
|
|
16
|
+
- Δ_b = S_micro^(b) - S_macro^(b)
|
|
17
|
+
- D = max_b |Δ_b|
|
|
18
|
+
- r_raw_b = Δ_b/(D+eps) in [-1,1]
|
|
19
|
+
- r_b = tanh(r_raw_b)
|
|
20
|
+
- w_b = (1+r_b)/2
|
|
21
|
+
- S_mM^(b) = w_b*S_micro^(b) + (1-w_b)*S_macro^(b)
|
|
22
|
+
|
|
23
|
+
Aggregate for each k:
|
|
24
|
+
- mean score: S_mM(k) = mean_b S_mM^(b)
|
|
25
|
+
- std score: std_S_mM(k) = std_b S_mM^(b)
|
|
26
|
+
- standard error: se_S_mM(k) = std_S_mM(k) / sqrt(B_eff)
|
|
27
|
+
- lower conf bound: LCB_S_mM(k) = S_mM(k) - se_S_mM(k)
|
|
28
|
+
|
|
29
|
+
Selection:
|
|
30
|
+
- k* = argmax_k S_mM(k) (or argmax_k LCB_S_mM(k))
|
|
31
|
+
|
|
32
|
+
Sampling size mechanism:
|
|
33
|
+
- explicit `sample_size` (int), or
|
|
34
|
+
- sample_size=None or sample_size="auto", the class chooses a subsample size automatically
|
|
35
|
+
based on dataset size N and the maximum k in `k_values`.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
- data: ndarray or DataFrame, shape (n_samples, n_features)
|
|
39
|
+
- ground_truth: int, optional
|
|
40
|
+
- k_values: iterable or int, default=range(2, 11)
|
|
41
|
+
- num_samples: int, default=100 (B)
|
|
42
|
+
- sample_size: int | float | None | "auto", default=1000
|
|
43
|
+
* int: absolute subsample size m
|
|
44
|
+
* float in (0,1]: treated as fraction f, m = floor(f*N)
|
|
45
|
+
* None or "auto": compute m automatically (see _auto_sample_size)
|
|
46
|
+
- random_state: int, default=42
|
|
47
|
+
- n_jobs: int, default=-1
|
|
48
|
+
- eps: float, default=1e-12
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self,
|
|
52
|
+
data,
|
|
53
|
+
ground_truth=None,
|
|
54
|
+
k_values=range(2, 11),
|
|
55
|
+
num_samples=10,
|
|
56
|
+
sample_size="auto",
|
|
57
|
+
random_state=42,
|
|
58
|
+
n_jobs=-1,
|
|
59
|
+
eps=1e-12):
|
|
60
|
+
self.data = data
|
|
61
|
+
self.ground_truth = ground_truth
|
|
62
|
+
self.k_values = [k_values] if isinstance(k_values, int) else list(k_values)
|
|
63
|
+
self.num_samples = int(num_samples)
|
|
64
|
+
self.random_state = int(random_state)
|
|
65
|
+
self.n_jobs = int(n_jobs)
|
|
66
|
+
self.eps = float(eps)
|
|
67
|
+
|
|
68
|
+
self._results = []
|
|
69
|
+
self.results_df = pd.DataFrame()
|
|
70
|
+
self.score_ = None # set only if one k is evaluated
|
|
71
|
+
|
|
72
|
+
self.n_samples_ = int(len(self.data))
|
|
73
|
+
if self.n_samples_ <= 0:
|
|
74
|
+
raise ValueError("Empty dataset.")
|
|
75
|
+
|
|
76
|
+
self.sample_size = self._resolve_sample_size(sample_size)
|
|
77
|
+
self.sample_fraction_ = self.sample_size / self.n_samples_
|
|
78
|
+
|
|
79
|
+
if self.sample_size < 2:
|
|
80
|
+
raise ValueError(f"Resolved sample_size={self.sample_size} is too small.")
|
|
81
|
+
if self.sample_size > self.n_samples_:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Resolved sample_size={self.sample_size} is larger than n_samples={self.n_samples_}."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def _resolve_sample_size(self, sample_size):
|
|
87
|
+
n = self.n_samples_
|
|
88
|
+
|
|
89
|
+
# Auto
|
|
90
|
+
if sample_size is None or (isinstance(sample_size, str) and sample_size.lower() == "auto"):
|
|
91
|
+
return self._auto_sample_size()
|
|
92
|
+
|
|
93
|
+
# Fraction mode (float in (0,1])
|
|
94
|
+
if isinstance(sample_size, float):
|
|
95
|
+
if not (0.0 < sample_size <= 1.0):
|
|
96
|
+
raise ValueError("If sample_size is a float, it must be in (0, 1].")
|
|
97
|
+
m = int(np.floor(sample_size * n))
|
|
98
|
+
return max(2, min(m, n))
|
|
99
|
+
|
|
100
|
+
# Int mode
|
|
101
|
+
m = int(sample_size)
|
|
102
|
+
return m
|
|
103
|
+
|
|
104
|
+
def _auto_sample_size(self):
|
|
105
|
+
"""
|
|
106
|
+
Automatic subsample size selection (no user-facing hyperparameters).
|
|
107
|
+
|
|
108
|
+
Heuristic:
|
|
109
|
+
1) Ensure a minimum average points-per-cluster at k_max: m >= 30 * k_max
|
|
110
|
+
2) Use a baseline fraction depending on dataset size:
|
|
111
|
+
- small N: 0.8N
|
|
112
|
+
- medium N: 0.6N
|
|
113
|
+
- large N: 0.4N
|
|
114
|
+
3) Take the maximum of (1) and (2), then cap at N.
|
|
115
|
+
"""
|
|
116
|
+
n = self.n_samples_
|
|
117
|
+
k_max = int(max(self.k_values)) if len(self.k_values) > 0 else 2
|
|
118
|
+
|
|
119
|
+
m_min = 30 * k_max
|
|
120
|
+
|
|
121
|
+
if n <= 2000:
|
|
122
|
+
m_base = int(np.floor(0.80 * n))
|
|
123
|
+
elif n <= 20000:
|
|
124
|
+
m_base = int(np.floor(0.60 * n))
|
|
125
|
+
else:
|
|
126
|
+
m_base = int(np.floor(0.40 * n))
|
|
127
|
+
|
|
128
|
+
m = max(m_min, m_base)
|
|
129
|
+
m = min(max(2, m), n)
|
|
130
|
+
return m
|
|
131
|
+
|
|
132
|
+
def evaluate_sample(self, k, i):
|
|
133
|
+
"""
|
|
134
|
+
One subsampling iteration for fixed k.
|
|
135
|
+
Returns: (smicro, smacro, diff, s_mm_b)
|
|
136
|
+
"""
|
|
137
|
+
seed = self.random_state + i
|
|
138
|
+
|
|
139
|
+
sampled_data = resample(
|
|
140
|
+
self.data,
|
|
141
|
+
n_samples=self.sample_size,
|
|
142
|
+
replace=False,
|
|
143
|
+
random_state=seed
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
kmeans = KMeans(n_clusters=k, random_state=seed, n_init=1)
|
|
147
|
+
labels = kmeans.fit_predict(sampled_data)
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
s = silhouette_samples(sampled_data, labels) # compute once
|
|
151
|
+
|
|
152
|
+
# micro silhouette
|
|
153
|
+
smicro = float(np.mean(s))
|
|
154
|
+
|
|
155
|
+
labs = np.asarray(labels)
|
|
156
|
+
uniq = np.unique(labs)
|
|
157
|
+
cluster_means = [float(np.mean(s[labs == u])) for u in uniq]
|
|
158
|
+
|
|
159
|
+
# macro silhouette
|
|
160
|
+
smacro = float(np.mean(cluster_means)) if len(cluster_means) > 0 else np.nan
|
|
161
|
+
except Exception:
|
|
162
|
+
return np.nan, np.nan, np.nan, np.nan
|
|
163
|
+
|
|
164
|
+
diff = smicro - smacro
|
|
165
|
+
return smicro, smacro, diff, np.nan
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def _tanh_rb_weights_from_differences(differences, eps=1e-12):
|
|
169
|
+
"""
|
|
170
|
+
Given Δ_b over b=1..B, compute:
|
|
171
|
+
D = max |Δ_b|
|
|
172
|
+
r_raw_b = Δ_b/(D+eps) in [-1, 1]
|
|
173
|
+
r_b = tanh(r_raw_b)
|
|
174
|
+
w_b = (1+r_b)/2 in (0,1)
|
|
175
|
+
"""
|
|
176
|
+
d = np.asarray(differences, dtype=float)
|
|
177
|
+
finite = np.isfinite(d)
|
|
178
|
+
|
|
179
|
+
if not np.any(finite):
|
|
180
|
+
return d * np.nan, d * np.nan, np.nan
|
|
181
|
+
|
|
182
|
+
D = float(np.max(np.abs(d[finite])))
|
|
183
|
+
denom = D + float(eps)
|
|
184
|
+
|
|
185
|
+
if D == 0.0:
|
|
186
|
+
r = np.zeros_like(d)
|
|
187
|
+
r[~finite] = np.nan
|
|
188
|
+
w = 0.5 * np.ones_like(d)
|
|
189
|
+
w[~finite] = np.nan
|
|
190
|
+
return w, r, D
|
|
191
|
+
|
|
192
|
+
r_raw = d / denom
|
|
193
|
+
r_raw = np.clip(r_raw, -1.0, 1.0)
|
|
194
|
+
r_raw[~finite] = np.nan
|
|
195
|
+
|
|
196
|
+
r = np.tanh(r_raw)
|
|
197
|
+
r[~finite] = np.nan
|
|
198
|
+
|
|
199
|
+
w = 0.5 * (1.0 + r)
|
|
200
|
+
w[~finite] = np.nan
|
|
201
|
+
|
|
202
|
+
return w, r, D
|
|
203
|
+
|
|
204
|
+
def evaluate(self):
|
|
205
|
+
"""
|
|
206
|
+
Evaluate over k_values using subsampled clustering.
|
|
207
|
+
Stores results in self.results_df.
|
|
208
|
+
|
|
209
|
+
Output columns (per k):
|
|
210
|
+
- avg S_micro
|
|
211
|
+
- avg S_macro
|
|
212
|
+
- w_micro (mean of per-subsample weights; descriptive)
|
|
213
|
+
- S_mM (mean of per-subsample composites)
|
|
214
|
+
- std S_mM
|
|
215
|
+
- se S_mM
|
|
216
|
+
- LCB S_mM (S_mM - se)
|
|
217
|
+
"""
|
|
218
|
+
self._results = []
|
|
219
|
+
|
|
220
|
+
for k in self.k_values:
|
|
221
|
+
results = Parallel(n_jobs=self.n_jobs)(
|
|
222
|
+
delayed(self.evaluate_sample)(k, i) for i in range(self.num_samples)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
smicro_list, smacro_list, differences, _ = zip(*results)
|
|
226
|
+
|
|
227
|
+
smicro_arr = np.asarray(smicro_list, dtype=float)
|
|
228
|
+
smacro_arr = np.asarray(smacro_list, dtype=float)
|
|
229
|
+
diff_arr = np.asarray(differences, dtype=float)
|
|
230
|
+
|
|
231
|
+
avg_smicro = float(np.nanmean(smicro_arr)) if np.any(np.isfinite(smicro_arr)) else np.nan
|
|
232
|
+
avg_smacro = float(np.nanmean(smacro_arr)) if np.any(np.isfinite(smacro_arr)) else np.nan
|
|
233
|
+
|
|
234
|
+
# weights
|
|
235
|
+
w_b, r_b, D = self._tanh_rb_weights_from_differences(diff_arr, eps=self.eps)
|
|
236
|
+
|
|
237
|
+
# per-subsample composite
|
|
238
|
+
S_b = w_b * smicro_arr + (1.0 - w_b) * smacro_arr
|
|
239
|
+
|
|
240
|
+
# mean composite
|
|
241
|
+
S_mM = float(np.nanmean(S_b)) if np.any(np.isfinite(S_b)) else np.nan
|
|
242
|
+
|
|
243
|
+
# descriptive mean weight
|
|
244
|
+
w_micro_mean = float(np.nanmean(w_b)) if np.any(np.isfinite(w_b)) else np.nan
|
|
245
|
+
|
|
246
|
+
# LCB components computed from S_b across valid subsamples
|
|
247
|
+
finite_sb = np.isfinite(S_b)
|
|
248
|
+
B_eff = int(np.sum(finite_sb))
|
|
249
|
+
if B_eff >= 2:
|
|
250
|
+
std_smm = float(np.nanstd(S_b, ddof=1))
|
|
251
|
+
se_smm = std_smm / np.sqrt(B_eff)
|
|
252
|
+
elif B_eff == 1:
|
|
253
|
+
std_smm = 0.0
|
|
254
|
+
se_smm = 0.0
|
|
255
|
+
else:
|
|
256
|
+
std_smm = np.nan
|
|
257
|
+
se_smm = np.nan
|
|
258
|
+
|
|
259
|
+
lcb_smm = (S_mM - se_smm) if (np.isfinite(S_mM) and np.isfinite(se_smm)) else np.nan
|
|
260
|
+
|
|
261
|
+
if len(self.k_values) == 1:
|
|
262
|
+
self.score_ = S_mM
|
|
263
|
+
|
|
264
|
+
result = {
|
|
265
|
+
'k': int(k),
|
|
266
|
+
'avg S_micro': avg_smicro,
|
|
267
|
+
'avg S_macro': avg_smacro,
|
|
268
|
+
'w_micro': w_micro_mean,
|
|
269
|
+
'S_mM': S_mM,
|
|
270
|
+
'std S_mM': std_smm,
|
|
271
|
+
'se S_mM': se_smm,
|
|
272
|
+
'LCB S_mM': lcb_smm,
|
|
273
|
+
'B_eff': B_eff,
|
|
274
|
+
'sample_size': int(self.sample_size),
|
|
275
|
+
'sample_fraction': float(self.sample_fraction_),
|
|
276
|
+
}
|
|
277
|
+
self._results.append(result)
|
|
278
|
+
|
|
279
|
+
self.results_df = pd.DataFrame(self._results)
|
|
280
|
+
|
|
281
|
+
def plot_results(self):
|
|
282
|
+
"""
|
|
283
|
+
Plot S_mM and individual averages vs k.
|
|
284
|
+
"""
|
|
285
|
+
if self.results_df.empty:
|
|
286
|
+
raise ValueError("No results available. Run evaluate() first.")
|
|
287
|
+
if len(self.results_df) == 1:
|
|
288
|
+
raise ValueError("Cannot plot with only one k. Evaluate multiple k values.")
|
|
289
|
+
|
|
290
|
+
max_smicro = self.results_df['avg S_micro'].max()
|
|
291
|
+
max_smicro_k = self.results_df.loc[self.results_df['avg S_micro'].idxmax(), 'k']
|
|
292
|
+
|
|
293
|
+
max_smacro = self.results_df['avg S_macro'].max()
|
|
294
|
+
max_smacro_k = self.results_df.loc[self.results_df['avg S_macro'].idxmax(), 'k']
|
|
295
|
+
|
|
296
|
+
max_smm = self.results_df['S_mM'].max()
|
|
297
|
+
max_smm_k = self.results_df.loc[self.results_df['S_mM'].idxmax(), 'k']
|
|
298
|
+
|
|
299
|
+
plt.figure(figsize=(10, 4))
|
|
300
|
+
|
|
301
|
+
if self.ground_truth is not None:
|
|
302
|
+
plt.axvline(
|
|
303
|
+
x=self.ground_truth, color='red', linestyle='--', linewidth=2.5,
|
|
304
|
+
label='Ground Truth'
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
plt.plot(
|
|
308
|
+
self.results_df['k'], self.results_df['avg S_micro'],
|
|
309
|
+
marker='o', linestyle='-', color='orange', linewidth=4, markersize=8, label='avg S_micro'
|
|
310
|
+
)
|
|
311
|
+
plt.plot(max_smicro_k, max_smicro, marker='*', color='orange', markersize=18)
|
|
312
|
+
|
|
313
|
+
plt.plot(
|
|
314
|
+
self.results_df['k'], self.results_df['avg S_macro'],
|
|
315
|
+
marker='o', linestyle='-', color='blue', linewidth=4, markersize=8, label='avg S_macro'
|
|
316
|
+
)
|
|
317
|
+
plt.plot(max_smacro_k, max_smacro, marker='*', color='blue', markersize=18)
|
|
318
|
+
|
|
319
|
+
plt.plot(
|
|
320
|
+
self.results_df['k'], self.results_df['S_mM'],
|
|
321
|
+
marker='o', linestyle='--', color='green', linewidth=4, markersize=8, label='S_mM'
|
|
322
|
+
)
|
|
323
|
+
plt.plot(max_smm_k, max_smm, marker='*', color='green', markersize=18)
|
|
324
|
+
|
|
325
|
+
plt.xlabel('k', fontsize=15)
|
|
326
|
+
plt.xticks(self.k_values, fontsize=14)
|
|
327
|
+
plt.yticks(fontsize=14)
|
|
328
|
+
plt.tick_params(axis='y', which='both', length=0)
|
|
329
|
+
plt.grid(axis='y', linestyle='--')
|
|
330
|
+
|
|
331
|
+
ax = plt.gca()
|
|
332
|
+
ax.spines['right'].set_visible(False)
|
|
333
|
+
ax.spines['left'].set_visible(False)
|
|
334
|
+
ax.spines['top'].set_visible(False)
|
|
335
|
+
|
|
336
|
+
handles, labels = ax.get_legend_handles_labels()
|
|
337
|
+
ax.legend(
|
|
338
|
+
handles, labels,
|
|
339
|
+
loc="lower left",
|
|
340
|
+
bbox_to_anchor=(0, 1.02, 1, 0.2),
|
|
341
|
+
mode="expand",
|
|
342
|
+
ncol=len(labels),
|
|
343
|
+
frameon=False,
|
|
344
|
+
fontsize=12
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
plt.tight_layout()
|
|
348
|
+
plt.show()
|
|
349
|
+
|
|
350
|
+
def get_optimal_k(self, use_lcb=False):
|
|
351
|
+
"""
|
|
352
|
+
Return optimal k.
|
|
353
|
+
|
|
354
|
+
By default, uses max selection:
|
|
355
|
+
k* = argmax_k (S_mM) (use_lcb for argmax_k (LCB S_mM) selection)
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
if self.results_df.empty:
|
|
359
|
+
raise ValueError("No results available. Run evaluate() first.")
|
|
360
|
+
|
|
361
|
+
if len(self.results_df) == 1:
|
|
362
|
+
return int(self.results_df['k'].iloc[0])
|
|
363
|
+
|
|
364
|
+
col = 'LCB S_mM' if use_lcb else 'S_mM'
|
|
365
|
+
if col not in self.results_df.columns:
|
|
366
|
+
raise ValueError(f"Missing column '{col}'. Run evaluate() first.")
|
|
367
|
+
|
|
368
|
+
optimal_row = self.results_df.loc[self.results_df[col].idxmax()]
|
|
369
|
+
return int(optimal_row['k'])
|
|
370
|
+
|
|
371
|
+
def get_results_dataframe(self):
|
|
372
|
+
"""
|
|
373
|
+
Return results DataFrame indexed by k.
|
|
374
|
+
"""
|
|
375
|
+
if self.results_df.empty:
|
|
376
|
+
raise ValueError("No results available. Run evaluate() first.")
|
|
377
|
+
return self.results_df.set_index('k', inplace=False)
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: compsil
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CompSil: Composite Silhouette for Cluster-Count Selection
|
|
5
|
+
Home-page: https://github.com/semoglou/compsil
|
|
6
|
+
Author: Aggelos Semoglou
|
|
7
|
+
Author-email: a.semoglou@outlook.gr
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy>=1.22
|
|
24
|
+
Requires-Dist: pandas>=1.5
|
|
25
|
+
Requires-Dist: scikit-learn>=1.4
|
|
26
|
+
Requires-Dist: matplotlib>=3.6
|
|
27
|
+
Requires-Dist: joblib>=1.2
|
|
28
|
+
Dynamic: author
|
|
29
|
+
Dynamic: author-email
|
|
30
|
+
Dynamic: classifier
|
|
31
|
+
Dynamic: description
|
|
32
|
+
Dynamic: description-content-type
|
|
33
|
+
Dynamic: home-page
|
|
34
|
+
Dynamic: license
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: requires-python
|
|
38
|
+
Dynamic: summary
|
|
39
|
+
|
|
40
|
+
# CompSil
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://pypi.org/project/compsil/"><img src="https://img.shields.io/pypi/v/compsil.svg?color=blue" alt="PyPI version"></a>
|
|
44
|
+
<a href="https://pypi.org/project/compsil/"><img src="https://img.shields.io/badge/python-3.9%2B-blue" alt="Python 3.9+"></a>
|
|
45
|
+
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/license-MIT-yellow.svg" alt="License: MIT"></a>
|
|
46
|
+
<a href="https://pepy.tech/project/compsil"><img src="https://pepy.tech/badge/compsil" alt="Downloads"></a>
|
|
47
|
+
<a href="#"><img src="https://img.shields.io/badge/ECML%20PKDD-2026-green" alt="ECML PKDD 2026"></a>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
<table>
|
|
51
|
+
<tr>
|
|
52
|
+
<td>
|
|
53
|
+
|
|
54
|
+
📄 **Accepted at _ECML PKDD 2026_**
|
|
55
|
+
|
|
56
|
+
**Composite Silhouette**
|
|
57
|
+
|
|
58
|
+
</td>
|
|
59
|
+
</tr>
|
|
60
|
+
</table>
|
|
61
|
+
|
|
62
|
+
**CompSil** is an open-source Python package for selecting the number of clusters in unlabeled data using **Composite Silhouette**, an internal validation criterion that adaptively combines micro- and macro-averaged Silhouette scores across repeated subsampled clusterings.
|
|
63
|
+
|
|
64
|
+
### Composite Silhouette: A Subsampling-based Aggregation Strategy
|
|
65
|
+
|
|
66
|
+
Selecting the number of clusters is a central challenge in unsupervised learning, where ground-truth labels are usually unavailable.
|
|
67
|
+
|
|
68
|
+
The standard Silhouette coefficient is one of the most widely used internal validation metrics for this task. However, its usual **micro-averaged** form aggregates Silhouette values over all data points, which can make the score strongly influenced by large clusters. In imbalanced datasets, this may mask poor separation or instability in smaller but meaningful groups.
|
|
69
|
+
|
|
70
|
+
A natural alternative is **macro-averaging**, where Silhouette values are first averaged within each cluster and then averaged across clusters. This gives every cluster equal influence, reducing the dominance of majority groups. However, macro-averaging can also overemphasize small, noisy, or under-represented clusters.
|
|
71
|
+
|
|
72
|
+
The distinction between micro- and macro-averaged Silhouette aggregation is discussed in detail in [**Revisiting Silhouette Aggregation**](https://arxiv.org/abs/2401.05831) by Pavlopoulos, Vardakas, and Likas. The corresponding repository is available here: [https://github.com/ipavlopoulos/revisiting-silhouette-aggregation](https://github.com/ipavlopoulos/revisiting-silhouette-aggregation).
|
|
73
|
+
|
|
74
|
+
For users who only need direct Silhouette computation, including sample-level, micro-averaged, and macro-averaged Silhouette scores with or without approximation, see the companion Silhouette package: [https://github.com/semoglou/sil_score](https://github.com/semoglou/sil_score).
|
|
75
|
+
|
|
76
|
+
<img src="https://raw.githubusercontent.com/semoglou/compsil/main/figs/aggr.png" alt="Micro vs Macro Silhouette Aggregation" width="700">
|
|
77
|
+
|
|
78
|
+
These complementary failure modes create a practical dilemma:
|
|
79
|
+
|
|
80
|
+
- **Micro-averaging** reflects global, point-wise clustering quality but can favor majority clusters.
|
|
81
|
+
|
|
82
|
+
- **Macro-averaging** reflects cluster-wise balance but can overemphasize small or noisy groups.
|
|
83
|
+
|
|
84
|
+
In many applications, it is unclear in advance which view should be trusted.
|
|
85
|
+
|
|
86
|
+
**CompSil** addresses this issue by using the disagreement between micro- and macro-averaged Silhouette scores as a local signal for adaptive aggregation.
|
|
87
|
+
|
|
88
|
+
Composite Silhouette evaluates candidate numbers of clusters through repeated subsampled clusterings. For each candidate value of `k`, the method:
|
|
89
|
+
|
|
90
|
+
1. Draws multiple subsamples of the dataset.
|
|
91
|
+
|
|
92
|
+
2. Clusters each subsample.
|
|
93
|
+
|
|
94
|
+
3. Computes both micro- and macro-averaged Silhouette scores.
|
|
95
|
+
|
|
96
|
+
4. Measures their discrepancy.
|
|
97
|
+
|
|
98
|
+
5. Converts this discrepancy into a smooth convex weight.
|
|
99
|
+
|
|
100
|
+
6. Combines the two Silhouette views into a subsample-level composite score.
|
|
101
|
+
|
|
102
|
+
7. Averages the composite scores across subsamples.
|
|
103
|
+
|
|
104
|
+
<img src="https://raw.githubusercontent.com/semoglou/compsil/main/figs/smmp.png" alt="Composite Silhouette pipeline" width="700">
|
|
105
|
+
|
|
106
|
+
For each subsample, Composite Silhouette combines the two views as:
|
|
107
|
+
|
|
108
|
+
```text
|
|
109
|
+
|
|
110
|
+
S_mM = w * S_micro + (1 - w) * S_macro
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
where the weight `w` is determined adaptively from the normalized discrepancy between `S_micro` and `S_macro`.
|
|
115
|
+
|
|
116
|
+
This produces a single internal validation score that can be maximized over candidate values of `k`.
|
|
117
|
+
|
|
118
|
+
CompSil enables:
|
|
119
|
+
|
|
120
|
+
- Selection of the number of clusters without labels.
|
|
121
|
+
|
|
122
|
+
- Adaptive balancing of micro- and macro-averaged Silhouette.
|
|
123
|
+
|
|
124
|
+
- More robust cluster-count selection under size imbalance.
|
|
125
|
+
|
|
126
|
+
- Repeated subsampling for stable internal validation.
|
|
127
|
+
|
|
128
|
+
- Optional lower-confidence-bound selection using subsampling variability.
|
|
129
|
+
|
|
130
|
+
#
|
|
131
|
+
|
|
132
|
+
## Citation
|
|
133
|
+
|
|
134
|
+
If you find this work useful, please consider citing:
|
|
135
|
+
|
|
136
|
+
Semoglou, A., Likas, A., & Pavlopoulos, J. (2026). Composite Silhouette.
|
|
137
|
+
|
|
138
|
+
Accepted at *ECML PKDD 2026*.
|
|
139
|
+
|
|
140
|
+
```bibtex
|
|
141
|
+
@inproceedings{semoglou2026composite,
|
|
142
|
+
title = {Composite Silhouette},
|
|
143
|
+
author = {Semoglou, Aggelos and Likas, Aristidis and Pavlopoulos, John},
|
|
144
|
+
booktitle = {Proceedings of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases},
|
|
145
|
+
year = {2026}
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
The preprint is also available on arXiv: [https://arxiv.org/abs/2604.13816](https://arxiv.org/abs/2604.13816)
|
|
150
|
+
|
|
151
|
+
## Installation
|
|
152
|
+
|
|
153
|
+
Install **CompSil** from [PyPI](https://pypi.org/project/compsil/):
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
pip install compsil
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Import the main class in Python as:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from compsil import CompSil
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## API Reference
|
|
166
|
+
|
|
167
|
+
CompSil provides a simple class-based interface for evaluating Composite Silhouette over one or more candidate numbers of clusters.
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
#### `CompSil`
|
|
172
|
+
|
|
173
|
+
Computes Composite Silhouette for candidate cluster counts using repeated subsampled clusterings.
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
CompSil(
|
|
177
|
+
data,
|
|
178
|
+
ground_truth=None,
|
|
179
|
+
k_values=range(2, 11),
|
|
180
|
+
num_samples=10,
|
|
181
|
+
sample_size="auto",
|
|
182
|
+
random_state=42,
|
|
183
|
+
n_jobs=-1,
|
|
184
|
+
eps=1e-12,
|
|
185
|
+
)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Inputs**
|
|
189
|
+
|
|
190
|
+
- `data`: array-like of shape `(n_samples, n_features)`
|
|
191
|
+
Input data matrix.
|
|
192
|
+
|
|
193
|
+
- `ground_truth`: int or None, default `None`
|
|
194
|
+
Optional reference number of clusters.
|
|
195
|
+
Used only for visualization.
|
|
196
|
+
|
|
197
|
+
- `k_values`: iterable of int or int, default `range(2, 11)`
|
|
198
|
+
Candidate number or candidate numbers of clusters to evaluate.
|
|
199
|
+
|
|
200
|
+
- `num_samples`: int, default `10`
|
|
201
|
+
Number of subsamples used for each candidate value of `k`.
|
|
202
|
+
|
|
203
|
+
- `sample_size`: int, float, None, or `"auto"`, default `"auto"`
|
|
204
|
+
Subsample size used in each repeated clustering.
|
|
205
|
+
- If `int`, it is interpreted as the absolute subsample size.
|
|
206
|
+
- If `float` in `(0, 1]`, it is interpreted as a fraction of the dataset size.
|
|
207
|
+
- If `None` or `"auto"`, the subsample size is selected automatically from the dataset size and the largest candidate value of `k`.
|
|
208
|
+
|
|
209
|
+
- `random_state`: int, default `42`
|
|
210
|
+
Base random seed used for reproducible subsampling and clustering.
|
|
211
|
+
|
|
212
|
+
- `n_jobs`: int, default `-1`
|
|
213
|
+
Number of parallel jobs used during evaluation.
|
|
214
|
+
|
|
215
|
+
- `eps`: float, default `1e-12`
|
|
216
|
+
Numerical stability constant used when normalizing micro–macro discrepancies.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
#### `evaluate`
|
|
221
|
+
|
|
222
|
+
Evaluates Composite Silhouette over all candidate values of `k`.
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
model.evaluate()
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
After calling `evaluate`, the results are stored in:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
model.results_df
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
The results table contains:
|
|
235
|
+
|
|
236
|
+
- `k`: candidate number of clusters.
|
|
237
|
+
- `avg S_micro`: average micro-averaged Silhouette across subsamples.
|
|
238
|
+
- `avg S_macro`: average macro-averaged Silhouette across subsamples.
|
|
239
|
+
- `w_micro`: average adaptive weight assigned to the micro view.
|
|
240
|
+
- `S_mM`: Composite Silhouette score.
|
|
241
|
+
- `std S_mM`: standard deviation of subsample-level composite scores.
|
|
242
|
+
- `se S_mM`: standard error of the Composite Silhouette estimate.
|
|
243
|
+
- `LCB S_mM`: lower-confidence-bound score, computed as `S_mM - se S_mM`.
|
|
244
|
+
- `B_eff`: number of valid subsampling trials.
|
|
245
|
+
- `sample_size`: resolved subsample size.
|
|
246
|
+
- `sample_fraction`: resolved subsample fraction.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
#### `get_optimal_k`
|
|
251
|
+
|
|
252
|
+
Returns the selected number of clusters.
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
model.get_optimal_k(use_lcb=False)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
**Inputs**
|
|
259
|
+
|
|
260
|
+
- `use_lcb`: bool, default `False`
|
|
261
|
+
If `False`, selects the `k` that maximizes `S_mM`.
|
|
262
|
+
If `True`, selects the `k` that maximizes `LCB S_mM`.
|
|
263
|
+
|
|
264
|
+
**Returns**
|
|
265
|
+
|
|
266
|
+
- `optimal_k`: int
|
|
267
|
+
Selected number of clusters.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
#### `get_results_dataframe`
|
|
272
|
+
|
|
273
|
+
Returns the results as a pandas DataFrame indexed by `k`.
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
results = model.get_results_dataframe()
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
**Returns**
|
|
280
|
+
|
|
281
|
+
- `results`: pandas DataFrame
|
|
282
|
+
Table containing the Composite Silhouette results for all candidate values of `k`.
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
#### `plot_results`
|
|
287
|
+
|
|
288
|
+
Plots the Composite Silhouette curve together with the subsample-averaged micro- and macro-averaged Silhouette curves.
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
model.plot_results()
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
If `ground_truth` was provided, it is shown as a vertical reference line.
|
|
295
|
+
|
|
296
|
+
## Quick Start
|
|
297
|
+
|
|
298
|
+
This example creates a simple synthetic dataset with five Gaussian clusters, evaluates candidate values of `k`, and selects the number of clusters using Composite Silhouette.
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
from sklearn.datasets import make_blobs
|
|
302
|
+
from sklearn.preprocessing import StandardScaler
|
|
303
|
+
from compsil import CompSil
|
|
304
|
+
|
|
305
|
+
# Create a simple synthetic dataset
|
|
306
|
+
X, y = make_blobs(
|
|
307
|
+
n_samples=1000,
|
|
308
|
+
centers=5,
|
|
309
|
+
n_features=10,
|
|
310
|
+
cluster_std=1.5,
|
|
311
|
+
random_state=42,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Standardize the data
|
|
315
|
+
X = StandardScaler().fit_transform(X)
|
|
316
|
+
|
|
317
|
+
# Initialize Composite Silhouette
|
|
318
|
+
model = CompSil(
|
|
319
|
+
data=X,
|
|
320
|
+
ground_truth=5,
|
|
321
|
+
k_values=range(2, 11),
|
|
322
|
+
num_samples=10,
|
|
323
|
+
sample_size="auto",
|
|
324
|
+
random_state=0,
|
|
325
|
+
n_jobs=-1,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Evaluate all candidate k values
|
|
329
|
+
model.evaluate()
|
|
330
|
+
|
|
331
|
+
# Select the number of clusters
|
|
332
|
+
best_k = model.get_optimal_k()
|
|
333
|
+
|
|
334
|
+
print("Selected k:", best_k)
|
|
335
|
+
|
|
336
|
+
# Inspect the full results table
|
|
337
|
+
results = model.get_results_dataframe()
|
|
338
|
+
print(results)
|
|
339
|
+
|
|
340
|
+
# Plot the Composite Silhouette curve
|
|
341
|
+
model.plot_results()
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
The `S_mM` column in the results table contains the Composite Silhouette score for each candidate number of clusters. The selected number of clusters is the value of `k` that maximizes `S_mM`.
|
|
345
|
+
|
|
346
|
+
CompSil can also be used to evaluate a single candidate number of clusters. In this case, pass an integer to `k_values`.
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
# Evaluate a single candidate k
|
|
350
|
+
model = CompSil(
|
|
351
|
+
data=X,
|
|
352
|
+
k_values=5
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
model.evaluate()
|
|
356
|
+
|
|
357
|
+
# Composite Silhouette score for k=5
|
|
358
|
+
print("Composite Silhouette score:", model.score_)
|
|
359
|
+
|
|
360
|
+
# Full results table
|
|
361
|
+
results = model.get_results_dataframe()
|
|
362
|
+
print(results)
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
When a single value of `k` is evaluated, `model.score_` stores the corresponding Composite Silhouette score.
|
|
366
|
+
|
|
367
|
+
## Acknowledgments
|
|
368
|
+
This work was supported by [_Archimedes Research Unit_](https://archimedesai.gr/), [_Athena Research Center_](https://www.athenarc.gr/en).
|
|
369
|
+
|
|
370
|
+
## License
|
|
371
|
+
This project is licensed under the [MIT License](https://github.com/semoglou/compsil/blob/main/LICENSE).
|
|
372
|
+
|
|
373
|
+
## Links
|
|
374
|
+
- Package: [PyPI](https://pypi.org/project/compsil/)
|
|
375
|
+
- Paper: Accepted at ECML PKDD 2026
|
|
376
|
+
- DOI: Coming soon
|
|
377
|
+
- Preprint: [arXiv:2604.13816](https://arxiv.org/abs/2604.13816)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
compsil/__init__.py,sha256=ZxBnwgnAisd0YC5shzSWTNhQsWiTbfRxBW4340kj8p4,29
|
|
2
|
+
compsil/compsil.py,sha256=KKx6da1QVDJa04hrMDPeqz18zurwvhwfdUa3pexjPVk,13133
|
|
3
|
+
compsil-0.1.0.dist-info/licenses/LICENSE,sha256=kWn4HCOo2H6AuCu5oBQd2VJYyCuWyILiElYZKNZh5JM,1073
|
|
4
|
+
compsil-0.1.0.dist-info/METADATA,sha256=3NISXriDBCYYUwsNVRmUDkBMP2vJclnmajbgTm8GMyg,12238
|
|
5
|
+
compsil-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
compsil-0.1.0.dist-info/top_level.txt,sha256=yQVXj3-Zb3n52XwCIlLsvomfByXMcXa2Lm1ShGsD7BE,8
|
|
7
|
+
compsil-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aggelos Semoglou
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
compsil
|