compsil 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compsil/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .compsil import CompSil
compsil/compsil.py ADDED
@@ -0,0 +1,377 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.metrics import silhouette_score, silhouette_samples
5
+ from sklearn.utils import resample
6
+ import matplotlib.pyplot as plt
7
+ from joblib import Parallel, delayed
8
+
9
+
10
+ class CompSil:
11
+ """
12
+ Composite Silhouette (CompSil)
13
+
14
+ Per subsample b=1..B for each k:
15
+ - compute S_micro^(b), S_macro^(b)
16
+ - Δ_b = S_micro^(b) - S_macro^(b)
17
+ - D = max_b |Δ_b|
18
+ - r_raw_b = Δ_b/(D+eps) in [-1,1]
19
+ - r_b = tanh(r_raw_b)
20
+ - w_b = (1+r_b)/2
21
+ - S_mM^(b) = w_b*S_micro^(b) + (1-w_b)*S_macro^(b)
22
+
23
+ Aggregate for each k:
24
+ - mean score: S_mM(k) = mean_b S_mM^(b)
25
+ - std score: std_S_mM(k) = std_b S_mM^(b)
26
+ - standard error: se_S_mM(k) = std_S_mM(k) / sqrt(B_eff)
27
+ - lower conf bound: LCB_S_mM(k) = S_mM(k) - se_S_mM(k)
28
+
29
+ Selection:
30
+ - k* = argmax_k S_mM(k) (or argmax_k LCB_S_mM(k))
31
+
32
+ Sampling size mechanism:
33
+ - explicit `sample_size` (int), or
34
+ - sample_size=None or sample_size="auto", the class chooses a subsample size automatically
35
+ based on dataset size N and the maximum k in `k_values`.
36
+
37
+ Parameters:
38
+ - data: ndarray or DataFrame, shape (n_samples, n_features)
39
+ - ground_truth: int, optional
40
+ - k_values: iterable or int, default=range(2, 11)
41
+ - num_samples: int, default=100 (B)
42
+ - sample_size: int | float | None | "auto", default=1000
43
+ * int: absolute subsample size m
44
+ * float in (0,1]: treated as fraction f, m = floor(f*N)
45
+ * None or "auto": compute m automatically (see _auto_sample_size)
46
+ - random_state: int, default=42
47
+ - n_jobs: int, default=-1
48
+ - eps: float, default=1e-12
49
+ """
50
+
51
+ def __init__(self,
52
+ data,
53
+ ground_truth=None,
54
+ k_values=range(2, 11),
55
+ num_samples=10,
56
+ sample_size="auto",
57
+ random_state=42,
58
+ n_jobs=-1,
59
+ eps=1e-12):
60
+ self.data = data
61
+ self.ground_truth = ground_truth
62
+ self.k_values = [k_values] if isinstance(k_values, int) else list(k_values)
63
+ self.num_samples = int(num_samples)
64
+ self.random_state = int(random_state)
65
+ self.n_jobs = int(n_jobs)
66
+ self.eps = float(eps)
67
+
68
+ self._results = []
69
+ self.results_df = pd.DataFrame()
70
+ self.score_ = None # set only if one k is evaluated
71
+
72
+ self.n_samples_ = int(len(self.data))
73
+ if self.n_samples_ <= 0:
74
+ raise ValueError("Empty dataset.")
75
+
76
+ self.sample_size = self._resolve_sample_size(sample_size)
77
+ self.sample_fraction_ = self.sample_size / self.n_samples_
78
+
79
+ if self.sample_size < 2:
80
+ raise ValueError(f"Resolved sample_size={self.sample_size} is too small.")
81
+ if self.sample_size > self.n_samples_:
82
+ raise ValueError(
83
+ f"Resolved sample_size={self.sample_size} is larger than n_samples={self.n_samples_}."
84
+ )
85
+
86
+ def _resolve_sample_size(self, sample_size):
87
+ n = self.n_samples_
88
+
89
+ # Auto
90
+ if sample_size is None or (isinstance(sample_size, str) and sample_size.lower() == "auto"):
91
+ return self._auto_sample_size()
92
+
93
+ # Fraction mode (float in (0,1])
94
+ if isinstance(sample_size, float):
95
+ if not (0.0 < sample_size <= 1.0):
96
+ raise ValueError("If sample_size is a float, it must be in (0, 1].")
97
+ m = int(np.floor(sample_size * n))
98
+ return max(2, min(m, n))
99
+
100
+ # Int mode
101
+ m = int(sample_size)
102
+ return m
103
+
104
+ def _auto_sample_size(self):
105
+ """
106
+ Automatic subsample size selection (no user-facing hyperparameters).
107
+
108
+ Heuristic:
109
+ 1) Ensure a minimum average points-per-cluster at k_max: m >= 30 * k_max
110
+ 2) Use a baseline fraction depending on dataset size:
111
+ - small N: 0.8N
112
+ - medium N: 0.6N
113
+ - large N: 0.4N
114
+ 3) Take the maximum of (1) and (2), then cap at N.
115
+ """
116
+ n = self.n_samples_
117
+ k_max = int(max(self.k_values)) if len(self.k_values) > 0 else 2
118
+
119
+ m_min = 30 * k_max
120
+
121
+ if n <= 2000:
122
+ m_base = int(np.floor(0.80 * n))
123
+ elif n <= 20000:
124
+ m_base = int(np.floor(0.60 * n))
125
+ else:
126
+ m_base = int(np.floor(0.40 * n))
127
+
128
+ m = max(m_min, m_base)
129
+ m = min(max(2, m), n)
130
+ return m
131
+
132
+ def evaluate_sample(self, k, i):
133
+ """
134
+ One subsampling iteration for fixed k.
135
+ Returns: (smicro, smacro, diff, s_mm_b)
136
+ """
137
+ seed = self.random_state + i
138
+
139
+ sampled_data = resample(
140
+ self.data,
141
+ n_samples=self.sample_size,
142
+ replace=False,
143
+ random_state=seed
144
+ )
145
+
146
+ kmeans = KMeans(n_clusters=k, random_state=seed, n_init=1)
147
+ labels = kmeans.fit_predict(sampled_data)
148
+
149
+ try:
150
+ s = silhouette_samples(sampled_data, labels) # compute once
151
+
152
+ # micro silhouette
153
+ smicro = float(np.mean(s))
154
+
155
+ labs = np.asarray(labels)
156
+ uniq = np.unique(labs)
157
+ cluster_means = [float(np.mean(s[labs == u])) for u in uniq]
158
+
159
+ # macro silhouette
160
+ smacro = float(np.mean(cluster_means)) if len(cluster_means) > 0 else np.nan
161
+ except Exception:
162
+ return np.nan, np.nan, np.nan, np.nan
163
+
164
+ diff = smicro - smacro
165
+ return smicro, smacro, diff, np.nan
166
+
167
+ @staticmethod
168
+ def _tanh_rb_weights_from_differences(differences, eps=1e-12):
169
+ """
170
+ Given Δ_b over b=1..B, compute:
171
+ D = max |Δ_b|
172
+ r_raw_b = Δ_b/(D+eps) in [-1, 1]
173
+ r_b = tanh(r_raw_b)
174
+ w_b = (1+r_b)/2 in (0,1)
175
+ """
176
+ d = np.asarray(differences, dtype=float)
177
+ finite = np.isfinite(d)
178
+
179
+ if not np.any(finite):
180
+ return d * np.nan, d * np.nan, np.nan
181
+
182
+ D = float(np.max(np.abs(d[finite])))
183
+ denom = D + float(eps)
184
+
185
+ if D == 0.0:
186
+ r = np.zeros_like(d)
187
+ r[~finite] = np.nan
188
+ w = 0.5 * np.ones_like(d)
189
+ w[~finite] = np.nan
190
+ return w, r, D
191
+
192
+ r_raw = d / denom
193
+ r_raw = np.clip(r_raw, -1.0, 1.0)
194
+ r_raw[~finite] = np.nan
195
+
196
+ r = np.tanh(r_raw)
197
+ r[~finite] = np.nan
198
+
199
+ w = 0.5 * (1.0 + r)
200
+ w[~finite] = np.nan
201
+
202
+ return w, r, D
203
+
204
+ def evaluate(self):
205
+ """
206
+ Evaluate over k_values using subsampled clustering.
207
+ Stores results in self.results_df.
208
+
209
+ Output columns (per k):
210
+ - avg S_micro
211
+ - avg S_macro
212
+ - w_micro (mean of per-subsample weights; descriptive)
213
+ - S_mM (mean of per-subsample composites)
214
+ - std S_mM
215
+ - se S_mM
216
+ - LCB S_mM (S_mM - se)
217
+ """
218
+ self._results = []
219
+
220
+ for k in self.k_values:
221
+ results = Parallel(n_jobs=self.n_jobs)(
222
+ delayed(self.evaluate_sample)(k, i) for i in range(self.num_samples)
223
+ )
224
+
225
+ smicro_list, smacro_list, differences, _ = zip(*results)
226
+
227
+ smicro_arr = np.asarray(smicro_list, dtype=float)
228
+ smacro_arr = np.asarray(smacro_list, dtype=float)
229
+ diff_arr = np.asarray(differences, dtype=float)
230
+
231
+ avg_smicro = float(np.nanmean(smicro_arr)) if np.any(np.isfinite(smicro_arr)) else np.nan
232
+ avg_smacro = float(np.nanmean(smacro_arr)) if np.any(np.isfinite(smacro_arr)) else np.nan
233
+
234
+ # weights
235
+ w_b, r_b, D = self._tanh_rb_weights_from_differences(diff_arr, eps=self.eps)
236
+
237
+ # per-subsample composite
238
+ S_b = w_b * smicro_arr + (1.0 - w_b) * smacro_arr
239
+
240
+ # mean composite
241
+ S_mM = float(np.nanmean(S_b)) if np.any(np.isfinite(S_b)) else np.nan
242
+
243
+ # descriptive mean weight
244
+ w_micro_mean = float(np.nanmean(w_b)) if np.any(np.isfinite(w_b)) else np.nan
245
+
246
+ # LCB components computed from S_b across valid subsamples
247
+ finite_sb = np.isfinite(S_b)
248
+ B_eff = int(np.sum(finite_sb))
249
+ if B_eff >= 2:
250
+ std_smm = float(np.nanstd(S_b, ddof=1))
251
+ se_smm = std_smm / np.sqrt(B_eff)
252
+ elif B_eff == 1:
253
+ std_smm = 0.0
254
+ se_smm = 0.0
255
+ else:
256
+ std_smm = np.nan
257
+ se_smm = np.nan
258
+
259
+ lcb_smm = (S_mM - se_smm) if (np.isfinite(S_mM) and np.isfinite(se_smm)) else np.nan
260
+
261
+ if len(self.k_values) == 1:
262
+ self.score_ = S_mM
263
+
264
+ result = {
265
+ 'k': int(k),
266
+ 'avg S_micro': avg_smicro,
267
+ 'avg S_macro': avg_smacro,
268
+ 'w_micro': w_micro_mean,
269
+ 'S_mM': S_mM,
270
+ 'std S_mM': std_smm,
271
+ 'se S_mM': se_smm,
272
+ 'LCB S_mM': lcb_smm,
273
+ 'B_eff': B_eff,
274
+ 'sample_size': int(self.sample_size),
275
+ 'sample_fraction': float(self.sample_fraction_),
276
+ }
277
+ self._results.append(result)
278
+
279
+ self.results_df = pd.DataFrame(self._results)
280
+
281
+ def plot_results(self):
282
+ """
283
+ Plot S_mM and individual averages vs k.
284
+ """
285
+ if self.results_df.empty:
286
+ raise ValueError("No results available. Run evaluate() first.")
287
+ if len(self.results_df) == 1:
288
+ raise ValueError("Cannot plot with only one k. Evaluate multiple k values.")
289
+
290
+ max_smicro = self.results_df['avg S_micro'].max()
291
+ max_smicro_k = self.results_df.loc[self.results_df['avg S_micro'].idxmax(), 'k']
292
+
293
+ max_smacro = self.results_df['avg S_macro'].max()
294
+ max_smacro_k = self.results_df.loc[self.results_df['avg S_macro'].idxmax(), 'k']
295
+
296
+ max_smm = self.results_df['S_mM'].max()
297
+ max_smm_k = self.results_df.loc[self.results_df['S_mM'].idxmax(), 'k']
298
+
299
+ plt.figure(figsize=(10, 4))
300
+
301
+ if self.ground_truth is not None:
302
+ plt.axvline(
303
+ x=self.ground_truth, color='red', linestyle='--', linewidth=2.5,
304
+ label='Ground Truth'
305
+ )
306
+
307
+ plt.plot(
308
+ self.results_df['k'], self.results_df['avg S_micro'],
309
+ marker='o', linestyle='-', color='orange', linewidth=4, markersize=8, label='avg S_micro'
310
+ )
311
+ plt.plot(max_smicro_k, max_smicro, marker='*', color='orange', markersize=18)
312
+
313
+ plt.plot(
314
+ self.results_df['k'], self.results_df['avg S_macro'],
315
+ marker='o', linestyle='-', color='blue', linewidth=4, markersize=8, label='avg S_macro'
316
+ )
317
+ plt.plot(max_smacro_k, max_smacro, marker='*', color='blue', markersize=18)
318
+
319
+ plt.plot(
320
+ self.results_df['k'], self.results_df['S_mM'],
321
+ marker='o', linestyle='--', color='green', linewidth=4, markersize=8, label='S_mM'
322
+ )
323
+ plt.plot(max_smm_k, max_smm, marker='*', color='green', markersize=18)
324
+
325
+ plt.xlabel('k', fontsize=15)
326
+ plt.xticks(self.k_values, fontsize=14)
327
+ plt.yticks(fontsize=14)
328
+ plt.tick_params(axis='y', which='both', length=0)
329
+ plt.grid(axis='y', linestyle='--')
330
+
331
+ ax = plt.gca()
332
+ ax.spines['right'].set_visible(False)
333
+ ax.spines['left'].set_visible(False)
334
+ ax.spines['top'].set_visible(False)
335
+
336
+ handles, labels = ax.get_legend_handles_labels()
337
+ ax.legend(
338
+ handles, labels,
339
+ loc="lower left",
340
+ bbox_to_anchor=(0, 1.02, 1, 0.2),
341
+ mode="expand",
342
+ ncol=len(labels),
343
+ frameon=False,
344
+ fontsize=12
345
+ )
346
+
347
+ plt.tight_layout()
348
+ plt.show()
349
+
350
+ def get_optimal_k(self, use_lcb=False):
351
+ """
352
+ Return optimal k.
353
+
354
+ By default, uses max selection:
355
+ k* = argmax_k (S_mM) (use_lcb for argmax_k (LCB S_mM) selection)
356
+
357
+ """
358
+ if self.results_df.empty:
359
+ raise ValueError("No results available. Run evaluate() first.")
360
+
361
+ if len(self.results_df) == 1:
362
+ return int(self.results_df['k'].iloc[0])
363
+
364
+ col = 'LCB S_mM' if use_lcb else 'S_mM'
365
+ if col not in self.results_df.columns:
366
+ raise ValueError(f"Missing column '{col}'. Run evaluate() first.")
367
+
368
+ optimal_row = self.results_df.loc[self.results_df[col].idxmax()]
369
+ return int(optimal_row['k'])
370
+
371
+ def get_results_dataframe(self):
372
+ """
373
+ Return results DataFrame indexed by k.
374
+ """
375
+ if self.results_df.empty:
376
+ raise ValueError("No results available. Run evaluate() first.")
377
+ return self.results_df.set_index('k', inplace=False)
@@ -0,0 +1,377 @@
1
+ Metadata-Version: 2.4
2
+ Name: compsil
3
+ Version: 0.1.0
4
+ Summary: CompSil: Composite Silhouette for Cluster-Count Selection
5
+ Home-page: https://github.com/semoglou/compsil
6
+ Author: Aggelos Semoglou
7
+ Author-email: a.semoglou@outlook.gr
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy>=1.22
24
+ Requires-Dist: pandas>=1.5
25
+ Requires-Dist: scikit-learn>=1.4
26
+ Requires-Dist: matplotlib>=3.6
27
+ Requires-Dist: joblib>=1.2
28
+ Dynamic: author
29
+ Dynamic: author-email
30
+ Dynamic: classifier
31
+ Dynamic: description
32
+ Dynamic: description-content-type
33
+ Dynamic: home-page
34
+ Dynamic: license
35
+ Dynamic: license-file
36
+ Dynamic: requires-dist
37
+ Dynamic: requires-python
38
+ Dynamic: summary
39
+
40
+ # CompSil
41
+
42
+ <p align="center">
43
+ <a href="https://pypi.org/project/compsil/"><img src="https://img.shields.io/pypi/v/compsil.svg?color=blue" alt="PyPI version"></a>&nbsp;&nbsp;
44
+ <a href="https://pypi.org/project/compsil/"><img src="https://img.shields.io/badge/python-3.9%2B-blue" alt="Python 3.9+"></a>
45
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/license-MIT-yellow.svg" alt="License: MIT"></a>&nbsp;&nbsp;
46
+ <a href="https://pepy.tech/project/compsil"><img src="https://pepy.tech/badge/compsil" alt="Downloads"></a>&nbsp;&nbsp;
47
+ <a href="#"><img src="https://img.shields.io/badge/ECML%20PKDD-2026-green" alt="ECML PKDD 2026"></a>
48
+ </p>
49
+
50
+ <table>
51
+ <tr>
52
+ <td>
53
+
54
+ 📄 **Accepted at _ECML PKDD 2026_**
55
+
56
+ **Composite Silhouette**
57
+
58
+ </td>
59
+ </tr>
60
+ </table>
61
+
62
+ **CompSil** is an open-source Python package for selecting the number of clusters in unlabeled data using **Composite Silhouette**, an internal validation criterion that adaptively combines micro- and macro-averaged Silhouette scores across repeated subsampled clusterings.
63
+
64
+ ### Composite Silhouette: A Subsampling-based Aggregation Strategy
65
+
66
+ Selecting the number of clusters is a central challenge in unsupervised learning, where ground-truth labels are usually unavailable.
67
+
68
+ The standard Silhouette coefficient is one of the most widely used internal validation metrics for this task. However, its usual **micro-averaged** form aggregates Silhouette values over all data points, which can make the score strongly influenced by large clusters. In imbalanced datasets, this may mask poor separation or instability in smaller but meaningful groups.
69
+
70
+ A natural alternative is **macro-averaging**, where Silhouette values are first averaged within each cluster and then averaged across clusters. This gives every cluster equal influence, reducing the dominance of majority groups. However, macro-averaging can also overemphasize small, noisy, or under-represented clusters.
71
+
72
+ The distinction between micro- and macro-averaged Silhouette aggregation is discussed in detail in [**Revisiting Silhouette Aggregation**](https://arxiv.org/abs/2401.05831) by Pavlopoulos, Vardakas, and Likas. The corresponding repository is available here: [https://github.com/ipavlopoulos/revisiting-silhouette-aggregation](https://github.com/ipavlopoulos/revisiting-silhouette-aggregation).
73
+
74
+ For users who only need direct Silhouette computation, including sample-level, micro-averaged, and macro-averaged Silhouette scores with or without approximation, see the companion Silhouette package: [https://github.com/semoglou/sil_score](https://github.com/semoglou/sil_score).
75
+
76
+ <img src="https://raw.githubusercontent.com/semoglou/compsil/main/figs/aggr.png" alt="Micro vs Macro Silhouette Aggregation" width="700">
77
+
78
+ These complementary failure modes create a practical dilemma:
79
+
80
+ - **Micro-averaging** reflects global, point-wise clustering quality but can favor majority clusters.
81
+
82
+ - **Macro-averaging** reflects cluster-wise balance but can overemphasize small or noisy groups.
83
+
84
+ In many applications, it is unclear in advance which view should be trusted.
85
+
86
+ **CompSil** addresses this issue by using the disagreement between micro- and macro-averaged Silhouette scores as a local signal for adaptive aggregation.
87
+
88
+ Composite Silhouette evaluates candidate numbers of clusters through repeated subsampled clusterings. For each candidate value of `k`, the method:
89
+
90
+ 1. Draws multiple subsamples of the dataset.
91
+
92
+ 2. Clusters each subsample.
93
+
94
+ 3. Computes both micro- and macro-averaged Silhouette scores.
95
+
96
+ 4. Measures their discrepancy.
97
+
98
+ 5. Converts this discrepancy into a smooth convex weight.
99
+
100
+ 6. Combines the two Silhouette views into a subsample-level composite score.
101
+
102
+ 7. Averages the composite scores across subsamples.
103
+
104
+ <img src="https://raw.githubusercontent.com/semoglou/compsil/main/figs/smmp.png" alt="Composite Silhouette pipeline" width="700">
105
+
106
+ For each subsample, Composite Silhouette combines the two views as:
107
+
108
+ ```text
109
+
110
+ S_mM = w * S_micro + (1 - w) * S_macro
111
+
112
+ ```
113
+
114
+ where the weight `w` is determined adaptively from the normalized discrepancy between `S_micro` and `S_macro`.
115
+
116
+ This produces a single internal validation score that can be maximized over candidate values of `k`.
117
+
118
+ CompSil enables:
119
+
120
+ - Selection of the number of clusters without labels.
121
+
122
+ - Adaptive balancing of micro- and macro-averaged Silhouette.
123
+
124
+ - More robust cluster-count selection under size imbalance.
125
+
126
+ - Repeated subsampling for stable internal validation.
127
+
128
+ - Optional lower-confidence-bound selection using subsampling variability.
129
+
130
+ #
131
+
132
+ ## Citation
133
+
134
+ If you find this work useful, please consider citing:
135
+
136
+ Semoglou, A., Likas, A., & Pavlopoulos, J. (2026). Composite Silhouette.
137
+
138
+ Accepted at *ECML PKDD 2026*.
139
+
140
+ ```bibtex
141
+ @inproceedings{semoglou2026composite,
142
+ title = {Composite Silhouette},
143
+ author = {Semoglou, Aggelos and Likas, Aristidis and Pavlopoulos, John},
144
+ booktitle = {Proceedings of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases},
145
+ year = {2026}
146
+ }
147
+ ```
148
+
149
+ The preprint is also available on arXiv: [https://arxiv.org/abs/2604.13816](https://arxiv.org/abs/2604.13816)
150
+
151
+ ## Installation
152
+
153
+ Install **CompSil** from [PyPI](https://pypi.org/project/compsil/):
154
+
155
+ ```python
156
+ pip install compsil
157
+ ```
158
+
159
+ Import the main class in Python as:
160
+
161
+ ```python
162
+ from compsil import CompSil
163
+ ```
164
+
165
+ ## API Reference
166
+
167
+ CompSil provides a simple class-based interface for evaluating Composite Silhouette over one or more candidate numbers of clusters.
168
+
169
+ ---
170
+
171
+ #### `CompSil`
172
+
173
+ Computes Composite Silhouette for candidate cluster counts using repeated subsampled clusterings.
174
+
175
+ ```python
176
+ CompSil(
177
+ data,
178
+ ground_truth=None,
179
+ k_values=range(2, 11),
180
+ num_samples=10,
181
+ sample_size="auto",
182
+ random_state=42,
183
+ n_jobs=-1,
184
+ eps=1e-12,
185
+ )
186
+ ```
187
+
188
+ **Inputs**
189
+
190
+ - `data`: array-like of shape `(n_samples, n_features)`
191
+ Input data matrix.
192
+
193
+ - `ground_truth`: int or None, default `None`
194
+ Optional reference number of clusters.
195
+ Used only for visualization.
196
+
197
+ - `k_values`: iterable of int or int, default `range(2, 11)`
198
+ Candidate number or candidate numbers of clusters to evaluate.
199
+
200
+ - `num_samples`: int, default `10`
201
+ Number of subsamples used for each candidate value of `k`.
202
+
203
+ - `sample_size`: int, float, None, or `"auto"`, default `"auto"`
204
+ Subsample size used in each repeated clustering.
205
+ - If `int`, it is interpreted as the absolute subsample size.
206
+ - If `float` in `(0, 1]`, it is interpreted as a fraction of the dataset size.
207
+ - If `None` or `"auto"`, the subsample size is selected automatically from the dataset size and the largest candidate value of `k`.
208
+
209
+ - `random_state`: int, default `42`
210
+ Base random seed used for reproducible subsampling and clustering.
211
+
212
+ - `n_jobs`: int, default `-1`
213
+ Number of parallel jobs used during evaluation.
214
+
215
+ - `eps`: float, default `1e-12`
216
+ Numerical stability constant used when normalizing micro–macro discrepancies.
217
+
218
+ ---
219
+
220
+ #### `evaluate`
221
+
222
+ Evaluates Composite Silhouette over all candidate values of `k`.
223
+
224
+ ```python
225
+ model.evaluate()
226
+ ```
227
+
228
+ After calling `evaluate`, the results are stored in:
229
+
230
+ ```python
231
+ model.results_df
232
+ ```
233
+
234
+ The results table contains:
235
+
236
+ - `k`: candidate number of clusters.
237
+ - `avg S_micro`: average micro-averaged Silhouette across subsamples.
238
+ - `avg S_macro`: average macro-averaged Silhouette across subsamples.
239
+ - `w_micro`: average adaptive weight assigned to the micro view.
240
+ - `S_mM`: Composite Silhouette score.
241
+ - `std S_mM`: standard deviation of subsample-level composite scores.
242
+ - `se S_mM`: standard error of the Composite Silhouette estimate.
243
+ - `LCB S_mM`: lower-confidence-bound score, computed as `S_mM - se S_mM`.
244
+ - `B_eff`: number of valid subsampling trials.
245
+ - `sample_size`: resolved subsample size.
246
+ - `sample_fraction`: resolved subsample fraction.
247
+
248
+ ---
249
+
250
+ #### `get_optimal_k`
251
+
252
+ Returns the selected number of clusters.
253
+
254
+ ```python
255
+ model.get_optimal_k(use_lcb=False)
256
+ ```
257
+
258
+ **Inputs**
259
+
260
+ - `use_lcb`: bool, default `False`
261
+ If `False`, selects the `k` that maximizes `S_mM`.
262
+ If `True`, selects the `k` that maximizes `LCB S_mM`.
263
+
264
+ **Returns**
265
+
266
+ - `optimal_k`: int
267
+ Selected number of clusters.
268
+
269
+ ---
270
+
271
+ #### `get_results_dataframe`
272
+
273
+ Returns the results as a pandas DataFrame indexed by `k`.
274
+
275
+ ```python
276
+ results = model.get_results_dataframe()
277
+ ```
278
+
279
+ **Returns**
280
+
281
+ - `results`: pandas DataFrame
282
+ Table containing the Composite Silhouette results for all candidate values of `k`.
283
+
284
+ ---
285
+
286
+ #### `plot_results`
287
+
288
+ Plots the Composite Silhouette curve together with the subsample-averaged micro- and macro-averaged Silhouette curves.
289
+
290
+ ```python
291
+ model.plot_results()
292
+ ```
293
+
294
+ If `ground_truth` was provided, it is shown as a vertical reference line.
295
+
296
+ ## Quick Start
297
+
298
+ This example creates a simple synthetic dataset with five Gaussian clusters, evaluates candidate values of `k`, and selects the number of clusters using Composite Silhouette.
299
+
300
+ ```python
301
+ from sklearn.datasets import make_blobs
302
+ from sklearn.preprocessing import StandardScaler
303
+ from compsil import CompSil
304
+
305
+ # Create a simple synthetic dataset
306
+ X, y = make_blobs(
307
+ n_samples=1000,
308
+ centers=5,
309
+ n_features=10,
310
+ cluster_std=1.5,
311
+ random_state=42,
312
+ )
313
+
314
+ # Standardize the data
315
+ X = StandardScaler().fit_transform(X)
316
+
317
+ # Initialize Composite Silhouette
318
+ model = CompSil(
319
+ data=X,
320
+ ground_truth=5,
321
+ k_values=range(2, 11),
322
+ num_samples=10,
323
+ sample_size="auto",
324
+ random_state=0,
325
+ n_jobs=-1,
326
+ )
327
+
328
+ # Evaluate all candidate k values
329
+ model.evaluate()
330
+
331
+ # Select the number of clusters
332
+ best_k = model.get_optimal_k()
333
+
334
+ print("Selected k:", best_k)
335
+
336
+ # Inspect the full results table
337
+ results = model.get_results_dataframe()
338
+ print(results)
339
+
340
+ # Plot the Composite Silhouette curve
341
+ model.plot_results()
342
+ ```
343
+
344
+ The `S_mM` column in the results table contains the Composite Silhouette score for each candidate number of clusters. The selected number of clusters is the value of `k` that maximizes `S_mM`.
345
+
346
+ CompSil can also be used to evaluate a single candidate number of clusters. In this case, pass an integer to `k_values`.
347
+
348
+ ```python
349
+ # Evaluate a single candidate k
350
+ model = CompSil(
351
+ data=X,
352
+ k_values=5
353
+ )
354
+
355
+ model.evaluate()
356
+
357
+ # Composite Silhouette score for k=5
358
+ print("Composite Silhouette score:", model.score_)
359
+
360
+ # Full results table
361
+ results = model.get_results_dataframe()
362
+ print(results)
363
+ ```
364
+
365
+ When a single value of `k` is evaluated, `model.score_` stores the corresponding Composite Silhouette score.
366
+
367
+ ## Acknowledgments
368
+ This work was supported by [_Archimedes Research Unit_](https://archimedesai.gr/), [_Athena Research Center_](https://www.athenarc.gr/en).
369
+
370
+ ## License
371
+ This project is licensed under the [MIT License](https://github.com/semoglou/compsil/blob/main/LICENSE).
372
+
373
+ ## Links
374
+ - Package: [PyPI](https://pypi.org/project/compsil/)
375
+ - Paper: Accepted at ECML PKDD 2026
376
+ - DOI: Coming soon
377
+ - Preprint: [arXiv:2604.13816](https://arxiv.org/abs/2604.13816)
@@ -0,0 +1,7 @@
1
+ compsil/__init__.py,sha256=ZxBnwgnAisd0YC5shzSWTNhQsWiTbfRxBW4340kj8p4,29
2
+ compsil/compsil.py,sha256=KKx6da1QVDJa04hrMDPeqz18zurwvhwfdUa3pexjPVk,13133
3
+ compsil-0.1.0.dist-info/licenses/LICENSE,sha256=kWn4HCOo2H6AuCu5oBQd2VJYyCuWyILiElYZKNZh5JM,1073
4
+ compsil-0.1.0.dist-info/METADATA,sha256=3NISXriDBCYYUwsNVRmUDkBMP2vJclnmajbgTm8GMyg,12238
5
+ compsil-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
6
+ compsil-0.1.0.dist-info/top_level.txt,sha256=yQVXj3-Zb3n52XwCIlLsvomfByXMcXa2Lm1ShGsD7BE,8
7
+ compsil-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aggelos Semoglou
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ compsil