subselect 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kevin Kammler
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: subselect
3
+ Version: 0.0.1
4
+ Summary: Subset-contrast unsupervised model selection for anomaly detection
5
+ Project-URL: Homepage, https://github.com/k-kammler/subselect
6
+ Project-URL: Repository, https://github.com/k-kammler/subselect
7
+ Project-URL: Issues, https://github.com/k-kammler/subselect/issues
8
+ Author-email: Kevin Kammler <k.kammler@uni-mainz.de>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 Kevin Kammler
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: anomaly detection,hyperparameter selection,model selection,outlier detection,unsupervised learning
32
+ Classifier: Development Status :: 2 - Pre-Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Operating System :: OS Independent
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
41
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
42
+ Requires-Python: >=3.10
43
+ Description-Content-Type: text/markdown
44
+
45
+ # subselect
46
+
47
+ **Subset-contrast unsupervised model selection for anomaly detection.**
48
+
49
+ Given a pool of trained anomaly-detection models with no ground-truth labels,
50
+ pick the best one by measuring the contrast between each model's top-ranked
51
+ points (predicted anomalies) and the rest of the data.
52
+
53
+ ## Why
54
+
55
+ In anomaly detection, practitioners must choose among many models (Isolation
56
+ Forest, LOF, kNN, OCSVM, ...) with different hyperparameters. The standard way
57
+ to evaluate models is supervised, using metrics like ROC-AUC that require
58
+ labelled anomalies. In practice, labelled anomalies are rarely available; if
59
+ they were, the problem would already be partly solved.
60
+
61
+ `subselect` selects a model from a pool **without any labels**. A good model
62
+ concentrates real anomalies at the top of its score ranking, so its top subset
63
+ looks distinct from the rest of the data in feature space; a bad model's top
64
+ subset looks like ordinary data. Measuring that contrast with a density or
65
+ distance metric gives an unsupervised ranking that correlates with the hidden
66
+ supervised ROC-AUC ranking.
67
+
68
+ ## Install
69
+
70
+ ```bash
71
+ pip install subselect
72
+ ```
73
+
74
+ Requires Python 3.10+ with numpy, scipy, pandas, and scikit-learn.
75
+
76
+ ## Quickstart
77
+
78
+ ```python
79
+ import subselect as ss
80
+
81
+ # scores: (n_samples, n_models) anomaly score per (sample, model); higher = more anomalous
82
+ # X: (n_samples, n_features) original feature matrix
83
+ # model_names: length-n_models identifiers aligned with the columns of scores
84
+ best = ss.Evaluator().select(scores=scores, X=X, model_names=model_names)
85
+ print("Picked:", model_names[best])
86
+ ```
87
+
88
+ `Evaluator()` selects with a single default metric (Mahalanobis distance). Pass
89
+ a list of metrics and `ensemble=True` to combine several and add their
90
+ rank-aggregated ensemble (the strongest configuration in our benchmarks):
91
+
92
+ ```python
93
+ ev = ss.Evaluator(metrics=["mahalanobis_distance", "gmm_likelihood"], ensemble=True)
94
+ results = ev.evaluate(scores, X, model_names)
95
+ print(results.selected_per_metric) # pick per metric, plus 'ensemble'
96
+ ```
97
+
98
+ ## Shipped metrics
99
+
100
+ Five contrast metrics across the density and distance families:
101
+
102
+ | name | family | direction |
103
+ |------|--------|-----------|
104
+ | `mahalanobis_distance` | distance, global covariance | higher = better |
105
+ | `knn_avg_distance` | distance, local | higher = better |
106
+ | `lof_score` | local density (LOF) | higher = better |
107
+ | `kde_log_likelihood` | density, non-parametric | lower = better |
108
+ | `gmm_likelihood` | density, parametric | lower = better |
109
+
110
+ `ss.list_metrics()` lists them and `ss.metric_sets["core"]` is all five.
111
+
112
+ ## Custom contrast metric
113
+
114
+ Three ways to add your own:
115
+
116
+ ```python
117
+ # 1. Subclass - most flexible
118
+ class MyMetric(ss.ContrastMetric):
119
+ direction = +1 # higher value -> better model
120
+ kind = "per-point"
121
+
122
+ def fit_reference(self, X_ref): # fit on the complement (the reference)
123
+ return self
124
+
125
+ def score_subset(self, X_sub):
126
+ return {"mean": ..., "std": ..., "median": ...}
127
+
128
+ # 2. Decorator - one function
129
+ @ss.contrast_metric(direction=+1, kind="per-point", name="my_metric")
130
+ def my_metric(subset_X, reference_X):
131
+ return float(...)
132
+
133
+ # 3. Inline callable - quickest
134
+ ss.Evaluator(metrics=[(my_metric_callable, +1)]).select(scores, X, model_names)
135
+ ```
136
+
137
+ ## Citation
138
+
139
+ See [CITATION.cff](CITATION.cff). A paper describing the method is forthcoming.
140
+
141
+ ## License
142
+
143
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,99 @@
1
+ # subselect
2
+
3
+ **Subset-contrast unsupervised model selection for anomaly detection.**
4
+
5
+ Given a pool of trained anomaly-detection models with no ground-truth labels,
6
+ pick the best one by measuring the contrast between each model's top-ranked
7
+ points (predicted anomalies) and the rest of the data.
8
+
9
+ ## Why
10
+
11
+ In anomaly detection, practitioners must choose among many models (Isolation
12
+ Forest, LOF, kNN, OCSVM, ...) with different hyperparameters. The standard way
13
+ to evaluate models is supervised, using metrics like ROC-AUC that require
14
+ labelled anomalies. In practice, labelled anomalies are rarely available; if
15
+ they were, the problem would already be partly solved.
16
+
17
+ `subselect` selects a model from a pool **without any labels**. A good model
18
+ concentrates real anomalies at the top of its score ranking, so its top subset
19
+ looks distinct from the rest of the data in feature space; a bad model's top
20
+ subset looks like ordinary data. Measuring that contrast with a density or
21
+ distance metric gives an unsupervised ranking that correlates with the hidden
22
+ supervised ROC-AUC ranking.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install subselect
28
+ ```
29
+
30
+ Requires Python 3.10+ with numpy, scipy, pandas, and scikit-learn.
31
+
32
+ ## Quickstart
33
+
34
+ ```python
35
+ import subselect as ss
36
+
37
+ # scores: (n_samples, n_models) anomaly score per (sample, model); higher = more anomalous
38
+ # X: (n_samples, n_features) original feature matrix
39
+ # model_names: length-n_models identifiers aligned with the columns of scores
40
+ best = ss.Evaluator().select(scores=scores, X=X, model_names=model_names)
41
+ print("Picked:", model_names[best])
42
+ ```
43
+
44
+ `Evaluator()` selects with a single default metric (Mahalanobis distance). Pass
45
+ a list of metrics and `ensemble=True` to combine several and add their
46
+ rank-aggregated ensemble (the strongest configuration in our benchmarks):
47
+
48
+ ```python
49
+ ev = ss.Evaluator(metrics=["mahalanobis_distance", "gmm_likelihood"], ensemble=True)
50
+ results = ev.evaluate(scores, X, model_names)
51
+ print(results.selected_per_metric) # pick per metric, plus 'ensemble'
52
+ ```
53
+
54
+ ## Shipped metrics
55
+
56
+ Five contrast metrics across the density and distance families:
57
+
58
+ | name | family | direction |
59
+ |------|--------|-----------|
60
+ | `mahalanobis_distance` | distance, global covariance | higher = better |
61
+ | `knn_avg_distance` | distance, local | higher = better |
62
+ | `lof_score` | local density (LOF) | higher = better |
63
+ | `kde_log_likelihood` | density, non-parametric | lower = better |
64
+ | `gmm_likelihood` | density, parametric | lower = better |
65
+
66
+ `ss.list_metrics()` lists them and `ss.metric_sets["core"]` is all five.
67
+
68
+ ## Custom contrast metric
69
+
70
+ Three ways to add your own:
71
+
72
+ ```python
73
+ # 1. Subclass - most flexible
74
+ class MyMetric(ss.ContrastMetric):
75
+ direction = +1 # higher value -> better model
76
+ kind = "per-point"
77
+
78
+ def fit_reference(self, X_ref): # fit on the complement (the reference)
79
+ return self
80
+
81
+ def score_subset(self, X_sub):
82
+ return {"mean": ..., "std": ..., "median": ...}
83
+
84
+ # 2. Decorator - one function
85
+ @ss.contrast_metric(direction=+1, kind="per-point", name="my_metric")
86
+ def my_metric(subset_X, reference_X):
87
+ return float(...)
88
+
89
+ # 3. Inline callable - quickest
90
+ ss.Evaluator(metrics=[(my_metric_callable, +1)]).select(scores, X, model_names)
91
+ ```
92
+
93
+ ## Citation
94
+
95
+ See [CITATION.cff](CITATION.cff). A paper describing the method is forthcoming.
96
+
97
+ ## License
98
+
99
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "subselect"
7
+ version = "0.0.1"
8
+ description = "Subset-contrast unsupervised model selection for anomaly detection"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Kevin Kammler", email = "k.kammler@uni-mainz.de" },
14
+ ]
15
+ keywords = [
16
+ "anomaly detection",
17
+ "outlier detection",
18
+ "model selection",
19
+ "unsupervised learning",
20
+ "hyperparameter selection",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 2 - Pre-Alpha",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ "Topic :: Scientific/Engineering :: Information Analysis",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/k-kammler/subselect"
37
+ Repository = "https://github.com/k-kammler/subselect"
38
+ Issues = "https://github.com/k-kammler/subselect/issues"
39
+
40
+ [tool.hatch.build.targets.wheel]
41
+ packages = ["subselect"]
@@ -0,0 +1,3 @@
1
+ """Subset-contrast unsupervised model selection for anomaly detection."""
2
+ __version__ = "0.0.1"
3
+ __all__ = ["__version__"]