separatix 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- separatix-0.1.0a1/LICENSE +21 -0
- separatix-0.1.0a1/PKG-INFO +172 -0
- separatix-0.1.0a1/README.md +143 -0
- separatix-0.1.0a1/pyproject.toml +49 -0
- separatix-0.1.0a1/separatix/__init__.py +8 -0
- separatix-0.1.0a1/separatix/api.py +37 -0
- separatix-0.1.0a1/separatix/config.py +42 -0
- separatix-0.1.0a1/separatix/constants.py +57 -0
- separatix-0.1.0a1/separatix/densify.py +106 -0
- separatix-0.1.0a1/separatix/exceptions.py +13 -0
- separatix-0.1.0a1/separatix/metrics/__init__.py +1 -0
- separatix-0.1.0a1/separatix/metrics/audit.py +61 -0
- separatix-0.1.0a1/separatix/metrics/baseline.py +21 -0
- separatix-0.1.0a1/separatix/metrics/boundary.py +54 -0
- separatix-0.1.0a1/separatix/metrics/geometry.py +80 -0
- separatix-0.1.0a1/separatix/metrics/graph.py +72 -0
- separatix-0.1.0a1/separatix/metrics/neighborhood.py +96 -0
- separatix-0.1.0a1/separatix/metrics/topology.py +130 -0
- separatix-0.1.0a1/separatix/models/__init__.py +1 -0
- separatix-0.1.0a1/separatix/models/probes.py +384 -0
- separatix-0.1.0a1/separatix/models/scoring.py +151 -0
- separatix-0.1.0a1/separatix/preprocessing.py +13 -0
- separatix-0.1.0a1/separatix/profiler.py +171 -0
- separatix-0.1.0a1/separatix/recommendation/__init__.py +1 -0
- separatix-0.1.0a1/separatix/recommendation/engine.py +240 -0
- separatix-0.1.0a1/separatix/recommendation/text.py +60 -0
- separatix-0.1.0a1/separatix/report.py +37 -0
- separatix-0.1.0a1/separatix/sampling.py +124 -0
- separatix-0.1.0a1/separatix/utils/__init__.py +1 -0
- separatix-0.1.0a1/separatix/utils/json.py +20 -0
- separatix-0.1.0a1/separatix/utils/random.py +10 -0
- separatix-0.1.0a1/separatix/utils/warnings.py +11 -0
- separatix-0.1.0a1/separatix/validation.py +92 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: separatix
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: Diagnostic profiling of labeled embeddings for classification model complexity guidance.
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Niklas Melton
|
|
8
|
+
Author-email: niklas@example.com
|
|
9
|
+
Requires-Python: >=3.9,<3.15
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Provides-Extra: examples
|
|
19
|
+
Provides-Extra: pandas
|
|
20
|
+
Provides-Extra: tda
|
|
21
|
+
Requires-Dist: matplotlib (>=3.6) ; extra == "examples"
|
|
22
|
+
Requires-Dist: numpy (>=1.23)
|
|
23
|
+
Requires-Dist: pandas (>=1.5) ; extra == "pandas" or extra == "examples"
|
|
24
|
+
Requires-Dist: ripser (>=0.6) ; extra == "tda"
|
|
25
|
+
Requires-Dist: scikit-learn (>=1.2)
|
|
26
|
+
Requires-Dist: scipy (>=1.9)
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
[](https://github.com/NiklasMelton/Separatix)
|
|
30
|
+
|
|
31
|
+
# separatix
|
|
32
|
+
|
|
33
|
+
`separatix` profiles labeled feature spaces before classifier training and
|
|
34
|
+
returns transparent, confidence-aware guidance about apparent classification
|
|
35
|
+
complexity.
|
|
36
|
+
|
|
37
|
+
The intended use case includes learned embeddings, but the package is not
|
|
38
|
+
restricted to embeddings. It also works on raw feature matrices when you want a
|
|
39
|
+
coarse diagnostic of whether the observed class geometry looks mostly linear,
|
|
40
|
+
smoothly nonlinear, local or kernel-like, fragmented, bottlenecked, or too
|
|
41
|
+
unreliable to trust.
|
|
42
|
+
|
|
43
|
+
`separatix` does not claim to pick the optimal classifier. It is a pretraining
|
|
44
|
+
diagnostic and auditing tool designed to make its reasoning visible.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install separatix
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To install the latest development version directly from GitHub:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install "git+https://github.com/NiklasMelton/Separatix.git@develop"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick start
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from separatix import diagnose
|
|
62
|
+
|
|
63
|
+
recommendation = diagnose(X, y, random_state=0)
|
|
64
|
+
print(recommendation)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
For a structured audit:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from separatix import diagnose
|
|
71
|
+
|
|
72
|
+
report = diagnose(X, y, return_report=True, random_state=0)
|
|
73
|
+
print(report.recommendation_text)
|
|
74
|
+
print(report.decision_path)
|
|
75
|
+
print(report.scores)
|
|
76
|
+
print(report.to_json())
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## What It Accepts
|
|
80
|
+
|
|
81
|
+
- Dense NumPy arrays
|
|
82
|
+
- SciPy sparse matrices
|
|
83
|
+
- pandas DataFrames and Series when pandas is installed
|
|
84
|
+
- Binary and multiclass classification targets
|
|
85
|
+
- String or numeric labels treated as categorical class identifiers
|
|
86
|
+
|
|
87
|
+
Regression, multilabel classification, and multioutput classification are not
|
|
88
|
+
supported.
|
|
89
|
+
|
|
90
|
+
## What It Returns
|
|
91
|
+
|
|
92
|
+
By default, `diagnose(...)` returns a plain-text recommendation. With
|
|
93
|
+
`return_report=True`, it returns a `DiagnosticReport` that includes:
|
|
94
|
+
|
|
95
|
+
- the recommendation label
|
|
96
|
+
- plain-text recommendation text
|
|
97
|
+
- confidence level
|
|
98
|
+
- underlying metric groups
|
|
99
|
+
- normalized summary scores
|
|
100
|
+
- a visible decision path
|
|
101
|
+
- warnings and skipped diagnostics
|
|
102
|
+
- sampling and densification events
|
|
103
|
+
- preprocessing and runtime metadata
|
|
104
|
+
|
|
105
|
+
The report is JSON-serializable through `report.to_dict()` and `report.to_json()`.
|
|
106
|
+
|
|
107
|
+
## Recommendation Categories
|
|
108
|
+
|
|
109
|
+
- `linear_likely_sufficient`
|
|
110
|
+
- `smooth_nonlinear_recommended`
|
|
111
|
+
- `kernel_or_local_recommended`
|
|
112
|
+
- `high_capacity_or_partitioning_recommended`
|
|
113
|
+
- `feature_or_label_bottleneck_likely`
|
|
114
|
+
- `insufficient_data_or_unreliable_geometry`
|
|
115
|
+
- `inconclusive`
|
|
116
|
+
|
|
117
|
+
These categories are intentionally coarse. They describe the apparent geometry
|
|
118
|
+
and difficulty of the labeled feature space, not a guaranteed best model choice.
|
|
119
|
+
|
|
120
|
+
## Decision Pipeline
|
|
121
|
+
|
|
122
|
+
The recommendation is produced by a fixed, inspectable pipeline:
|
|
123
|
+
|
|
124
|
+
1. Validate inputs and encode labels.
|
|
125
|
+
2. Audit class counts, imbalance, sparsity, and basic dataset conditions.
|
|
126
|
+
3. Compute geometry, neighborhood, and boundary-related diagnostics.
|
|
127
|
+
4. Run simple probe models and compare them to a dummy baseline.
|
|
128
|
+
5. Aggregate the raw metrics into normalized scores such as signal,
|
|
129
|
+
linearity, nonlinearity, overlap, fragmentation, and reliability.
|
|
130
|
+
6. Apply explicit rule-based branching to map those scores to a recommendation
|
|
131
|
+
category and confidence level.
|
|
132
|
+
7. Render both a plain-language summary and a structured report.
|
|
133
|
+
|
|
134
|
+
The full rationale and decision rules are documented in
|
|
135
|
+
[docs/decision_pipeline.md](/Users/niklasmelton/code/Separatix/docs/decision_pipeline.md).
|
|
136
|
+
|
|
137
|
+
## Sparse Inputs And Memory Behavior
|
|
138
|
+
|
|
139
|
+
Sparse matrices are accepted directly. Diagnostics that need dense data use a
|
|
140
|
+
shared densification policy rather than a separate dense-only code path. When a
|
|
141
|
+
step would require densification, `separatix` can fail, skip, or warn and
|
|
142
|
+
subsample before densifying, depending on configuration. These events are
|
|
143
|
+
recorded in the report.
|
|
144
|
+
|
|
145
|
+
## Examples
|
|
146
|
+
|
|
147
|
+
- [examples/basic_breast_cancer.py](/Users/niklasmelton/code/Separatix/examples/basic_breast_cancer.py)
|
|
148
|
+
- [examples/linear_hyperplane_visual.py](/Users/niklasmelton/code/Separatix/examples/linear_hyperplane_visual.py)
|
|
149
|
+
- [examples/curvilinear_boundary_visual.py](/Users/niklasmelton/code/Separatix/examples/curvilinear_boundary_visual.py)
|
|
150
|
+
- [examples/high_dimensional_linear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_linear_hyperplane.py)
|
|
151
|
+
- [examples/high_dimensional_curvilinear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_curvilinear_hyperplane.py)
|
|
152
|
+
- [examples/moons_vs_linear.py](/Users/niklasmelton/code/Separatix/examples/moons_vs_linear.py)
|
|
153
|
+
- [examples/circles_kernel_signal.py](/Users/niklasmelton/code/Separatix/examples/circles_kernel_signal.py)
|
|
154
|
+
- [examples/multiclass_wine.py](/Users/niklasmelton/code/Separatix/examples/multiclass_wine.py)
|
|
155
|
+
- [examples/sparse_text_like_embeddings.py](/Users/niklasmelton/code/Separatix/examples/sparse_text_like_embeddings.py)
|
|
156
|
+
|
|
157
|
+
## Related Work
|
|
158
|
+
|
|
159
|
+
This package is not an implementation of a published dataset-complexity
|
|
160
|
+
procedure, but the project is adjacent to and inspired by prior work on
|
|
161
|
+
classification complexity and data geometry. In particular, would like to acknowledge:
|
|
162
|
+
|
|
163
|
+
- Ho and Basu, "Complexity Measures of Supervised Classification Problems"
|
|
164
|
+
([PDF](https://sci2s.ugr.es/keel/pdf/algorithm/articulo/2002-IEEE-TPAMI-Ho-DC.pdf))
|
|
165
|
+
- Lorena, Garcia, Lehmann, Souto, and Ho, "How Complex Is Your
|
|
166
|
+
Classification Problem? A Survey on Measuring Classification Complexity"
|
|
167
|
+
([DOI](https://doi.org/10.1145/3347711),
|
|
168
|
+
[PDF](https://dl.acm.org/doi/epdf/10.1145/3347711))
|
|
169
|
+
|
|
170
|
+
We do not follow those procedures directly, but they are relevant background
|
|
171
|
+
for why geometry-aware pretraining diagnostics are useful.
|
|
172
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
[](https://github.com/NiklasMelton/Separatix)
|
|
2
|
+
|
|
3
|
+
# separatix
|
|
4
|
+
|
|
5
|
+
`separatix` profiles labeled feature spaces before classifier training and
|
|
6
|
+
returns transparent, confidence-aware guidance about apparent classification
|
|
7
|
+
complexity.
|
|
8
|
+
|
|
9
|
+
The intended use case includes learned embeddings, but the package is not
|
|
10
|
+
restricted to embeddings. It also works on raw feature matrices when you want a
|
|
11
|
+
coarse diagnostic of whether the observed class geometry looks mostly linear,
|
|
12
|
+
smoothly nonlinear, local or kernel-like, fragmented, bottlenecked, or too
|
|
13
|
+
unreliable to trust.
|
|
14
|
+
|
|
15
|
+
`separatix` does not claim to pick the optimal classifier. It is a pretraining
|
|
16
|
+
diagnostic and auditing tool designed to make its reasoning visible.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install separatix
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
To install the latest development version directly from GitHub:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install "git+https://github.com/NiklasMelton/Separatix.git@develop"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from separatix import diagnose
|
|
34
|
+
|
|
35
|
+
recommendation = diagnose(X, y, random_state=0)
|
|
36
|
+
print(recommendation)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
For a structured audit:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from separatix import diagnose
|
|
43
|
+
|
|
44
|
+
report = diagnose(X, y, return_report=True, random_state=0)
|
|
45
|
+
print(report.recommendation_text)
|
|
46
|
+
print(report.decision_path)
|
|
47
|
+
print(report.scores)
|
|
48
|
+
print(report.to_json())
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## What It Accepts
|
|
52
|
+
|
|
53
|
+
- Dense NumPy arrays
|
|
54
|
+
- SciPy sparse matrices
|
|
55
|
+
- pandas DataFrames and Series when pandas is installed
|
|
56
|
+
- Binary and multiclass classification targets
|
|
57
|
+
- String or numeric labels treated as categorical class identifiers
|
|
58
|
+
|
|
59
|
+
Regression, multilabel classification, and multioutput classification are not
|
|
60
|
+
supported.
|
|
61
|
+
|
|
62
|
+
## What It Returns
|
|
63
|
+
|
|
64
|
+
By default, `diagnose(...)` returns a plain-text recommendation. With
|
|
65
|
+
`return_report=True`, it returns a `DiagnosticReport` that includes:
|
|
66
|
+
|
|
67
|
+
- the recommendation label
|
|
68
|
+
- plain-text recommendation text
|
|
69
|
+
- confidence level
|
|
70
|
+
- underlying metric groups
|
|
71
|
+
- normalized summary scores
|
|
72
|
+
- a visible decision path
|
|
73
|
+
- warnings and skipped diagnostics
|
|
74
|
+
- sampling and densification events
|
|
75
|
+
- preprocessing and runtime metadata
|
|
76
|
+
|
|
77
|
+
The report is JSON-serializable through `report.to_dict()` and `report.to_json()`.
|
|
78
|
+
|
|
79
|
+
## Recommendation Categories
|
|
80
|
+
|
|
81
|
+
- `linear_likely_sufficient`
|
|
82
|
+
- `smooth_nonlinear_recommended`
|
|
83
|
+
- `kernel_or_local_recommended`
|
|
84
|
+
- `high_capacity_or_partitioning_recommended`
|
|
85
|
+
- `feature_or_label_bottleneck_likely`
|
|
86
|
+
- `insufficient_data_or_unreliable_geometry`
|
|
87
|
+
- `inconclusive`
|
|
88
|
+
|
|
89
|
+
These categories are intentionally coarse. They describe the apparent geometry
|
|
90
|
+
and difficulty of the labeled feature space, not a guaranteed best model choice.
|
|
91
|
+
|
|
92
|
+
## Decision Pipeline
|
|
93
|
+
|
|
94
|
+
The recommendation is produced by a fixed, inspectable pipeline:
|
|
95
|
+
|
|
96
|
+
1. Validate inputs and encode labels.
|
|
97
|
+
2. Audit class counts, imbalance, sparsity, and basic dataset conditions.
|
|
98
|
+
3. Compute geometry, neighborhood, and boundary-related diagnostics.
|
|
99
|
+
4. Run simple probe models and compare them to a dummy baseline.
|
|
100
|
+
5. Aggregate the raw metrics into normalized scores such as signal,
|
|
101
|
+
linearity, nonlinearity, overlap, fragmentation, and reliability.
|
|
102
|
+
6. Apply explicit rule-based branching to map those scores to a recommendation
|
|
103
|
+
category and confidence level.
|
|
104
|
+
7. Render both a plain-language summary and a structured report.
|
|
105
|
+
|
|
106
|
+
The full rationale and decision rules are documented in
|
|
107
|
+
[docs/decision_pipeline.md](/Users/niklasmelton/code/Separatix/docs/decision_pipeline.md).
|
|
108
|
+
|
|
109
|
+
## Sparse Inputs And Memory Behavior
|
|
110
|
+
|
|
111
|
+
Sparse matrices are accepted directly. Diagnostics that need dense data use a
|
|
112
|
+
shared densification policy rather than a separate dense-only code path. When a
|
|
113
|
+
step would require densification, `separatix` can fail, skip, or warn and
|
|
114
|
+
subsample before densifying, depending on configuration. These events are
|
|
115
|
+
recorded in the report.
|
|
116
|
+
|
|
117
|
+
## Examples
|
|
118
|
+
|
|
119
|
+
- [examples/basic_breast_cancer.py](/Users/niklasmelton/code/Separatix/examples/basic_breast_cancer.py)
|
|
120
|
+
- [examples/linear_hyperplane_visual.py](/Users/niklasmelton/code/Separatix/examples/linear_hyperplane_visual.py)
|
|
121
|
+
- [examples/curvilinear_boundary_visual.py](/Users/niklasmelton/code/Separatix/examples/curvilinear_boundary_visual.py)
|
|
122
|
+
- [examples/high_dimensional_linear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_linear_hyperplane.py)
|
|
123
|
+
- [examples/high_dimensional_curvilinear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_curvilinear_hyperplane.py)
|
|
124
|
+
- [examples/moons_vs_linear.py](/Users/niklasmelton/code/Separatix/examples/moons_vs_linear.py)
|
|
125
|
+
- [examples/circles_kernel_signal.py](/Users/niklasmelton/code/Separatix/examples/circles_kernel_signal.py)
|
|
126
|
+
- [examples/multiclass_wine.py](/Users/niklasmelton/code/Separatix/examples/multiclass_wine.py)
|
|
127
|
+
- [examples/sparse_text_like_embeddings.py](/Users/niklasmelton/code/Separatix/examples/sparse_text_like_embeddings.py)
|
|
128
|
+
|
|
129
|
+
## Related Work
|
|
130
|
+
|
|
131
|
+
This package is not an implementation of a published dataset-complexity
|
|
132
|
+
procedure, but the project is adjacent to and inspired by prior work on
|
|
133
|
+
classification complexity and data geometry. In particular, would like to acknowledge:
|
|
134
|
+
|
|
135
|
+
- Ho and Basu, "Complexity Measures of Supervised Classification Problems"
|
|
136
|
+
([PDF](https://sci2s.ugr.es/keel/pdf/algorithm/articulo/2002-IEEE-TPAMI-Ho-DC.pdf))
|
|
137
|
+
- Lorena, Garcia, Lehmann, Souto, and Ho, "How Complex Is Your
|
|
138
|
+
Classification Problem? A Survey on Measuring Classification Complexity"
|
|
139
|
+
([DOI](https://doi.org/10.1145/3347711),
|
|
140
|
+
[PDF](https://dl.acm.org/doi/epdf/10.1145/3347711))
|
|
141
|
+
|
|
142
|
+
We do not follow those procedures directly, but they are relevant background
|
|
143
|
+
for why geometry-aware pretraining diagnostics are useful.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "separatix"
|
|
3
|
+
version = "0.1.0a1"
|
|
4
|
+
description = "Diagnostic profiling of labeled embeddings for classification model complexity guidance."
|
|
5
|
+
authors = ["Niklas Melton <niklas@example.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
packages = [{ include = "separatix" }]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = ">=3.9,<3.15"
|
|
12
|
+
numpy = ">=1.23"
|
|
13
|
+
scipy = ">=1.9"
|
|
14
|
+
scikit-learn = ">=1.2"
|
|
15
|
+
pandas = { version = ">=1.5", optional = true }
|
|
16
|
+
matplotlib = { version = ">=3.6", optional = true }
|
|
17
|
+
ripser = { version = ">=0.6", optional = true }
|
|
18
|
+
|
|
19
|
+
[tool.poetry.group.dev.dependencies]
|
|
20
|
+
pytest = ">=7"
|
|
21
|
+
pytest-cov = ">=4"
|
|
22
|
+
ruff = ">=0.5"
|
|
23
|
+
mypy = ">=1"
|
|
24
|
+
build = ">=1"
|
|
25
|
+
twine = ">=5"
|
|
26
|
+
|
|
27
|
+
[tool.poetry.extras]
|
|
28
|
+
pandas = ["pandas"]
|
|
29
|
+
tda = ["ripser"]
|
|
30
|
+
examples = ["matplotlib", "pandas"]
|
|
31
|
+
|
|
32
|
+
[tool.ruff]
|
|
33
|
+
line-length = 88
|
|
34
|
+
|
|
35
|
+
[tool.ruff.lint]
|
|
36
|
+
select = ["E", "F", "I", "B", "UP"]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
testpaths = ["tests"]
|
|
40
|
+
addopts = "-ra"
|
|
41
|
+
|
|
42
|
+
[tool.mypy]
|
|
43
|
+
python_version = "3.12"
|
|
44
|
+
warn_unused_configs = true
|
|
45
|
+
ignore_missing_imports = true
|
|
46
|
+
|
|
47
|
+
[build-system]
|
|
48
|
+
requires = ["poetry-core"]
|
|
49
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Public package exports for separatix."""
|
|
2
|
+
|
|
3
|
+
from separatix.api import diagnose
|
|
4
|
+
from separatix.config import ProfilerConfig
|
|
5
|
+
from separatix.profiler import ComplexityProfiler
|
|
6
|
+
from separatix.report import DiagnosticReport
|
|
7
|
+
|
|
8
|
+
__all__ = ["ComplexityProfiler", "DiagnosticReport", "ProfilerConfig", "diagnose"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Functional API for separatix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from separatix.profiler import ComplexityProfiler
|
|
8
|
+
from separatix.report import DiagnosticReport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def diagnose(
|
|
12
|
+
X: Any,
|
|
13
|
+
y: Any,
|
|
14
|
+
*,
|
|
15
|
+
return_report: bool = False,
|
|
16
|
+
budget: Literal["fast", "standard", "extended"] = "standard",
|
|
17
|
+
topology: Literal["off", "auto", "graph", "persistent"] = "auto",
|
|
18
|
+
densify_policy: Literal["fail", "warn_and_sample", "skip"] = ("warn_and_sample"),
|
|
19
|
+
max_dense_mb: int = 512,
|
|
20
|
+
max_samples: int | None = None,
|
|
21
|
+
random_state: int | None = None,
|
|
22
|
+
warn_on_densify: bool = True,
|
|
23
|
+
) -> str | DiagnosticReport:
|
|
24
|
+
"""Diagnose apparent classification complexity from embeddings and labels."""
|
|
25
|
+
profiler = ComplexityProfiler(
|
|
26
|
+
budget=budget,
|
|
27
|
+
topology=topology,
|
|
28
|
+
densify_policy=densify_policy,
|
|
29
|
+
max_dense_mb=max_dense_mb,
|
|
30
|
+
max_samples=max_samples,
|
|
31
|
+
random_state=random_state,
|
|
32
|
+
warn_on_densify=warn_on_densify,
|
|
33
|
+
)
|
|
34
|
+
report = profiler.fit(X, y).report_
|
|
35
|
+
if report is None:
|
|
36
|
+
raise RuntimeError("Profiler did not produce a report.")
|
|
37
|
+
return report if return_report else report.recommendation_text
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Configuration objects for separatix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import asdict, dataclass
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from separatix.constants import BUDGETS
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ProfilerConfig:
|
|
13
|
+
"""Configuration for the separatix diagnostic profiler."""
|
|
14
|
+
|
|
15
|
+
budget: Literal["fast", "standard", "extended"] = "standard"
|
|
16
|
+
topology: Literal["off", "auto", "graph", "persistent"] = "auto"
|
|
17
|
+
densify_policy: Literal["fail", "warn_and_sample", "skip"] = "warn_and_sample"
|
|
18
|
+
max_dense_mb: int = 512
|
|
19
|
+
max_samples: int | None = None
|
|
20
|
+
min_dense_samples: int = 200
|
|
21
|
+
random_state: int | None = None
|
|
22
|
+
warn_on_densify: bool = True
|
|
23
|
+
n_jobs: int | None = None
|
|
24
|
+
|
|
25
|
+
def __post_init__(self) -> None:
|
|
26
|
+
"""Validate configuration values."""
|
|
27
|
+
if self.budget not in BUDGETS:
|
|
28
|
+
raise ValueError(f"Unsupported budget: {self.budget!r}")
|
|
29
|
+
if self.topology not in {"off", "auto", "graph", "persistent"}:
|
|
30
|
+
raise ValueError(f"Unsupported topology mode: {self.topology!r}")
|
|
31
|
+
if self.densify_policy not in {"fail", "warn_and_sample", "skip"}:
|
|
32
|
+
raise ValueError(f"Unsupported densify policy: {self.densify_policy!r}")
|
|
33
|
+
if self.max_dense_mb <= 0:
|
|
34
|
+
raise ValueError("max_dense_mb must be positive.")
|
|
35
|
+
if self.max_samples is not None and self.max_samples <= 0:
|
|
36
|
+
raise ValueError("max_samples must be positive when provided.")
|
|
37
|
+
if self.min_dense_samples <= 0:
|
|
38
|
+
raise ValueError("min_dense_samples must be positive.")
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict[str, object]:
|
|
41
|
+
"""Return a JSON-serializable configuration dictionary."""
|
|
42
|
+
return asdict(self)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Constants used across the separatix package."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
LINEAR_LIKELY_SUFFICIENT = "linear_likely_sufficient"
|
|
6
|
+
SMOOTH_NONLINEAR_RECOMMENDED = "smooth_nonlinear_recommended"
|
|
7
|
+
KERNEL_OR_LOCAL_RECOMMENDED = "kernel_or_local_recommended"
|
|
8
|
+
HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED = "high_capacity_or_partitioning_recommended"
|
|
9
|
+
FEATURE_OR_LABEL_BOTTLENECK_LIKELY = "feature_or_label_bottleneck_likely"
|
|
10
|
+
INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY = "insufficient_data_or_unreliable_geometry"
|
|
11
|
+
INCONCLUSIVE = "inconclusive"
|
|
12
|
+
|
|
13
|
+
RECOMMENDATION_LABELS = {
|
|
14
|
+
LINEAR_LIKELY_SUFFICIENT: "Linear model likely sufficient.",
|
|
15
|
+
SMOOTH_NONLINEAR_RECOMMENDED: "Smooth nonlinear model likely useful.",
|
|
16
|
+
KERNEL_OR_LOCAL_RECOMMENDED: "Kernel or local model likely useful.",
|
|
17
|
+
HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED: (
|
|
18
|
+
"Higher-capacity or partitioning model likely useful."
|
|
19
|
+
),
|
|
20
|
+
FEATURE_OR_LABEL_BOTTLENECK_LIKELY: "Feature or label bottleneck likely.",
|
|
21
|
+
INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY: (
|
|
22
|
+
"Insufficient data or unreliable geometry."
|
|
23
|
+
),
|
|
24
|
+
INCONCLUSIVE: "Diagnostic result is inconclusive.",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
BUDGETS = {
|
|
28
|
+
"fast": {
|
|
29
|
+
"max_probe_samples": 5000,
|
|
30
|
+
"max_neighbor_samples": 5000,
|
|
31
|
+
"max_boundary_samples": 2000,
|
|
32
|
+
"cv_folds": 3,
|
|
33
|
+
"bootstrap_repeats": 0,
|
|
34
|
+
"run_kernel_probe": False,
|
|
35
|
+
"run_persistent_topology": False,
|
|
36
|
+
},
|
|
37
|
+
"standard": {
|
|
38
|
+
"max_probe_samples": 20000,
|
|
39
|
+
"max_neighbor_samples": 10000,
|
|
40
|
+
"max_boundary_samples": 3000,
|
|
41
|
+
"cv_folds": 5,
|
|
42
|
+
"bootstrap_repeats": 3,
|
|
43
|
+
"run_kernel_probe": True,
|
|
44
|
+
"run_persistent_topology": "auto",
|
|
45
|
+
},
|
|
46
|
+
"extended": {
|
|
47
|
+
"max_probe_samples": 50000,
|
|
48
|
+
"max_neighbor_samples": 20000,
|
|
49
|
+
"max_boundary_samples": 5000,
|
|
50
|
+
"cv_folds": 5,
|
|
51
|
+
"bootstrap_repeats": 10,
|
|
52
|
+
"run_kernel_probe": True,
|
|
53
|
+
"run_persistent_topology": "auto",
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
CONFIDENCE_LEVELS = ("low", "medium", "high")
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Dense conversion helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from math import floor
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy import sparse
|
|
10
|
+
|
|
11
|
+
from separatix.config import ProfilerConfig
|
|
12
|
+
from separatix.exceptions import DensificationError, DensificationWarning
|
|
13
|
+
from separatix.sampling import stratified_subsample_indices
|
|
14
|
+
from separatix.utils.warnings import record_warning
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def ensure_dense_or_sample(
|
|
18
|
+
X: Any,
|
|
19
|
+
y: np.ndarray,
|
|
20
|
+
*,
|
|
21
|
+
reason: str,
|
|
22
|
+
config: ProfilerConfig,
|
|
23
|
+
report_context: dict[str, Any],
|
|
24
|
+
) -> dict[str, Any]:
|
|
25
|
+
"""Return a dense matrix, optionally after stratified subsampling."""
|
|
26
|
+
densification_events = report_context.setdefault("densification_events", [])
|
|
27
|
+
warnings_list = report_context.setdefault("warnings", [])
|
|
28
|
+
skipped = report_context.setdefault("skipped_diagnostics", [])
|
|
29
|
+
|
|
30
|
+
if not sparse.issparse(X):
|
|
31
|
+
return {"X": np.asarray(X), "y": y, "performed": False, "skipped": False}
|
|
32
|
+
|
|
33
|
+
dtype = X.dtype if X.dtype is not None else np.dtype(float)
|
|
34
|
+
estimated_mb = X.shape[0] * X.shape[1] * np.dtype(dtype).itemsize / 1024**2
|
|
35
|
+
event = {
|
|
36
|
+
"operation": "densify",
|
|
37
|
+
"reason": reason,
|
|
38
|
+
"input_shape": [int(X.shape[0]), int(X.shape[1])],
|
|
39
|
+
"estimated_full_dense_mb": float(estimated_mb),
|
|
40
|
+
"max_dense_mb": config.max_dense_mb,
|
|
41
|
+
"policy": config.densify_policy,
|
|
42
|
+
"sampling_used": False,
|
|
43
|
+
"n_original": int(X.shape[0]),
|
|
44
|
+
"n_used": int(X.shape[0]),
|
|
45
|
+
"status": "performed",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if estimated_mb <= config.max_dense_mb:
|
|
49
|
+
dense = X.toarray()
|
|
50
|
+
densification_events.append(event)
|
|
51
|
+
if config.warn_on_densify:
|
|
52
|
+
record_warning(
|
|
53
|
+
f"Sparse input densified for {reason}.",
|
|
54
|
+
warnings_list,
|
|
55
|
+
DensificationWarning,
|
|
56
|
+
)
|
|
57
|
+
return {"X": dense, "y": y, "performed": True, "skipped": False}
|
|
58
|
+
|
|
59
|
+
if config.densify_policy == "fail":
|
|
60
|
+
message = (
|
|
61
|
+
f"Dense conversion for {reason} would exceed "
|
|
62
|
+
f"max_dense_mb={config.max_dense_mb}."
|
|
63
|
+
)
|
|
64
|
+
raise DensificationError(message)
|
|
65
|
+
|
|
66
|
+
if config.densify_policy == "skip":
|
|
67
|
+
event["status"] = "skipped"
|
|
68
|
+
densification_events.append(event)
|
|
69
|
+
skipped.append(
|
|
70
|
+
{
|
|
71
|
+
"name": reason,
|
|
72
|
+
"reason": "dense conversion exceeds configured memory budget",
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
return {"X": None, "y": y, "performed": False, "skipped": True}
|
|
76
|
+
|
|
77
|
+
max_rows = floor(
|
|
78
|
+
(config.max_dense_mb * 1024**2) / (X.shape[1] * np.dtype(dtype).itemsize)
|
|
79
|
+
)
|
|
80
|
+
n_used = min(X.shape[0], max_rows, config.max_samples or X.shape[0])
|
|
81
|
+
if n_used < min(config.min_dense_samples, X.shape[0]):
|
|
82
|
+
skipped.append({"name": reason, "reason": "dense subsample would be too small"})
|
|
83
|
+
event["status"] = "skipped_too_small"
|
|
84
|
+
event["n_used"] = int(max(n_used, 0))
|
|
85
|
+
densification_events.append(event)
|
|
86
|
+
if config.densify_policy == "warn_and_sample":
|
|
87
|
+
return {"X": None, "y": y, "performed": False, "skipped": True}
|
|
88
|
+
raise DensificationError(f"Unable to densify enough samples for {reason}.")
|
|
89
|
+
|
|
90
|
+
indices = stratified_subsample_indices(
|
|
91
|
+
y,
|
|
92
|
+
n_samples=n_used,
|
|
93
|
+
random_state=config.random_state,
|
|
94
|
+
)
|
|
95
|
+
dense = X[indices, :].toarray()
|
|
96
|
+
event["sampling_used"] = True
|
|
97
|
+
event["n_used"] = int(indices.shape[0])
|
|
98
|
+
event["status"] = "performed_on_subsample"
|
|
99
|
+
densification_events.append(event)
|
|
100
|
+
if config.warn_on_densify:
|
|
101
|
+
record_warning(
|
|
102
|
+
f"Sparse input was stratified-subsampled then densified for {reason}.",
|
|
103
|
+
warnings_list,
|
|
104
|
+
DensificationWarning,
|
|
105
|
+
)
|
|
106
|
+
return {"X": dense, "y": y[indices], "performed": True, "skipped": False}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Custom exceptions and warnings for separatix."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SeparatixError(Exception):
|
|
5
|
+
"""Base exception for separatix."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DensificationError(SeparatixError):
|
|
9
|
+
"""Raised when dense conversion is required but disallowed or impossible."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DensificationWarning(UserWarning):
|
|
13
|
+
"""Warning emitted when sparse data are densified or subsampled."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Diagnostic metric modules."""
|