separatix 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. separatix-0.1.0a1/LICENSE +21 -0
  2. separatix-0.1.0a1/PKG-INFO +172 -0
  3. separatix-0.1.0a1/README.md +143 -0
  4. separatix-0.1.0a1/pyproject.toml +49 -0
  5. separatix-0.1.0a1/separatix/__init__.py +8 -0
  6. separatix-0.1.0a1/separatix/api.py +37 -0
  7. separatix-0.1.0a1/separatix/config.py +42 -0
  8. separatix-0.1.0a1/separatix/constants.py +57 -0
  9. separatix-0.1.0a1/separatix/densify.py +106 -0
  10. separatix-0.1.0a1/separatix/exceptions.py +13 -0
  11. separatix-0.1.0a1/separatix/metrics/__init__.py +1 -0
  12. separatix-0.1.0a1/separatix/metrics/audit.py +61 -0
  13. separatix-0.1.0a1/separatix/metrics/baseline.py +21 -0
  14. separatix-0.1.0a1/separatix/metrics/boundary.py +54 -0
  15. separatix-0.1.0a1/separatix/metrics/geometry.py +80 -0
  16. separatix-0.1.0a1/separatix/metrics/graph.py +72 -0
  17. separatix-0.1.0a1/separatix/metrics/neighborhood.py +96 -0
  18. separatix-0.1.0a1/separatix/metrics/topology.py +130 -0
  19. separatix-0.1.0a1/separatix/models/__init__.py +1 -0
  20. separatix-0.1.0a1/separatix/models/probes.py +384 -0
  21. separatix-0.1.0a1/separatix/models/scoring.py +151 -0
  22. separatix-0.1.0a1/separatix/preprocessing.py +13 -0
  23. separatix-0.1.0a1/separatix/profiler.py +171 -0
  24. separatix-0.1.0a1/separatix/recommendation/__init__.py +1 -0
  25. separatix-0.1.0a1/separatix/recommendation/engine.py +240 -0
  26. separatix-0.1.0a1/separatix/recommendation/text.py +60 -0
  27. separatix-0.1.0a1/separatix/report.py +37 -0
  28. separatix-0.1.0a1/separatix/sampling.py +124 -0
  29. separatix-0.1.0a1/separatix/utils/__init__.py +1 -0
  30. separatix-0.1.0a1/separatix/utils/json.py +20 -0
  31. separatix-0.1.0a1/separatix/utils/random.py +10 -0
  32. separatix-0.1.0a1/separatix/utils/warnings.py +11 -0
  33. separatix-0.1.0a1/separatix/validation.py +92 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,172 @@
1
+ Metadata-Version: 2.4
2
+ Name: separatix
3
+ Version: 0.1.0a1
4
+ Summary: Diagnostic profiling of labeled embeddings for classification model complexity guidance.
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Author: Niklas Melton
8
+ Author-email: niklas@example.com
9
+ Requires-Python: >=3.9,<3.15
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Provides-Extra: examples
19
+ Provides-Extra: pandas
20
+ Provides-Extra: tda
21
+ Requires-Dist: matplotlib (>=3.6) ; extra == "examples"
22
+ Requires-Dist: numpy (>=1.23)
23
+ Requires-Dist: pandas (>=1.5) ; extra == "pandas" or extra == "examples"
24
+ Requires-Dist: ripser (>=0.6) ; extra == "tda"
25
+ Requires-Dist: scikit-learn (>=1.2)
26
+ Requires-Dist: scipy (>=1.9)
27
+ Description-Content-Type: text/markdown
28
+
29
+ [![separatix logo](https://raw.githubusercontent.com/NiklasMelton/Separatix/develop/img/separatix_logo.png)](https://github.com/NiklasMelton/Separatix)
30
+
31
+ # separatix
32
+
33
+ `separatix` profiles labeled feature spaces before classifier training and
34
+ returns transparent, confidence-aware guidance about apparent classification
35
+ complexity.
36
+
37
+ The intended use case includes learned embeddings, but the package is not
38
+ restricted to embeddings. It also works on raw feature matrices when you want a
39
+ coarse diagnostic of whether the observed class geometry looks mostly linear,
40
+ smoothly nonlinear, local or kernel-like, fragmented, bottlenecked, or too
41
+ unreliable to trust.
42
+
43
+ `separatix` does not claim to pick the optimal classifier. It is a pretraining
44
+ diagnostic and auditing tool designed to make its reasoning visible.
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install separatix
50
+ ```
51
+
52
+ To install the latest development version directly from GitHub:
53
+
54
+ ```bash
55
+ pip install "git+https://github.com/NiklasMelton/Separatix.git@develop"
56
+ ```
57
+
58
+ ## Quick start
59
+
60
+ ```python
61
+ from separatix import diagnose
62
+
63
+ recommendation = diagnose(X, y, random_state=0)
64
+ print(recommendation)
65
+ ```
66
+
67
+ For a structured audit:
68
+
69
+ ```python
70
+ from separatix import diagnose
71
+
72
+ report = diagnose(X, y, return_report=True, random_state=0)
73
+ print(report.recommendation_text)
74
+ print(report.decision_path)
75
+ print(report.scores)
76
+ print(report.to_json())
77
+ ```
78
+
79
+ ## What It Accepts
80
+
81
+ - Dense NumPy arrays
82
+ - SciPy sparse matrices
83
+ - pandas DataFrames and Series when pandas is installed
84
+ - Binary and multiclass classification targets
85
+ - String or numeric labels treated as categorical class identifiers
86
+
87
+ Regression, multilabel classification, and multioutput classification are not
88
+ supported.
89
+
90
+ ## What It Returns
91
+
92
+ By default, `diagnose(...)` returns a plain-text recommendation. With
93
+ `return_report=True`, it returns a `DiagnosticReport` that includes:
94
+
95
+ - the recommendation label
96
+ - plain-text recommendation text
97
+ - confidence level
98
+ - underlying metric groups
99
+ - normalized summary scores
100
+ - a visible decision path
101
+ - warnings and skipped diagnostics
102
+ - sampling and densification events
103
+ - preprocessing and runtime metadata
104
+
105
+ The report is JSON-serializable through `report.to_dict()` and `report.to_json()`.
106
+
107
+ ## Recommendation Categories
108
+
109
+ - `linear_likely_sufficient`
110
+ - `smooth_nonlinear_recommended`
111
+ - `kernel_or_local_recommended`
112
+ - `high_capacity_or_partitioning_recommended`
113
+ - `feature_or_label_bottleneck_likely`
114
+ - `insufficient_data_or_unreliable_geometry`
115
+ - `inconclusive`
116
+
117
+ These categories are intentionally coarse. They describe the apparent geometry
118
+ and difficulty of the labeled feature space, not a guaranteed best model choice.
119
+
120
+ ## Decision Pipeline
121
+
122
+ The recommendation is produced by a fixed, inspectable pipeline:
123
+
124
+ 1. Validate inputs and encode labels.
125
+ 2. Audit class counts, imbalance, sparsity, and basic dataset conditions.
126
+ 3. Compute geometry, neighborhood, and boundary-related diagnostics.
127
+ 4. Run simple probe models and compare them to a dummy baseline.
128
+ 5. Aggregate the raw metrics into normalized scores such as signal,
129
+ linearity, nonlinearity, overlap, fragmentation, and reliability.
130
+ 6. Apply explicit rule-based branching to map those scores to a recommendation
131
+ category and confidence level.
132
+ 7. Render both a plain-language summary and a structured report.
133
+
134
+ The full rationale and decision rules are documented in
135
+ [docs/decision_pipeline.md](/Users/niklasmelton/code/Separatix/docs/decision_pipeline.md).
136
+
137
+ ## Sparse Inputs And Memory Behavior
138
+
139
+ Sparse matrices are accepted directly. Diagnostics that need dense data use a
140
+ shared densification policy rather than a separate dense-only code path. When a
141
+ step would require densification, `separatix` can fail, skip, or warn and
142
+ subsample before densifying, depending on configuration. These events are
143
+ recorded in the report.
144
+
145
+ ## Examples
146
+
147
+ - [examples/basic_breast_cancer.py](/Users/niklasmelton/code/Separatix/examples/basic_breast_cancer.py)
148
+ - [examples/linear_hyperplane_visual.py](/Users/niklasmelton/code/Separatix/examples/linear_hyperplane_visual.py)
149
+ - [examples/curvilinear_boundary_visual.py](/Users/niklasmelton/code/Separatix/examples/curvilinear_boundary_visual.py)
150
+ - [examples/high_dimensional_linear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_linear_hyperplane.py)
151
+ - [examples/high_dimensional_curvilinear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_curvilinear_hyperplane.py)
152
+ - [examples/moons_vs_linear.py](/Users/niklasmelton/code/Separatix/examples/moons_vs_linear.py)
153
+ - [examples/circles_kernel_signal.py](/Users/niklasmelton/code/Separatix/examples/circles_kernel_signal.py)
154
+ - [examples/multiclass_wine.py](/Users/niklasmelton/code/Separatix/examples/multiclass_wine.py)
155
+ - [examples/sparse_text_like_embeddings.py](/Users/niklasmelton/code/Separatix/examples/sparse_text_like_embeddings.py)
156
+
157
+ ## Related Work
158
+
159
+ This package is not an implementation of a published dataset-complexity
160
+ procedure, but the project is adjacent to and inspired by prior work on
161
+ classification complexity and data geometry. In particular, would like to acknowledge:
162
+
163
+ - Ho and Basu, "Complexity Measures of Supervised Classification Problems"
164
+ ([PDF](https://sci2s.ugr.es/keel/pdf/algorithm/articulo/2002-IEEE-TPAMI-Ho-DC.pdf))
165
+ - Lorena, Garcia, Lehmann, Souto, and Ho, "How Complex Is Your
166
+ Classification Problem? A Survey on Measuring Classification Complexity"
167
+ ([DOI](https://doi.org/10.1145/3347711),
168
+ [PDF](https://dl.acm.org/doi/epdf/10.1145/3347711))
169
+
170
+ We do not follow those procedures directly, but they are relevant background
171
+ for why geometry-aware pretraining diagnostics are useful.
172
+
@@ -0,0 +1,143 @@
1
+ [![separatix logo](https://raw.githubusercontent.com/NiklasMelton/Separatix/develop/img/separatix_logo.png)](https://github.com/NiklasMelton/Separatix)
2
+
3
+ # separatix
4
+
5
+ `separatix` profiles labeled feature spaces before classifier training and
6
+ returns transparent, confidence-aware guidance about apparent classification
7
+ complexity.
8
+
9
+ The intended use case includes learned embeddings, but the package is not
10
+ restricted to embeddings. It also works on raw feature matrices when you want a
11
+ coarse diagnostic of whether the observed class geometry looks mostly linear,
12
+ smoothly nonlinear, local or kernel-like, fragmented, bottlenecked, or too
13
+ unreliable to trust.
14
+
15
+ `separatix` does not claim to pick the optimal classifier. It is a pretraining
16
+ diagnostic and auditing tool designed to make its reasoning visible.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install separatix
22
+ ```
23
+
24
+ To install the latest development version directly from GitHub:
25
+
26
+ ```bash
27
+ pip install "git+https://github.com/NiklasMelton/Separatix.git@develop"
28
+ ```
29
+
30
+ ## Quick start
31
+
32
+ ```python
33
+ from separatix import diagnose
34
+
35
+ recommendation = diagnose(X, y, random_state=0)
36
+ print(recommendation)
37
+ ```
38
+
39
+ For a structured audit:
40
+
41
+ ```python
42
+ from separatix import diagnose
43
+
44
+ report = diagnose(X, y, return_report=True, random_state=0)
45
+ print(report.recommendation_text)
46
+ print(report.decision_path)
47
+ print(report.scores)
48
+ print(report.to_json())
49
+ ```
50
+
51
+ ## What It Accepts
52
+
53
+ - Dense NumPy arrays
54
+ - SciPy sparse matrices
55
+ - pandas DataFrames and Series when pandas is installed
56
+ - Binary and multiclass classification targets
57
+ - String or numeric labels treated as categorical class identifiers
58
+
59
+ Regression, multilabel classification, and multioutput classification are not
60
+ supported.
61
+
62
+ ## What It Returns
63
+
64
+ By default, `diagnose(...)` returns a plain-text recommendation. With
65
+ `return_report=True`, it returns a `DiagnosticReport` that includes:
66
+
67
+ - the recommendation label
68
+ - plain-text recommendation text
69
+ - confidence level
70
+ - underlying metric groups
71
+ - normalized summary scores
72
+ - a visible decision path
73
+ - warnings and skipped diagnostics
74
+ - sampling and densification events
75
+ - preprocessing and runtime metadata
76
+
77
+ The report is JSON-serializable through `report.to_dict()` and `report.to_json()`.
78
+
79
+ ## Recommendation Categories
80
+
81
+ - `linear_likely_sufficient`
82
+ - `smooth_nonlinear_recommended`
83
+ - `kernel_or_local_recommended`
84
+ - `high_capacity_or_partitioning_recommended`
85
+ - `feature_or_label_bottleneck_likely`
86
+ - `insufficient_data_or_unreliable_geometry`
87
+ - `inconclusive`
88
+
89
+ These categories are intentionally coarse. They describe the apparent geometry
90
+ and difficulty of the labeled feature space, not a guaranteed best model choice.
91
+
92
+ ## Decision Pipeline
93
+
94
+ The recommendation is produced by a fixed, inspectable pipeline:
95
+
96
+ 1. Validate inputs and encode labels.
97
+ 2. Audit class counts, imbalance, sparsity, and basic dataset conditions.
98
+ 3. Compute geometry, neighborhood, and boundary-related diagnostics.
99
+ 4. Run simple probe models and compare them to a dummy baseline.
100
+ 5. Aggregate the raw metrics into normalized scores such as signal,
101
+ linearity, nonlinearity, overlap, fragmentation, and reliability.
102
+ 6. Apply explicit rule-based branching to map those scores to a recommendation
103
+ category and confidence level.
104
+ 7. Render both a plain-language summary and a structured report.
105
+
106
+ The full rationale and decision rules are documented in
107
+ [docs/decision_pipeline.md](/Users/niklasmelton/code/Separatix/docs/decision_pipeline.md).
108
+
109
+ ## Sparse Inputs And Memory Behavior
110
+
111
+ Sparse matrices are accepted directly. Diagnostics that need dense data use a
112
+ shared densification policy rather than a separate dense-only code path. When a
113
+ step would require densification, `separatix` can fail, skip, or warn and
114
+ subsample before densifying, depending on configuration. These events are
115
+ recorded in the report.
116
+
117
+ ## Examples
118
+
119
+ - [examples/basic_breast_cancer.py](/Users/niklasmelton/code/Separatix/examples/basic_breast_cancer.py)
120
+ - [examples/linear_hyperplane_visual.py](/Users/niklasmelton/code/Separatix/examples/linear_hyperplane_visual.py)
121
+ - [examples/curvilinear_boundary_visual.py](/Users/niklasmelton/code/Separatix/examples/curvilinear_boundary_visual.py)
122
+ - [examples/high_dimensional_linear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_linear_hyperplane.py)
123
+ - [examples/high_dimensional_curvilinear_hyperplane.py](/Users/niklasmelton/code/Separatix/examples/high_dimensional_curvilinear_hyperplane.py)
124
+ - [examples/moons_vs_linear.py](/Users/niklasmelton/code/Separatix/examples/moons_vs_linear.py)
125
+ - [examples/circles_kernel_signal.py](/Users/niklasmelton/code/Separatix/examples/circles_kernel_signal.py)
126
+ - [examples/multiclass_wine.py](/Users/niklasmelton/code/Separatix/examples/multiclass_wine.py)
127
+ - [examples/sparse_text_like_embeddings.py](/Users/niklasmelton/code/Separatix/examples/sparse_text_like_embeddings.py)
128
+
129
+ ## Related Work
130
+
131
+ This package is not an implementation of a published dataset-complexity
132
+ procedure, but the project is adjacent to and inspired by prior work on
133
+ classification complexity and data geometry. In particular, would like to acknowledge:
134
+
135
+ - Ho and Basu, "Complexity Measures of Supervised Classification Problems"
136
+ ([PDF](https://sci2s.ugr.es/keel/pdf/algorithm/articulo/2002-IEEE-TPAMI-Ho-DC.pdf))
137
+ - Lorena, Garcia, Lehmann, Souto, and Ho, "How Complex Is Your
138
+ Classification Problem? A Survey on Measuring Classification Complexity"
139
+ ([DOI](https://doi.org/10.1145/3347711),
140
+ [PDF](https://dl.acm.org/doi/epdf/10.1145/3347711))
141
+
142
+ We do not follow those procedures directly, but they are relevant background
143
+ for why geometry-aware pretraining diagnostics are useful.
@@ -0,0 +1,49 @@
1
+ [tool.poetry]
2
+ name = "separatix"
3
+ version = "0.1.0a1"
4
+ description = "Diagnostic profiling of labeled embeddings for classification model complexity guidance."
5
+ authors = ["Niklas Melton <niklas@example.com>"]
6
+ readme = "README.md"
7
+ license = "MIT"
8
+ packages = [{ include = "separatix" }]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = ">=3.9,<3.15"
12
+ numpy = ">=1.23"
13
+ scipy = ">=1.9"
14
+ scikit-learn = ">=1.2"
15
+ pandas = { version = ">=1.5", optional = true }
16
+ matplotlib = { version = ">=3.6", optional = true }
17
+ ripser = { version = ">=0.6", optional = true }
18
+
19
+ [tool.poetry.group.dev.dependencies]
20
+ pytest = ">=7"
21
+ pytest-cov = ">=4"
22
+ ruff = ">=0.5"
23
+ mypy = ">=1"
24
+ build = ">=1"
25
+ twine = ">=5"
26
+
27
+ [tool.poetry.extras]
28
+ pandas = ["pandas"]
29
+ tda = ["ripser"]
30
+ examples = ["matplotlib", "pandas"]
31
+
32
+ [tool.ruff]
33
+ line-length = 88
34
+
35
+ [tool.ruff.lint]
36
+ select = ["E", "F", "I", "B", "UP"]
37
+
38
+ [tool.pytest.ini_options]
39
+ testpaths = ["tests"]
40
+ addopts = "-ra"
41
+
42
+ [tool.mypy]
43
+ python_version = "3.12"
44
+ warn_unused_configs = true
45
+ ignore_missing_imports = true
46
+
47
+ [build-system]
48
+ requires = ["poetry-core"]
49
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,8 @@
1
+ """Public package exports for separatix."""
2
+
3
+ from separatix.api import diagnose
4
+ from separatix.config import ProfilerConfig
5
+ from separatix.profiler import ComplexityProfiler
6
+ from separatix.report import DiagnosticReport
7
+
8
+ __all__ = ["ComplexityProfiler", "DiagnosticReport", "ProfilerConfig", "diagnose"]
@@ -0,0 +1,37 @@
1
+ """Functional API for separatix."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from separatix.profiler import ComplexityProfiler
8
+ from separatix.report import DiagnosticReport
9
+
10
+
11
+ def diagnose(
12
+ X: Any,
13
+ y: Any,
14
+ *,
15
+ return_report: bool = False,
16
+ budget: Literal["fast", "standard", "extended"] = "standard",
17
+ topology: Literal["off", "auto", "graph", "persistent"] = "auto",
18
+ densify_policy: Literal["fail", "warn_and_sample", "skip"] = ("warn_and_sample"),
19
+ max_dense_mb: int = 512,
20
+ max_samples: int | None = None,
21
+ random_state: int | None = None,
22
+ warn_on_densify: bool = True,
23
+ ) -> str | DiagnosticReport:
24
+ """Diagnose apparent classification complexity from embeddings and labels."""
25
+ profiler = ComplexityProfiler(
26
+ budget=budget,
27
+ topology=topology,
28
+ densify_policy=densify_policy,
29
+ max_dense_mb=max_dense_mb,
30
+ max_samples=max_samples,
31
+ random_state=random_state,
32
+ warn_on_densify=warn_on_densify,
33
+ )
34
+ report = profiler.fit(X, y).report_
35
+ if report is None:
36
+ raise RuntimeError("Profiler did not produce a report.")
37
+ return report if return_report else report.recommendation_text
@@ -0,0 +1,42 @@
1
+ """Configuration objects for separatix."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass
6
+ from typing import Literal
7
+
8
+ from separatix.constants import BUDGETS
9
+
10
+
11
+ @dataclass
12
+ class ProfilerConfig:
13
+ """Configuration for the separatix diagnostic profiler."""
14
+
15
+ budget: Literal["fast", "standard", "extended"] = "standard"
16
+ topology: Literal["off", "auto", "graph", "persistent"] = "auto"
17
+ densify_policy: Literal["fail", "warn_and_sample", "skip"] = "warn_and_sample"
18
+ max_dense_mb: int = 512
19
+ max_samples: int | None = None
20
+ min_dense_samples: int = 200
21
+ random_state: int | None = None
22
+ warn_on_densify: bool = True
23
+ n_jobs: int | None = None
24
+
25
+ def __post_init__(self) -> None:
26
+ """Validate configuration values."""
27
+ if self.budget not in BUDGETS:
28
+ raise ValueError(f"Unsupported budget: {self.budget!r}")
29
+ if self.topology not in {"off", "auto", "graph", "persistent"}:
30
+ raise ValueError(f"Unsupported topology mode: {self.topology!r}")
31
+ if self.densify_policy not in {"fail", "warn_and_sample", "skip"}:
32
+ raise ValueError(f"Unsupported densify policy: {self.densify_policy!r}")
33
+ if self.max_dense_mb <= 0:
34
+ raise ValueError("max_dense_mb must be positive.")
35
+ if self.max_samples is not None and self.max_samples <= 0:
36
+ raise ValueError("max_samples must be positive when provided.")
37
+ if self.min_dense_samples <= 0:
38
+ raise ValueError("min_dense_samples must be positive.")
39
+
40
+ def to_dict(self) -> dict[str, object]:
41
+ """Return a JSON-serializable configuration dictionary."""
42
+ return asdict(self)
@@ -0,0 +1,57 @@
1
+ """Constants used across the separatix package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ LINEAR_LIKELY_SUFFICIENT = "linear_likely_sufficient"
6
+ SMOOTH_NONLINEAR_RECOMMENDED = "smooth_nonlinear_recommended"
7
+ KERNEL_OR_LOCAL_RECOMMENDED = "kernel_or_local_recommended"
8
+ HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED = "high_capacity_or_partitioning_recommended"
9
+ FEATURE_OR_LABEL_BOTTLENECK_LIKELY = "feature_or_label_bottleneck_likely"
10
+ INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY = "insufficient_data_or_unreliable_geometry"
11
+ INCONCLUSIVE = "inconclusive"
12
+
13
+ RECOMMENDATION_LABELS = {
14
+ LINEAR_LIKELY_SUFFICIENT: "Linear model likely sufficient.",
15
+ SMOOTH_NONLINEAR_RECOMMENDED: "Smooth nonlinear model likely useful.",
16
+ KERNEL_OR_LOCAL_RECOMMENDED: "Kernel or local model likely useful.",
17
+ HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED: (
18
+ "Higher-capacity or partitioning model likely useful."
19
+ ),
20
+ FEATURE_OR_LABEL_BOTTLENECK_LIKELY: "Feature or label bottleneck likely.",
21
+ INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY: (
22
+ "Insufficient data or unreliable geometry."
23
+ ),
24
+ INCONCLUSIVE: "Diagnostic result is inconclusive.",
25
+ }
26
+
27
+ BUDGETS = {
28
+ "fast": {
29
+ "max_probe_samples": 5000,
30
+ "max_neighbor_samples": 5000,
31
+ "max_boundary_samples": 2000,
32
+ "cv_folds": 3,
33
+ "bootstrap_repeats": 0,
34
+ "run_kernel_probe": False,
35
+ "run_persistent_topology": False,
36
+ },
37
+ "standard": {
38
+ "max_probe_samples": 20000,
39
+ "max_neighbor_samples": 10000,
40
+ "max_boundary_samples": 3000,
41
+ "cv_folds": 5,
42
+ "bootstrap_repeats": 3,
43
+ "run_kernel_probe": True,
44
+ "run_persistent_topology": "auto",
45
+ },
46
+ "extended": {
47
+ "max_probe_samples": 50000,
48
+ "max_neighbor_samples": 20000,
49
+ "max_boundary_samples": 5000,
50
+ "cv_folds": 5,
51
+ "bootstrap_repeats": 10,
52
+ "run_kernel_probe": True,
53
+ "run_persistent_topology": "auto",
54
+ },
55
+ }
56
+
57
+ CONFIDENCE_LEVELS = ("low", "medium", "high")
@@ -0,0 +1,106 @@
1
+ """Dense conversion helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from math import floor
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ from scipy import sparse
10
+
11
+ from separatix.config import ProfilerConfig
12
+ from separatix.exceptions import DensificationError, DensificationWarning
13
+ from separatix.sampling import stratified_subsample_indices
14
+ from separatix.utils.warnings import record_warning
15
+
16
+
17
+ def ensure_dense_or_sample(
18
+ X: Any,
19
+ y: np.ndarray,
20
+ *,
21
+ reason: str,
22
+ config: ProfilerConfig,
23
+ report_context: dict[str, Any],
24
+ ) -> dict[str, Any]:
25
+ """Return a dense matrix, optionally after stratified subsampling."""
26
+ densification_events = report_context.setdefault("densification_events", [])
27
+ warnings_list = report_context.setdefault("warnings", [])
28
+ skipped = report_context.setdefault("skipped_diagnostics", [])
29
+
30
+ if not sparse.issparse(X):
31
+ return {"X": np.asarray(X), "y": y, "performed": False, "skipped": False}
32
+
33
+ dtype = X.dtype if X.dtype is not None else np.dtype(float)
34
+ estimated_mb = X.shape[0] * X.shape[1] * np.dtype(dtype).itemsize / 1024**2
35
+ event = {
36
+ "operation": "densify",
37
+ "reason": reason,
38
+ "input_shape": [int(X.shape[0]), int(X.shape[1])],
39
+ "estimated_full_dense_mb": float(estimated_mb),
40
+ "max_dense_mb": config.max_dense_mb,
41
+ "policy": config.densify_policy,
42
+ "sampling_used": False,
43
+ "n_original": int(X.shape[0]),
44
+ "n_used": int(X.shape[0]),
45
+ "status": "performed",
46
+ }
47
+
48
+ if estimated_mb <= config.max_dense_mb:
49
+ dense = X.toarray()
50
+ densification_events.append(event)
51
+ if config.warn_on_densify:
52
+ record_warning(
53
+ f"Sparse input densified for {reason}.",
54
+ warnings_list,
55
+ DensificationWarning,
56
+ )
57
+ return {"X": dense, "y": y, "performed": True, "skipped": False}
58
+
59
+ if config.densify_policy == "fail":
60
+ message = (
61
+ f"Dense conversion for {reason} would exceed "
62
+ f"max_dense_mb={config.max_dense_mb}."
63
+ )
64
+ raise DensificationError(message)
65
+
66
+ if config.densify_policy == "skip":
67
+ event["status"] = "skipped"
68
+ densification_events.append(event)
69
+ skipped.append(
70
+ {
71
+ "name": reason,
72
+ "reason": "dense conversion exceeds configured memory budget",
73
+ }
74
+ )
75
+ return {"X": None, "y": y, "performed": False, "skipped": True}
76
+
77
+ max_rows = floor(
78
+ (config.max_dense_mb * 1024**2) / (X.shape[1] * np.dtype(dtype).itemsize)
79
+ )
80
+ n_used = min(X.shape[0], max_rows, config.max_samples or X.shape[0])
81
+ if n_used < min(config.min_dense_samples, X.shape[0]):
82
+ skipped.append({"name": reason, "reason": "dense subsample would be too small"})
83
+ event["status"] = "skipped_too_small"
84
+ event["n_used"] = int(max(n_used, 0))
85
+ densification_events.append(event)
86
+ if config.densify_policy == "warn_and_sample":
87
+ return {"X": None, "y": y, "performed": False, "skipped": True}
88
+ raise DensificationError(f"Unable to densify enough samples for {reason}.")
89
+
90
+ indices = stratified_subsample_indices(
91
+ y,
92
+ n_samples=n_used,
93
+ random_state=config.random_state,
94
+ )
95
+ dense = X[indices, :].toarray()
96
+ event["sampling_used"] = True
97
+ event["n_used"] = int(indices.shape[0])
98
+ event["status"] = "performed_on_subsample"
99
+ densification_events.append(event)
100
+ if config.warn_on_densify:
101
+ record_warning(
102
+ f"Sparse input was stratified-subsampled then densified for {reason}.",
103
+ warnings_list,
104
+ DensificationWarning,
105
+ )
106
+ return {"X": dense, "y": y[indices], "performed": True, "skipped": False}
@@ -0,0 +1,13 @@
1
+ """Custom exceptions and warnings for separatix."""
2
+
3
+
4
+ class SeparatixError(Exception):
5
+ """Base exception for separatix."""
6
+
7
+
8
+ class DensificationError(SeparatixError):
9
+ """Raised when dense conversion is required but disallowed or impossible."""
10
+
11
+
12
+ class DensificationWarning(UserWarning):
13
+ """Warning emitted when sparse data are densified or subsampled."""
@@ -0,0 +1 @@
1
+ """Diagnostic metric modules."""