separatix 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- separatix/__init__.py +8 -0
- separatix/api.py +37 -0
- separatix/config.py +42 -0
- separatix/constants.py +57 -0
- separatix/densify.py +106 -0
- separatix/exceptions.py +13 -0
- separatix/metrics/__init__.py +1 -0
- separatix/metrics/audit.py +61 -0
- separatix/metrics/baseline.py +21 -0
- separatix/metrics/boundary.py +54 -0
- separatix/metrics/geometry.py +80 -0
- separatix/metrics/graph.py +72 -0
- separatix/metrics/neighborhood.py +96 -0
- separatix/metrics/topology.py +130 -0
- separatix/models/__init__.py +1 -0
- separatix/models/probes.py +384 -0
- separatix/models/scoring.py +151 -0
- separatix/preprocessing.py +13 -0
- separatix/profiler.py +171 -0
- separatix/recommendation/__init__.py +1 -0
- separatix/recommendation/engine.py +240 -0
- separatix/recommendation/text.py +60 -0
- separatix/report.py +37 -0
- separatix/sampling.py +124 -0
- separatix/utils/__init__.py +1 -0
- separatix/utils/json.py +20 -0
- separatix/utils/random.py +10 -0
- separatix/utils/warnings.py +11 -0
- separatix/validation.py +92 -0
- separatix-0.1.0a1.dist-info/METADATA +172 -0
- separatix-0.1.0a1.dist-info/RECORD +33 -0
- separatix-0.1.0a1.dist-info/WHEEL +4 -0
- separatix-0.1.0a1.dist-info/licenses/LICENSE +21 -0
separatix/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Public package exports for separatix."""
|
|
2
|
+
|
|
3
|
+
from separatix.api import diagnose
|
|
4
|
+
from separatix.config import ProfilerConfig
|
|
5
|
+
from separatix.profiler import ComplexityProfiler
|
|
6
|
+
from separatix.report import DiagnosticReport
|
|
7
|
+
|
|
8
|
+
__all__ = ["ComplexityProfiler", "DiagnosticReport", "ProfilerConfig", "diagnose"]
|
separatix/api.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Functional API for separatix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from separatix.profiler import ComplexityProfiler
|
|
8
|
+
from separatix.report import DiagnosticReport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def diagnose(
|
|
12
|
+
X: Any,
|
|
13
|
+
y: Any,
|
|
14
|
+
*,
|
|
15
|
+
return_report: bool = False,
|
|
16
|
+
budget: Literal["fast", "standard", "extended"] = "standard",
|
|
17
|
+
topology: Literal["off", "auto", "graph", "persistent"] = "auto",
|
|
18
|
+
densify_policy: Literal["fail", "warn_and_sample", "skip"] = ("warn_and_sample"),
|
|
19
|
+
max_dense_mb: int = 512,
|
|
20
|
+
max_samples: int | None = None,
|
|
21
|
+
random_state: int | None = None,
|
|
22
|
+
warn_on_densify: bool = True,
|
|
23
|
+
) -> str | DiagnosticReport:
|
|
24
|
+
"""Diagnose apparent classification complexity from embeddings and labels."""
|
|
25
|
+
profiler = ComplexityProfiler(
|
|
26
|
+
budget=budget,
|
|
27
|
+
topology=topology,
|
|
28
|
+
densify_policy=densify_policy,
|
|
29
|
+
max_dense_mb=max_dense_mb,
|
|
30
|
+
max_samples=max_samples,
|
|
31
|
+
random_state=random_state,
|
|
32
|
+
warn_on_densify=warn_on_densify,
|
|
33
|
+
)
|
|
34
|
+
report = profiler.fit(X, y).report_
|
|
35
|
+
if report is None:
|
|
36
|
+
raise RuntimeError("Profiler did not produce a report.")
|
|
37
|
+
return report if return_report else report.recommendation_text
|
separatix/config.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Configuration objects for separatix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import asdict, dataclass
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from separatix.constants import BUDGETS
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ProfilerConfig:
|
|
13
|
+
"""Configuration for the separatix diagnostic profiler."""
|
|
14
|
+
|
|
15
|
+
budget: Literal["fast", "standard", "extended"] = "standard"
|
|
16
|
+
topology: Literal["off", "auto", "graph", "persistent"] = "auto"
|
|
17
|
+
densify_policy: Literal["fail", "warn_and_sample", "skip"] = "warn_and_sample"
|
|
18
|
+
max_dense_mb: int = 512
|
|
19
|
+
max_samples: int | None = None
|
|
20
|
+
min_dense_samples: int = 200
|
|
21
|
+
random_state: int | None = None
|
|
22
|
+
warn_on_densify: bool = True
|
|
23
|
+
n_jobs: int | None = None
|
|
24
|
+
|
|
25
|
+
def __post_init__(self) -> None:
|
|
26
|
+
"""Validate configuration values."""
|
|
27
|
+
if self.budget not in BUDGETS:
|
|
28
|
+
raise ValueError(f"Unsupported budget: {self.budget!r}")
|
|
29
|
+
if self.topology not in {"off", "auto", "graph", "persistent"}:
|
|
30
|
+
raise ValueError(f"Unsupported topology mode: {self.topology!r}")
|
|
31
|
+
if self.densify_policy not in {"fail", "warn_and_sample", "skip"}:
|
|
32
|
+
raise ValueError(f"Unsupported densify policy: {self.densify_policy!r}")
|
|
33
|
+
if self.max_dense_mb <= 0:
|
|
34
|
+
raise ValueError("max_dense_mb must be positive.")
|
|
35
|
+
if self.max_samples is not None and self.max_samples <= 0:
|
|
36
|
+
raise ValueError("max_samples must be positive when provided.")
|
|
37
|
+
if self.min_dense_samples <= 0:
|
|
38
|
+
raise ValueError("min_dense_samples must be positive.")
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict[str, object]:
|
|
41
|
+
"""Return a JSON-serializable configuration dictionary."""
|
|
42
|
+
return asdict(self)
|
separatix/constants.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Constants used across the separatix package."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
LINEAR_LIKELY_SUFFICIENT = "linear_likely_sufficient"
|
|
6
|
+
SMOOTH_NONLINEAR_RECOMMENDED = "smooth_nonlinear_recommended"
|
|
7
|
+
KERNEL_OR_LOCAL_RECOMMENDED = "kernel_or_local_recommended"
|
|
8
|
+
HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED = "high_capacity_or_partitioning_recommended"
|
|
9
|
+
FEATURE_OR_LABEL_BOTTLENECK_LIKELY = "feature_or_label_bottleneck_likely"
|
|
10
|
+
INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY = "insufficient_data_or_unreliable_geometry"
|
|
11
|
+
INCONCLUSIVE = "inconclusive"
|
|
12
|
+
|
|
13
|
+
RECOMMENDATION_LABELS = {
|
|
14
|
+
LINEAR_LIKELY_SUFFICIENT: "Linear model likely sufficient.",
|
|
15
|
+
SMOOTH_NONLINEAR_RECOMMENDED: "Smooth nonlinear model likely useful.",
|
|
16
|
+
KERNEL_OR_LOCAL_RECOMMENDED: "Kernel or local model likely useful.",
|
|
17
|
+
HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED: (
|
|
18
|
+
"Higher-capacity or partitioning model likely useful."
|
|
19
|
+
),
|
|
20
|
+
FEATURE_OR_LABEL_BOTTLENECK_LIKELY: "Feature or label bottleneck likely.",
|
|
21
|
+
INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY: (
|
|
22
|
+
"Insufficient data or unreliable geometry."
|
|
23
|
+
),
|
|
24
|
+
INCONCLUSIVE: "Diagnostic result is inconclusive.",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
BUDGETS = {
|
|
28
|
+
"fast": {
|
|
29
|
+
"max_probe_samples": 5000,
|
|
30
|
+
"max_neighbor_samples": 5000,
|
|
31
|
+
"max_boundary_samples": 2000,
|
|
32
|
+
"cv_folds": 3,
|
|
33
|
+
"bootstrap_repeats": 0,
|
|
34
|
+
"run_kernel_probe": False,
|
|
35
|
+
"run_persistent_topology": False,
|
|
36
|
+
},
|
|
37
|
+
"standard": {
|
|
38
|
+
"max_probe_samples": 20000,
|
|
39
|
+
"max_neighbor_samples": 10000,
|
|
40
|
+
"max_boundary_samples": 3000,
|
|
41
|
+
"cv_folds": 5,
|
|
42
|
+
"bootstrap_repeats": 3,
|
|
43
|
+
"run_kernel_probe": True,
|
|
44
|
+
"run_persistent_topology": "auto",
|
|
45
|
+
},
|
|
46
|
+
"extended": {
|
|
47
|
+
"max_probe_samples": 50000,
|
|
48
|
+
"max_neighbor_samples": 20000,
|
|
49
|
+
"max_boundary_samples": 5000,
|
|
50
|
+
"cv_folds": 5,
|
|
51
|
+
"bootstrap_repeats": 10,
|
|
52
|
+
"run_kernel_probe": True,
|
|
53
|
+
"run_persistent_topology": "auto",
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
CONFIDENCE_LEVELS = ("low", "medium", "high")
|
separatix/densify.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Dense conversion helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from math import floor
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy import sparse
|
|
10
|
+
|
|
11
|
+
from separatix.config import ProfilerConfig
|
|
12
|
+
from separatix.exceptions import DensificationError, DensificationWarning
|
|
13
|
+
from separatix.sampling import stratified_subsample_indices
|
|
14
|
+
from separatix.utils.warnings import record_warning
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def ensure_dense_or_sample(
|
|
18
|
+
X: Any,
|
|
19
|
+
y: np.ndarray,
|
|
20
|
+
*,
|
|
21
|
+
reason: str,
|
|
22
|
+
config: ProfilerConfig,
|
|
23
|
+
report_context: dict[str, Any],
|
|
24
|
+
) -> dict[str, Any]:
|
|
25
|
+
"""Return a dense matrix, optionally after stratified subsampling."""
|
|
26
|
+
densification_events = report_context.setdefault("densification_events", [])
|
|
27
|
+
warnings_list = report_context.setdefault("warnings", [])
|
|
28
|
+
skipped = report_context.setdefault("skipped_diagnostics", [])
|
|
29
|
+
|
|
30
|
+
if not sparse.issparse(X):
|
|
31
|
+
return {"X": np.asarray(X), "y": y, "performed": False, "skipped": False}
|
|
32
|
+
|
|
33
|
+
dtype = X.dtype if X.dtype is not None else np.dtype(float)
|
|
34
|
+
estimated_mb = X.shape[0] * X.shape[1] * np.dtype(dtype).itemsize / 1024**2
|
|
35
|
+
event = {
|
|
36
|
+
"operation": "densify",
|
|
37
|
+
"reason": reason,
|
|
38
|
+
"input_shape": [int(X.shape[0]), int(X.shape[1])],
|
|
39
|
+
"estimated_full_dense_mb": float(estimated_mb),
|
|
40
|
+
"max_dense_mb": config.max_dense_mb,
|
|
41
|
+
"policy": config.densify_policy,
|
|
42
|
+
"sampling_used": False,
|
|
43
|
+
"n_original": int(X.shape[0]),
|
|
44
|
+
"n_used": int(X.shape[0]),
|
|
45
|
+
"status": "performed",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if estimated_mb <= config.max_dense_mb:
|
|
49
|
+
dense = X.toarray()
|
|
50
|
+
densification_events.append(event)
|
|
51
|
+
if config.warn_on_densify:
|
|
52
|
+
record_warning(
|
|
53
|
+
f"Sparse input densified for {reason}.",
|
|
54
|
+
warnings_list,
|
|
55
|
+
DensificationWarning,
|
|
56
|
+
)
|
|
57
|
+
return {"X": dense, "y": y, "performed": True, "skipped": False}
|
|
58
|
+
|
|
59
|
+
if config.densify_policy == "fail":
|
|
60
|
+
message = (
|
|
61
|
+
f"Dense conversion for {reason} would exceed "
|
|
62
|
+
f"max_dense_mb={config.max_dense_mb}."
|
|
63
|
+
)
|
|
64
|
+
raise DensificationError(message)
|
|
65
|
+
|
|
66
|
+
if config.densify_policy == "skip":
|
|
67
|
+
event["status"] = "skipped"
|
|
68
|
+
densification_events.append(event)
|
|
69
|
+
skipped.append(
|
|
70
|
+
{
|
|
71
|
+
"name": reason,
|
|
72
|
+
"reason": "dense conversion exceeds configured memory budget",
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
return {"X": None, "y": y, "performed": False, "skipped": True}
|
|
76
|
+
|
|
77
|
+
max_rows = floor(
|
|
78
|
+
(config.max_dense_mb * 1024**2) / (X.shape[1] * np.dtype(dtype).itemsize)
|
|
79
|
+
)
|
|
80
|
+
n_used = min(X.shape[0], max_rows, config.max_samples or X.shape[0])
|
|
81
|
+
if n_used < min(config.min_dense_samples, X.shape[0]):
|
|
82
|
+
skipped.append({"name": reason, "reason": "dense subsample would be too small"})
|
|
83
|
+
event["status"] = "skipped_too_small"
|
|
84
|
+
event["n_used"] = int(max(n_used, 0))
|
|
85
|
+
densification_events.append(event)
|
|
86
|
+
if config.densify_policy == "warn_and_sample":
|
|
87
|
+
return {"X": None, "y": y, "performed": False, "skipped": True}
|
|
88
|
+
raise DensificationError(f"Unable to densify enough samples for {reason}.")
|
|
89
|
+
|
|
90
|
+
indices = stratified_subsample_indices(
|
|
91
|
+
y,
|
|
92
|
+
n_samples=n_used,
|
|
93
|
+
random_state=config.random_state,
|
|
94
|
+
)
|
|
95
|
+
dense = X[indices, :].toarray()
|
|
96
|
+
event["sampling_used"] = True
|
|
97
|
+
event["n_used"] = int(indices.shape[0])
|
|
98
|
+
event["status"] = "performed_on_subsample"
|
|
99
|
+
densification_events.append(event)
|
|
100
|
+
if config.warn_on_densify:
|
|
101
|
+
record_warning(
|
|
102
|
+
f"Sparse input was stratified-subsampled then densified for {reason}.",
|
|
103
|
+
warnings_list,
|
|
104
|
+
DensificationWarning,
|
|
105
|
+
)
|
|
106
|
+
return {"X": dense, "y": y[indices], "performed": True, "skipped": False}
|
separatix/exceptions.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Custom exceptions and warnings for separatix."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SeparatixError(Exception):
|
|
5
|
+
"""Base exception for separatix."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DensificationError(SeparatixError):
|
|
9
|
+
"""Raised when dense conversion is required but disallowed or impossible."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DensificationWarning(UserWarning):
|
|
13
|
+
"""Warning emitted when sparse data are densified or subsampled."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Diagnostic metric modules."""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Dataset audit metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_dataset_audit(
|
|
11
|
+
X: Any,
|
|
12
|
+
y: np.ndarray,
|
|
13
|
+
*,
|
|
14
|
+
classes: np.ndarray,
|
|
15
|
+
is_sparse: bool,
|
|
16
|
+
) -> dict[str, Any]:
|
|
17
|
+
"""Compute cheap dataset audit statistics."""
|
|
18
|
+
class_ids, counts = np.unique(y, return_counts=True)
|
|
19
|
+
proportions = counts / counts.sum()
|
|
20
|
+
imbalance_ratio = float(counts.max() / max(1, counts.min()))
|
|
21
|
+
result: dict[str, Any] = {
|
|
22
|
+
"n_samples": int(X.shape[0]),
|
|
23
|
+
"n_features": int(X.shape[1]),
|
|
24
|
+
"n_classes": int(classes.shape[0]),
|
|
25
|
+
"class_counts": {
|
|
26
|
+
str(classes[i]): int(counts[idx]) for idx, i in enumerate(class_ids)
|
|
27
|
+
},
|
|
28
|
+
"class_proportions": {
|
|
29
|
+
str(classes[i]): float(proportions[idx]) for idx, i in enumerate(class_ids)
|
|
30
|
+
},
|
|
31
|
+
"imbalance_ratio": imbalance_ratio,
|
|
32
|
+
"is_sparse": is_sparse,
|
|
33
|
+
}
|
|
34
|
+
if is_sparse:
|
|
35
|
+
density = (
|
|
36
|
+
float(X.nnz / (X.shape[0] * X.shape[1]))
|
|
37
|
+
if X.shape[0] and X.shape[1]
|
|
38
|
+
else 0.0
|
|
39
|
+
)
|
|
40
|
+
result.update(
|
|
41
|
+
{
|
|
42
|
+
"nnz": int(X.nnz),
|
|
43
|
+
"density": density,
|
|
44
|
+
"sparsity_fraction": float(1.0 - density),
|
|
45
|
+
"estimated_dense_memory_mb": float(
|
|
46
|
+
X.shape[0] * X.shape[1] * X.dtype.itemsize / 1024**2
|
|
47
|
+
),
|
|
48
|
+
"dtype": str(X.dtype),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
result.update(
|
|
53
|
+
{
|
|
54
|
+
"dtype": str(X.dtype),
|
|
55
|
+
"constant_feature_fraction": float(
|
|
56
|
+
np.mean(np.nanstd(X, axis=0) == 0.0)
|
|
57
|
+
),
|
|
58
|
+
"estimated_dense_memory_mb": float(X.nbytes / 1024**2),
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
return result
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Baseline metric helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def summarize_probe_family(probes: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
|
9
|
+
"""Summarize high-level probe behavior."""
|
|
10
|
+
available = {
|
|
11
|
+
name: result["balanced_accuracy"]
|
|
12
|
+
for name, result in probes.items()
|
|
13
|
+
if "balanced_accuracy" in result
|
|
14
|
+
}
|
|
15
|
+
best_name = (
|
|
16
|
+
max(available.items(), key=lambda item: item[1])[0] if available else None
|
|
17
|
+
)
|
|
18
|
+
return {
|
|
19
|
+
"best_probe": best_name,
|
|
20
|
+
"best_probe_score": available.get(best_name) if best_name is not None else None,
|
|
21
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Boundary candidate diagnostics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_boundary_candidates(
|
|
11
|
+
y: np.ndarray,
|
|
12
|
+
neighborhood: dict[str, Any],
|
|
13
|
+
probes: dict[str, dict[str, Any]],
|
|
14
|
+
) -> dict[str, Any]:
|
|
15
|
+
"""Extract boundary candidate indices from ambiguity and disagreement."""
|
|
16
|
+
local_entropy = np.asarray(neighborhood.get("local_entropy", []), dtype=float)
|
|
17
|
+
local_ambiguity = np.asarray(neighborhood.get("local_ambiguity", []), dtype=float)
|
|
18
|
+
if local_entropy.size == 0 or local_ambiguity.size == 0:
|
|
19
|
+
return {
|
|
20
|
+
"candidate_indices": [],
|
|
21
|
+
"candidate_fraction": 0.0,
|
|
22
|
+
"boundary_sample_size": 0,
|
|
23
|
+
"class_composition": {},
|
|
24
|
+
"warning": "Boundary diagnostics unavailable.",
|
|
25
|
+
}
|
|
26
|
+
entropy_threshold = float(np.quantile(local_entropy, 0.75))
|
|
27
|
+
ambiguity_threshold = max(0.33, float(np.quantile(local_ambiguity, 0.75)))
|
|
28
|
+
candidate_mask = (local_entropy >= entropy_threshold) | (
|
|
29
|
+
local_ambiguity >= ambiguity_threshold
|
|
30
|
+
)
|
|
31
|
+
linear_preds = probes.get("linear", {}).get("predictions")
|
|
32
|
+
knn_preds = probes.get("knn", {}).get("predictions")
|
|
33
|
+
if (
|
|
34
|
+
linear_preds is not None
|
|
35
|
+
and knn_preds is not None
|
|
36
|
+
and len(linear_preds) == candidate_mask.shape[0]
|
|
37
|
+
):
|
|
38
|
+
disagreement = np.asarray(linear_preds) != np.asarray(knn_preds)
|
|
39
|
+
candidate_mask = candidate_mask | disagreement
|
|
40
|
+
indices = np.flatnonzero(candidate_mask)
|
|
41
|
+
counts = {
|
|
42
|
+
str(cls): int(np.sum(y[indices] == cls))
|
|
43
|
+
for cls in np.unique(y[indices])
|
|
44
|
+
if indices.size
|
|
45
|
+
}
|
|
46
|
+
return {
|
|
47
|
+
"candidate_indices": indices.tolist(),
|
|
48
|
+
"candidate_fraction": float(indices.shape[0] / max(1, y.shape[0])),
|
|
49
|
+
"boundary_sample_size": int(indices.shape[0]),
|
|
50
|
+
"class_composition": counts,
|
|
51
|
+
"warning": "Boundary sample is very small."
|
|
52
|
+
if indices.shape[0] < max(10, len(np.unique(y)))
|
|
53
|
+
else None,
|
|
54
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Geometry reliability diagnostics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import sparse
|
|
9
|
+
from sklearn.decomposition import PCA, TruncatedSVD
|
|
10
|
+
from sklearn.metrics import pairwise_distances
|
|
11
|
+
|
|
12
|
+
from separatix.config import ProfilerConfig
|
|
13
|
+
from separatix.densify import ensure_dense_or_sample
|
|
14
|
+
from separatix.sampling import cap_samples_for_budget
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def compute_geometry_diagnostics(
|
|
18
|
+
X: Any,
|
|
19
|
+
y: np.ndarray,
|
|
20
|
+
*,
|
|
21
|
+
config: ProfilerConfig,
|
|
22
|
+
report_context: dict[str, Any],
|
|
23
|
+
) -> dict[str, Any]:
|
|
24
|
+
"""Compute cheap geometry reliability metrics."""
|
|
25
|
+
X_used, y_used, sample_info = cap_samples_for_budget(
|
|
26
|
+
X, y, config=config, reason="neighbors"
|
|
27
|
+
)
|
|
28
|
+
if sparse.issparse(X_used):
|
|
29
|
+
svd = TruncatedSVD(
|
|
30
|
+
n_components=min(10, max(2, X_used.shape[1] - 1)),
|
|
31
|
+
random_state=config.random_state,
|
|
32
|
+
)
|
|
33
|
+
embedding = svd.fit_transform(X_used)
|
|
34
|
+
explained = svd.explained_variance_ratio_
|
|
35
|
+
feature_scale_range = None
|
|
36
|
+
else:
|
|
37
|
+
centered = np.asarray(X_used)
|
|
38
|
+
pca = PCA(
|
|
39
|
+
n_components=min(10, centered.shape[1], max(1, centered.shape[0] - 1)),
|
|
40
|
+
random_state=config.random_state,
|
|
41
|
+
)
|
|
42
|
+
embedding = pca.fit_transform(centered)
|
|
43
|
+
explained = pca.explained_variance_ratio_
|
|
44
|
+
std = np.std(centered, axis=0)
|
|
45
|
+
feature_scale_range = float(
|
|
46
|
+
std.max() / max(std[std > 0].min() if np.any(std > 0) else 1.0, 1e-9)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
dense_for_dist = ensure_dense_or_sample(
|
|
50
|
+
X_used,
|
|
51
|
+
y_used,
|
|
52
|
+
reason="geometry_distance_concentration",
|
|
53
|
+
config=config,
|
|
54
|
+
report_context=report_context,
|
|
55
|
+
)
|
|
56
|
+
if dense_for_dist["skipped"]:
|
|
57
|
+
concentration = None
|
|
58
|
+
else:
|
|
59
|
+
sample = dense_for_dist["X"]
|
|
60
|
+
if sample.shape[0] > 250:
|
|
61
|
+
sample = sample[:250]
|
|
62
|
+
dists = pairwise_distances(sample)
|
|
63
|
+
tri = dists[np.triu_indices_from(dists, k=1)]
|
|
64
|
+
concentration = (
|
|
65
|
+
float((tri.max() - tri.min()) / max(tri.mean(), 1e-9)) if tri.size else 0.0
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
intrinsic_dim = float(np.sum(explained > (1.0 / max(embedding.shape[1], 1))))
|
|
69
|
+
effective_rank = float(
|
|
70
|
+
np.exp(-np.sum(explained * np.log(np.clip(explained, 1e-12, None))))
|
|
71
|
+
)
|
|
72
|
+
return {
|
|
73
|
+
"feature_scale_range_estimate": feature_scale_range,
|
|
74
|
+
"effective_rank_estimate": effective_rank,
|
|
75
|
+
"intrinsic_dimension_proxy": intrinsic_dim,
|
|
76
|
+
"distance_concentration_proxy": concentration,
|
|
77
|
+
"high_dimensionality_flag": bool(X.shape[1] > max(100, X.shape[0] // 2)),
|
|
78
|
+
"sample_to_feature_ratio": float(X.shape[0] / max(1, X.shape[1])),
|
|
79
|
+
"sampling": sample_info,
|
|
80
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Graph fragmentation diagnostics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy.sparse.csgraph import connected_components
|
|
9
|
+
from sklearn.neighbors import NearestNeighbors
|
|
10
|
+
|
|
11
|
+
from separatix.config import ProfilerConfig
|
|
12
|
+
from separatix.sampling import cap_samples_for_budget
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compute_graph_fragmentation(
|
|
16
|
+
X: Any,
|
|
17
|
+
y: np.ndarray,
|
|
18
|
+
boundary: dict[str, Any],
|
|
19
|
+
*,
|
|
20
|
+
config: ProfilerConfig,
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""Compute fragmentation diagnostics over boundary candidates."""
|
|
23
|
+
indices = np.asarray(boundary.get("candidate_indices", []), dtype=int)
|
|
24
|
+
if indices.size < 3:
|
|
25
|
+
return {
|
|
26
|
+
"component_count": 0,
|
|
27
|
+
"largest_component_fraction": 1.0 if indices.size else 0.0,
|
|
28
|
+
"component_size_entropy": 0.0,
|
|
29
|
+
"small_component_count": 0,
|
|
30
|
+
"cross_class_edge_density": 0.0,
|
|
31
|
+
"graph_fragmentation_score": 0.0,
|
|
32
|
+
"warning": "Not enough boundary candidates for graph diagnostics.",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
X_boundary = X[indices]
|
|
36
|
+
y_boundary = y[indices]
|
|
37
|
+
X_boundary, y_boundary, sample_info = cap_samples_for_budget(
|
|
38
|
+
X_boundary, y_boundary, config=config, reason="boundary"
|
|
39
|
+
)
|
|
40
|
+
k = min(len(y_boundary) - 1, 10)
|
|
41
|
+
nn = NearestNeighbors(n_neighbors=k + 1)
|
|
42
|
+
nn.fit(X_boundary)
|
|
43
|
+
graph = nn.kneighbors_graph(X_boundary, mode="connectivity")
|
|
44
|
+
graph = graph.maximum(graph.T)
|
|
45
|
+
n_components, labels = connected_components(graph, directed=False)
|
|
46
|
+
sizes = np.bincount(labels)
|
|
47
|
+
probs = sizes / max(1, sizes.sum())
|
|
48
|
+
entropy = float(
|
|
49
|
+
-np.sum(np.where(probs > 0, probs * np.log(probs), 0.0))
|
|
50
|
+
/ max(np.log(max(2, len(sizes))), 1e-9)
|
|
51
|
+
)
|
|
52
|
+
rows, cols = graph.nonzero()
|
|
53
|
+
valid = rows < cols
|
|
54
|
+
cross_class = (
|
|
55
|
+
np.mean(y_boundary[rows[valid]] != y_boundary[cols[valid]])
|
|
56
|
+
if np.any(valid)
|
|
57
|
+
else 0.0
|
|
58
|
+
)
|
|
59
|
+
largest = float(sizes.max() / max(1, sizes.sum()))
|
|
60
|
+
component_count_scaled = min(1.0, n_components / max(3, len(y_boundary) / 20))
|
|
61
|
+
fragmentation = float(
|
|
62
|
+
np.clip(np.mean([1.0 - largest, component_count_scaled, entropy]), 0.0, 1.0)
|
|
63
|
+
)
|
|
64
|
+
return {
|
|
65
|
+
"component_count": int(n_components),
|
|
66
|
+
"largest_component_fraction": largest,
|
|
67
|
+
"component_size_entropy": entropy,
|
|
68
|
+
"small_component_count": int(np.sum(sizes <= 3)),
|
|
69
|
+
"cross_class_edge_density": float(cross_class),
|
|
70
|
+
"graph_fragmentation_score": fragmentation,
|
|
71
|
+
"sampling": sample_info,
|
|
72
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Neighborhood diagnostics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import sparse
|
|
9
|
+
from sklearn.neighbors import NearestNeighbors
|
|
10
|
+
|
|
11
|
+
from separatix.config import ProfilerConfig
|
|
12
|
+
from separatix.densify import ensure_dense_or_sample
|
|
13
|
+
from separatix.sampling import cap_samples_for_budget
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def compute_neighborhood_diagnostics(
|
|
17
|
+
X: Any,
|
|
18
|
+
y: np.ndarray,
|
|
19
|
+
*,
|
|
20
|
+
config: ProfilerConfig,
|
|
21
|
+
report_context: dict[str, Any],
|
|
22
|
+
) -> dict[str, Any]:
|
|
23
|
+
"""Compute neighborhood overlap and ambiguity diagnostics."""
|
|
24
|
+
X_used, y_used, sample_info = cap_samples_for_budget(
|
|
25
|
+
X, y, config=config, reason="neighbors"
|
|
26
|
+
)
|
|
27
|
+
k = min(len(y_used) - 1, min(15, max(3, int(np.sqrt(len(y_used))))))
|
|
28
|
+
if k < 1:
|
|
29
|
+
return {
|
|
30
|
+
"mean_local_entropy": 0.0,
|
|
31
|
+
"high_entropy_fraction": 0.0,
|
|
32
|
+
"same_class_neighbor_fraction": 1.0,
|
|
33
|
+
"cross_class_neighbor_fraction": 0.0,
|
|
34
|
+
"mean_local_ambiguity": 0.0,
|
|
35
|
+
"sampling": sample_info,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
X_fit = X_used
|
|
39
|
+
if sparse.issparse(X_used):
|
|
40
|
+
try:
|
|
41
|
+
nn = NearestNeighbors(n_neighbors=k)
|
|
42
|
+
nn.fit(X_fit)
|
|
43
|
+
except TypeError:
|
|
44
|
+
dense_info = ensure_dense_or_sample(
|
|
45
|
+
X_used,
|
|
46
|
+
y_used,
|
|
47
|
+
reason="neighborhood_diagnostics",
|
|
48
|
+
config=config,
|
|
49
|
+
report_context=report_context,
|
|
50
|
+
)
|
|
51
|
+
if dense_info["skipped"]:
|
|
52
|
+
return {
|
|
53
|
+
"sampling": sample_info,
|
|
54
|
+
"skipped_reason": "dense conversion unavailable",
|
|
55
|
+
}
|
|
56
|
+
X_fit = dense_info["X"]
|
|
57
|
+
y_used = dense_info["y"]
|
|
58
|
+
nn = NearestNeighbors(n_neighbors=min(k, len(y_used) - 1))
|
|
59
|
+
nn.fit(X_fit)
|
|
60
|
+
else:
|
|
61
|
+
nn = NearestNeighbors(n_neighbors=k)
|
|
62
|
+
nn.fit(X_fit)
|
|
63
|
+
|
|
64
|
+
indices = nn.kneighbors(X_fit, n_neighbors=k + 1, return_distance=False)[:, 1:]
|
|
65
|
+
entropies = []
|
|
66
|
+
ambiguities = []
|
|
67
|
+
same_class = []
|
|
68
|
+
enemy_distances = []
|
|
69
|
+
distances, idxs = nn.kneighbors(X_fit, n_neighbors=k + 1, return_distance=True)
|
|
70
|
+
for row_i, neigh in enumerate(indices):
|
|
71
|
+
neigh_labels = y_used[neigh]
|
|
72
|
+
counts = np.bincount(neigh_labels, minlength=len(np.unique(y_used)))
|
|
73
|
+
probs = counts / max(1, counts.sum())
|
|
74
|
+
positive_probs = probs[probs > 0]
|
|
75
|
+
ent = -np.sum(positive_probs * np.log(positive_probs))
|
|
76
|
+
entropies.append(float(ent / max(np.log(max(2, len(probs))), 1e-9)))
|
|
77
|
+
ambiguities.append(float(1.0 - probs.max()))
|
|
78
|
+
same_class.append(float(np.mean(neigh_labels == y_used[row_i])))
|
|
79
|
+
row_dist = distances[row_i, 1:]
|
|
80
|
+
enemy_mask = neigh_labels != y_used[row_i]
|
|
81
|
+
if np.any(enemy_mask):
|
|
82
|
+
enemy_distances.append(float(np.min(row_dist[enemy_mask])))
|
|
83
|
+
entropies_arr = np.array(entropies)
|
|
84
|
+
return {
|
|
85
|
+
"mean_local_entropy": float(entropies_arr.mean()),
|
|
86
|
+
"high_entropy_fraction": float(np.mean(entropies_arr >= 0.5)),
|
|
87
|
+
"same_class_neighbor_fraction": float(np.mean(same_class)),
|
|
88
|
+
"cross_class_neighbor_fraction": float(1.0 - np.mean(same_class)),
|
|
89
|
+
"nearest_enemy_distance_estimate": float(np.mean(enemy_distances))
|
|
90
|
+
if enemy_distances
|
|
91
|
+
else None,
|
|
92
|
+
"mean_local_ambiguity": float(np.mean(ambiguities)),
|
|
93
|
+
"local_entropy": entropies,
|
|
94
|
+
"local_ambiguity": ambiguities,
|
|
95
|
+
"sampling": sample_info,
|
|
96
|
+
}
|