separatix 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
separatix/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Public package exports for separatix."""
2
+
3
+ from separatix.api import diagnose
4
+ from separatix.config import ProfilerConfig
5
+ from separatix.profiler import ComplexityProfiler
6
+ from separatix.report import DiagnosticReport
7
+
8
+ __all__ = ["ComplexityProfiler", "DiagnosticReport", "ProfilerConfig", "diagnose"]
separatix/api.py ADDED
@@ -0,0 +1,37 @@
1
+ """Functional API for separatix."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from separatix.profiler import ComplexityProfiler
8
+ from separatix.report import DiagnosticReport
9
+
10
+
11
+ def diagnose(
12
+ X: Any,
13
+ y: Any,
14
+ *,
15
+ return_report: bool = False,
16
+ budget: Literal["fast", "standard", "extended"] = "standard",
17
+ topology: Literal["off", "auto", "graph", "persistent"] = "auto",
18
+ densify_policy: Literal["fail", "warn_and_sample", "skip"] = ("warn_and_sample"),
19
+ max_dense_mb: int = 512,
20
+ max_samples: int | None = None,
21
+ random_state: int | None = None,
22
+ warn_on_densify: bool = True,
23
+ ) -> str | DiagnosticReport:
24
+ """Diagnose apparent classification complexity from embeddings and labels."""
25
+ profiler = ComplexityProfiler(
26
+ budget=budget,
27
+ topology=topology,
28
+ densify_policy=densify_policy,
29
+ max_dense_mb=max_dense_mb,
30
+ max_samples=max_samples,
31
+ random_state=random_state,
32
+ warn_on_densify=warn_on_densify,
33
+ )
34
+ report = profiler.fit(X, y).report_
35
+ if report is None:
36
+ raise RuntimeError("Profiler did not produce a report.")
37
+ return report if return_report else report.recommendation_text
separatix/config.py ADDED
@@ -0,0 +1,42 @@
1
+ """Configuration objects for separatix."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass
6
+ from typing import Literal
7
+
8
+ from separatix.constants import BUDGETS
9
+
10
+
11
+ @dataclass
12
+ class ProfilerConfig:
13
+ """Configuration for the separatix diagnostic profiler."""
14
+
15
+ budget: Literal["fast", "standard", "extended"] = "standard"
16
+ topology: Literal["off", "auto", "graph", "persistent"] = "auto"
17
+ densify_policy: Literal["fail", "warn_and_sample", "skip"] = "warn_and_sample"
18
+ max_dense_mb: int = 512
19
+ max_samples: int | None = None
20
+ min_dense_samples: int = 200
21
+ random_state: int | None = None
22
+ warn_on_densify: bool = True
23
+ n_jobs: int | None = None
24
+
25
+ def __post_init__(self) -> None:
26
+ """Validate configuration values."""
27
+ if self.budget not in BUDGETS:
28
+ raise ValueError(f"Unsupported budget: {self.budget!r}")
29
+ if self.topology not in {"off", "auto", "graph", "persistent"}:
30
+ raise ValueError(f"Unsupported topology mode: {self.topology!r}")
31
+ if self.densify_policy not in {"fail", "warn_and_sample", "skip"}:
32
+ raise ValueError(f"Unsupported densify policy: {self.densify_policy!r}")
33
+ if self.max_dense_mb <= 0:
34
+ raise ValueError("max_dense_mb must be positive.")
35
+ if self.max_samples is not None and self.max_samples <= 0:
36
+ raise ValueError("max_samples must be positive when provided.")
37
+ if self.min_dense_samples <= 0:
38
+ raise ValueError("min_dense_samples must be positive.")
39
+
40
+ def to_dict(self) -> dict[str, object]:
41
+ """Return a JSON-serializable configuration dictionary."""
42
+ return asdict(self)
separatix/constants.py ADDED
@@ -0,0 +1,57 @@
1
+ """Constants used across the separatix package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ LINEAR_LIKELY_SUFFICIENT = "linear_likely_sufficient"
6
+ SMOOTH_NONLINEAR_RECOMMENDED = "smooth_nonlinear_recommended"
7
+ KERNEL_OR_LOCAL_RECOMMENDED = "kernel_or_local_recommended"
8
+ HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED = "high_capacity_or_partitioning_recommended"
9
+ FEATURE_OR_LABEL_BOTTLENECK_LIKELY = "feature_or_label_bottleneck_likely"
10
+ INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY = "insufficient_data_or_unreliable_geometry"
11
+ INCONCLUSIVE = "inconclusive"
12
+
13
+ RECOMMENDATION_LABELS = {
14
+ LINEAR_LIKELY_SUFFICIENT: "Linear model likely sufficient.",
15
+ SMOOTH_NONLINEAR_RECOMMENDED: "Smooth nonlinear model likely useful.",
16
+ KERNEL_OR_LOCAL_RECOMMENDED: "Kernel or local model likely useful.",
17
+ HIGH_CAPACITY_OR_PARTITIONING_RECOMMENDED: (
18
+ "Higher-capacity or partitioning model likely useful."
19
+ ),
20
+ FEATURE_OR_LABEL_BOTTLENECK_LIKELY: "Feature or label bottleneck likely.",
21
+ INSUFFICIENT_DATA_OR_UNRELIABLE_GEOMETRY: (
22
+ "Insufficient data or unreliable geometry."
23
+ ),
24
+ INCONCLUSIVE: "Diagnostic result is inconclusive.",
25
+ }
26
+
27
+ BUDGETS = {
28
+ "fast": {
29
+ "max_probe_samples": 5000,
30
+ "max_neighbor_samples": 5000,
31
+ "max_boundary_samples": 2000,
32
+ "cv_folds": 3,
33
+ "bootstrap_repeats": 0,
34
+ "run_kernel_probe": False,
35
+ "run_persistent_topology": False,
36
+ },
37
+ "standard": {
38
+ "max_probe_samples": 20000,
39
+ "max_neighbor_samples": 10000,
40
+ "max_boundary_samples": 3000,
41
+ "cv_folds": 5,
42
+ "bootstrap_repeats": 3,
43
+ "run_kernel_probe": True,
44
+ "run_persistent_topology": "auto",
45
+ },
46
+ "extended": {
47
+ "max_probe_samples": 50000,
48
+ "max_neighbor_samples": 20000,
49
+ "max_boundary_samples": 5000,
50
+ "cv_folds": 5,
51
+ "bootstrap_repeats": 10,
52
+ "run_kernel_probe": True,
53
+ "run_persistent_topology": "auto",
54
+ },
55
+ }
56
+
57
+ CONFIDENCE_LEVELS = ("low", "medium", "high")
separatix/densify.py ADDED
@@ -0,0 +1,106 @@
1
+ """Dense conversion helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from math import floor
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ from scipy import sparse
10
+
11
+ from separatix.config import ProfilerConfig
12
+ from separatix.exceptions import DensificationError, DensificationWarning
13
+ from separatix.sampling import stratified_subsample_indices
14
+ from separatix.utils.warnings import record_warning
15
+
16
+
17
+ def ensure_dense_or_sample(
18
+ X: Any,
19
+ y: np.ndarray,
20
+ *,
21
+ reason: str,
22
+ config: ProfilerConfig,
23
+ report_context: dict[str, Any],
24
+ ) -> dict[str, Any]:
25
+ """Return a dense matrix, optionally after stratified subsampling."""
26
+ densification_events = report_context.setdefault("densification_events", [])
27
+ warnings_list = report_context.setdefault("warnings", [])
28
+ skipped = report_context.setdefault("skipped_diagnostics", [])
29
+
30
+ if not sparse.issparse(X):
31
+ return {"X": np.asarray(X), "y": y, "performed": False, "skipped": False}
32
+
33
+ dtype = X.dtype if X.dtype is not None else np.dtype(float)
34
+ estimated_mb = X.shape[0] * X.shape[1] * np.dtype(dtype).itemsize / 1024**2
35
+ event = {
36
+ "operation": "densify",
37
+ "reason": reason,
38
+ "input_shape": [int(X.shape[0]), int(X.shape[1])],
39
+ "estimated_full_dense_mb": float(estimated_mb),
40
+ "max_dense_mb": config.max_dense_mb,
41
+ "policy": config.densify_policy,
42
+ "sampling_used": False,
43
+ "n_original": int(X.shape[0]),
44
+ "n_used": int(X.shape[0]),
45
+ "status": "performed",
46
+ }
47
+
48
+ if estimated_mb <= config.max_dense_mb:
49
+ dense = X.toarray()
50
+ densification_events.append(event)
51
+ if config.warn_on_densify:
52
+ record_warning(
53
+ f"Sparse input densified for {reason}.",
54
+ warnings_list,
55
+ DensificationWarning,
56
+ )
57
+ return {"X": dense, "y": y, "performed": True, "skipped": False}
58
+
59
+ if config.densify_policy == "fail":
60
+ message = (
61
+ f"Dense conversion for {reason} would exceed "
62
+ f"max_dense_mb={config.max_dense_mb}."
63
+ )
64
+ raise DensificationError(message)
65
+
66
+ if config.densify_policy == "skip":
67
+ event["status"] = "skipped"
68
+ densification_events.append(event)
69
+ skipped.append(
70
+ {
71
+ "name": reason,
72
+ "reason": "dense conversion exceeds configured memory budget",
73
+ }
74
+ )
75
+ return {"X": None, "y": y, "performed": False, "skipped": True}
76
+
77
+ max_rows = floor(
78
+ (config.max_dense_mb * 1024**2) / (X.shape[1] * np.dtype(dtype).itemsize)
79
+ )
80
+ n_used = min(X.shape[0], max_rows, config.max_samples or X.shape[0])
81
+ if n_used < min(config.min_dense_samples, X.shape[0]):
82
+ skipped.append({"name": reason, "reason": "dense subsample would be too small"})
83
+ event["status"] = "skipped_too_small"
84
+ event["n_used"] = int(max(n_used, 0))
85
+ densification_events.append(event)
86
+ if config.densify_policy == "warn_and_sample":
87
+ return {"X": None, "y": y, "performed": False, "skipped": True}
88
+ raise DensificationError(f"Unable to densify enough samples for {reason}.")
89
+
90
+ indices = stratified_subsample_indices(
91
+ y,
92
+ n_samples=n_used,
93
+ random_state=config.random_state,
94
+ )
95
+ dense = X[indices, :].toarray()
96
+ event["sampling_used"] = True
97
+ event["n_used"] = int(indices.shape[0])
98
+ event["status"] = "performed_on_subsample"
99
+ densification_events.append(event)
100
+ if config.warn_on_densify:
101
+ record_warning(
102
+ f"Sparse input was stratified-subsampled then densified for {reason}.",
103
+ warnings_list,
104
+ DensificationWarning,
105
+ )
106
+ return {"X": dense, "y": y[indices], "performed": True, "skipped": False}
@@ -0,0 +1,13 @@
1
+ """Custom exceptions and warnings for separatix."""
2
+
3
+
4
+ class SeparatixError(Exception):
5
+ """Base exception for separatix."""
6
+
7
+
8
+ class DensificationError(SeparatixError):
9
+ """Raised when dense conversion is required but disallowed or impossible."""
10
+
11
+
12
+ class DensificationWarning(UserWarning):
13
+ """Warning emitted when sparse data are densified or subsampled."""
@@ -0,0 +1 @@
1
+ """Diagnostic metric modules."""
@@ -0,0 +1,61 @@
1
+ """Dataset audit metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+
9
+
10
+ def compute_dataset_audit(
11
+ X: Any,
12
+ y: np.ndarray,
13
+ *,
14
+ classes: np.ndarray,
15
+ is_sparse: bool,
16
+ ) -> dict[str, Any]:
17
+ """Compute cheap dataset audit statistics."""
18
+ class_ids, counts = np.unique(y, return_counts=True)
19
+ proportions = counts / counts.sum()
20
+ imbalance_ratio = float(counts.max() / max(1, counts.min()))
21
+ result: dict[str, Any] = {
22
+ "n_samples": int(X.shape[0]),
23
+ "n_features": int(X.shape[1]),
24
+ "n_classes": int(classes.shape[0]),
25
+ "class_counts": {
26
+ str(classes[i]): int(counts[idx]) for idx, i in enumerate(class_ids)
27
+ },
28
+ "class_proportions": {
29
+ str(classes[i]): float(proportions[idx]) for idx, i in enumerate(class_ids)
30
+ },
31
+ "imbalance_ratio": imbalance_ratio,
32
+ "is_sparse": is_sparse,
33
+ }
34
+ if is_sparse:
35
+ density = (
36
+ float(X.nnz / (X.shape[0] * X.shape[1]))
37
+ if X.shape[0] and X.shape[1]
38
+ else 0.0
39
+ )
40
+ result.update(
41
+ {
42
+ "nnz": int(X.nnz),
43
+ "density": density,
44
+ "sparsity_fraction": float(1.0 - density),
45
+ "estimated_dense_memory_mb": float(
46
+ X.shape[0] * X.shape[1] * X.dtype.itemsize / 1024**2
47
+ ),
48
+ "dtype": str(X.dtype),
49
+ }
50
+ )
51
+ else:
52
+ result.update(
53
+ {
54
+ "dtype": str(X.dtype),
55
+ "constant_feature_fraction": float(
56
+ np.mean(np.nanstd(X, axis=0) == 0.0)
57
+ ),
58
+ "estimated_dense_memory_mb": float(X.nbytes / 1024**2),
59
+ }
60
+ )
61
+ return result
@@ -0,0 +1,21 @@
1
+ """Baseline metric helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def summarize_probe_family(probes: dict[str, dict[str, Any]]) -> dict[str, Any]:
9
+ """Summarize high-level probe behavior."""
10
+ available = {
11
+ name: result["balanced_accuracy"]
12
+ for name, result in probes.items()
13
+ if "balanced_accuracy" in result
14
+ }
15
+ best_name = (
16
+ max(available.items(), key=lambda item: item[1])[0] if available else None
17
+ )
18
+ return {
19
+ "best_probe": best_name,
20
+ "best_probe_score": available.get(best_name) if best_name is not None else None,
21
+ }
@@ -0,0 +1,54 @@
1
+ """Boundary candidate diagnostics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+
9
+
10
+ def compute_boundary_candidates(
11
+ y: np.ndarray,
12
+ neighborhood: dict[str, Any],
13
+ probes: dict[str, dict[str, Any]],
14
+ ) -> dict[str, Any]:
15
+ """Extract boundary candidate indices from ambiguity and disagreement."""
16
+ local_entropy = np.asarray(neighborhood.get("local_entropy", []), dtype=float)
17
+ local_ambiguity = np.asarray(neighborhood.get("local_ambiguity", []), dtype=float)
18
+ if local_entropy.size == 0 or local_ambiguity.size == 0:
19
+ return {
20
+ "candidate_indices": [],
21
+ "candidate_fraction": 0.0,
22
+ "boundary_sample_size": 0,
23
+ "class_composition": {},
24
+ "warning": "Boundary diagnostics unavailable.",
25
+ }
26
+ entropy_threshold = float(np.quantile(local_entropy, 0.75))
27
+ ambiguity_threshold = max(0.33, float(np.quantile(local_ambiguity, 0.75)))
28
+ candidate_mask = (local_entropy >= entropy_threshold) | (
29
+ local_ambiguity >= ambiguity_threshold
30
+ )
31
+ linear_preds = probes.get("linear", {}).get("predictions")
32
+ knn_preds = probes.get("knn", {}).get("predictions")
33
+ if (
34
+ linear_preds is not None
35
+ and knn_preds is not None
36
+ and len(linear_preds) == candidate_mask.shape[0]
37
+ ):
38
+ disagreement = np.asarray(linear_preds) != np.asarray(knn_preds)
39
+ candidate_mask = candidate_mask | disagreement
40
+ indices = np.flatnonzero(candidate_mask)
41
+ counts = {
42
+ str(cls): int(np.sum(y[indices] == cls))
43
+ for cls in np.unique(y[indices])
44
+ if indices.size
45
+ }
46
+ return {
47
+ "candidate_indices": indices.tolist(),
48
+ "candidate_fraction": float(indices.shape[0] / max(1, y.shape[0])),
49
+ "boundary_sample_size": int(indices.shape[0]),
50
+ "class_composition": counts,
51
+ "warning": "Boundary sample is very small."
52
+ if indices.shape[0] < max(10, len(np.unique(y)))
53
+ else None,
54
+ }
@@ -0,0 +1,80 @@
1
+ """Geometry reliability diagnostics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ from scipy import sparse
9
+ from sklearn.decomposition import PCA, TruncatedSVD
10
+ from sklearn.metrics import pairwise_distances
11
+
12
+ from separatix.config import ProfilerConfig
13
+ from separatix.densify import ensure_dense_or_sample
14
+ from separatix.sampling import cap_samples_for_budget
15
+
16
+
17
+ def compute_geometry_diagnostics(
18
+ X: Any,
19
+ y: np.ndarray,
20
+ *,
21
+ config: ProfilerConfig,
22
+ report_context: dict[str, Any],
23
+ ) -> dict[str, Any]:
24
+ """Compute cheap geometry reliability metrics."""
25
+ X_used, y_used, sample_info = cap_samples_for_budget(
26
+ X, y, config=config, reason="neighbors"
27
+ )
28
+ if sparse.issparse(X_used):
29
+ svd = TruncatedSVD(
30
+ n_components=min(10, max(2, X_used.shape[1] - 1)),
31
+ random_state=config.random_state,
32
+ )
33
+ embedding = svd.fit_transform(X_used)
34
+ explained = svd.explained_variance_ratio_
35
+ feature_scale_range = None
36
+ else:
37
+ centered = np.asarray(X_used)
38
+ pca = PCA(
39
+ n_components=min(10, centered.shape[1], max(1, centered.shape[0] - 1)),
40
+ random_state=config.random_state,
41
+ )
42
+ embedding = pca.fit_transform(centered)
43
+ explained = pca.explained_variance_ratio_
44
+ std = np.std(centered, axis=0)
45
+ feature_scale_range = float(
46
+ std.max() / max(std[std > 0].min() if np.any(std > 0) else 1.0, 1e-9)
47
+ )
48
+
49
+ dense_for_dist = ensure_dense_or_sample(
50
+ X_used,
51
+ y_used,
52
+ reason="geometry_distance_concentration",
53
+ config=config,
54
+ report_context=report_context,
55
+ )
56
+ if dense_for_dist["skipped"]:
57
+ concentration = None
58
+ else:
59
+ sample = dense_for_dist["X"]
60
+ if sample.shape[0] > 250:
61
+ sample = sample[:250]
62
+ dists = pairwise_distances(sample)
63
+ tri = dists[np.triu_indices_from(dists, k=1)]
64
+ concentration = (
65
+ float((tri.max() - tri.min()) / max(tri.mean(), 1e-9)) if tri.size else 0.0
66
+ )
67
+
68
+ intrinsic_dim = float(np.sum(explained > (1.0 / max(embedding.shape[1], 1))))
69
+ effective_rank = float(
70
+ np.exp(-np.sum(explained * np.log(np.clip(explained, 1e-12, None))))
71
+ )
72
+ return {
73
+ "feature_scale_range_estimate": feature_scale_range,
74
+ "effective_rank_estimate": effective_rank,
75
+ "intrinsic_dimension_proxy": intrinsic_dim,
76
+ "distance_concentration_proxy": concentration,
77
+ "high_dimensionality_flag": bool(X.shape[1] > max(100, X.shape[0] // 2)),
78
+ "sample_to_feature_ratio": float(X.shape[0] / max(1, X.shape[1])),
79
+ "sampling": sample_info,
80
+ }
@@ -0,0 +1,72 @@
1
+ """Graph fragmentation diagnostics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ from scipy.sparse.csgraph import connected_components
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+ from separatix.config import ProfilerConfig
12
+ from separatix.sampling import cap_samples_for_budget
13
+
14
+
15
+ def compute_graph_fragmentation(
16
+ X: Any,
17
+ y: np.ndarray,
18
+ boundary: dict[str, Any],
19
+ *,
20
+ config: ProfilerConfig,
21
+ ) -> dict[str, Any]:
22
+ """Compute fragmentation diagnostics over boundary candidates."""
23
+ indices = np.asarray(boundary.get("candidate_indices", []), dtype=int)
24
+ if indices.size < 3:
25
+ return {
26
+ "component_count": 0,
27
+ "largest_component_fraction": 1.0 if indices.size else 0.0,
28
+ "component_size_entropy": 0.0,
29
+ "small_component_count": 0,
30
+ "cross_class_edge_density": 0.0,
31
+ "graph_fragmentation_score": 0.0,
32
+ "warning": "Not enough boundary candidates for graph diagnostics.",
33
+ }
34
+
35
+ X_boundary = X[indices]
36
+ y_boundary = y[indices]
37
+ X_boundary, y_boundary, sample_info = cap_samples_for_budget(
38
+ X_boundary, y_boundary, config=config, reason="boundary"
39
+ )
40
+ k = min(len(y_boundary) - 1, 10)
41
+ nn = NearestNeighbors(n_neighbors=k + 1)
42
+ nn.fit(X_boundary)
43
+ graph = nn.kneighbors_graph(X_boundary, mode="connectivity")
44
+ graph = graph.maximum(graph.T)
45
+ n_components, labels = connected_components(graph, directed=False)
46
+ sizes = np.bincount(labels)
47
+ probs = sizes / max(1, sizes.sum())
48
+ entropy = float(
49
+ -np.sum(np.where(probs > 0, probs * np.log(probs), 0.0))
50
+ / max(np.log(max(2, len(sizes))), 1e-9)
51
+ )
52
+ rows, cols = graph.nonzero()
53
+ valid = rows < cols
54
+ cross_class = (
55
+ np.mean(y_boundary[rows[valid]] != y_boundary[cols[valid]])
56
+ if np.any(valid)
57
+ else 0.0
58
+ )
59
+ largest = float(sizes.max() / max(1, sizes.sum()))
60
+ component_count_scaled = min(1.0, n_components / max(3, len(y_boundary) / 20))
61
+ fragmentation = float(
62
+ np.clip(np.mean([1.0 - largest, component_count_scaled, entropy]), 0.0, 1.0)
63
+ )
64
+ return {
65
+ "component_count": int(n_components),
66
+ "largest_component_fraction": largest,
67
+ "component_size_entropy": entropy,
68
+ "small_component_count": int(np.sum(sizes <= 3)),
69
+ "cross_class_edge_density": float(cross_class),
70
+ "graph_fragmentation_score": fragmentation,
71
+ "sampling": sample_info,
72
+ }
@@ -0,0 +1,96 @@
1
+ """Neighborhood diagnostics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ from scipy import sparse
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+ from separatix.config import ProfilerConfig
12
+ from separatix.densify import ensure_dense_or_sample
13
+ from separatix.sampling import cap_samples_for_budget
14
+
15
+
16
+ def compute_neighborhood_diagnostics(
17
+ X: Any,
18
+ y: np.ndarray,
19
+ *,
20
+ config: ProfilerConfig,
21
+ report_context: dict[str, Any],
22
+ ) -> dict[str, Any]:
23
+ """Compute neighborhood overlap and ambiguity diagnostics."""
24
+ X_used, y_used, sample_info = cap_samples_for_budget(
25
+ X, y, config=config, reason="neighbors"
26
+ )
27
+ k = min(len(y_used) - 1, min(15, max(3, int(np.sqrt(len(y_used))))))
28
+ if k < 1:
29
+ return {
30
+ "mean_local_entropy": 0.0,
31
+ "high_entropy_fraction": 0.0,
32
+ "same_class_neighbor_fraction": 1.0,
33
+ "cross_class_neighbor_fraction": 0.0,
34
+ "mean_local_ambiguity": 0.0,
35
+ "sampling": sample_info,
36
+ }
37
+
38
+ X_fit = X_used
39
+ if sparse.issparse(X_used):
40
+ try:
41
+ nn = NearestNeighbors(n_neighbors=k)
42
+ nn.fit(X_fit)
43
+ except TypeError:
44
+ dense_info = ensure_dense_or_sample(
45
+ X_used,
46
+ y_used,
47
+ reason="neighborhood_diagnostics",
48
+ config=config,
49
+ report_context=report_context,
50
+ )
51
+ if dense_info["skipped"]:
52
+ return {
53
+ "sampling": sample_info,
54
+ "skipped_reason": "dense conversion unavailable",
55
+ }
56
+ X_fit = dense_info["X"]
57
+ y_used = dense_info["y"]
58
+ nn = NearestNeighbors(n_neighbors=min(k, len(y_used) - 1))
59
+ nn.fit(X_fit)
60
+ else:
61
+ nn = NearestNeighbors(n_neighbors=k)
62
+ nn.fit(X_fit)
63
+
64
+ indices = nn.kneighbors(X_fit, n_neighbors=k + 1, return_distance=False)[:, 1:]
65
+ entropies = []
66
+ ambiguities = []
67
+ same_class = []
68
+ enemy_distances = []
69
+ distances, idxs = nn.kneighbors(X_fit, n_neighbors=k + 1, return_distance=True)
70
+ for row_i, neigh in enumerate(indices):
71
+ neigh_labels = y_used[neigh]
72
+ counts = np.bincount(neigh_labels, minlength=len(np.unique(y_used)))
73
+ probs = counts / max(1, counts.sum())
74
+ positive_probs = probs[probs > 0]
75
+ ent = -np.sum(positive_probs * np.log(positive_probs))
76
+ entropies.append(float(ent / max(np.log(max(2, len(probs))), 1e-9)))
77
+ ambiguities.append(float(1.0 - probs.max()))
78
+ same_class.append(float(np.mean(neigh_labels == y_used[row_i])))
79
+ row_dist = distances[row_i, 1:]
80
+ enemy_mask = neigh_labels != y_used[row_i]
81
+ if np.any(enemy_mask):
82
+ enemy_distances.append(float(np.min(row_dist[enemy_mask])))
83
+ entropies_arr = np.array(entropies)
84
+ return {
85
+ "mean_local_entropy": float(entropies_arr.mean()),
86
+ "high_entropy_fraction": float(np.mean(entropies_arr >= 0.5)),
87
+ "same_class_neighbor_fraction": float(np.mean(same_class)),
88
+ "cross_class_neighbor_fraction": float(1.0 - np.mean(same_class)),
89
+ "nearest_enemy_distance_estimate": float(np.mean(enemy_distances))
90
+ if enemy_distances
91
+ else None,
92
+ "mean_local_ambiguity": float(np.mean(ambiguities)),
93
+ "local_entropy": entropies,
94
+ "local_ambiguity": ambiguities,
95
+ "sampling": sample_info,
96
+ }