benchmark-reliability 0.1.8__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/PKG-INFO +29 -34
  2. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/README.md +28 -33
  3. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/pyproject.toml +1 -1
  4. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/setup.py +1 -1
  5. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/PKG-INFO +29 -34
  6. benchmark_reliability-0.2.0/src/brf/analyzer.py +319 -0
  7. benchmark_reliability-0.1.8/src/brf/analyzer.py +0 -133
  8. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/setup.cfg +0 -0
  9. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/SOURCES.txt +0 -0
  10. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/dependency_links.txt +0 -0
  11. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/entry_points.txt +0 -0
  12. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/requires.txt +0 -0
  13. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/benchmark_reliability.egg-info/top_level.txt +0 -0
  14. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/__init__.py +0 -0
  15. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/cli.py +0 -0
  16. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/metrics/__init__.py +0 -0
  17. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/metrics/baseline_gap.py +0 -0
  18. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/metrics/instability.py +0 -0
  19. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/metrics/metadata.py +0 -0
  20. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/metrics/null_test.py +0 -0
  21. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/phase/__init__.py +0 -0
  22. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/phase/classifier.py +0 -0
  23. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/phase/embedding.py +0 -0
  24. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/phase/visualization.py +0 -0
  25. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/__init__.py +0 -0
  26. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/cli.py +0 -0
  27. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/manifest.yaml +0 -0
  28. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/__init__.py +0 -0
  29. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/assistments.py +0 -0
  30. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/college_scorecard.py +0 -0
  31. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/colleges_aaup.py +0 -0
  32. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/colleges_usnews.py +0 -0
  33. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/entrance_exam.py +0 -0
  34. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/higher_ed.py +0 -0
  35. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/mathe.py +0 -0
  36. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/mm_tba.py +0 -0
  37. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/oli.py +0 -0
  38. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/oulad.py +0 -0
  39. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/student_depression.py +0 -0
  40. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/student_dropout.py +0 -0
  41. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/tae.py +0 -0
  42. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/turkiye.py +0 -0
  43. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/uci_student.py +0 -0
  44. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/sources/xapi_edu.py +0 -0
  45. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/registry/verify.py +0 -0
  46. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/report/__init__.py +0 -0
  47. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/report/json_export.py +0 -0
  48. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/src/brf/report/latex_export.py +0 -0
  49. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/tests/test_analyzer.py +0 -0
  50. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/tests/test_metrics.py +0 -0
  51. {benchmark_reliability-0.1.8 → benchmark_reliability-0.2.0}/tests/test_phase.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.1.8
3
+ Version: 0.2.0
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -53,16 +53,27 @@ X, y, groups, metadata = source.prepare()
53
53
  X_scaled = StandardScaler().fit_transform(X)
54
54
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X_scaled, y, groups=groups)
55
55
 
56
- print(analyzer.brf_vector)
57
- # {'B': 0.12, 'I': 1.36, 'N': 0.93, 'M': 0.63,
58
- # 'S': -0.42, 'E': 0.75, 'class': 'Void'}
59
- print(source.metadata())
60
- # {'name': 'tae', 'display_name': 'Teaching Assistant Evaluation',
61
- # 'n_samples': 151, 'n_features': 4, 'n_groups': 25,
62
- # 'education_level': 'Higher Education', 'country': 'US', ...}
56
+ # Continuous metrics (always use these)
57
+ print(f"S={analyzer.S:.3f} E={analyzer.E:.3f}")
58
+ # S=-0.423 E=0.746
59
+
60
+ # Diagnostic report: explains WHY, not just a label
61
+ print(analyzer.diagnose()["summary"])
62
+ # "The model shows no detectable, stable signal (S=-0.423 <= 0). ..."
63
+
64
+ # Per-dimension details + actionable recommendations
65
+ for dim, issue in analyzer.diagnose()["details"].items():
66
+ print(f" {dim}: {issue}")
67
+
68
+ # Percentile rank against 25 audited benchmarks
69
+ print(analyzer.rank())
70
+ # {'S_percentile': 16.0, 'E_percentile': 36.0, ...}
71
+
72
+ # One-paragraph recommendation
73
+ print(analyzer.recommend())
63
74
  ```
64
75
 
65
- ### Browse the BRF Benchmark Registry
76
+ ### Browse the Registry
66
77
 
67
78
  ```bash
68
79
  $ brf registry list
@@ -96,23 +107,22 @@ $ brf registry info oulad
96
107
 
97
108
  ## BRF Metrics
98
109
 
110
+ The framework computes six continuous metrics. S and E are the primary
111
+ summary coordinates; the three-class label (Reliable/Void/Fragile) is
112
+ communication shorthand only --- **the signal is in the continuous values**.
113
+
99
114
  | Metric | Formula | Meaning |
100
115
  |--------|---------|---------|
101
116
  | B | mean(Delta R^2 vs mean baseline) | Predictive signal strength |
102
117
  | I | std(R^2) / max(|mean(R^2)|, 1e-4) + 1e-8 | Intrinsic instability |
103
118
  | N | fraction of folds where R^2_real > median(R^2_perm) | Null separation |
104
119
  | M | 0.5 * norm_group_entropy + 0.5 * group_balance | Metadata adequacy |
105
- | S | N - I | Stability |
106
- | E | B + M | Evidence |
120
+ | S | N - I | Stability (signal above noise) |
121
+ | E | B + M | Evidence (predictive + structural) |
107
122
 
108
- Three-regime classification (communication shorthand -- the signal is in the
109
- continuous S and E values):
110
-
111
- | Class | Condition | Meaning |
112
- |-------|-----------|---------|
113
- | **Reliable** | S > 0, E > 0.5 | Stable signal, adequate group structure |
114
- | **Void** | S <= 0 | No detectable signal beyond noise |
115
- | **Fragile** | S > 0, E <= 0.5 | Signal exists but group structure insufficient (rare) |
123
+ Use `analyzer.diagnose()` for per-dimension explanations and actionable
124
+ recommendations, or `analyzer.rank()` to see percentile scores against
125
+ the 25 benchmarks in the BRF Registry.
116
126
 
117
127
  ## CLI Reference
118
128
 
@@ -134,21 +144,6 @@ export_json(analyzer.brf_vector, "results.json")
134
144
  latex_table = export_latex(analyzer.brf_vector)
135
145
  ```
136
146
 
137
- ## BRF Benchmark Registry
138
-
139
- The Registry is a versioned, Dataset-as-Code collection of 18 unique
140
- group-aware educational prediction benchmarks (25 entries including
141
- alternative grouping views). Each dataset is a self-contained Python
142
- module with download, SHA-256 verification, and standardized
143
- preprocessing.
144
-
145
- - 16 DatasetSource modules (8 with SHA-256 verified)
146
- - Enriched metadata: education level, country, year, domain, grouping rationale
147
- - CLI for sync, verification, and inspection
148
- - Versioned releases (v1.5 current) with frozen snapshots for reproducibility
149
-
150
- Adding a dataset = one `.py` file. Auto-discovered on import.
151
-
152
147
  ## Citation
153
148
 
154
149
  To cite the BRF framework and package (JOSS paper forthcoming):
@@ -26,16 +26,27 @@ X, y, groups, metadata = source.prepare()
26
26
  X_scaled = StandardScaler().fit_transform(X)
27
27
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X_scaled, y, groups=groups)
28
28
 
29
- print(analyzer.brf_vector)
30
- # {'B': 0.12, 'I': 1.36, 'N': 0.93, 'M': 0.63,
31
- # 'S': -0.42, 'E': 0.75, 'class': 'Void'}
32
- print(source.metadata())
33
- # {'name': 'tae', 'display_name': 'Teaching Assistant Evaluation',
34
- # 'n_samples': 151, 'n_features': 4, 'n_groups': 25,
35
- # 'education_level': 'Higher Education', 'country': 'US', ...}
29
+ # Continuous metrics (always use these)
30
+ print(f"S={analyzer.S:.3f} E={analyzer.E:.3f}")
31
+ # S=-0.423 E=0.746
32
+
33
+ # Diagnostic report: explains WHY, not just a label
34
+ print(analyzer.diagnose()["summary"])
35
+ # "The model shows no detectable, stable signal (S=-0.423 <= 0). ..."
36
+
37
+ # Per-dimension details + actionable recommendations
38
+ for dim, issue in analyzer.diagnose()["details"].items():
39
+ print(f" {dim}: {issue}")
40
+
41
+ # Percentile rank against 25 audited benchmarks
42
+ print(analyzer.rank())
43
+ # {'S_percentile': 16.0, 'E_percentile': 36.0, ...}
44
+
45
+ # One-paragraph recommendation
46
+ print(analyzer.recommend())
36
47
  ```
37
48
 
38
- ### Browse the BRF Benchmark Registry
49
+ ### Browse the Registry
39
50
 
40
51
  ```bash
41
52
  $ brf registry list
@@ -69,23 +80,22 @@ $ brf registry info oulad
69
80
 
70
81
  ## BRF Metrics
71
82
 
83
+ The framework computes six continuous metrics. S and E are the primary
84
+ summary coordinates; the three-class label (Reliable/Void/Fragile) is
85
+ communication shorthand only --- **the signal is in the continuous values**.
86
+
72
87
  | Metric | Formula | Meaning |
73
88
  |--------|---------|---------|
74
89
  | B | mean(Delta R^2 vs mean baseline) | Predictive signal strength |
75
90
  | I | std(R^2) / max(|mean(R^2)|, 1e-4) + 1e-8 | Intrinsic instability |
76
91
  | N | fraction of folds where R^2_real > median(R^2_perm) | Null separation |
77
92
  | M | 0.5 * norm_group_entropy + 0.5 * group_balance | Metadata adequacy |
78
- | S | N - I | Stability |
79
- | E | B + M | Evidence |
93
+ | S | N - I | Stability (signal above noise) |
94
+ | E | B + M | Evidence (predictive + structural) |
80
95
 
81
- Three-regime classification (communication shorthand -- the signal is in the
82
- continuous S and E values):
83
-
84
- | Class | Condition | Meaning |
85
- |-------|-----------|---------|
86
- | **Reliable** | S > 0, E > 0.5 | Stable signal, adequate group structure |
87
- | **Void** | S <= 0 | No detectable signal beyond noise |
88
- | **Fragile** | S > 0, E <= 0.5 | Signal exists but group structure insufficient (rare) |
96
+ Use `analyzer.diagnose()` for per-dimension explanations and actionable
97
+ recommendations, or `analyzer.rank()` to see percentile scores against
98
+ the 25 benchmarks in the BRF Registry.
89
99
 
90
100
  ## CLI Reference
91
101
 
@@ -107,21 +117,6 @@ export_json(analyzer.brf_vector, "results.json")
107
117
  latex_table = export_latex(analyzer.brf_vector)
108
118
  ```
109
119
 
110
- ## BRF Benchmark Registry
111
-
112
- The Registry is a versioned, Dataset-as-Code collection of 18 unique
113
- group-aware educational prediction benchmarks (25 entries including
114
- alternative grouping views). Each dataset is a self-contained Python
115
- module with download, SHA-256 verification, and standardized
116
- preprocessing.
117
-
118
- - 16 DatasetSource modules (8 with SHA-256 verified)
119
- - Enriched metadata: education level, country, year, domain, grouping rationale
120
- - CLI for sync, verification, and inspection
121
- - Versioned releases (v1.5 current) with frozen snapshots for reproducibility
122
-
123
- Adding a dataset = one `.py` file. Auto-discovered on import.
124
-
125
120
  ## Citation
126
121
 
127
122
  To cite the BRF framework and package (JOSS paper forthcoming):
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "benchmark-reliability"
7
- version = "0.1.8"
7
+ version = "0.2.0"
8
8
  description = "Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="benchmark-reliability",
5
- version="0.1.8",
5
+ version="0.2.0",
6
6
  packages=find_packages(where="src"),
7
7
  package_dir={"": "src"},
8
8
  package_data={
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.1.8
3
+ Version: 0.2.0
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -53,16 +53,27 @@ X, y, groups, metadata = source.prepare()
53
53
  X_scaled = StandardScaler().fit_transform(X)
54
54
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X_scaled, y, groups=groups)
55
55
 
56
- print(analyzer.brf_vector)
57
- # {'B': 0.12, 'I': 1.36, 'N': 0.93, 'M': 0.63,
58
- # 'S': -0.42, 'E': 0.75, 'class': 'Void'}
59
- print(source.metadata())
60
- # {'name': 'tae', 'display_name': 'Teaching Assistant Evaluation',
61
- # 'n_samples': 151, 'n_features': 4, 'n_groups': 25,
62
- # 'education_level': 'Higher Education', 'country': 'US', ...}
56
+ # Continuous metrics (always use these)
57
+ print(f"S={analyzer.S:.3f} E={analyzer.E:.3f}")
58
+ # S=-0.423 E=0.746
59
+
60
+ # Diagnostic report: explains WHY, not just a label
61
+ print(analyzer.diagnose()["summary"])
62
+ # "The model shows no detectable, stable signal (S=-0.423 <= 0). ..."
63
+
64
+ # Per-dimension details + actionable recommendations
65
+ for dim, issue in analyzer.diagnose()["details"].items():
66
+ print(f" {dim}: {issue}")
67
+
68
+ # Percentile rank against 25 audited benchmarks
69
+ print(analyzer.rank())
70
+ # {'S_percentile': 16.0, 'E_percentile': 36.0, ...}
71
+
72
+ # One-paragraph recommendation
73
+ print(analyzer.recommend())
63
74
  ```
64
75
 
65
- ### Browse the BRF Benchmark Registry
76
+ ### Browse the Registry
66
77
 
67
78
  ```bash
68
79
  $ brf registry list
@@ -96,23 +107,22 @@ $ brf registry info oulad
96
107
 
97
108
  ## BRF Metrics
98
109
 
110
+ The framework computes six continuous metrics. S and E are the primary
111
+ summary coordinates; the three-class label (Reliable/Void/Fragile) is
112
+ communication shorthand only --- **the signal is in the continuous values**.
113
+
99
114
  | Metric | Formula | Meaning |
100
115
  |--------|---------|---------|
101
116
  | B | mean(Delta R^2 vs mean baseline) | Predictive signal strength |
102
117
  | I | std(R^2) / max(|mean(R^2)|, 1e-4) + 1e-8 | Intrinsic instability |
103
118
  | N | fraction of folds where R^2_real > median(R^2_perm) | Null separation |
104
119
  | M | 0.5 * norm_group_entropy + 0.5 * group_balance | Metadata adequacy |
105
- | S | N - I | Stability |
106
- | E | B + M | Evidence |
120
+ | S | N - I | Stability (signal above noise) |
121
+ | E | B + M | Evidence (predictive + structural) |
107
122
 
108
- Three-regime classification (communication shorthand -- the signal is in the
109
- continuous S and E values):
110
-
111
- | Class | Condition | Meaning |
112
- |-------|-----------|---------|
113
- | **Reliable** | S > 0, E > 0.5 | Stable signal, adequate group structure |
114
- | **Void** | S <= 0 | No detectable signal beyond noise |
115
- | **Fragile** | S > 0, E <= 0.5 | Signal exists but group structure insufficient (rare) |
123
+ Use `analyzer.diagnose()` for per-dimension explanations and actionable
124
+ recommendations, or `analyzer.rank()` to see percentile scores against
125
+ the 25 benchmarks in the BRF Registry.
116
126
 
117
127
  ## CLI Reference
118
128
 
@@ -134,21 +144,6 @@ export_json(analyzer.brf_vector, "results.json")
134
144
  latex_table = export_latex(analyzer.brf_vector)
135
145
  ```
136
146
 
137
- ## BRF Benchmark Registry
138
-
139
- The Registry is a versioned, Dataset-as-Code collection of 18 unique
140
- group-aware educational prediction benchmarks (25 entries including
141
- alternative grouping views). Each dataset is a self-contained Python
142
- module with download, SHA-256 verification, and standardized
143
- preprocessing.
144
-
145
- - 16 DatasetSource modules (8 with SHA-256 verified)
146
- - Enriched metadata: education level, country, year, domain, grouping rationale
147
- - CLI for sync, verification, and inspection
148
- - Versioned releases (v1.5 current) with frozen snapshots for reproducibility
149
-
150
- Adding a dataset = one `.py` file. Auto-discovered on import.
151
-
152
147
  ## Citation
153
148
 
154
149
  To cite the BRF framework and package (JOSS paper forthcoming):
@@ -0,0 +1,319 @@
1
+ import json
2
+ import math
3
+ import warnings
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import numpy as np
8
+ from sklearn.base import clone
9
+ from sklearn.linear_model import Ridge
10
+ from sklearn.metrics import r2_score
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+ from .metrics import compute_b, compute_i, compute_m
14
+ from .phase import compute_phase_from_brf, classify_dataset
15
+
16
+
17
+ class BRFAnalyzer:
18
+ def __init__(
19
+ self,
20
+ n_splits: int = 30,
21
+ n_permutations: int = 200,
22
+ model=None,
23
+ seed: int = 42,
24
+ scale: bool = True,
25
+ ):
26
+ if n_splits < 2:
27
+ raise ValueError("n_splits must be >= 2")
28
+ self.n_splits = n_splits
29
+ self.n_permutations = n_permutations
30
+ self.model = model or Ridge(alpha=1.0)
31
+ self.seed = seed
32
+ self.scale = scale
33
+
34
+ self._fitted = False
35
+ self.B: Optional[float] = None
36
+ self.I: Optional[float] = None
37
+ self.N: Optional[float] = None
38
+ self.M: Optional[float] = None
39
+ self.S: Optional[float] = None
40
+ self.E: Optional[float] = None
41
+ self.class_: Optional[str] = None # retained for backward compat
42
+ self._registry_ref: Optional[Dict] = None # loaded lazily
43
+
44
+ def _validate_inputs(self, X, y):
45
+ X = np.asarray(X, dtype=float)
46
+ y = np.asarray(y, dtype=float)
47
+ if X.ndim != 2:
48
+ raise ValueError(f"X must be 2D, got shape {X.shape}")
49
+ if y.ndim != 1:
50
+ raise ValueError(f"y must be 1D, got shape {y.shape}")
51
+ if len(X) != len(y):
52
+ raise ValueError(f"X and y length mismatch: {len(X)} vs {len(y)}")
53
+ if len(X) < 20:
54
+ raise ValueError(f"Need at least 20 samples, got {len(X)}")
55
+ if not np.all(np.isfinite(X)):
56
+ raise ValueError("X contains NaN or Inf values")
57
+ if not np.all(np.isfinite(y)):
58
+ raise ValueError("y contains NaN or Inf values")
59
+ unique_y = np.unique(y)
60
+ if len(unique_y) <= 12 and np.all(unique_y == unique_y.astype(int)):
61
+ warnings.warn(
62
+ "y appears to be integer classification labels "
63
+ f"({len(unique_y)} unique values). "
64
+ "BRF is designed for regression targets."
65
+ )
66
+ return X, y
67
+
68
+ def fit(self, X, y, groups=None):
69
+ X, y = self._validate_inputs(X, y)
70
+ n = len(y)
71
+
72
+ if self.scale:
73
+ scaler = StandardScaler()
74
+ X = scaler.fit_transform(X)
75
+
76
+ rng_cv = np.random.default_rng(self.seed)
77
+ rng_perm = np.random.default_rng(self.seed + 10_007)
78
+
79
+ r2_scores = []
80
+ b_gains = []
81
+
82
+ n_per_fold = max(3, math.ceil(self.n_permutations / self.n_splits))
83
+ exceed_count = 0
84
+
85
+ for i in range(self.n_splits):
86
+ idx = rng_cv.permutation(n)
87
+ split = max(1, int(0.8 * n))
88
+ train_idx = idx[:split]
89
+ test_idx = idx[split:]
90
+
91
+ Xtr, Xte = X[train_idx], X[test_idx]
92
+ ytr, yte = y[train_idx], y[test_idx]
93
+
94
+ y_mean = np.full(len(yte), float(np.mean(ytr)))
95
+ m = clone(self.model)
96
+ m.fit(Xtr, ytr)
97
+ y_pred = m.predict(Xte)
98
+
99
+ r2_real = r2_score(yte, y_pred)
100
+ r2_scores.append(r2_real)
101
+ b_gains.append(compute_b(yte, y_pred, y_mean))
102
+
103
+ perm_r2s = []
104
+ for _ in range(n_per_fold):
105
+ y_perm = rng_perm.permutation(ytr)
106
+ m_perm = clone(self.model)
107
+ m_perm.fit(Xtr, y_perm)
108
+ y_pred_perm = m_perm.predict(Xte)
109
+ perm_r2s.append(r2_score(yte, y_pred_perm))
110
+
111
+ if r2_real > float(np.median(perm_r2s)):
112
+ exceed_count += 1
113
+
114
+ self.B = float(np.mean(b_gains))
115
+ self.I = compute_i(r2_scores)
116
+ self.N = exceed_count / self.n_splits
117
+ self.M = compute_m(groups)
118
+ self.S, self.E = compute_phase_from_brf(self.B, self.I, self.N, self.M)
119
+ self.class_ = classify_dataset(self.S, self.E)
120
+ self._fitted = True
121
+
122
+ return self
123
+
124
+ @property
125
+ def brf_vector(self) -> dict:
126
+ if not self._fitted:
127
+ raise RuntimeError("call fit() before accessing brf_vector")
128
+ return {
129
+ "B": self.B,
130
+ "I": self.I,
131
+ "N": self.N,
132
+ "M": self.M,
133
+ "S": self.S,
134
+ "E": self.E,
135
+ "class": self.class_,
136
+ }
137
+
138
+ # ---- improved reporting (v0.2) ----
139
+
140
+ def diagnose(self) -> Dict[str, str]:
141
+ """Return structured diagnosis explaining *why* the dataset is in its current state.
142
+
143
+ Replaces the opaque 3-class label with interpretable per-dimension
144
+ explanations, enabling benchmark designers to understand what to fix.
145
+ """
146
+ if not self._fitted:
147
+ raise RuntimeError("call fit() before accessing diagnose()")
148
+
149
+ issues = {}
150
+ suggestions = {}
151
+
152
+ # --- Predictive signal (B) ---
153
+ if self.B < 0:
154
+ issues["B"] = (f"Model performs WORSE than the mean baseline "
155
+ f"(B={self.B:.3f}). The features carry no useful "
156
+ f"predictive signal for this target.")
157
+ suggestions["B"] = "Reconsider feature engineering or target definition."
158
+ elif self.B < 0.05:
159
+ issues["B"] = (f"Marginal improvement over mean baseline "
160
+ f"(B={self.B:.3f}). Features explain very little variance.")
161
+ suggestions["B"] = "Add more informative features or reframe the task."
162
+ elif self.B < 0.2:
163
+ issues["B"] = (f"Moderate predictive signal (B={self.B:.3f}).")
164
+ suggestions["B"] = None
165
+ else:
166
+ issues["B"] = (f"Strong predictive signal (B={self.B:.3f}).")
167
+ suggestions["B"] = None
168
+
169
+ # --- Instability (I) ---
170
+ if self.I > 1.0:
171
+ issues["I"] = (f"High cross-split instability (I={self.I:.3f}). "
172
+ f"Model R^2 varies dramatically depending on which "
173
+ f"samples happen to be in the test set.")
174
+ suggestions["I"] = ("Increase sample size (N), reduce feature count (p), "
175
+ "or use regularization.")
176
+ elif self.I > 0.3:
177
+ issues["I"] = (f"Moderate instability (I={self.I:.3f}).")
178
+ suggestions["I"] = "Consider larger N or fewer features for more stable estimates."
179
+ else:
180
+ issues["I"] = (f"Low instability (I={self.I:.3f}). "
181
+ f"Model is robust to data split variation.")
182
+ suggestions["I"] = None
183
+
184
+ # --- Null separation (N) ---
185
+ if self.N < 0.5:
186
+ issues["N"] = (f"Model rarely beats permutation baseline "
187
+ f"(N={self.N:.3f}). The signal is indistinguishable "
188
+ f"from random noise.")
189
+ suggestions["N"] = ("The model is effectively fitting noise. "
190
+ "Consider whether a predictive relationship exists.")
191
+ elif self.N < 0.8:
192
+ issues["N"] = (f"Model sometimes fails to beat permutation "
193
+ f"(N={self.N:.3f}). Signal is present but inconsistent.")
194
+ suggestions["N"] = "Increase sample size or feature quality for more reliable separation."
195
+ else:
196
+ issues["N"] = (f"Model consistently beats permutation "
197
+ f"(N={self.N:.3f}). Clear signal above noise.")
198
+ suggestions["N"] = None
199
+
200
+ # --- Metadata adequacy (M) ---
201
+ if self.M < 0.1:
202
+ issues["M"] = (f"Insufficient group metadata (M={self.M:.3f}). "
203
+ f"Groups are too few, highly imbalanced, or absent.")
204
+ suggestions["M"] = ("Add or improve group annotations. "
205
+ "Consider whether an alternative grouping variable "
206
+ "captures more meaningful structure.")
207
+ elif self.M < 0.3:
208
+ issues["M"] = (f"Weak group metadata (M={self.M:.3f}). "
209
+ f"Group structure exists but is sparse or imbalanced.")
210
+ suggestions["M"] = "Use a finer-grained grouping variable if available."
211
+ elif self.M < 0.5:
212
+ issues["M"] = (f"Moderate group metadata (M={self.M:.3f}).")
213
+ suggestions["M"] = None
214
+ else:
215
+ issues["M"] = (f"Strong group metadata (M={self.M:.3f}). "
216
+ f"Group structure is well-defined and balanced.")
217
+ suggestions["M"] = None
218
+
219
+ # --- Synthesis ---
220
+ if self.S <= 0:
221
+ primary = ("The model shows no detectable, stable signal "
222
+ f"(S={self.S:.3f} <= 0). Performance differences between "
223
+ f"models on this benchmark may not be meaningful.")
224
+ elif self.E <= 0.5:
225
+ primary = ("Predictive signal is present (S>0) but the benchmark "
226
+ f"lacks sufficient evidence (E={self.E:.3f} <= 0.5). "
227
+ f"Results may not generalize across groups.")
228
+ else:
229
+ primary = ("The benchmark shows stable predictive signal and "
230
+ f"adequate group structure (S={self.S:.3f}, E={self.E:.3f}). "
231
+ f"Model comparisons are likely reproducible.")
232
+
233
+ return {
234
+ "summary": primary,
235
+ "details": issues,
236
+ "recommendations": {k: v for k, v in suggestions.items() if v},
237
+ }
238
+
239
+ def rank(self) -> Dict[str, float]:
240
+ """Percentile rank of S and E against the BRF Registry v1.5 benchmarks.
241
+
242
+ Returns percentiles (0-100) indicating where this dataset's S and E
243
+ fall relative to the 25 audited benchmarks. Requires the registry
244
+ data to be accessible.
245
+ """
246
+ if not self._fitted:
247
+ raise RuntimeError("call fit() before accessing rank()")
248
+ ref = self._load_registry_ref()
249
+ if ref is None:
250
+ return {"S_percentile": None, "E_percentile": None,
251
+ "note": "Registry data not available for ranking"}
252
+
253
+ s_vals = sorted(r["S"] for r in ref if r["S"] is not None)
254
+ e_vals = sorted(r["E"] for r in ref if r["E"] is not None)
255
+
256
+ def pctile(vals, x):
257
+ return sum(1 for v in vals if v <= x) / len(vals) * 100
258
+
259
+ return {
260
+ "S_percentile": round(pctile(s_vals, self.S), 1),
261
+ "E_percentile": round(pctile(e_vals, self.E), 1),
262
+ "reference": f"BRF Registry v1.5 ({len(s_vals)} benchmarks)",
263
+ }
264
+
265
+ def recommend(self) -> str:
266
+ """One-paragraph actionable recommendation for benchmark improvement."""
267
+ d = self.diagnose()
268
+ recs = d["recommendations"]
269
+ if not recs:
270
+ return ("Benchmark metrics are within normal ranges. "
271
+ "No specific action recommended.")
272
+ # Prioritize: B < 0 is most critical, then N < 0.5, then I > 1, then M < 0.1
273
+ priority = []
274
+ if self.B < 0:
275
+ priority.append("B")
276
+ if self.N < 0.5:
277
+ priority.append("N")
278
+ if self.I > 1.0:
279
+ priority.append("I")
280
+ if self.M < 0.1:
281
+ priority.append("M")
282
+ if not priority:
283
+ priority = [k for k in recs]
284
+
285
+ lines = [
286
+ f"This benchmark has {len(recs)} dimension(s) needing attention. "
287
+ f"Primary concern: {recs[priority[0]]}"
288
+ ]
289
+ return " ".join(lines)
290
+
291
+ def _load_registry_ref(self) -> Optional[List[Dict]]:
292
+ """Load Registry reference data for percentile ranking."""
293
+ if self._registry_ref is not None:
294
+ return self._registry_ref
295
+ # Search for registry_v1.5.json in known locations
296
+ candidates = [
297
+ Path(__file__).parent.parent.parent.parent
298
+ / "BRFRegistry" / "results" / "registry_v1.5.json",
299
+ Path(__file__).parent.parent.parent
300
+ / "BRFRegistry" / "results" / "registry_v1.5.json",
301
+ ]
302
+ for p in candidates:
303
+ if p.exists():
304
+ with open(p) as f:
305
+ data = json.load(f)
306
+ refs = []
307
+ for v in data.values():
308
+ if v.get("brf_result"):
309
+ refs.append({
310
+ "S": v["brf_result"]["S"],
311
+ "E": v["brf_result"]["E"],
312
+ "B": v["brf_result"]["B"],
313
+ "I": v["brf_result"]["I"],
314
+ "N": v["brf_result"]["N"],
315
+ "M": v["brf_result"]["M"],
316
+ })
317
+ self._registry_ref = refs
318
+ return refs
319
+ return None
@@ -1,133 +0,0 @@
1
- import math
2
- import warnings
3
- from typing import Optional
4
-
5
- import numpy as np
6
- from sklearn.base import clone
7
- from sklearn.linear_model import Ridge
8
- from sklearn.metrics import r2_score
9
- from sklearn.preprocessing import StandardScaler
10
-
11
- from .metrics import compute_b, compute_i, compute_m
12
- from .phase import compute_phase_from_brf, classify_dataset
13
-
14
-
15
- class BRFAnalyzer:
16
- def __init__(
17
- self,
18
- n_splits: int = 30,
19
- n_permutations: int = 200,
20
- model=None,
21
- seed: int = 42,
22
- scale: bool = True,
23
- ):
24
- if n_splits < 2:
25
- raise ValueError("n_splits must be >= 2")
26
- self.n_splits = n_splits
27
- self.n_permutations = n_permutations
28
- self.model = model or Ridge(alpha=1.0)
29
- self.seed = seed
30
- self.scale = scale
31
-
32
- self._fitted = False
33
- self.B: Optional[float] = None
34
- self.I: Optional[float] = None
35
- self.N: Optional[float] = None
36
- self.M: Optional[float] = None
37
- self.S: Optional[float] = None
38
- self.E: Optional[float] = None
39
- self.class_: Optional[str] = None
40
-
41
- def _validate_inputs(self, X, y):
42
- X = np.asarray(X, dtype=float)
43
- y = np.asarray(y, dtype=float)
44
- if X.ndim != 2:
45
- raise ValueError(f"X must be 2D, got shape {X.shape}")
46
- if y.ndim != 1:
47
- raise ValueError(f"y must be 1D, got shape {y.shape}")
48
- if len(X) != len(y):
49
- raise ValueError(f"X and y length mismatch: {len(X)} vs {len(y)}")
50
- if len(X) < 20:
51
- raise ValueError(f"Need at least 20 samples, got {len(X)}")
52
- if not np.all(np.isfinite(X)):
53
- raise ValueError("X contains NaN or Inf values")
54
- if not np.all(np.isfinite(y)):
55
- raise ValueError("y contains NaN or Inf values")
56
- unique_y = np.unique(y)
57
- if len(unique_y) <= 12 and np.all(unique_y == unique_y.astype(int)):
58
- warnings.warn(
59
- "y appears to be integer classification labels "
60
- f"({len(unique_y)} unique values). "
61
- "BRF is designed for regression targets."
62
- )
63
- return X, y
64
-
65
- def fit(self, X, y, groups=None):
66
- X, y = self._validate_inputs(X, y)
67
- n = len(y)
68
-
69
- if self.scale:
70
- scaler = StandardScaler()
71
- X = scaler.fit_transform(X)
72
-
73
- rng_cv = np.random.default_rng(self.seed)
74
- rng_perm = np.random.default_rng(self.seed + 10_007)
75
-
76
- r2_scores = []
77
- b_gains = []
78
-
79
- n_per_fold = max(3, math.ceil(self.n_permutations / self.n_splits))
80
- exceed_count = 0
81
-
82
- for i in range(self.n_splits):
83
- idx = rng_cv.permutation(n)
84
- split = max(1, int(0.8 * n))
85
- train_idx = idx[:split]
86
- test_idx = idx[split:]
87
-
88
- Xtr, Xte = X[train_idx], X[test_idx]
89
- ytr, yte = y[train_idx], y[test_idx]
90
-
91
- y_mean = np.full(len(yte), float(np.mean(ytr)))
92
- m = clone(self.model)
93
- m.fit(Xtr, ytr)
94
- y_pred = m.predict(Xte)
95
-
96
- r2_real = r2_score(yte, y_pred)
97
- r2_scores.append(r2_real)
98
- b_gains.append(compute_b(yte, y_pred, y_mean))
99
-
100
- perm_r2s = []
101
- for _ in range(n_per_fold):
102
- y_perm = rng_perm.permutation(ytr)
103
- m_perm = clone(self.model)
104
- m_perm.fit(Xtr, y_perm)
105
- y_pred_perm = m_perm.predict(Xte)
106
- perm_r2s.append(r2_score(yte, y_pred_perm))
107
-
108
- if r2_real > float(np.median(perm_r2s)):
109
- exceed_count += 1
110
-
111
- self.B = float(np.mean(b_gains))
112
- self.I = compute_i(r2_scores)
113
- self.N = exceed_count / self.n_splits
114
- self.M = compute_m(groups)
115
- self.S, self.E = compute_phase_from_brf(self.B, self.I, self.N, self.M)
116
- self.class_ = classify_dataset(self.S, self.E)
117
- self._fitted = True
118
-
119
- return self
120
-
121
- @property
122
- def brf_vector(self) -> dict:
123
- if not self._fitted:
124
- raise RuntimeError("call fit() before accessing brf_vector")
125
- return {
126
- "B": self.B,
127
- "I": self.I,
128
- "N": self.N,
129
- "M": self.M,
130
- "S": self.S,
131
- "E": self.E,
132
- "class": self.class_,
133
- }