benchmark-reliability 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.1
2
+ Name: benchmark-reliability
3
+ Version: 0.1.0
4
+ Summary: Benchmark Reliability Framework (BRF) =?unknown-8bit?b?4oCU?= dataset-level reliability auditing for predictive benchmarks
5
+ Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/zhanglizhuo/BenchmarkReliability
8
+ Project-URL: Repository, https://github.com/zhanglizhuo/BenchmarkReliability
9
+ Keywords: benchmark reliability,dataset auditing,educational AI,machine learning
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: numpy (>=1.21)
22
+ Requires-Dist: scikit-learn (>=1.0)
23
+ Requires-Dist: matplotlib (>=3.5)
24
+
25
+ # BenchmarkReliability ��� BRF Python Package
26
+
27
+ ## Target
28
+
29
+ Provide a standardized, pip-installable Python package that computes the Benchmark Reliability Framework (BRF) for any predictive dataset, enabling researchers to run the four-dimension audit protocol with a single API call.
30
+
31
+ ## Method
32
+
33
+ The package wraps the core logic from the BehaviorAudit project into a sklearn-style API:
34
+
35
+ ```python
36
+ from brf import BRFAnalyzer
37
+ from brf.phase import plot_phase_diagram
38
+ from brf.report import export_json
39
+
40
+ analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
41
+ print(analyzer.brf_vector) # (B, I, N, M) ��� (S, E) ��� class
42
+
43
+ # Visualization
44
+ plot_phase_diagram(
45
+ [analyzer.S], [analyzer.E],
46
+ labels=[analyzer.class_],
47
+ classes=[analyzer.class_],
48
+ )
49
+
50
+ # Export
51
+ export_json(analyzer.brf_vector, "results.json")
52
+ ```
53
+
54
+ ## Package Structure
55
+
56
+ ```
57
+ brf/
58
+ ��������� __init__.py
59
+ ��������� analyzer.py ��� BRFAnalyzer main class
60
+ ��������� metrics/
61
+ ��� ��������� baseline_gap.py ��� B
62
+ ��� ��������� instability.py ��� I
63
+ ��� ��������� null_test.py ��� N (permutation test)
64
+ ��� ��������� metadata.py ��� M
65
+ ��������� phase/
66
+ ��� ��������� embedding.py ��� S = N - I, E = B + M
67
+ ��� ��������� classifier.py ��� Reliable / Fragile / Void
68
+ ��� ��������� visualization.py ��� phase diagram, clustering plot
69
+ ��������� report/
70
+ ��� ��������� json_export.py
71
+ ��� ��������� latex_export.py
72
+ ```
73
+
74
+ ## Steps
75
+
76
+ ### Phase 1: Package skeleton (1-2 weeks)
77
+ - [x] Initialize Python project with `pyproject.toml`
78
+ - [x] Implement `BRFAnalyzer` main class with fit/predict interface
79
+ - [x] Port `compute_b`, `compute_i`, `compute_n`, `compute_m` from BehaviorAudit
80
+ - [x] Write unit tests for each metric
81
+
82
+ ### Phase 2: Phase embedding + classification (1 week)
83
+ - [x] Implement `compute_phase(S, E)` and `classify_dataset(S, E)`
84
+ - [x] Build phase diagram visualization (matplotlib)
85
+ - [x] Test on all 7 datasets from BehaviorAudit; verify BRF output matches SR paper results
86
+
87
+ ### Phase 3: Documentation + distribution (1-2 weeks)
88
+ - [x] Write README with quick-start tutorial and API docs
89
+ - [ ] Publish to TestPyPI ��� PyPI
90
+ - [ ] Set up ReadTheDocs for auto-generated documentation
91
+ - [ ] Add GitHub Actions CI (test on Python 3.9���3.12)
92
+
93
+ ### Phase 4: HuggingFace Hub integration (optional, 1 week)
94
+ - [ ] Add HF dataset loading wrapper
95
+ - [ ] Allow `brf.fit(dataset_id="OULAD")` shorthand
96
+
97
+ ## Dependencies
98
+
99
+ - `numpy>=1.21`
100
+ - `scikit-learn>=1.0`
101
+ - `matplotlib>=3.5`
102
+ - No deep learning dependencies required
103
+
104
+ ## Relationship to Sister Repos
105
+
106
+ - `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
107
+ - `LLMScoringAudit/`: first applied use case (MM-TBA �� multiple LLMs)
108
+ - `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
109
+ - `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
110
+
111
+ ## Target Journal
112
+
113
+ - Journal of Open Source Software (JOSS) ��� tool paper, lightweight submission
114
+ - Followed by application papers in C&E / BJET
115
+
116
+ ## Timeline
117
+
118
+ - Phase 1���2: 3 weeks
119
+ - Phase 3: 2 weeks
120
+ - Phase 4: optional
121
+ - JOSS submission: after Phase 3
@@ -0,0 +1,18 @@
1
+ brf/__init__.py,sha256=JEXhX4YSatBkKOsNSEkmuwZpWbGW_bUVHJ9rUfloimQ,61
2
+ brf/analyzer.py,sha256=nvb5cqY79ddA9sC7RtyBaLp8QdtuQNNTJ1VNFzQ8xK8,4312
3
+ brf/metrics/__init__.py,sha256=kWx4ikozAbX_0fT0MtyG-wPNbN9Z7d52QkdKb72YC3Q,200
4
+ brf/metrics/baseline_gap.py,sha256=yAgq73ELMXsbWvEtKonV-y2ninY3LRJ9NTootLt65tw,311
5
+ brf/metrics/instability.py,sha256=4UnCzDNl3i26fmXtF1yZY01vmZgTxdsDA9ZFi5vNvBs,302
6
+ brf/metrics/metadata.py,sha256=hRIr8-tKMB-O2LPfKWhRBOG2oVatE2jDxj5RKIgNyS4,939
7
+ brf/metrics/null_test.py,sha256=DQLtYfzj1DKNlVSM3jofKWfAHMiVNiApVVVZAJvLCDY,747
8
+ brf/phase/__init__.py,sha256=nK_0X8XnMChPueDDJwCVzcV9Rw4cAJ9OX21_stNTg2I,213
9
+ brf/phase/classifier.py,sha256=Tb_TU2dLGwh9MjqklIh6v6t6lnfKbMHc-FwGVTfYHAc,212
10
+ brf/phase/embedding.py,sha256=2q3TAso_5UOQhvdweYCXiYk19J4ylj9InUqOHQjUbMU,181
11
+ brf/phase/visualization.py,sha256=U4aqC4slRAaM9fKqjWtEByyXV_oPGry93F-X0onyNKg,1696
12
+ brf/report/__init__.py,sha256=RgBGEAnAF_9t_QfKH7zGwBfzKHC5sdRe8pHYKCGWlEg,119
13
+ brf/report/json_export.py,sha256=ze54oM8La-WfdcMjZJChByRTotQkEiDhkRM-EqZ_B1k,318
14
+ brf/report/latex_export.py,sha256=oDUx9Y-lAwmWPX7EfuN8YXaMUioRidFFyRJlcBwUAT0,1131
15
+ benchmark_reliability-0.1.0.dist-info/METADATA,sha256=RwEddB7oKjgd-DYAxLDHBtCmK7F6NLsOQAFzyHBVu-c,4664
16
+ benchmark_reliability-0.1.0.dist-info/WHEEL,sha256=BNRMDyzLkkcmlv0J8ppDQkk2VED33SesJDynr9ED1gc,91
17
+ benchmark_reliability-0.1.0.dist-info/top_level.txt,sha256=eLsF702sQIv4BWW8eHaxxqdwePWflYHLNpVl5KzW93k,4
18
+ benchmark_reliability-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.4)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
brf/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .analyzer import BRFAnalyzer
2
+
3
+ __all__ = ["BRFAnalyzer"]
brf/analyzer.py ADDED
@@ -0,0 +1,133 @@
1
+ import math
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+ from sklearn.base import clone
7
+ from sklearn.linear_model import Ridge
8
+ from sklearn.metrics import r2_score
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+ from .metrics import compute_b, compute_i, compute_m
12
+ from .phase import compute_phase_from_brf, classify_dataset
13
+
14
+
15
+ class BRFAnalyzer:
16
+ def __init__(
17
+ self,
18
+ n_splits: int = 30,
19
+ n_permutations: int = 200,
20
+ model=None,
21
+ seed: int = 42,
22
+ scale: bool = True,
23
+ ):
24
+ if n_splits < 2:
25
+ raise ValueError("n_splits must be >= 2")
26
+ self.n_splits = n_splits
27
+ self.n_permutations = n_permutations
28
+ self.model = model or Ridge(alpha=1.0)
29
+ self.seed = seed
30
+ self.scale = scale
31
+
32
+ self._fitted = False
33
+ self.B: Optional[float] = None
34
+ self.I: Optional[float] = None
35
+ self.N: Optional[float] = None
36
+ self.M: Optional[float] = None
37
+ self.S: Optional[float] = None
38
+ self.E: Optional[float] = None
39
+ self.class_: Optional[str] = None
40
+
41
+ def _validate_inputs(self, X, y):
42
+ X = np.asarray(X, dtype=float)
43
+ y = np.asarray(y, dtype=float)
44
+ if X.ndim != 2:
45
+ raise ValueError(f"X must be 2D, got shape {X.shape}")
46
+ if y.ndim != 1:
47
+ raise ValueError(f"y must be 1D, got shape {y.shape}")
48
+ if len(X) != len(y):
49
+ raise ValueError(f"X and y length mismatch: {len(X)} vs {len(y)}")
50
+ if len(X) < 20:
51
+ raise ValueError(f"Need at least 20 samples, got {len(X)}")
52
+ if not np.all(np.isfinite(X)):
53
+ raise ValueError("X contains NaN or Inf values")
54
+ if not np.all(np.isfinite(y)):
55
+ raise ValueError("y contains NaN or Inf values")
56
+ unique_y = np.unique(y)
57
+ if len(unique_y) <= 12 and np.all(unique_y == unique_y.astype(int)):
58
+ warnings.warn(
59
+ "y appears to be integer classification labels "
60
+ f"({len(unique_y)} unique values). "
61
+ "BRF is designed for regression targets."
62
+ )
63
+ return X, y
64
+
65
+ def fit(self, X, y, groups=None):
66
+ X, y = self._validate_inputs(X, y)
67
+ n = len(y)
68
+
69
+ if self.scale:
70
+ scaler = StandardScaler()
71
+ X = scaler.fit_transform(X)
72
+
73
+ rng_cv = np.random.default_rng(self.seed)
74
+ rng_perm = np.random.default_rng(self.seed + 10_007)
75
+
76
+ r2_scores = []
77
+ b_gains = []
78
+
79
+ n_per_fold = max(3, math.ceil(self.n_permutations / self.n_splits))
80
+ exceed_count = 0
81
+
82
+ for i in range(self.n_splits):
83
+ idx = rng_cv.permutation(n)
84
+ split = max(1, int(0.8 * n))
85
+ train_idx = idx[:split]
86
+ test_idx = idx[split:]
87
+
88
+ Xtr, Xte = X[train_idx], X[test_idx]
89
+ ytr, yte = y[train_idx], y[test_idx]
90
+
91
+ y_mean = np.full(len(yte), float(np.mean(ytr)))
92
+ m = clone(self.model)
93
+ m.fit(Xtr, ytr)
94
+ y_pred = m.predict(Xte)
95
+
96
+ r2_real = r2_score(yte, y_pred)
97
+ r2_scores.append(r2_real)
98
+ b_gains.append(compute_b(yte, y_pred, y_mean))
99
+
100
+ perm_r2s = []
101
+ for _ in range(n_per_fold):
102
+ y_perm = rng_perm.permutation(ytr)
103
+ m_perm = clone(self.model)
104
+ m_perm.fit(Xtr, y_perm)
105
+ y_pred_perm = m_perm.predict(Xte)
106
+ perm_r2s.append(r2_score(yte, y_pred_perm))
107
+
108
+ if r2_real > float(np.median(perm_r2s)):
109
+ exceed_count += 1
110
+
111
+ self.B = float(np.mean(b_gains))
112
+ self.I = compute_i(r2_scores)
113
+ self.N = exceed_count / self.n_splits
114
+ self.M = compute_m(groups)
115
+ self.S, self.E = compute_phase_from_brf(self.B, self.I, self.N, self.M)
116
+ self.class_ = classify_dataset(self.S, self.E)
117
+ self._fitted = True
118
+
119
+ return self
120
+
121
+ @property
122
+ def brf_vector(self) -> dict:
123
+ if not self._fitted:
124
+ raise RuntimeError("call fit() before accessing brf_vector")
125
+ return {
126
+ "B": self.B,
127
+ "I": self.I,
128
+ "N": self.N,
129
+ "M": self.M,
130
+ "S": self.S,
131
+ "E": self.E,
132
+ "class": self.class_,
133
+ }
@@ -0,0 +1,6 @@
1
+ from .baseline_gap import compute_b
2
+ from .instability import compute_i
3
+ from .null_test import compute_n
4
+ from .metadata import compute_m
5
+
6
+ __all__ = ["compute_b", "compute_i", "compute_n", "compute_m"]
@@ -0,0 +1,12 @@
1
+ import numpy as np
2
+ from sklearn.metrics import r2_score
3
+
4
+
5
+ def compute_b(
6
+ y_true: np.ndarray,
7
+ y_pred_model: np.ndarray,
8
+ y_pred_baseline: np.ndarray,
9
+ ) -> float:
10
+ r2_model = r2_score(y_true, y_pred_model)
11
+ r2_baseline = r2_score(y_true, y_pred_baseline)
12
+ return float(r2_model - r2_baseline)
@@ -0,0 +1,11 @@
1
+ from typing import Sequence
2
+
3
+ import numpy as np
4
+
5
+
6
+ def compute_i(r2_values: Sequence[float], eps: float = 1e-8) -> float:
7
+ r2_arr = np.array(r2_values)
8
+ mean_r2 = float(np.mean(r2_arr))
9
+ std_r2 = float(np.std(r2_arr, ddof=1))
10
+ denom = max(abs(mean_r2), 1e-4) + eps
11
+ return std_r2 / denom
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+
5
+
6
+ def compute_m(groups: Optional[np.ndarray] = None) -> float:
7
+ if groups is None:
8
+ return 0.0
9
+
10
+ group_arr = np.asarray(groups)
11
+ if not np.issubdtype(group_arr.dtype, np.number):
12
+ _, group_arr = np.unique(group_arr, return_inverse=True)
13
+ if not np.all(np.isfinite(group_arr)):
14
+ raise ValueError("groups contains NaN or Inf values")
15
+
16
+ unique, counts = np.unique(group_arr, return_counts=True)
17
+ n_groups = len(unique)
18
+
19
+ if n_groups <= 1:
20
+ return 0.0
21
+
22
+ probs = counts / counts.sum()
23
+ entropy = -np.sum(probs * np.log(probs + 1e-10))
24
+ max_entropy = np.log(n_groups)
25
+ normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
26
+
27
+ group_balance = 1.0 - float(np.std(counts) / (np.mean(counts) + 1e-8))
28
+ group_balance = max(0.0, min(1.0, group_balance))
29
+
30
+ return float(0.5 * normalized_entropy + 0.5 * group_balance)
@@ -0,0 +1,25 @@
1
+ import numpy as np
2
+ from sklearn.metrics import r2_score
3
+
4
+
5
+ def compute_n(
6
+ y_true: np.ndarray,
7
+ y_pred_real: np.ndarray,
8
+ n_permutations: int = 500,
9
+ seed: int = 42,
10
+ ) -> float:
11
+ """Simple permutation test: shuffle y and compare R² against fixed predictions.
12
+ Does NOT retrain the model per permutation (see BRFAnalyzer for the
13
+ per-fold retrain version used in the full BRF protocol).
14
+ """
15
+ rng = np.random.default_rng(seed)
16
+ r2_real = r2_score(y_true, y_pred_real)
17
+
18
+ count_exceed = 0
19
+ for _ in range(n_permutations):
20
+ y_perm = rng.permutation(y_true)
21
+ r2_perm = r2_score(y_perm, y_pred_real)
22
+ if r2_real >= r2_perm:
23
+ count_exceed += 1
24
+
25
+ return count_exceed / n_permutations
brf/phase/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .embedding import compute_phase_from_brf
2
+ from .classifier import classify_dataset
3
+ from .visualization import plot_phase_diagram
4
+
5
+ __all__ = ["compute_phase_from_brf", "classify_dataset", "plot_phase_diagram"]
@@ -0,0 +1,7 @@
1
+ def classify_dataset(S: float, E: float, tau_s: float = 0.0, tau_e: float = 0.5) -> str:
2
+ if S <= tau_s:
3
+ return "Void"
4
+ elif E <= tau_e:
5
+ return "Fragile"
6
+ else:
7
+ return "Reliable"
brf/phase/embedding.py ADDED
@@ -0,0 +1,12 @@
1
+ from typing import Tuple
2
+
3
+
4
+ def compute_phase_from_brf(
5
+ B: float,
6
+ I: float,
7
+ N: float,
8
+ M: float,
9
+ ) -> Tuple[float, float]:
10
+ S = N - I
11
+ E = B + M
12
+ return S, E
@@ -0,0 +1,52 @@
1
+ from typing import List, Optional
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+
6
+
7
+ def plot_phase_diagram(
8
+ S_list: List[float],
9
+ E_list: List[float],
10
+ labels: Optional[List[str]] = None,
11
+ classes: Optional[List[str]] = None,
12
+ title: str = "BRF Phase Diagram",
13
+ save_path: Optional[str] = None,
14
+ tau_s: float = 0.0,
15
+ tau_e: float = 0.5,
16
+ ):
17
+ fig, ax = plt.subplots(figsize=(8, 6))
18
+
19
+ if classes is not None:
20
+ color_map = {"Reliable": "#2ecc71", "Fragile": "#f39c12", "Void": "#e74c3c"}
21
+ for cls in set(classes):
22
+ mask = [c == cls for c in classes]
23
+ ax.scatter(
24
+ np.array(S_list)[mask],
25
+ np.array(E_list)[mask],
26
+ c=color_map.get(cls, "#95a5a6"),
27
+ label=cls,
28
+ s=80,
29
+ edgecolors="black",
30
+ linewidths=0.5,
31
+ alpha=0.8,
32
+ )
33
+ ax.legend(fontsize=12)
34
+ else:
35
+ ax.scatter(S_list, E_list, c="#3498db", s=80, edgecolors="black", linewidths=0.5)
36
+
37
+ if labels:
38
+ for i, label in enumerate(labels):
39
+ ax.annotate(label, (S_list[i], E_list[i]), fontsize=8, alpha=0.8)
40
+
41
+ ax.axhline(y=tau_e, color="gray", linestyle="--", alpha=0.4, label=f"E = {tau_e} (Fragile boundary)")
42
+ ax.axvline(x=tau_s, color="gray", linestyle="--", alpha=0.4, label=f"S = {tau_s} (Void boundary)")
43
+
44
+ ax.set_xlabel("Signal Identifiability (S = N - I)", fontsize=12)
45
+ ax.set_ylabel("Epistemic Completeness (E = B + M)", fontsize=12)
46
+ ax.set_title(title, fontsize=14)
47
+ ax.grid(True, alpha=0.3)
48
+
49
+ if save_path:
50
+ fig.savefig(save_path, dpi=300, bbox_inches="tight")
51
+
52
+ return fig
brf/report/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .json_export import export_json
2
+ from .latex_export import export_latex
3
+
4
+ __all__ = ["export_json", "export_latex"]
@@ -0,0 +1,8 @@
1
+ import json
2
+
3
+
4
+ def export_json(brf_vector: dict, filepath: str) -> None:
5
+ if any(v is None for v in brf_vector.values()):
6
+ raise ValueError("BRF vector contains None values; call fit() first")
7
+ with open(filepath, "w", encoding="utf-8") as f:
8
+ json.dump(brf_vector, f, indent=2, ensure_ascii=False)
@@ -0,0 +1,23 @@
1
+ def export_latex(brf_vector: dict) -> str:
2
+ """Export BRF vector as a LaTeX table (requires booktabs package)."""
3
+ for v in brf_vector.values():
4
+ if v is None:
5
+ raise ValueError("BRF vector contains None values; call fit() first")
6
+ lines = [
7
+ r"\begin{tabular}{lcc}",
8
+ r"\toprule",
9
+ r"Dimension & Value & Interpretation \\",
10
+ r"\midrule",
11
+ f"B (Baseline Gain) & {brf_vector['B']:.3f} & Model improvement over mean predictor \\\\",
12
+ f"I (Instability) & {brf_vector['I']:.3f} & Sensitivity to split choice \\\\",
13
+ f"N (Null Separability) & {brf_vector['N']:.3f} & Signal distinguishability from noise \\\\",
14
+ f"M (Metadata Sufficiency) & {brf_vector['M']:.3f} & Group structure completeness \\\\",
15
+ r"\midrule",
16
+ f"S (Signal Identifiability) & {brf_vector['S']:.3f} & N - I \\\\",
17
+ f"E (Epistemic Completeness) & {brf_vector['E']:.3f} & B + M \\\\",
18
+ r"\midrule",
19
+ f"Class & \\multicolumn{{2}}{{c}}{{{brf_vector['class']}}} \\\\",
20
+ r"\bottomrule",
21
+ r"\end{tabular}",
22
+ ]
23
+ return "\n".join(lines)