PyPI - benchmark-reliability - Versions diffs - 0.1.0__py3-none-any.whl - Mend

benchmark-reliability 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

benchmark_reliability-0.1.0.dist-info/METADATA +121 -0
benchmark_reliability-0.1.0.dist-info/RECORD +18 -0
benchmark_reliability-0.1.0.dist-info/WHEEL +5 -0
benchmark_reliability-0.1.0.dist-info/top_level.txt +1 -0
brf/__init__.py +3 -0
brf/analyzer.py +133 -0
brf/metrics/__init__.py +6 -0
brf/metrics/baseline_gap.py +12 -0
brf/metrics/instability.py +11 -0
brf/metrics/metadata.py +30 -0
brf/metrics/null_test.py +25 -0
brf/phase/__init__.py +5 -0
brf/phase/classifier.py +7 -0
brf/phase/embedding.py +12 -0
brf/phase/visualization.py +52 -0
brf/report/__init__.py +4 -0
brf/report/json_export.py +8 -0
brf/report/latex_export.py +23 -0

benchmark_reliability-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,121 @@
+Metadata-Version: 2.1
+Name: benchmark-reliability
+Version: 0.1.0
+Summary: Benchmark Reliability Framework (BRF) =?unknown-8bit?b?4oCU?= dataset-level reliability auditing for predictive benchmarks
+Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/zhanglizhuo/BenchmarkReliability
+Project-URL: Repository, https://github.com/zhanglizhuo/BenchmarkReliability
+Keywords: benchmark reliability,dataset auditing,educational AI,machine learning
+Classifier: Development Status :: 3 - Alpha
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: numpy (>=1.21)
+Requires-Dist: scikit-learn (>=1.0)
+Requires-Dist: matplotlib (>=3.5)
+# BenchmarkReliability ��� BRF Python Package
+## Target
+Provide a standardized, pip-installable Python package that computes the Benchmark Reliability Framework (BRF) for any predictive dataset, enabling researchers to run the four-dimension audit protocol with a single API call.
+## Method
+The package wraps the core logic from the BehaviorAudit project into a sklearn-style API:
+```python
+from brf import BRFAnalyzer
+from brf.phase import plot_phase_diagram
+from brf.report import export_json
+analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
+print(analyzer.brf_vector)   # (B, I, N, M) ��� (S, E) ��� class
+# Visualization
+plot_phase_diagram(
+    [analyzer.S], [analyzer.E],
+    labels=[analyzer.class_],
+    classes=[analyzer.class_],
+)
+# Export
+export_json(analyzer.brf_vector, "results.json")
+```
+## Package Structure
+```
+brf/
+��������� __init__.py
+��������� analyzer.py          ��� BRFAnalyzer main class
+��������� metrics/
+���   ��������� baseline_gap.py  ��� B
+���   ��������� instability.py   ��� I
+���   ��������� null_test.py     ��� N (permutation test)
+���   ��������� metadata.py      ��� M
+��������� phase/
+���   ��������� embedding.py     ��� S = N - I, E = B + M
+���   ��������� classifier.py    ��� Reliable / Fragile / Void
+���   ��������� visualization.py ��� phase diagram, clustering plot
+��������� report/
+���   ��������� json_export.py
+���   ��������� latex_export.py
+```
+## Steps
+### Phase 1: Package skeleton (1-2 weeks)
+- [x] Initialize Python project with `pyproject.toml`
+- [x] Implement `BRFAnalyzer` main class with fit/predict interface
+- [x] Port `compute_b`, `compute_i`, `compute_n`, `compute_m` from BehaviorAudit
+- [x] Write unit tests for each metric
+### Phase 2: Phase embedding + classification (1 week)
+- [x] Implement `compute_phase(S, E)` and `classify_dataset(S, E)`
+- [x] Build phase diagram visualization (matplotlib)
+- [x] Test on all 7 datasets from BehaviorAudit; verify BRF output matches SR paper results
+### Phase 3: Documentation + distribution (1-2 weeks)
+- [x] Write README with quick-start tutorial and API docs
+- [ ] Publish to TestPyPI ��� PyPI
+- [ ] Set up ReadTheDocs for auto-generated documentation
+- [ ] Add GitHub Actions CI (test on Python 3.9���3.12)
+### Phase 4: HuggingFace Hub integration (optional, 1 week)
+- [ ] Add HF dataset loading wrapper
+- [ ] Allow `brf.fit(dataset_id="OULAD")` shorthand
+## Dependencies
+- `numpy>=1.21`
+- `scikit-learn>=1.0`
+- `matplotlib>=3.5`
+- No deep learning dependencies required
+## Relationship to Sister Repos
+- `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
+- `LLMScoringAudit/`: first applied use case (MM-TBA �� multiple LLMs)
+- `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
+- `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
+## Target Journal
+- Journal of Open Source Software (JOSS) ��� tool paper, lightweight submission
+- Followed by application papers in C&E / BJET
+## Timeline
+- Phase 1���2: 3 weeks
+- Phase 3: 2 weeks
+- Phase 4: optional
+- JOSS submission: after Phase 3

benchmark_reliability-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+brf/__init__.py,sha256=JEXhX4YSatBkKOsNSEkmuwZpWbGW_bUVHJ9rUfloimQ,61
+brf/analyzer.py,sha256=nvb5cqY79ddA9sC7RtyBaLp8QdtuQNNTJ1VNFzQ8xK8,4312
+brf/metrics/__init__.py,sha256=kWx4ikozAbX_0fT0MtyG-wPNbN9Z7d52QkdKb72YC3Q,200
+brf/metrics/baseline_gap.py,sha256=yAgq73ELMXsbWvEtKonV-y2ninY3LRJ9NTootLt65tw,311
+brf/metrics/instability.py,sha256=4UnCzDNl3i26fmXtF1yZY01vmZgTxdsDA9ZFi5vNvBs,302
+brf/metrics/metadata.py,sha256=hRIr8-tKMB-O2LPfKWhRBOG2oVatE2jDxj5RKIgNyS4,939
+brf/metrics/null_test.py,sha256=DQLtYfzj1DKNlVSM3jofKWfAHMiVNiApVVVZAJvLCDY,747
+brf/phase/__init__.py,sha256=nK_0X8XnMChPueDDJwCVzcV9Rw4cAJ9OX21_stNTg2I,213
+brf/phase/classifier.py,sha256=Tb_TU2dLGwh9MjqklIh6v6t6lnfKbMHc-FwGVTfYHAc,212
+brf/phase/embedding.py,sha256=2q3TAso_5UOQhvdweYCXiYk19J4ylj9InUqOHQjUbMU,181
+brf/phase/visualization.py,sha256=U4aqC4slRAaM9fKqjWtEByyXV_oPGry93F-X0onyNKg,1696
+brf/report/__init__.py,sha256=RgBGEAnAF_9t_QfKH7zGwBfzKHC5sdRe8pHYKCGWlEg,119
+brf/report/json_export.py,sha256=ze54oM8La-WfdcMjZJChByRTotQkEiDhkRM-EqZ_B1k,318
+brf/report/latex_export.py,sha256=oDUx9Y-lAwmWPX7EfuN8YXaMUioRidFFyRJlcBwUAT0,1131
+benchmark_reliability-0.1.0.dist-info/METADATA,sha256=RwEddB7oKjgd-DYAxLDHBtCmK7F6NLsOQAFzyHBVu-c,4664
+benchmark_reliability-0.1.0.dist-info/WHEEL,sha256=BNRMDyzLkkcmlv0J8ppDQkk2VED33SesJDynr9ED1gc,91
+benchmark_reliability-0.1.0.dist-info/top_level.txt,sha256=eLsF702sQIv4BWW8eHaxxqdwePWflYHLNpVl5KzW93k,4
+benchmark_reliability-0.1.0.dist-info/RECORD,,

benchmark_reliability-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (75.3.4)
+Root-Is-Purelib: true
+Tag: py3-none-any

benchmark_reliability-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ brf

brf/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .analyzer import BRFAnalyzer
+__all__ = ["BRFAnalyzer"]

brf/analyzer.py ADDED Viewed

@@ -0,0 +1,133 @@
+import math
+import warnings
+from typing import Optional
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import Ridge
+from sklearn.metrics import r2_score
+from sklearn.preprocessing import StandardScaler
+from .metrics import compute_b, compute_i, compute_m
+from .phase import compute_phase_from_brf, classify_dataset
+class BRFAnalyzer:
+    def __init__(
+        self,
+        n_splits: int = 30,
+        n_permutations: int = 200,
+        model=None,
+        seed: int = 42,
+        scale: bool = True,
+    ):
+        if n_splits < 2:
+            raise ValueError("n_splits must be >= 2")
+        self.n_splits = n_splits
+        self.n_permutations = n_permutations
+        self.model = model or Ridge(alpha=1.0)
+        self.seed = seed
+        self.scale = scale
+        self._fitted = False
+        self.B: Optional[float] = None
+        self.I: Optional[float] = None
+        self.N: Optional[float] = None
+        self.M: Optional[float] = None
+        self.S: Optional[float] = None
+        self.E: Optional[float] = None
+        self.class_: Optional[str] = None
+    def _validate_inputs(self, X, y):
+        X = np.asarray(X, dtype=float)
+        y = np.asarray(y, dtype=float)
+        if X.ndim != 2:
+            raise ValueError(f"X must be 2D, got shape {X.shape}")
+        if y.ndim != 1:
+            raise ValueError(f"y must be 1D, got shape {y.shape}")
+        if len(X) != len(y):
+            raise ValueError(f"X and y length mismatch: {len(X)} vs {len(y)}")
+        if len(X) < 20:
+            raise ValueError(f"Need at least 20 samples, got {len(X)}")
+        if not np.all(np.isfinite(X)):
+            raise ValueError("X contains NaN or Inf values")
+        if not np.all(np.isfinite(y)):
+            raise ValueError("y contains NaN or Inf values")
+        unique_y = np.unique(y)
+        if len(unique_y) <= 12 and np.all(unique_y == unique_y.astype(int)):
+            warnings.warn(
+                "y appears to be integer classification labels "
+                f"({len(unique_y)} unique values). "
+                "BRF is designed for regression targets."
+            )
+        return X, y
+    def fit(self, X, y, groups=None):
+        X, y = self._validate_inputs(X, y)
+        n = len(y)
+        if self.scale:
+            scaler = StandardScaler()
+            X = scaler.fit_transform(X)
+        rng_cv = np.random.default_rng(self.seed)
+        rng_perm = np.random.default_rng(self.seed + 10_007)
+        r2_scores = []
+        b_gains = []
+        n_per_fold = max(3, math.ceil(self.n_permutations / self.n_splits))
+        exceed_count = 0
+        for i in range(self.n_splits):
+            idx = rng_cv.permutation(n)
+            split = max(1, int(0.8 * n))
+            train_idx = idx[:split]
+            test_idx = idx[split:]
+            Xtr, Xte = X[train_idx], X[test_idx]
+            ytr, yte = y[train_idx], y[test_idx]
+            y_mean = np.full(len(yte), float(np.mean(ytr)))
+            m = clone(self.model)
+            m.fit(Xtr, ytr)
+            y_pred = m.predict(Xte)
+            r2_real = r2_score(yte, y_pred)
+            r2_scores.append(r2_real)
+            b_gains.append(compute_b(yte, y_pred, y_mean))
+            perm_r2s = []
+            for _ in range(n_per_fold):
+                y_perm = rng_perm.permutation(ytr)
+                m_perm = clone(self.model)
+                m_perm.fit(Xtr, y_perm)
+                y_pred_perm = m_perm.predict(Xte)
+                perm_r2s.append(r2_score(yte, y_pred_perm))
+            if r2_real > float(np.median(perm_r2s)):
+                exceed_count += 1
+        self.B = float(np.mean(b_gains))
+        self.I = compute_i(r2_scores)
+        self.N = exceed_count / self.n_splits
+        self.M = compute_m(groups)
+        self.S, self.E = compute_phase_from_brf(self.B, self.I, self.N, self.M)
+        self.class_ = classify_dataset(self.S, self.E)
+        self._fitted = True
+        return self
+    @property
+    def brf_vector(self) -> dict:
+        if not self._fitted:
+            raise RuntimeError("call fit() before accessing brf_vector")
+        return {
+            "B": self.B,
+            "I": self.I,
+            "N": self.N,
+            "M": self.M,
+            "S": self.S,
+            "E": self.E,
+            "class": self.class_,
+        }

brf/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .baseline_gap import compute_b
+from .instability import compute_i
+from .null_test import compute_n
+from .metadata import compute_m
+__all__ = ["compute_b", "compute_i", "compute_n", "compute_m"]

brf/metrics/baseline_gap.py ADDED Viewed

@@ -0,0 +1,12 @@
+import numpy as np
+from sklearn.metrics import r2_score
+def compute_b(
+    y_true: np.ndarray,
+    y_pred_model: np.ndarray,
+    y_pred_baseline: np.ndarray,
+) -> float:
+    r2_model = r2_score(y_true, y_pred_model)
+    r2_baseline = r2_score(y_true, y_pred_baseline)
+    return float(r2_model - r2_baseline)

brf/metrics/instability.py ADDED Viewed

@@ -0,0 +1,11 @@
+from typing import Sequence
+import numpy as np
+def compute_i(r2_values: Sequence[float], eps: float = 1e-8) -> float:
+    r2_arr = np.array(r2_values)
+    mean_r2 = float(np.mean(r2_arr))
+    std_r2 = float(np.std(r2_arr, ddof=1))
+    denom = max(abs(mean_r2), 1e-4) + eps
+    return std_r2 / denom

brf/metrics/metadata.py ADDED Viewed

@@ -0,0 +1,30 @@
+from typing import Optional
+import numpy as np
+def compute_m(groups: Optional[np.ndarray] = None) -> float:
+    if groups is None:
+        return 0.0
+    group_arr = np.asarray(groups)
+    if not np.issubdtype(group_arr.dtype, np.number):
+        _, group_arr = np.unique(group_arr, return_inverse=True)
+    if not np.all(np.isfinite(group_arr)):
+        raise ValueError("groups contains NaN or Inf values")
+    unique, counts = np.unique(group_arr, return_counts=True)
+    n_groups = len(unique)
+    if n_groups <= 1:
+        return 0.0
+    probs = counts / counts.sum()
+    entropy = -np.sum(probs * np.log(probs + 1e-10))
+    max_entropy = np.log(n_groups)
+    normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
+    group_balance = 1.0 - float(np.std(counts) / (np.mean(counts) + 1e-8))
+    group_balance = max(0.0, min(1.0, group_balance))
+    return float(0.5 * normalized_entropy + 0.5 * group_balance)

brf/metrics/null_test.py ADDED Viewed

@@ -0,0 +1,25 @@
+import numpy as np
+from sklearn.metrics import r2_score
+def compute_n(
+    y_true: np.ndarray,
+    y_pred_real: np.ndarray,
+    n_permutations: int = 500,
+    seed: int = 42,
+) -> float:
+    """Simple permutation test: shuffle y and compare R² against fixed predictions.
+    Does NOT retrain the model per permutation (see BRFAnalyzer for the
+    per-fold retrain version used in the full BRF protocol).
+    """
+    rng = np.random.default_rng(seed)
+    r2_real = r2_score(y_true, y_pred_real)
+    count_exceed = 0
+    for _ in range(n_permutations):
+        y_perm = rng.permutation(y_true)
+        r2_perm = r2_score(y_perm, y_pred_real)
+        if r2_real >= r2_perm:
+            count_exceed += 1
+    return count_exceed / n_permutations

brf/phase/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .embedding import compute_phase_from_brf
+from .classifier import classify_dataset
+from .visualization import plot_phase_diagram
+__all__ = ["compute_phase_from_brf", "classify_dataset", "plot_phase_diagram"]

brf/phase/classifier.py ADDED Viewed

@@ -0,0 +1,7 @@
+def classify_dataset(S: float, E: float, tau_s: float = 0.0, tau_e: float = 0.5) -> str:
+    if S <= tau_s:
+        return "Void"
+    elif E <= tau_e:
+        return "Fragile"
+    else:
+        return "Reliable"

brf/phase/embedding.py ADDED Viewed

@@ -0,0 +1,12 @@
+from typing import Tuple
+def compute_phase_from_brf(
+    B: float,
+    I: float,
+    N: float,
+    M: float,
+) -> Tuple[float, float]:
+    S = N - I
+    E = B + M
+    return S, E

brf/phase/visualization.py ADDED Viewed

@@ -0,0 +1,52 @@
+from typing import List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_phase_diagram(
+    S_list: List[float],
+    E_list: List[float],
+    labels: Optional[List[str]] = None,
+    classes: Optional[List[str]] = None,
+    title: str = "BRF Phase Diagram",
+    save_path: Optional[str] = None,
+    tau_s: float = 0.0,
+    tau_e: float = 0.5,
+):
+    fig, ax = plt.subplots(figsize=(8, 6))
+    if classes is not None:
+        color_map = {"Reliable": "#2ecc71", "Fragile": "#f39c12", "Void": "#e74c3c"}
+        for cls in set(classes):
+            mask = [c == cls for c in classes]
+            ax.scatter(
+                np.array(S_list)[mask],
+                np.array(E_list)[mask],
+                c=color_map.get(cls, "#95a5a6"),
+                label=cls,
+                s=80,
+                edgecolors="black",
+                linewidths=0.5,
+                alpha=0.8,
+            )
+        ax.legend(fontsize=12)
+    else:
+        ax.scatter(S_list, E_list, c="#3498db", s=80, edgecolors="black", linewidths=0.5)
+    if labels:
+        for i, label in enumerate(labels):
+            ax.annotate(label, (S_list[i], E_list[i]), fontsize=8, alpha=0.8)
+    ax.axhline(y=tau_e, color="gray", linestyle="--", alpha=0.4, label=f"E = {tau_e} (Fragile boundary)")
+    ax.axvline(x=tau_s, color="gray", linestyle="--", alpha=0.4, label=f"S = {tau_s} (Void boundary)")
+    ax.set_xlabel("Signal Identifiability (S = N - I)", fontsize=12)
+    ax.set_ylabel("Epistemic Completeness (E = B + M)", fontsize=12)
+    ax.set_title(title, fontsize=14)
+    ax.grid(True, alpha=0.3)
+    if save_path:
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+    return fig

brf/report/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .json_export import export_json
+from .latex_export import export_latex
+__all__ = ["export_json", "export_latex"]

brf/report/json_export.py ADDED Viewed

@@ -0,0 +1,8 @@
+import json
+def export_json(brf_vector: dict, filepath: str) -> None:
+    if any(v is None for v in brf_vector.values()):
+        raise ValueError("BRF vector contains None values; call fit() first")
+    with open(filepath, "w", encoding="utf-8") as f:
+        json.dump(brf_vector, f, indent=2, ensure_ascii=False)

brf/report/latex_export.py ADDED Viewed

@@ -0,0 +1,23 @@
+def export_latex(brf_vector: dict) -> str:
+    """Export BRF vector as a LaTeX table (requires booktabs package)."""
+    for v in brf_vector.values():
+        if v is None:
+            raise ValueError("BRF vector contains None values; call fit() first")
+    lines = [
+        r"\begin{tabular}{lcc}",
+        r"\toprule",
+        r"Dimension & Value & Interpretation \\",
+        r"\midrule",
+        f"B (Baseline Gain) & {brf_vector['B']:.3f} & Model improvement over mean predictor \\\\",
+        f"I (Instability) & {brf_vector['I']:.3f} & Sensitivity to split choice \\\\",
+        f"N (Null Separability) & {brf_vector['N']:.3f} & Signal distinguishability from noise \\\\",
+        f"M (Metadata Sufficiency) & {brf_vector['M']:.3f} & Group structure completeness \\\\",
+        r"\midrule",
+        f"S (Signal Identifiability) & {brf_vector['S']:.3f} & N - I \\\\",
+        f"E (Epistemic Completeness) & {brf_vector['E']:.3f} & B + M \\\\",
+        r"\midrule",
+        f"Class & \\multicolumn{{2}}{{c}}{{{brf_vector['class']}}} \\\\",
+        r"\bottomrule",
+        r"\end{tabular}",
+    ]
+    return "\n".join(lines)