benchmark-reliability 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/PKG-INFO +1 -1
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/pyproject.toml +1 -1
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/setup.py +1 -1
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/PKG-INFO +1 -1
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/analyzer.py +85 -61
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/README.md +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/setup.cfg +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/SOURCES.txt +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/dependency_links.txt +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/entry_points.txt +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/requires.txt +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/top_level.txt +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/cli.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/baseline_gap.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/instability.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/metadata.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/null_test.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/classifier.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/embedding.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/visualization.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/cli.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/manifest.yaml +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/assistments.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/college_scorecard.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/colleges_aaup.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/colleges_usnews.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/entrance_exam.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/higher_ed.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mathe.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mm_tba.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/oli.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/oulad.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/student_depression.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/student_dropout.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/tae.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/turkiye.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/uci_student.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/xapi_edu.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/verify.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/__init__.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/json_export.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/latex_export.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_analyzer.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_metrics.py +0 -0
- {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_phase.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: benchmark-reliability
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
|
|
5
5
|
Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "benchmark-reliability"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: benchmark-reliability
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
|
|
5
5
|
Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -137,11 +137,15 @@ class BRFAnalyzer:
|
|
|
137
137
|
|
|
138
138
|
# ---- improved reporting (v0.2) ----
|
|
139
139
|
|
|
140
|
-
def diagnose(self
|
|
140
|
+
def diagnose(self, n_samples: Optional[int] = None,
|
|
141
|
+
n_features: Optional[int] = None,
|
|
142
|
+
n_groups: Optional[int] = None) -> Dict[str, str]:
|
|
141
143
|
"""Return structured diagnosis explaining *why* the dataset is in its current state.
|
|
142
144
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
+
Args:
|
|
146
|
+
n_samples: Total sample count (for context-aware suggestions).
|
|
147
|
+
n_features: Feature count.
|
|
148
|
+
n_groups: Group count.
|
|
145
149
|
"""
|
|
146
150
|
if not self._fitted:
|
|
147
151
|
raise RuntimeError("call fit() before accessing diagnose()")
|
|
@@ -149,71 +153,81 @@ class BRFAnalyzer:
|
|
|
149
153
|
issues = {}
|
|
150
154
|
suggestions = {}
|
|
151
155
|
|
|
156
|
+
n = n_samples or 0
|
|
157
|
+
p = n_features or 0
|
|
158
|
+
g = n_groups or 0
|
|
159
|
+
|
|
152
160
|
# --- Predictive signal (B) ---
|
|
153
161
|
if self.B < 0:
|
|
154
162
|
issues["B"] = (f"Model performs WORSE than the mean baseline "
|
|
155
|
-
f"(B={self.B:.3f}).
|
|
156
|
-
|
|
157
|
-
|
|
163
|
+
f"(B={self.B:.3f}). Features carry no useful signal.")
|
|
164
|
+
suggestions["B"] = ("Reconsider feature engineering and target definition. "
|
|
165
|
+
"The chosen features cannot predict this target.")
|
|
158
166
|
elif self.B < 0.05:
|
|
159
|
-
issues["B"] = (f"Marginal
|
|
160
|
-
f"
|
|
161
|
-
suggestions["B"] = "Add more informative features or reframe the task."
|
|
167
|
+
issues["B"] = (f"Marginal signal (B={self.B:.3f}). "
|
|
168
|
+
f"Features explain very little variance.")
|
|
169
|
+
suggestions["B"] = "Add more informative features or reframe the prediction task."
|
|
162
170
|
elif self.B < 0.2:
|
|
163
|
-
issues["B"] = (f"Moderate
|
|
171
|
+
issues["B"] = (f"Moderate signal (B={self.B:.3f}).")
|
|
164
172
|
suggestions["B"] = None
|
|
165
173
|
else:
|
|
166
|
-
issues["B"] = (f"Strong
|
|
174
|
+
issues["B"] = (f"Strong signal (B={self.B:.3f}).")
|
|
167
175
|
suggestions["B"] = None
|
|
168
176
|
|
|
169
177
|
# --- Instability (I) ---
|
|
170
178
|
if self.I > 1.0:
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
179
|
+
per_group = f"~{n//g} per group" if g > 0 and n > 0 else ""
|
|
180
|
+
n_feat_ratio = f" (N/p={n//p})" if p > 0 and n > 0 else ""
|
|
181
|
+
issues["I"] = (f"High instability (I={self.I:.3f}). "
|
|
182
|
+
f"R^2 varies dramatically across data splits.")
|
|
183
|
+
if n > 0 and n < 200:
|
|
184
|
+
suggestions["I"] = (f"Only N={n} samples{per_group}. "
|
|
185
|
+
f"Increase to 300+ for stable estimates.")
|
|
186
|
+
elif p > 0 and n > 0 and n / p < 10:
|
|
187
|
+
suggestions["I"] = (f"N/p={n//p} is low{n_feat_ratio}. "
|
|
188
|
+
f"Increase N or reduce features (currently {p}).")
|
|
189
|
+
else:
|
|
190
|
+
suggestions["I"] = "Increase N, reduce p, or use stronger regularization."
|
|
176
191
|
elif self.I > 0.3:
|
|
177
192
|
issues["I"] = (f"Moderate instability (I={self.I:.3f}).")
|
|
178
|
-
suggestions["I"] = "Consider larger N
|
|
193
|
+
suggestions["I"] = "Consider larger N for more stable estimates."
|
|
179
194
|
else:
|
|
180
|
-
issues["I"] = (f"Low instability (I={self.I:.3f}). "
|
|
181
|
-
f"Model is robust to data split variation.")
|
|
195
|
+
issues["I"] = (f"Low instability (I={self.I:.3f}). Stable across splits.")
|
|
182
196
|
suggestions["I"] = None
|
|
183
197
|
|
|
184
198
|
# --- Null separation (N) ---
|
|
185
199
|
if self.N < 0.5:
|
|
186
|
-
issues["N"] = (f"
|
|
187
|
-
f"(N={self.N:.3f}).
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
200
|
+
issues["N"] = (f"Signal indistinguishable from noise "
|
|
201
|
+
f"(N={self.N:.3f}). Model rarely beats permutation.")
|
|
202
|
+
if self.B <= 0:
|
|
203
|
+
suggestions["N"] = "No predictive relationship detected. Reconsider features/target."
|
|
204
|
+
else:
|
|
205
|
+
suggestions["N"] = "Weak signal. Increase N or simplify the feature set."
|
|
191
206
|
elif self.N < 0.8:
|
|
192
|
-
issues["N"] = (f"
|
|
193
|
-
|
|
194
|
-
suggestions["N"] = "Increase sample size or feature quality for more reliable separation."
|
|
207
|
+
issues["N"] = (f"Inconsistent signal separation (N={self.N:.3f}).")
|
|
208
|
+
suggestions["N"] = "Increase N or improve feature quality."
|
|
195
209
|
else:
|
|
196
|
-
issues["N"] = (f"
|
|
197
|
-
f"(N={self.N:.3f}). Clear signal above noise.")
|
|
210
|
+
issues["N"] = (f"Clear signal above noise (N={self.N:.3f}).")
|
|
198
211
|
suggestions["N"] = None
|
|
199
212
|
|
|
200
213
|
# --- Metadata adequacy (M) ---
|
|
201
214
|
if self.M < 0.1:
|
|
202
|
-
issues["M"] = (f"Insufficient group
|
|
203
|
-
f"Groups are too few,
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
215
|
+
issues["M"] = (f"Insufficient group structure (M={self.M:.3f}). "
|
|
216
|
+
f"Groups are too few, absent, or severely imbalanced.")
|
|
217
|
+
if g < 5:
|
|
218
|
+
suggestions["M"] = (f"Only {g} group(s). Add group annotations "
|
|
219
|
+
f"with >=5 categories for meaningful cross-group evaluation.")
|
|
220
|
+
else:
|
|
221
|
+
suggestions["M"] = (f"{g} groups but highly imbalanced. "
|
|
222
|
+
f"Use a more balanced grouping variable.")
|
|
207
223
|
elif self.M < 0.3:
|
|
208
|
-
issues["M"] = (f"Weak group
|
|
209
|
-
f"Group structure exists but is sparse or imbalanced.")
|
|
224
|
+
issues["M"] = (f"Weak group structure (M={self.M:.3f}).")
|
|
210
225
|
suggestions["M"] = "Use a finer-grained grouping variable if available."
|
|
211
226
|
elif self.M < 0.5:
|
|
212
|
-
issues["M"] = (f"Moderate group
|
|
227
|
+
issues["M"] = (f"Moderate group structure (M={self.M:.3f}).")
|
|
213
228
|
suggestions["M"] = None
|
|
214
229
|
else:
|
|
215
|
-
issues["M"] = (f"Strong group
|
|
216
|
-
f"Group structure is well-defined and balanced.")
|
|
230
|
+
issues["M"] = (f"Strong group structure (M={self.M:.3f}).")
|
|
217
231
|
suggestions["M"] = None
|
|
218
232
|
|
|
219
233
|
# --- Synthesis ---
|
|
@@ -262,31 +276,41 @@ class BRFAnalyzer:
|
|
|
262
276
|
"reference": f"BRF Registry v1.5 ({len(s_vals)} benchmarks)",
|
|
263
277
|
}
|
|
264
278
|
|
|
265
|
-
def recommend(self
|
|
266
|
-
|
|
267
|
-
|
|
279
|
+
def recommend(self, n_samples: Optional[int] = None,
|
|
280
|
+
n_features: Optional[int] = None,
|
|
281
|
+
n_groups: Optional[int] = None) -> str:
|
|
282
|
+
"""Actionable recommendations for benchmark improvement.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
n_samples, n_features, n_groups: Optional context for concrete suggestions
|
|
286
|
+
(e.g., "Only N=151 samples. Increase to 300+").
|
|
287
|
+
"""
|
|
288
|
+
d = self.diagnose(n_samples, n_features, n_groups)
|
|
268
289
|
recs = d["recommendations"]
|
|
269
290
|
if not recs:
|
|
270
|
-
return
|
|
271
|
-
"No specific action recommended.")
|
|
272
|
-
# Prioritize: B < 0 is most critical, then N < 0.5, then I > 1, then M < 0.1
|
|
273
|
-
priority = []
|
|
274
|
-
if self.B < 0:
|
|
275
|
-
priority.append("B")
|
|
276
|
-
if self.N < 0.5:
|
|
277
|
-
priority.append("N")
|
|
278
|
-
if self.I > 1.0:
|
|
279
|
-
priority.append("I")
|
|
280
|
-
if self.M < 0.1:
|
|
281
|
-
priority.append("M")
|
|
282
|
-
if not priority:
|
|
283
|
-
priority = [k for k in recs]
|
|
291
|
+
return "No issues found. Your benchmark metrics are within normal ranges."
|
|
284
292
|
|
|
285
|
-
lines = [
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
return "
|
|
293
|
+
lines = []
|
|
294
|
+
for dim in ["B", "N", "I", "M"]:
|
|
295
|
+
if dim in recs:
|
|
296
|
+
lines.append(f"[{dim}] {recs[dim]}")
|
|
297
|
+
return "\n".join(lines)
|
|
298
|
+
|
|
299
|
+
def recommend_dict(self) -> Dict:
|
|
300
|
+
"""Structured actionable recommendations as a dict.
|
|
301
|
+
|
|
302
|
+
Returns {dimension: {"issue": ..., "action": ..., "value": ...}}.
|
|
303
|
+
"""
|
|
304
|
+
d = self.diagnose()
|
|
305
|
+
out = {}
|
|
306
|
+
for dim in ["B", "N", "I", "M"]:
|
|
307
|
+
if dim in d["details"] and dim in d["recommendations"]:
|
|
308
|
+
out[dim] = {
|
|
309
|
+
"issue": d["details"][dim],
|
|
310
|
+
"action": d["recommendations"][dim],
|
|
311
|
+
"value": getattr(self, dim),
|
|
312
|
+
}
|
|
313
|
+
return out
|
|
290
314
|
|
|
291
315
|
def _load_registry_ref(self) -> Optional[List[Dict]]:
|
|
292
316
|
"""Load Registry reference data for percentile ranking."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/__init__.py
RENAMED
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/assistments.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/higher_ed.py
RENAMED
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mathe.py
RENAMED
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mm_tba.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/oulad.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/turkiye.py
RENAMED
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/uci_student.py
RENAMED
|
File without changes
|
{benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/xapi_edu.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|