benchmark-reliability 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/PKG-INFO +1 -1
  2. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/pyproject.toml +1 -1
  3. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/setup.py +1 -1
  4. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/PKG-INFO +1 -1
  5. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/analyzer.py +85 -61
  6. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/README.md +0 -0
  7. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/setup.cfg +0 -0
  8. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/SOURCES.txt +0 -0
  9. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/dependency_links.txt +0 -0
  10. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/entry_points.txt +0 -0
  11. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/requires.txt +0 -0
  12. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/benchmark_reliability.egg-info/top_level.txt +0 -0
  13. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/__init__.py +0 -0
  14. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/cli.py +0 -0
  15. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/__init__.py +0 -0
  16. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/baseline_gap.py +0 -0
  17. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/instability.py +0 -0
  18. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/metadata.py +0 -0
  19. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/metrics/null_test.py +0 -0
  20. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/__init__.py +0 -0
  21. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/classifier.py +0 -0
  22. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/embedding.py +0 -0
  23. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/phase/visualization.py +0 -0
  24. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/__init__.py +0 -0
  25. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/cli.py +0 -0
  26. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/manifest.yaml +0 -0
  27. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/__init__.py +0 -0
  28. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/assistments.py +0 -0
  29. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/college_scorecard.py +0 -0
  30. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/colleges_aaup.py +0 -0
  31. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/colleges_usnews.py +0 -0
  32. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/entrance_exam.py +0 -0
  33. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/higher_ed.py +0 -0
  34. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mathe.py +0 -0
  35. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/mm_tba.py +0 -0
  36. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/oli.py +0 -0
  37. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/oulad.py +0 -0
  38. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/student_depression.py +0 -0
  39. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/student_dropout.py +0 -0
  40. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/tae.py +0 -0
  41. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/turkiye.py +0 -0
  42. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/uci_student.py +0 -0
  43. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/sources/xapi_edu.py +0 -0
  44. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/registry/verify.py +0 -0
  45. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/__init__.py +0 -0
  46. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/json_export.py +0 -0
  47. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/src/brf/report/latex_export.py +0 -0
  48. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_analyzer.py +0 -0
  49. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_metrics.py +0 -0
  50. {benchmark_reliability-0.2.0 → benchmark_reliability-0.2.1}/tests/test_phase.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "benchmark-reliability"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="benchmark-reliability",
5
- version="0.2.0",
5
+ version="0.2.1",
6
6
  packages=find_packages(where="src"),
7
7
  package_dir={"": "src"},
8
8
  package_data={
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing with built-in benchmark registry
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -137,11 +137,15 @@ class BRFAnalyzer:
137
137
 
138
138
  # ---- improved reporting (v0.2) ----
139
139
 
140
- def diagnose(self) -> Dict[str, str]:
140
+ def diagnose(self, n_samples: Optional[int] = None,
141
+ n_features: Optional[int] = None,
142
+ n_groups: Optional[int] = None) -> Dict[str, str]:
141
143
  """Return structured diagnosis explaining *why* the dataset is in its current state.
142
144
 
143
- Replaces the opaque 3-class label with interpretable per-dimension
144
- explanations, enabling benchmark designers to understand what to fix.
145
+ Args:
146
+ n_samples: Total sample count (for context-aware suggestions).
147
+ n_features: Feature count.
148
+ n_groups: Group count.
145
149
  """
146
150
  if not self._fitted:
147
151
  raise RuntimeError("call fit() before accessing diagnose()")
@@ -149,71 +153,81 @@ class BRFAnalyzer:
149
153
  issues = {}
150
154
  suggestions = {}
151
155
 
156
+ n = n_samples or 0
157
+ p = n_features or 0
158
+ g = n_groups or 0
159
+
152
160
  # --- Predictive signal (B) ---
153
161
  if self.B < 0:
154
162
  issues["B"] = (f"Model performs WORSE than the mean baseline "
155
- f"(B={self.B:.3f}). The features carry no useful "
156
- f"predictive signal for this target.")
157
- suggestions["B"] = "Reconsider feature engineering or target definition."
163
+ f"(B={self.B:.3f}). Features carry no useful signal.")
164
+ suggestions["B"] = ("Reconsider feature engineering and target definition. "
165
+ "The chosen features cannot predict this target.")
158
166
  elif self.B < 0.05:
159
- issues["B"] = (f"Marginal improvement over mean baseline "
160
- f"(B={self.B:.3f}). Features explain very little variance.")
161
- suggestions["B"] = "Add more informative features or reframe the task."
167
+ issues["B"] = (f"Marginal signal (B={self.B:.3f}). "
168
+ f"Features explain very little variance.")
169
+ suggestions["B"] = "Add more informative features or reframe the prediction task."
162
170
  elif self.B < 0.2:
163
- issues["B"] = (f"Moderate predictive signal (B={self.B:.3f}).")
171
+ issues["B"] = (f"Moderate signal (B={self.B:.3f}).")
164
172
  suggestions["B"] = None
165
173
  else:
166
- issues["B"] = (f"Strong predictive signal (B={self.B:.3f}).")
174
+ issues["B"] = (f"Strong signal (B={self.B:.3f}).")
167
175
  suggestions["B"] = None
168
176
 
169
177
  # --- Instability (I) ---
170
178
  if self.I > 1.0:
171
- issues["I"] = (f"High cross-split instability (I={self.I:.3f}). "
172
- f"Model R^2 varies dramatically depending on which "
173
- f"samples happen to be in the test set.")
174
- suggestions["I"] = ("Increase sample size (N), reduce feature count (p), "
175
- "or use regularization.")
179
+ per_group = f"~{n//g} per group" if g > 0 and n > 0 else ""
180
+ n_feat_ratio = f" (N/p={n//p})" if p > 0 and n > 0 else ""
181
+ issues["I"] = (f"High instability (I={self.I:.3f}). "
182
+ f"R^2 varies dramatically across data splits.")
183
+ if n > 0 and n < 200:
184
+ suggestions["I"] = (f"Only N={n} samples{per_group}. "
185
+ f"Increase to 300+ for stable estimates.")
186
+ elif p > 0 and n > 0 and n / p < 10:
187
+ suggestions["I"] = (f"N/p={n//p} is low{n_feat_ratio}. "
188
+ f"Increase N or reduce features (currently {p}).")
189
+ else:
190
+ suggestions["I"] = "Increase N, reduce p, or use stronger regularization."
176
191
  elif self.I > 0.3:
177
192
  issues["I"] = (f"Moderate instability (I={self.I:.3f}).")
178
- suggestions["I"] = "Consider larger N or fewer features for more stable estimates."
193
+ suggestions["I"] = "Consider larger N for more stable estimates."
179
194
  else:
180
- issues["I"] = (f"Low instability (I={self.I:.3f}). "
181
- f"Model is robust to data split variation.")
195
+ issues["I"] = (f"Low instability (I={self.I:.3f}). Stable across splits.")
182
196
  suggestions["I"] = None
183
197
 
184
198
  # --- Null separation (N) ---
185
199
  if self.N < 0.5:
186
- issues["N"] = (f"Model rarely beats permutation baseline "
187
- f"(N={self.N:.3f}). The signal is indistinguishable "
188
- f"from random noise.")
189
- suggestions["N"] = ("The model is effectively fitting noise. "
190
- "Consider whether a predictive relationship exists.")
200
+ issues["N"] = (f"Signal indistinguishable from noise "
201
+ f"(N={self.N:.3f}). Model rarely beats permutation.")
202
+ if self.B <= 0:
203
+ suggestions["N"] = "No predictive relationship detected. Reconsider features/target."
204
+ else:
205
+ suggestions["N"] = "Weak signal. Increase N or simplify the feature set."
191
206
  elif self.N < 0.8:
192
- issues["N"] = (f"Model sometimes fails to beat permutation "
193
- f"(N={self.N:.3f}). Signal is present but inconsistent.")
194
- suggestions["N"] = "Increase sample size or feature quality for more reliable separation."
207
+ issues["N"] = (f"Inconsistent signal separation (N={self.N:.3f}).")
208
+ suggestions["N"] = "Increase N or improve feature quality."
195
209
  else:
196
- issues["N"] = (f"Model consistently beats permutation "
197
- f"(N={self.N:.3f}). Clear signal above noise.")
210
+ issues["N"] = (f"Clear signal above noise (N={self.N:.3f}).")
198
211
  suggestions["N"] = None
199
212
 
200
213
  # --- Metadata adequacy (M) ---
201
214
  if self.M < 0.1:
202
- issues["M"] = (f"Insufficient group metadata (M={self.M:.3f}). "
203
- f"Groups are too few, highly imbalanced, or absent.")
204
- suggestions["M"] = ("Add or improve group annotations. "
205
- "Consider whether an alternative grouping variable "
206
- "captures more meaningful structure.")
215
+ issues["M"] = (f"Insufficient group structure (M={self.M:.3f}). "
216
+ f"Groups are too few, absent, or severely imbalanced.")
217
+ if g < 5:
218
+ suggestions["M"] = (f"Only {g} group(s). Add group annotations "
219
+ f"with >=5 categories for meaningful cross-group evaluation.")
220
+ else:
221
+ suggestions["M"] = (f"{g} groups but highly imbalanced. "
222
+ f"Use a more balanced grouping variable.")
207
223
  elif self.M < 0.3:
208
- issues["M"] = (f"Weak group metadata (M={self.M:.3f}). "
209
- f"Group structure exists but is sparse or imbalanced.")
224
+ issues["M"] = (f"Weak group structure (M={self.M:.3f}).")
210
225
  suggestions["M"] = "Use a finer-grained grouping variable if available."
211
226
  elif self.M < 0.5:
212
- issues["M"] = (f"Moderate group metadata (M={self.M:.3f}).")
227
+ issues["M"] = (f"Moderate group structure (M={self.M:.3f}).")
213
228
  suggestions["M"] = None
214
229
  else:
215
- issues["M"] = (f"Strong group metadata (M={self.M:.3f}). "
216
- f"Group structure is well-defined and balanced.")
230
+ issues["M"] = (f"Strong group structure (M={self.M:.3f}).")
217
231
  suggestions["M"] = None
218
232
 
219
233
  # --- Synthesis ---
@@ -262,31 +276,41 @@ class BRFAnalyzer:
262
276
  "reference": f"BRF Registry v1.5 ({len(s_vals)} benchmarks)",
263
277
  }
264
278
 
265
- def recommend(self) -> str:
266
- """One-paragraph actionable recommendation for benchmark improvement."""
267
- d = self.diagnose()
279
+ def recommend(self, n_samples: Optional[int] = None,
280
+ n_features: Optional[int] = None,
281
+ n_groups: Optional[int] = None) -> str:
282
+ """Actionable recommendations for benchmark improvement.
283
+
284
+ Args:
285
+ n_samples, n_features, n_groups: Optional context for concrete suggestions
286
+ (e.g., "Only N=151 samples. Increase to 300+").
287
+ """
288
+ d = self.diagnose(n_samples, n_features, n_groups)
268
289
  recs = d["recommendations"]
269
290
  if not recs:
270
- return ("Benchmark metrics are within normal ranges. "
271
- "No specific action recommended.")
272
- # Prioritize: B < 0 is most critical, then N < 0.5, then I > 1, then M < 0.1
273
- priority = []
274
- if self.B < 0:
275
- priority.append("B")
276
- if self.N < 0.5:
277
- priority.append("N")
278
- if self.I > 1.0:
279
- priority.append("I")
280
- if self.M < 0.1:
281
- priority.append("M")
282
- if not priority:
283
- priority = [k for k in recs]
291
+ return "No issues found. Your benchmark metrics are within normal ranges."
284
292
 
285
- lines = [
286
- f"This benchmark has {len(recs)} dimension(s) needing attention. "
287
- f"Primary concern: {recs[priority[0]]}"
288
- ]
289
- return " ".join(lines)
293
+ lines = []
294
+ for dim in ["B", "N", "I", "M"]:
295
+ if dim in recs:
296
+ lines.append(f"[{dim}] {recs[dim]}")
297
+ return "\n".join(lines)
298
+
299
+ def recommend_dict(self) -> Dict:
300
+ """Structured actionable recommendations as a dict.
301
+
302
+ Returns {dimension: {"issue": ..., "action": ..., "value": ...}}.
303
+ """
304
+ d = self.diagnose()
305
+ out = {}
306
+ for dim in ["B", "N", "I", "M"]:
307
+ if dim in d["details"] and dim in d["recommendations"]:
308
+ out[dim] = {
309
+ "issue": d["details"][dim],
310
+ "action": d["recommendations"][dim],
311
+ "value": getattr(self, dim),
312
+ }
313
+ return out
290
314
 
291
315
  def _load_registry_ref(self) -> Optional[List[Dict]]:
292
316
  """Load Registry reference data for percentile ranking."""