eval-toolkit 0.40.0__tar.gz → 0.42.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/CHANGELOG.md +130 -0
  2. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/pyproject.toml +1 -0
  4. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/__init__.py +1 -0
  5. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/_version.py +1 -1
  6. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/calibration.py +78 -0
  7. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/loaders.py +148 -12
  8. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/public_api/snapshot.json +8 -2
  9. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_binary_adapters.py +97 -2
  10. eval_toolkit-0.42.0/tests/test_croissant_e2e.py +145 -0
  11. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_loaders_coverage.py +11 -3
  12. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/.gitignore +0 -0
  13. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/LICENSE +0 -0
  14. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/README.md +0 -0
  15. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/STYLE.md +0 -0
  16. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/archive/README.md +0 -0
  17. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/README.md +0 -0
  18. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/datasets/README.md +0 -0
  19. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/papers/data-integrity/README.md +0 -0
  20. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  21. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/papers/inference/README.md +0 -0
  22. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/research/papers/prompt-injection/README.md +0 -0
  23. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/docs/source/methodology/README.md +0 -0
  24. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/_parallel.py +0 -0
  27. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/analysis.py +0 -0
  28. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/artifacts.py +0 -0
  29. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/bootstrap.py +0 -0
  30. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/claims.py +0 -0
  31. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/config.py +0 -0
  32. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/docs.py +0 -0
  33. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/embeddings.py +0 -0
  34. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/evidence.py +0 -0
  35. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/harness.py +0 -0
  36. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/leakage.py +0 -0
  37. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/manifest.py +0 -0
  38. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/metrics.py +0 -0
  39. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/operating_points.py +0 -0
  40. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/paths.py +0 -0
  41. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/plotting.py +0 -0
  42. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/protocols.py +0 -0
  43. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/provenance.py +0 -0
  44. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/py.typed +0 -0
  45. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  46. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  47. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  48. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  49. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  50. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/seeds.py +0 -0
  51. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/splits.py +0 -0
  52. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/text_dedup.py +0 -0
  53. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/src/eval_toolkit/thresholds.py +0 -0
  54. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  55. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  56. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  57. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  58. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  59. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  60. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  61. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  62. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  63. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  64. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/benchmarks/__init__.py +0 -0
  65. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  66. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/conftest.py +0 -0
  67. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  68. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  69. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  70. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  71. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/docs/expected.md +0 -0
  72. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/docs/input.md +0 -0
  73. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/docs/metrics.json +0 -0
  74. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  75. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/strategies.py +0 -0
  76. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_analysis.py +0 -0
  77. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_artifacts.py +0 -0
  78. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  79. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  80. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_edge_cases.py +0 -0
  81. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_golden.py +0 -0
  82. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_njobs.py +0 -0
  83. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_props.py +0 -0
  84. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_research_grounded.py +0 -0
  85. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_bootstrap_unit.py +0 -0
  86. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  87. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_determinism.py +0 -0
  88. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_optimization_failures.py +0 -0
  89. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_props.py +0 -0
  90. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_research_grounded.py +0 -0
  91. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_calibration_unit.py +0 -0
  92. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_claims.py +0 -0
  93. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_claims_coverage.py +0 -0
  94. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_claims_props.py +0 -0
  95. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_cli.py +0 -0
  96. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_config.py +0 -0
  97. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_coverage_bootstrap.py +0 -0
  98. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_coverage_calibration.py +0 -0
  99. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_coverage_harness.py +0 -0
  100. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_coverage_metrics.py +0 -0
  101. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_coverage_plotting.py +0 -0
  102. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  103. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_deprecations.py +0 -0
  104. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_docs_golden.py +0 -0
  105. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_docs_props.py +0 -0
  106. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_embeddings.py +0 -0
  107. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_evidence_validators.py +0 -0
  108. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_edge_cases.py +0 -0
  109. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_fault_injection.py +0 -0
  110. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_folded.py +0 -0
  111. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_internals.py +0 -0
  112. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_metric_options.py +0 -0
  113. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_parallelism.py +0 -0
  114. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_harness_smoke.py +0 -0
  115. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_import_boundaries.py +0 -0
  116. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  117. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_leakage.py +0 -0
  118. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_leakage_error_paths.py +0 -0
  119. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_leakage_props.py +0 -0
  120. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_loaders.py +0 -0
  121. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_loaders_props.py +0 -0
  122. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_logging.py +0 -0
  123. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_manifest.py +0 -0
  124. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  125. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_manifest_props.py +0 -0
  126. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_manifest_validation.py +0 -0
  127. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_metrics_props.py +0 -0
  128. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_metrics_stratified_subsets.py +0 -0
  129. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_metrics_unit.py +0 -0
  130. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_misc_coverage.py +0 -0
  131. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_numeric_edge_cases.py +0 -0
  132. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_operating_points.py +0 -0
  133. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_operating_points_props.py +0 -0
  134. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_parallel.py +0 -0
  135. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_paths.py +0 -0
  136. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_pipeline_e2e.py +0 -0
  137. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_plotting_edge.py +0 -0
  138. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_plotting_smoke.py +0 -0
  139. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_plotting_visual.py +0 -0
  140. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_protocol_conformance.py +0 -0
  141. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_provenance.py +0 -0
  142. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_public_api.py +0 -0
  143. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_recall_at_fpr.py +0 -0
  144. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_reference_equivalence.py +0 -0
  145. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_reproducibility_integration.py +0 -0
  146. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_schemas.py +0 -0
  147. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_seeds.py +0 -0
  148. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_splits.py +0 -0
  149. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_splits_leakage_integration.py +0 -0
  150. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_splits_props.py +0 -0
  151. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_text_dedup.py +0 -0
  152. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_text_dedup_coverage.py +0 -0
  153. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_text_dedup_props.py +0 -0
  154. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_text_dedup_strategies.py +0 -0
  155. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_thresholds.py +0 -0
  156. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_thresholds_constant_score.py +0 -0
  157. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_thresholds_coverage.py +0 -0
  158. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_thresholds_props.py +0 -0
  159. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_thresholds_research_grounded.py +0 -0
  160. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_tokenization_leakage_check.py +0 -0
  161. {eval_toolkit-0.40.0 → eval_toolkit-0.42.0}/tests/test_v09_contracts.py +0 -0
@@ -7,6 +7,136 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.42.0] — 2026-05-19 — fit_isotonic_binary completes 4-calibrator family (closes #44)
11
+
12
+ Final element of the binary scalar-prob calibrator family started by
13
+ `fit_temperature_binary` (v0.35.0). All four now uniformly return
14
+ `(params, apply)`:
15
+
16
+ | Function | Params | Shipped |
17
+ |---|---|---|
18
+ | `fit_temperature_binary` | `(T,)` — single float | v0.35.0 |
19
+ | `fit_isotonic_binary` | `None` — non-parametric | **v0.42.0** |
20
+ | `fit_platt_binary` | `(a, b)` | v0.40.0 |
21
+ | `fit_beta_binary` | `(a, b, c)` | v0.40.0 |
22
+
23
+ Consumer code can now iterate the family with a single shape, used
24
+ to distinguish parametric from non-parametric via
25
+ `if params is not None`:
26
+
27
+ ```text
28
+ CALIBRATORS = {
29
+ "temperature": fit_temperature_binary,
30
+ "isotonic": fit_isotonic_binary,
31
+ "platt": fit_platt_binary,
32
+ "beta": fit_beta_binary,
33
+ }
34
+ for name, fit_fn in CALIBRATORS.items():
35
+ params, apply = fit_fn(y_val, p_val)
36
+ calibrated = apply(p_test)
37
+ if params is not None:
38
+ manifest.record(f"{name}_params", params)
39
+ ```
40
+
41
+ This matches the consumer's calibration-battery pattern in
42
+ `prompt-injection-detection-prototype` (their ADR-056 supersedes
43
+ ADR-023 to adopt the canonical `(params, apply)` shape across the
44
+ full 4-calibrator audit battery).
45
+
46
+ ### Added
47
+
48
+ - **`eval_toolkit.fit_isotonic_binary(y_true, y_score) -> (None,
49
+ apply)`** — thin wrapper over `fit_isotonic_calibrator`. The
50
+ `None` in the params slot encodes "non-parametric" (isotonic
51
+ regression is a monotone step function, no scalar params to log).
52
+ - 6 new unit tests in `tests/test_calibration_binary_adapters.py`
53
+ including a 4-calibrator family-iteration integration test that
54
+ verifies the `None`-vs-tuple convention.
55
+
56
+ ### Protocol stability
57
+
58
+ Additive only. No Tier-2 Protocol shape edits. v0.42 is minor 3 of
59
+ consecutive-without-Protocol-changes (v0.40 + v0.41 + v0.42). Gate 2
60
+ stays MET.
61
+
62
+ ## [0.41.0] — 2026-05-18 — Croissant end-to-end (closes #42, v1.0 Gate 4 MET)
63
+
64
+ Closes v1.0 readiness Gate 4 — "Croissant interop verified end-to-end."
65
+ `HFDatasetsLoader.describe()` now fetches per-file `sha256` hashes
66
+ from HF Hub and exposes them in `distribution[].sha256`. The
67
+ integration test (`tests/test_croissant_e2e.py`) downloads a real
68
+ parquet shard from `stanfordnlp/sst2` and verifies the bytes hash
69
+ bit-exactly to the value `describe()` reports.
70
+
71
+ ### Added
72
+
73
+ - **`HFDatasetsLoader.describe()` Croissant + tree-API enrichment.**
74
+ When `fetch_remote_metadata=True` (default), the loader fetches from
75
+ two HF Hub endpoints:
76
+ - `/api/datasets/{repo}/croissant` — JSON-LD metadata (name,
77
+ description, license, citeAs, schema).
78
+ - `/api/datasets/{repo}/tree/refs%2Fconvert%2Fparquet?recursive=true`
79
+ — per-file `sha256` (read from each file's `lfs.oid` field — the
80
+ git-LFS content hash, equal to `sha256sum` of the raw bytes).
81
+ Caller-provided fields (`name=`, `cite_as=`, etc.) win over
82
+ Croissant fetches; Croissant fills only gaps. Network failures
83
+ degrade gracefully (warning emitted; sha256 empty as in pre-v0.41).
84
+ - **`fetch_remote_metadata: bool = True`** constructor field on
85
+ `HFDatasetsLoader`. Set `False` for offline / unit-test paths.
86
+ - **`tests/test_croissant_e2e.py`** — 5 integration tests against
87
+ live HF Hub:
88
+ 1. `describe()` returns real `sha256:<64-hex>` per shard.
89
+ 2. **Bit-exact verification**: download shard from `contentUrl`,
90
+ hash bytes, assert equals `describe()`'s sha256. This is the
91
+ literal v1.0 Gate 4 check.
92
+ 3. Croissant metadata enriches name/citeAs/license/description.
93
+ 4. Caller overrides win over remote.
94
+ 5. `fetch_remote_metadata=False` preserves v0.40 behavior.
95
+ All pass against `stanfordnlp/sst2` (~3 MB train shard).
96
+ - **New `integration` pytest marker** for network-dependent tests.
97
+ Excluded from `make coverage` (PR CI); runs explicitly via
98
+ `pytest -m integration`.
99
+
100
+ ### Why dual-sourced
101
+
102
+ HF Hub's Croissant emitter currently fills `distribution[].sha256`
103
+ with a placeholder URL pointing at MLCommons Croissant spec issue
104
+ [#80](https://github.com/mlcommons/croissant/issues/80) ("In
105
+ <Download>, check SHA256 or MD5"), which is **open**. The Croissant
106
+ spec doesn't yet require per-file checksums from emitters; HF Hub is
107
+ honest and punts the field. The authoritative hash IS available via
108
+ HF Hub's tree API: `lfs.oid` is precisely sha256 of the file content
109
+ (verified bit-exact via `sha256sum`).
110
+
111
+ When MLCommons #80 resolves and HF Hub starts populating Croissant
112
+ `sha256` with real values (which will equal the existing `lfs.oid`),
113
+ the loader's source switches in ~5 LOC. Same downstream contract.
114
+
115
+ ### Documentation
116
+
117
+ - `docs/source/methodology/reproducibility.md` §"Croissant
118
+ interoperability": replaces v0.7-era "subset" framing with the
119
+ end-to-end-verified narrative + dual-source rationale.
120
+ - `docs/source/roadmap.md` §"v1.0.0 path":
121
+ - **Gate 2 (Protocol stability) ✅ MET** — v0.41 = minor 2 of 2
122
+ without Protocol shape edits (v0.40 fit_*_binary additives +
123
+ v0.41 HFDatasetsLoader enrichment leave Tier-2 Protocols
124
+ untouched).
125
+ - **Gate 4 (Croissant end-to-end) ✅ MET** — with dual-source caveat
126
+ documented; one-line migration path when MLCommons #80 resolves.
127
+
128
+ ### v1.0 readiness state after v0.41.0
129
+
130
+ - Gate 1 (real consumer ≥1 review cycle on v0.7+): partial — consumer
131
+ pinned to v0.34.0; needs bump + cycle. **External**.
132
+ - Gate 2 ✅ MET (v0.41 is minor 2 of 2 stable).
133
+ - Gate 3 (methodology peer review): not met — needs external reader.
134
+ **External**.
135
+ - Gate 4 ✅ MET — see this release.
136
+
137
+ Two of four gates closed in-repo. The remaining two require external
138
+ coordination (consumer review cycle, methodology peer reviewer).
139
+
10
140
  ## [0.40.0] — 2026-05-18 — fit_platt_binary + fit_beta_binary (closes #43)
11
141
 
12
142
  Completes the binary scalar-prob calibrator family started in v0.35.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.40.0
3
+ Version: 0.42.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -188,6 +188,7 @@ markers = [
188
188
  "slow: Tests > 2s (bootstrap-t studentized, multi-seed K-fold). Opt out with `pytest -m 'not slow'`.",
189
189
  "monte_carlo: Monte Carlo calibration suite (~14 min). Skipped in PR CI; runs only in the nightly-mc workflow via `-m monte_carlo`.",
190
190
  "benchmark: pytest-benchmark perf-regression tests on math kernels. Skipped in PR CI; runs in the nightly-benchmarks workflow via `-m benchmark`. Per v0.29.0 plan Tier γ #1.",
191
+ "integration: Network-dependent integration tests (HF Hub API, Croissant endpoints, etc.). Excluded from PR CI to avoid network-flake; runs in nightly. Opt in via `-m integration`. Added v0.41.0 (#42 Croissant Gate 4).",
191
192
  ]
192
193
 
193
194
  [tool.coverage.run]
@@ -85,6 +85,7 @@ _EXPORTS: dict[str, str] = {
85
85
  "bayes_optimal_threshold": "eval_toolkit.calibration",
86
86
  "fit_beta_binary": "eval_toolkit.calibration",
87
87
  "fit_beta_calibrator": "eval_toolkit.calibration",
88
+ "fit_isotonic_binary": "eval_toolkit.calibration",
88
89
  "fit_isotonic_calibrator": "eval_toolkit.calibration",
89
90
  "fit_platt_binary": "eval_toolkit.calibration",
90
91
  "fit_platt_calibrator": "eval_toolkit.calibration",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.40.0"
5
+ __version__ = "0.42.0"
@@ -55,6 +55,7 @@ __all__ = [
55
55
  "bayes_optimal_threshold",
56
56
  "fit_beta_binary",
57
57
  "fit_beta_calibrator",
58
+ "fit_isotonic_binary",
58
59
  "fit_isotonic_calibrator",
59
60
  "fit_platt_binary",
60
61
  "fit_platt_calibrator",
@@ -1294,6 +1295,83 @@ def fit_beta_binary(
1294
1295
  return (a, b, c), apply
1295
1296
 
1296
1297
 
1298
+ def fit_isotonic_binary(
1299
+ y_true: np.ndarray, y_score: np.ndarray
1300
+ ) -> tuple[None, Callable[[np.ndarray], np.ndarray]]:
1301
+ r"""Binary-probability adapter for :func:`fit_isotonic_calibrator`.
1302
+
1303
+ Mirror of :func:`fit_temperature_binary` / :func:`fit_platt_binary`
1304
+ / :func:`fit_beta_binary`: returns ``(None, apply)``. Isotonic
1305
+ regression is non-parametric — there are no introspectable scalar
1306
+ parameters to log alongside the apply callable — so the params
1307
+ slot is :obj:`None`.
1308
+
1309
+ The ``None``-in-params slot makes "non-parametric" unambiguous
1310
+ while preserving the canonical ``(params_tuple, apply)`` shape
1311
+ shared by the four binary scalar-prob calibrators. Consumer code
1312
+ can iterate over the full family with one idiom:
1313
+
1314
+ .. code-block:: text
1315
+
1316
+ CALIBRATORS = {
1317
+ "temperature": fit_temperature_binary,
1318
+ "isotonic": fit_isotonic_binary,
1319
+ "platt": fit_platt_binary,
1320
+ "beta": fit_beta_binary,
1321
+ }
1322
+ for name, fit_fn in CALIBRATORS.items():
1323
+ params, apply = fit_fn(y_val, p_val)
1324
+ calibrated = apply(p_test)
1325
+ if params is not None:
1326
+ manifest.record(f"{name}_params", params)
1327
+
1328
+ Added v0.42.0 (closes #44) to complete the binary scalar-prob
1329
+ calibrator family started by ``fit_temperature_binary`` (v0.35.0).
1330
+
1331
+ Parameters
1332
+ ----------
1333
+ y_true : np.ndarray, shape (n,)
1334
+ Binary validation labels in ``{0, 1}``.
1335
+ y_score : np.ndarray, shape (n,)
1336
+ Validation predicted probabilities of class 1, in [0, 1].
1337
+
1338
+ Returns
1339
+ -------
1340
+ tuple
1341
+ ``(None, apply)`` — ``None`` in the params slot (isotonic is
1342
+ non-parametric); ``apply`` maps probabilities through the
1343
+ fitted monotonic step function.
1344
+
1345
+ Raises
1346
+ ------
1347
+ ValueError
1348
+ On shape mismatch, empty input, non-finite scores, or
1349
+ single-class ``y_true`` (propagated from
1350
+ :func:`fit_isotonic_calibrator`).
1351
+
1352
+ Examples
1353
+ --------
1354
+ >>> import numpy as np
1355
+ >>> rng = np.random.default_rng(0)
1356
+ >>> n = 500
1357
+ >>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
1358
+ >>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
1359
+ >>> params, apply = fit_isotonic_binary(y_val, p_val)
1360
+ >>> params is None
1361
+ True
1362
+ >>> apply(np.array([0.1, 0.5, 0.9])).shape == (3,)
1363
+ True
1364
+
1365
+ See Also
1366
+ --------
1367
+ fit_isotonic_calibrator : underlying non-parametric fitter.
1368
+ fit_temperature_binary : 1-parameter sibling.
1369
+ fit_platt_binary : 2-parameter sibling.
1370
+ fit_beta_binary : 3-parameter sibling.
1371
+ """
1372
+ return None, fit_isotonic_calibrator(y_true, y_score)
1373
+
1374
+
1297
1375
  def fit_temperature_oracle(
1298
1376
  y_true: np.ndarray, y_score: np.ndarray
1299
1377
  ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
@@ -29,7 +29,9 @@ References
29
29
  from __future__ import annotations
30
30
 
31
31
  import glob as _glob
32
+ import json as _json
32
33
  import logging
34
+ import urllib.request as _urlrequest
33
35
  from collections.abc import Mapping, Sequence
34
36
  from dataclasses import dataclass
35
37
  from pathlib import Path
@@ -38,6 +40,24 @@ from typing import TYPE_CHECKING, Any, Protocol, cast, runtime_checkable
38
40
  from eval_toolkit.harness import EvalSlice
39
41
  from eval_toolkit.provenance import file_sha256
40
42
 
43
+ _HF_HUB_BASE = "https://huggingface.co"
44
+ _HF_FETCH_TIMEOUT_SEC = 15
45
+
46
+
47
+ def _hf_get_json(path: str) -> Any:
48
+ """GET ``https://huggingface.co{path}`` and return parsed JSON.
49
+
50
+ Stdlib-only (no ``requests`` / ``huggingface_hub`` dep). Raises
51
+ ``OSError`` (urllib error) or ``ValueError`` (JSON decode) on
52
+ failure — callers catch both. The 15-second timeout caps any one
53
+ fetch so CI doesn't hang on a slow HF Hub.
54
+ """
55
+ url = f"{_HF_HUB_BASE}{path}"
56
+ req = _urlrequest.Request(url, headers={"User-Agent": "eval-toolkit"})
57
+ with _urlrequest.urlopen(req, timeout=_HF_FETCH_TIMEOUT_SEC) as resp:
58
+ return _json.loads(resp.read().decode("utf-8"))
59
+
60
+
41
61
  _logger = logging.getLogger(__name__)
42
62
 
43
63
  if TYPE_CHECKING:
@@ -367,10 +387,22 @@ class HFDatasetsLoader:
367
387
  raised at :meth:`load_splits` time with a clear install hint. This is
368
388
  intentional — eval-toolkit's core deps are numpy / scipy / sklearn only.
369
389
 
390
+ Since v0.41.0, :meth:`describe` enriches its output with per-file
391
+ ``sha256`` hashes fetched from the HF Hub tree API (the ``lfs.oid``
392
+ field), plus Croissant metadata fetched from HF Hub's Croissant
393
+ endpoint. The dual-source design is documented in
394
+ ``methodology/reproducibility.md`` §"Croissant interoperability";
395
+ in short: HF Hub's Croissant emitter currently punts the
396
+ ``distribution[].sha256`` field per MLCommons Croissant issue #80
397
+ (open), so we read the authoritative sha256 from the tree API's
398
+ ``lfs.oid`` instead. When #80 resolves and HF Hub starts populating
399
+ Croissant ``sha256`` with real values, the implementation collapses
400
+ to a single source.
401
+
370
402
  Parameters
371
403
  ----------
372
404
  repo_id : str
373
- HuggingFace dataset repo, e.g. ``"deepset/prompt-injections"``.
405
+ HuggingFace dataset repo, e.g. ``"stanfordnlp/sst2"``.
374
406
  splits : sequence of str or None, optional
375
407
  Subset of HF splits to load. ``None`` = every split the repo defines.
376
408
  feature_col : str, optional
@@ -381,7 +413,12 @@ class HFDatasetsLoader:
381
413
  config_name : str or None, optional
382
414
  HF dataset config name (some datasets have multiple configs).
383
415
  name, description, cite_as, license, url : str, optional
384
- Croissant metadata fields.
416
+ Croissant metadata overrides. If empty, :meth:`describe` will
417
+ fall back to fetching from HF Hub's Croissant endpoint.
418
+ fetch_remote_metadata : bool, optional
419
+ If ``True`` (default), :meth:`describe` fetches Croissant + tree
420
+ metadata from HF Hub. Set ``False`` to disable network calls
421
+ (useful for offline / unit testing).
385
422
  """
386
423
 
387
424
  repo_id: str
@@ -395,6 +432,7 @@ class HFDatasetsLoader:
395
432
  cite_as: str = ""
396
433
  license: str = ""
397
434
  url: str = ""
435
+ fetch_remote_metadata: bool = True
398
436
 
399
437
  def _load_dataset(self) -> Mapping[str, Any]:
400
438
  """Soft-import ``datasets`` and return the loaded DatasetDict.
@@ -447,20 +485,118 @@ class HFDatasetsLoader:
447
485
  return out
448
486
 
449
487
  def describe(self) -> dict[str, object]:
450
- """Croissant-subset metadata pointing at the HF repo (no file hashes HF caches)."""
451
- return {
452
- "name": self.name or self.repo_id,
453
- "description": self.description,
454
- "citeAs": self.cite_as,
455
- "license": self.license,
456
- "url": self.url or f"https://huggingface.co/datasets/{self.repo_id}",
457
- "distribution": [
488
+ """Croissant-compatible metadata + per-file sha256 from HF Hub.
489
+
490
+ When ``fetch_remote_metadata=True`` (default), enriches the
491
+ baseline metadata with two HF Hub API fetches:
492
+
493
+ - **Croissant endpoint** (``/api/datasets/{repo}/croissant``) —
494
+ provides ``name``, ``description``, ``citeAs``, ``license``,
495
+ ``url`` defaults when the loader's fields are empty.
496
+ - **Tree API** (``/api/datasets/{repo}/tree/...?recursive=true``) —
497
+ provides per-file ``sha256`` (from ``lfs.oid``) and
498
+ ``contentSize`` for each parquet shard under the
499
+ ``refs/convert/parquet`` branch.
500
+
501
+ Network failures degrade gracefully (warning emitted; sha256
502
+ empty as in pre-v0.41 behavior). See class docstring for the
503
+ dual-source rationale (MLCommons Croissant issue #80).
504
+ """
505
+ remote_meta: dict[str, object] = {}
506
+ distribution: list[dict[str, object]] = []
507
+ if self.fetch_remote_metadata:
508
+ remote_meta = self._fetch_croissant_metadata_safe()
509
+ distribution = self._fetch_tree_distribution_safe()
510
+
511
+ # Caller-provided fields win; Croissant fills gaps.
512
+ def _pick(local: str, key: str) -> str:
513
+ if local:
514
+ return local
515
+ val = remote_meta.get(key)
516
+ return val if isinstance(val, str) else ""
517
+
518
+ if not distribution:
519
+ distribution = [
458
520
  {
459
521
  "name": f"hf:{self.repo_id}",
460
522
  "contentUrl": f"https://huggingface.co/datasets/{self.repo_id}",
461
- "sha256": "", # HF cache hash not exposed via the public API
523
+ "sha256": "",
462
524
  "contentSize": 0,
463
525
  }
464
- ],
526
+ ]
527
+
528
+ return {
529
+ "name": _pick(self.name, "name") or self.repo_id,
530
+ "description": _pick(self.description, "description"),
531
+ "citeAs": _pick(self.cite_as, "citeAs"),
532
+ "license": _pick(self.license, "license"),
533
+ "url": self.url or f"https://huggingface.co/datasets/{self.repo_id}",
534
+ "distribution": distribution,
465
535
  "config_name": self.config_name,
466
536
  }
537
+
538
+ def _fetch_croissant_metadata_safe(self) -> dict[str, object]:
539
+ """Fetch HF Hub Croissant JSON-LD; return empty dict on any failure."""
540
+ try:
541
+ data = _hf_get_json(f"/api/datasets/{self.repo_id}/croissant")
542
+ return data if isinstance(data, dict) else {}
543
+ except (OSError, ValueError) as exc: # urllib.URLError, JSONDecodeError, etc.
544
+ _logger.warning(
545
+ "HFDatasetsLoader %s: Croissant fetch failed (%s); proceeding without",
546
+ self.repo_id,
547
+ exc,
548
+ )
549
+ return {}
550
+
551
+ def _fetch_tree_distribution_safe(self) -> list[dict[str, object]]:
552
+ """Fetch HF Hub tree API for the parquet-convert branch; return ``cr:FileObject`` entries.
553
+
554
+ Each entry carries ``sha256`` (from ``lfs.oid`` — the git-LFS
555
+ content hash, equal to ``sha256sum`` of the file content) and
556
+ ``contentSize`` (from the tree response's ``size`` field).
557
+
558
+ Falls back to an empty list on any failure — callers should
559
+ treat empty distribution as "no remote provenance available."
560
+ """
561
+ # HF stores native parquet (or auto-converts) under
562
+ # refs/convert/parquet; that's the canonical hash target.
563
+ path = f"/api/datasets/{self.repo_id}/tree/refs%2Fconvert%2Fparquet?recursive=true"
564
+ try:
565
+ entries = _hf_get_json(path)
566
+ except (OSError, ValueError) as exc:
567
+ _logger.warning(
568
+ "HFDatasetsLoader %s: tree-API fetch failed (%s); sha256 unavailable",
569
+ self.repo_id,
570
+ exc,
571
+ )
572
+ return []
573
+ if not isinstance(entries, list):
574
+ return []
575
+ out: list[dict[str, object]] = []
576
+ for entry in entries:
577
+ if not isinstance(entry, dict):
578
+ continue
579
+ if entry.get("type") != "file":
580
+ continue
581
+ path_val = entry.get("path", "")
582
+ if not isinstance(path_val, str) or not path_val.endswith(".parquet"):
583
+ continue
584
+ lfs = entry.get("lfs")
585
+ sha = ""
586
+ if isinstance(lfs, dict):
587
+ oid = lfs.get("oid")
588
+ if isinstance(oid, str) and len(oid) == 64: # sha256 hex
589
+ sha = f"sha256:{oid}"
590
+ size = entry.get("size", 0)
591
+ out.append(
592
+ {
593
+ "name": path_val,
594
+ "contentUrl": (
595
+ f"https://huggingface.co/datasets/{self.repo_id}"
596
+ f"/resolve/refs%2Fconvert%2Fparquet/{path_val}"
597
+ ),
598
+ "sha256": sha,
599
+ "contentSize": int(size) if isinstance(size, (int, float)) else 0,
600
+ }
601
+ )
602
+ return out
@@ -136,6 +136,7 @@
136
136
  "file_sha256",
137
137
  "fit_beta_binary",
138
138
  "fit_beta_calibrator",
139
+ "fit_isotonic_binary",
139
140
  "fit_isotonic_calibrator",
140
141
  "fit_operating_points",
141
142
  "fit_platt_binary",
@@ -553,7 +554,7 @@
553
554
  ],
554
555
  "doc_first_line": "Load a HuggingFace ``datasets`` repo as ``{split: EvalSlice}``.",
555
556
  "kind": "class",
556
- "signature": "(repo_id: 'str', splits: 'Sequence[str] | None' = None, feature_col: 'str' = 'text', label_col: 'str' = 'label', strata_col: 'str | None' = None, config_name: 'str | None' = None, name: 'str' = '', description: 'str' = '', cite_as: 'str' = '', license: 'str' = '', url: 'str' = '') -> None"
557
+ "signature": "(repo_id: 'str', splits: 'Sequence[str] | None' = None, feature_col: 'str' = 'text', label_col: 'str' = 'label', strata_col: 'str | None' = None, config_name: 'str | None' = None, name: 'str' = '', description: 'str' = '', cite_as: 'str' = '', license: 'str' = '', url: 'str' = '', fetch_remote_metadata: 'bool' = True) -> None"
557
558
  },
558
559
  "HoldoutSplitter": {
559
560
  "bases": [
@@ -1036,7 +1037,7 @@
1036
1037
  "doc_first_line": "str(object='') -> str",
1037
1038
  "kind": "value",
1038
1039
  "type": "str",
1039
- "value": "'0.40.0'"
1040
+ "value": "'0.42.0'"
1040
1041
  },
1041
1042
  "apply_operating_points": {
1042
1043
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1208,6 +1209,11 @@
1208
1209
  "kind": "function",
1209
1210
  "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray') -> 'Callable[[np.ndarray], np.ndarray]'"
1210
1211
  },
1212
+ "fit_isotonic_binary": {
1213
+ "doc_first_line": "Binary-probability adapter for :func:`fit_isotonic_calibrator`.",
1214
+ "kind": "function",
1215
+ "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray') -> 'tuple[None, Callable[[np.ndarray], np.ndarray]]'"
1216
+ },
1211
1217
  "fit_isotonic_calibrator": {
1212
1218
  "doc_first_line": "Niculescu-Mizil & Caruana 2005 [#nm05]_ isotonic regression.",
1213
1219
  "kind": "function",
@@ -12,6 +12,8 @@ import pytest
12
12
  from eval_toolkit import (
13
13
  fit_beta_binary,
14
14
  fit_beta_calibrator,
15
+ fit_isotonic_binary,
16
+ fit_isotonic_calibrator,
15
17
  fit_platt_binary,
16
18
  fit_platt_calibrator,
17
19
  fit_temperature_binary,
@@ -160,6 +162,66 @@ def test_beta_binary_apply_rejects_non_finite_test_scores(
160
162
  apply(np.array([0.5, np.nan, 0.8]))
161
163
 
162
164
 
165
+ # ---------- fit_isotonic_binary ----------
166
+
167
+
168
+ @pytest.mark.unit
169
+ def test_isotonic_binary_returns_none_params_and_apply(
170
+ synthetic_binary_data: tuple[np.ndarray, np.ndarray],
171
+ ) -> None:
172
+ """Isotonic is non-parametric → params slot is None."""
173
+ y, p = synthetic_binary_data
174
+ params, apply = fit_isotonic_binary(y, p)
175
+ assert params is None
176
+ assert callable(apply)
177
+
178
+
179
+ @pytest.mark.unit
180
+ def test_isotonic_binary_apply_returns_same_shape(
181
+ synthetic_binary_data: tuple[np.ndarray, np.ndarray],
182
+ ) -> None:
183
+ y, p = synthetic_binary_data
184
+ _, apply = fit_isotonic_binary(y, p)
185
+ test = np.array([0.1, 0.5, 0.9])
186
+ out = apply(test)
187
+ assert out.shape == test.shape
188
+ assert (out >= 0.0).all() and (out <= 1.0).all()
189
+
190
+
191
+ @pytest.mark.unit
192
+ def test_isotonic_binary_apply_matches_underlying_calibrator(
193
+ synthetic_binary_data: tuple[np.ndarray, np.ndarray],
194
+ ) -> None:
195
+ """fit_isotonic_binary apply should match fit_isotonic_calibrator output."""
196
+ y, p = synthetic_binary_data
197
+ _, apply = fit_isotonic_binary(y, p)
198
+ canonical_apply = fit_isotonic_calibrator(y, p)
199
+ test = np.linspace(0.05, 0.95, 20)
200
+ np.testing.assert_allclose(apply(test), canonical_apply(test))
201
+
202
+
203
+ @pytest.mark.unit
204
+ def test_isotonic_binary_rejects_single_class() -> None:
205
+ y_single = np.zeros(50, dtype=int)
206
+ p = np.random.default_rng(0).uniform(0.0, 1.0, 50)
207
+ with pytest.raises(ValueError):
208
+ fit_isotonic_binary(y_single, p)
209
+
210
+
211
+ @pytest.mark.unit
212
+ def test_isotonic_binary_monotone(
213
+ synthetic_binary_data: tuple[np.ndarray, np.ndarray],
214
+ ) -> None:
215
+ """Isotonic regression is monotone non-decreasing in the score."""
216
+ y, p = synthetic_binary_data
217
+ _, apply = fit_isotonic_binary(y, p)
218
+ test = np.linspace(0.05, 0.95, 50)
219
+ out = apply(test)
220
+ # Allow tiny numerical noise but enforce non-decreasing trend
221
+ deltas = np.diff(out)
222
+ assert (deltas >= -1e-12).all(), "isotonic should be non-decreasing"
223
+
224
+
163
225
  # ---------- consistency across the calibrator family ----------
164
226
 
165
227
 
@@ -167,14 +229,15 @@ def test_beta_binary_apply_rejects_non_finite_test_scores(
167
229
  def test_all_four_binary_adapters_have_consistent_shape(
168
230
  synthetic_binary_data: tuple[np.ndarray, np.ndarray],
169
231
  ) -> None:
170
- """temperature_binary, platt_binary, beta_binary all return ``(params, apply)``.
232
+ """temperature, isotonic, platt, beta all return ``(params, apply)``.
171
233
 
172
234
  Documents the audit-battery contract for the 4-calibrator pattern.
173
235
  """
174
236
  y, p = synthetic_binary_data
175
- # All return (params, apply); apply is a callable taking (n,) -> (n,)
237
+ # All return (params, apply); apply is a callable taking (n,) -> (n,).
176
238
  for name, fitter in [
177
239
  ("temperature", fit_temperature_binary),
240
+ ("isotonic", fit_isotonic_binary),
178
241
  ("platt", fit_platt_binary),
179
242
  ("beta", fit_beta_binary),
180
243
  ]:
@@ -185,6 +248,38 @@ def test_all_four_binary_adapters_have_consistent_shape(
185
248
  assert (out >= 0.0).all() and (out <= 1.0).all(), f"{name}: output not in [0,1]"
186
249
 
187
250
 
251
+ @pytest.mark.unit
252
+ def test_consumer_idiom_iterating_all_four_calibrators(
253
+ synthetic_binary_data: tuple[np.ndarray, np.ndarray],
254
+ ) -> None:
255
+ """End-to-end consumer idiom: iterate the 4-element family with one shape.
256
+
257
+ Matches the calibration-battery pattern in
258
+ ``prompt-injection-detection-prototype/src/eval/calibration_battery.py``
259
+ (ADR-056). The ``params is not None`` check distinguishes parametric
260
+ from non-parametric in a single conditional.
261
+ """
262
+ y, p = synthetic_binary_data
263
+ fitters = {
264
+ "temperature": fit_temperature_binary,
265
+ "isotonic": fit_isotonic_binary,
266
+ "platt": fit_platt_binary,
267
+ "beta": fit_beta_binary,
268
+ }
269
+ p_test = np.linspace(0.05, 0.95, 30)
270
+ recorded_params: dict[str, object] = {}
271
+ calibrated: dict[str, np.ndarray] = {}
272
+ for name, fit_fn in fitters.items():
273
+ params, apply = fit_fn(y, p)
274
+ calibrated[name] = apply(p_test)
275
+ if params is not None:
276
+ recorded_params[name] = params
277
+ # Three of four have inspectable params; isotonic is None.
278
+ assert set(recorded_params.keys()) == {"temperature", "platt", "beta"}
279
+ # All four produced calibrated outputs of matching shape.
280
+ assert all(out.shape == p_test.shape for out in calibrated.values())
281
+
282
+
188
283
  @pytest.mark.unit
189
284
  def test_platt_binary_params_are_pair(
190
285
  synthetic_binary_data: tuple[np.ndarray, np.ndarray],