eval-toolkit 1.0.1__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/CHANGELOG.md +49 -0
  2. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/PKG-INFO +3 -2
  3. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/README.md +2 -1
  4. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/metrics.py +38 -0
  6. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/public_api/snapshot.json +1 -1
  7. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_folded.py +23 -0
  8. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/.gitignore +0 -0
  9. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/LICENSE +0 -0
  10. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/STYLE.md +0 -0
  11. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/archive/README.md +0 -0
  12. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/README.md +0 -0
  13. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/datasets/README.md +0 -0
  14. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/data-integrity/README.md +0 -0
  15. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/eval-ecosystem/README.md +0 -0
  16. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/inference/README.md +0 -0
  17. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/prompt-injection/README.md +0 -0
  18. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/source/adr/README.md +0 -0
  19. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/source/methodology/README.md +0 -0
  20. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/pyproject.toml +0 -0
  21. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/__init__.py +0 -0
  22. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/bootstrap.py +0 -0
  32. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/calibration.py +0 -0
  33. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/claims.py +0 -0
  34. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/config.py +0 -0
  35. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/docs.py +0 -0
  36. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/embeddings.py +0 -0
  37. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/evidence.py +0 -0
  38. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/harness.py +0 -0
  39. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/leakage.py +0 -0
  40. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/loaders.py +0 -0
  41. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/losses.py +0 -0
  42. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/manifest.py +0 -0
  43. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/metric_specs.py +0 -0
  44. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/operating_points.py +0 -0
  45. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/paths.py +0 -0
  46. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/plotting.py +0 -0
  47. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/preprocessing.py +0 -0
  48. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/probes.py +0 -0
  49. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/protocols.py +0 -0
  50. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/provenance.py +0 -0
  51. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/py.typed +0 -0
  52. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  53. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  54. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  55. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  56. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results.v1.json +0 -0
  57. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  58. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/scorecards.py +0 -0
  59. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/seeds.py +0 -0
  60. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/splits.py +0 -0
  61. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/stacking.py +0 -0
  62. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/text_dedup.py +0 -0
  63. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/thresholds.py +0 -0
  64. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  65. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  66. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  67. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  68. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  69. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  70. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  71. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  72. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  73. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  74. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/benchmarks/__init__.py +0 -0
  75. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  76. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/conftest.py +0 -0
  77. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/bootstrap_ci/cases.json +0 -0
  78. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout.jsonl +0 -0
  79. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_expected.json +0 -0
  80. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  81. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/expected.md +0 -0
  82. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/input.md +0 -0
  83. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/metrics.json +0 -0
  84. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  85. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/strategies.py +0 -0
  86. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_adversarial.py +0 -0
  87. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_analysis.py +0 -0
  88. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_artifacts.py +0 -0
  89. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_audit_citation_alignment.py +0 -0
  90. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_block_bootstrap_on_folds.py +0 -0
  91. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_calibration_mc.py +0 -0
  92. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_edge_cases.py +0 -0
  93. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_golden.py +0 -0
  94. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_njobs.py +0 -0
  95. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_props.py +0 -0
  96. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_research_grounded.py +0 -0
  97. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_unit.py +0 -0
  98. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_binary_adapters.py +0 -0
  99. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_bootstrap_chain.py +0 -0
  100. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_determinism.py +0 -0
  101. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_optimization_failures.py +0 -0
  102. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_props.py +0 -0
  103. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_research_grounded.py +0 -0
  104. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_unit.py +0 -0
  105. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims.py +0 -0
  106. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims_coverage.py +0 -0
  107. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims_props.py +0 -0
  108. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_cli.py +0 -0
  109. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_config.py +0 -0
  110. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_bootstrap.py +0 -0
  111. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_calibration.py +0 -0
  112. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_harness.py +0 -0
  113. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_metrics.py +0 -0
  114. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_plotting.py +0 -0
  115. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_croissant_e2e.py +0 -0
  116. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_dedup_split_leakage_chain.py +0 -0
  117. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_deprecated_scalars_shim.py +0 -0
  118. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_deprecations.py +0 -0
  119. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_docs_golden.py +0 -0
  120. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_docs_props.py +0 -0
  121. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_embeddings.py +0 -0
  122. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_evidence_validators.py +0 -0
  123. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_edge_cases.py +0 -0
  124. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_fault_injection.py +0 -0
  125. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_internals.py +0 -0
  126. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_metric_options.py +0 -0
  127. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_parallelism.py +0 -0
  128. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_smoke.py +0 -0
  129. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_import_boundaries.py +0 -0
  130. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_is_metric_defined_for_slice.py +0 -0
  131. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_lazy_extras_messages.py +0 -0
  132. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage.py +0 -0
  133. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage_error_paths.py +0 -0
  134. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage_props.py +0 -0
  135. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders.py +0 -0
  136. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders_coverage.py +0 -0
  137. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders_props.py +0 -0
  138. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_logging.py +0 -0
  139. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_losses.py +0 -0
  140. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest.py +0 -0
  141. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_contamination_round_trip.py +0 -0
  142. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_props.py +0 -0
  143. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_validation.py +0 -0
  144. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_props.py +0 -0
  145. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_stratified_subsets.py +0 -0
  146. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_unit.py +0 -0
  147. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_misc_coverage.py +0 -0
  148. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_numeric_edge_cases.py +0 -0
  149. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_ood_loader.py +0 -0
  150. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_operating_points.py +0 -0
  151. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_operating_points_props.py +0 -0
  152. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_parallel.py +0 -0
  153. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_paths.py +0 -0
  154. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_pipeline_e2e.py +0 -0
  155. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_edge.py +0 -0
  156. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_smoke.py +0 -0
  157. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_visual.py +0 -0
  158. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_preprocessing.py +0 -0
  159. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_probes.py +0 -0
  160. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_protocol_conformance.py +0 -0
  161. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_provenance.py +0 -0
  162. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_public_api.py +0 -0
  163. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_recall_at_fpr.py +0 -0
  164. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_reference_equivalence.py +0 -0
  165. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_reproducibility_integration.py +0 -0
  166. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_rng.py +0 -0
  167. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_schemas.py +0 -0
  168. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_scorecard.py +0 -0
  169. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_seeds.py +0 -0
  170. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits.py +0 -0
  171. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits_leakage_integration.py +0 -0
  172. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits_props.py +0 -0
  173. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_stacking.py +0 -0
  174. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_sweep.py +0 -0
  175. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup.py +0 -0
  176. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_coverage.py +0 -0
  177. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_props.py +0 -0
  178. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_strategies.py +0 -0
  179. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds.py +0 -0
  180. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_constant_score.py +0 -0
  181. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_coverage.py +0 -0
  182. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_props.py +0 -0
  183. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_research_grounded.py +0 -0
  184. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_tokenization_leakage_check.py +0 -0
  185. {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,55 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
9
+
10
+ Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
11
+ v1.0.1 (RC4) and v1.0.2 (this release). All P3, all NON-BREAKING.
12
+
13
+ ### Changed (Tier-2 ADDITIVE: contract clarification only)
14
+
15
+ - **RC2** (#76) — `SimilarityStrategy` Protocol promoted from
16
+ "pre-v0.7 internal interface" (prose framing only) to formal
17
+ 10th strict Tier-2 Protocol per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
18
+ Aligns prose surfaces (README, extending.md, strict_tier2_protocols.md,
19
+ api/protocols.md, ADR 0004 §D6, roadmap.md) with the contract
20
+ already locked in `tests/golden/public_api/snapshot.json` +
21
+ `src/eval_toolkit/__init__.py:_EXPORTS` since v1.0.0. **No code
22
+ change — documentation-only reconciliation.** Strict-Tier-2 count
23
+ goes 9 → 10 (+ 1 opt-in `Versioned`).
24
+
25
+ ### Fixed
26
+
27
+ - **RC3** (#76) — `tests/test_harness_folded.py::test_evaluate_folded_reseed_splitter_varies_partitions`
28
+ test hardening. Previous assertions covered count + key existence
29
+ only; a regression silently reusing the splitter (R8-C1 pre-fix
30
+ behavior) could still pass. v1.0.2 adds row-content comparison:
31
+ replays `reseed_splitter` against the splitter for `seed=1` vs
32
+ `seed=2` and asserts fold-0 test partitions differ via feature-text
33
+ set membership (robust to `_slice_subset`'s `reset_index(drop=True)`
34
+ via stable text-column identifiers).
35
+
36
+ - **F-metrics-1** (#76) — `brier_score` docstring input-domain clarity.
37
+ Added explicit "Input domain" Notes subsection clarifying binary
38
+ labels in `{0, 1}` + calibrated probabilities in `[0, 1]` are
39
+ required; raw logits or unbounded ranking scores pass the finiteness
40
+ check but produce out-of-range MSE that misrepresents calibration
41
+ quality. Includes calibration-applying recipe pointer.
42
+
43
+ - **F-metrics-3** (#76) — `expected_calibration_error` docstring
44
+ uniform-scores note. Added explicit Notes subsection documenting
45
+ that constant `y_score` returns 0.0 (per-bin formula trivially
46
+ satisfied) but is semantically misleading — uninformative scorers
47
+ look "perfectly calibrated" despite zero discriminative power.
48
+ Callers should filter constant inputs before ECE.
49
+
50
+ - **F-metrics-4** (#76) — `brier_score` docstring single-class
51
+ edge-case explicit. Added Notes subsection with closed-form
52
+ expressions for all-zeros (`BS = mean(p²)`) and all-ones
53
+ (`BS = mean((1-p)²)`) cases. Explicit confirmation that
54
+ per-slice degenerate-class evaluation is supported (unlike
55
+ PR-AUC / ROC-AUC).
56
+
8
57
  ## [1.0.1] — 2026-05-25 — audit_citation_alignment + RC4 docs polish
9
58
 
10
59
  First v1.x patch release. Ships the `audit_citation_alignment` validator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -115,7 +115,8 @@ format changes.
115
115
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
116
116
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
117
117
  │ ThresholdSelector / DatasetLoader / MetricSpec │
118
- │ MetaLearner / Probe / TextTransform (9 strict)
118
+ │ MetaLearner / Probe / TextTransform /
119
+ │ SimilarityStrategy (10 strict) │
119
120
  │ Versioned (opt-in: per-object versions in manifest) │
120
121
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
121
122
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -32,7 +32,8 @@ format changes.
32
32
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
33
33
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
34
34
  │ ThresholdSelector / DatasetLoader / MetricSpec │
35
- │ MetaLearner / Probe / TextTransform (9 strict)
35
+ │ MetaLearner / Probe / TextTransform /
36
+ │ SimilarityStrategy (10 strict) │
36
37
  │ Versioned (opt-in: per-object versions in manifest) │
37
38
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
38
39
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.1"
5
+ __version__ = "1.0.2"
@@ -792,6 +792,20 @@ def expected_calibration_error(
792
792
  empirical positive rate in the bin, and :math:`\\mathrm{conf}` is the
793
793
  mean predicted score.
794
794
 
795
+ **Uniform / uninformative scores** (F-metrics-3 v1.0.2 clarity pass):
796
+ when ``y_score`` is constant (e.g., ``[0.5] * n`` — an uninformative
797
+ detector), this function returns ``0.0`` regardless of the true label
798
+ distribution. That's technically correct per the formula —
799
+ :math:`|\\mathrm{acc}(B_m) - \\mathrm{conf}(B_m)|` measures bin-level
800
+ calibration, and a single occupied bin with ``conf = base rate``
801
+ achieves perfect calibration locally. But it is semantically
802
+ misleading: an uninformative scorer looks "perfectly calibrated"
803
+ even though it has zero discriminative power. **Callers should
804
+ detect and filter uninformative inputs before passing to ECE** —
805
+ e.g., reject when ``np.unique(y_score).size == 1`` or when the
806
+ score variance is below a domain-specific threshold. Use
807
+ :func:`brier_score` or :func:`pr_auc` for resolution-aware metrics.
808
+
795
809
  References
796
810
  ----------
797
811
  .. [1] DeGroot, M. H. & Fienberg, S. E. "The comparison and evaluation of
@@ -1240,6 +1254,30 @@ def brier_score(
1240
1254
  -----
1241
1255
  .. math:: \mathrm{BS} = \frac{1}{n} \sum_i (p_i - y_i)^2
1242
1256
 
1257
+ **Input domain** (F-metrics-1 v1.0.2 clarity pass): ``y_true`` must
1258
+ be binary labels in ``{0, 1}`` (other label values raise
1259
+ ``ValueError``). ``y_score`` must be calibrated probabilities in
1260
+ ``[0, 1]`` — raw logits or unbounded ranking scores will pass the
1261
+ finiteness check but produce an out-of-range MSE that misrepresents
1262
+ calibration quality. If your scorer produces logits, apply
1263
+ sigmoid / softmax / a fitted calibrator (see
1264
+ :mod:`eval_toolkit.calibration`) before passing to ``brier_score``.
1265
+
1266
+ **Single-class behavior** (F-metrics-4 v1.0.2 clarity pass): unlike
1267
+ PR-AUC / ROC-AUC, ``brier_score`` is well-defined when ``y_true``
1268
+ is all-zeros or all-ones — it degenerates to the MSE around the
1269
+ constant class label. Specifically:
1270
+
1271
+ - All-zeros: :math:`\mathrm{BS} = \frac{1}{n} \sum_i p_i^2` —
1272
+ forecasting any positive probability incurs squared-error loss.
1273
+ - All-ones: :math:`\mathrm{BS} = \frac{1}{n} \sum_i (1 - p_i)^2`
1274
+ — forecasting low probability incurs squared-error loss.
1275
+
1276
+ This is the deliberate Brier-as-strict-proper-scoring-rule behavior
1277
+ (Brier 1950). Per-slice degenerate-class evaluation is supported
1278
+ via the ``empty_strategy`` parameter for ``n=0`` only; non-empty
1279
+ single-class slices score normally.
1280
+
1243
1281
  See Also
1244
1282
  --------
1245
1283
  eval_toolkit.metrics.brier_decomposition :
@@ -1373,7 +1373,7 @@
1373
1373
  "doc_first_line": "str(object='') -> str",
1374
1374
  "kind": "value",
1375
1375
  "type": "str",
1376
- "value": "'1.0.1'"
1376
+ "value": "'1.0.2'"
1377
1377
  },
1378
1378
  "apply_operating_points": {
1379
1379
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -162,6 +162,29 @@ def test_evaluate_folded_reseed_splitter_varies_partitions() -> None:
162
162
  assert "seed=1/fold=0" in fold_ids
163
163
  assert "seed=2/fold=0" in fold_ids
164
164
 
165
+ # R10-RC3 v1.0.2 hardening (#76): the previous assertions covered
166
+ # COUNT + key existence but did NOT verify the actual partition
167
+ # indices differ across seeds — a regression that silently reused
168
+ # the splitter (R8-C1 pre-fix behavior) could still pass. Directly
169
+ # verify the reseed_splitter callback yields different partitions
170
+ # by replaying it against the splitter.
171
+ splitter = StratifiedKFoldSplitter(k=2, seed=42)
172
+ splits_seed_1 = list(dataclasses.replace(splitter, seed=1).iter_folds(parent, groups=None))
173
+ splits_seed_2 = list(dataclasses.replace(splitter, seed=2).iter_folds(parent, groups=None))
174
+ # _slice_subset resets the child df index to [0..n-1], so compare
175
+ # the underlying text feature values instead (stable across the
176
+ # reset_index drop). Each child slice's `text` column carries the
177
+ # original row labels.
178
+ fold_0_test_texts_seed_1 = set(splits_seed_1[0]["test"].df["text"].tolist())
179
+ fold_0_test_texts_seed_2 = set(splits_seed_2[0]["test"].df["text"].tolist())
180
+ # Different seeds → different fold-0 test partitions (the whole
181
+ # point of reseed_splitter).
182
+ assert fold_0_test_texts_seed_1 != fold_0_test_texts_seed_2, (
183
+ "reseed_splitter callback failed to vary partitions: "
184
+ f"seed=1 fold=0 texts={sorted(fold_0_test_texts_seed_1)[:5]}... "
185
+ f"seed=2 fold=0 texts={sorted(fold_0_test_texts_seed_2)[:5]}..."
186
+ )
187
+
165
188
 
166
189
  @pytest.mark.unit
167
190
  def test_evaluate_folded_single_seed_no_deprecation_warning() -> None:
File without changes
File without changes
File without changes