eval-toolkit 1.3.0__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/CHANGELOG.md +139 -0
  2. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/PKG-INFO +4 -1
  3. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/source/adr/README.md +1 -0
  4. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/pyproject.toml +8 -0
  5. eval_toolkit-1.5.0/src/eval_toolkit/_narrative.py +425 -0
  6. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_citation_alignment.py +188 -8
  8. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_value_bindings.py +23 -383
  9. eval_toolkit-1.5.0/src/eval_toolkit/eda/__init__.py +80 -0
  10. eval_toolkit-1.5.0/src/eval_toolkit/eda/data_audit.py +785 -0
  11. eval_toolkit-1.5.0/src/eval_toolkit/eda/obfuscation.py +622 -0
  12. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/loaders.py +46 -8
  13. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/public_api/snapshot.json +3 -3
  14. eval_toolkit-1.5.0/tests/test_audit_citation_alignment.py +458 -0
  15. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_audit_value_bindings.py +6 -2
  16. eval_toolkit-1.5.0/tests/test_eda.py +330 -0
  17. eval_toolkit-1.5.0/tests/test_eda_obfuscation.py +448 -0
  18. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders.py +107 -0
  19. eval_toolkit-1.3.0/tests/test_audit_citation_alignment.py +0 -242
  20. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/.gitignore +0 -0
  21. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/LICENSE +0 -0
  22. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/README.md +0 -0
  23. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/STYLE.md +0 -0
  24. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/archive/README.md +0 -0
  25. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/README.md +0 -0
  26. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/datasets/README.md +0 -0
  27. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/data-integrity/README.md +0 -0
  28. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  29. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/inference/README.md +0 -0
  30. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/prompt-injection/README.md +0 -0
  31. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/source/methodology/README.md +0 -0
  32. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__init__.py +0 -0
  33. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__main__.py +0 -0
  34. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_deprecated.py +0 -0
  35. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_parallel.py +0 -0
  36. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_rng.py +0 -0
  37. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_sweep.py +0 -0
  38. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/adversarial.py +0 -0
  39. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/analysis.py +0 -0
  40. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/artifacts.py +0 -0
  41. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  42. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/bootstrap.py +0 -0
  43. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/calibration.py +0 -0
  44. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/claims.py +0 -0
  45. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/config.py +0 -0
  46. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/docs.py +0 -0
  47. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/embeddings.py +0 -0
  48. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/evidence.py +0 -0
  49. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/harness.py +0 -0
  50. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/leakage.py +0 -0
  51. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/losses.py +0 -0
  52. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/manifest.py +0 -0
  53. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metric_specs.py +0 -0
  54. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metrics.py +0 -0
  55. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/operating_points.py +0 -0
  56. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/paths.py +0 -0
  57. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/plotting.py +0 -0
  58. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/preprocessing.py +0 -0
  59. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/probes.py +0 -0
  60. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/protocols.py +0 -0
  61. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/provenance.py +0 -0
  62. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/py.typed +0 -0
  63. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  64. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  65. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  66. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  67. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  68. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  69. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/scorecards.py +0 -0
  70. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/seeds.py +0 -0
  71. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/splits.py +0 -0
  72. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/stacking.py +0 -0
  73. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/text_dedup.py +0 -0
  74. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/thresholds.py +0 -0
  75. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  76. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  77. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  78. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  79. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  80. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  81. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  82. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  83. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  84. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  85. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/benchmarks/__init__.py +0 -0
  86. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  87. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/conftest.py +0 -0
  88. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  89. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  90. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  91. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  92. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/expected.md +0 -0
  93. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/input.md +0 -0
  94. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/metrics.json +0 -0
  95. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  96. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/strategies.py +0 -0
  97. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_adversarial.py +0 -0
  98. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_analysis.py +0 -0
  99. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_artifacts.py +0 -0
  100. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  101. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  102. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  103. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_edge_cases.py +0 -0
  104. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_golden.py +0 -0
  105. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_njobs.py +0 -0
  106. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_props.py +0 -0
  107. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_research_grounded.py +0 -0
  108. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_unit.py +0 -0
  109. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_binary_adapters.py +0 -0
  110. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  111. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_determinism.py +0 -0
  112. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_optimization_failures.py +0 -0
  113. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_props.py +0 -0
  114. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_research_grounded.py +0 -0
  115. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_unit.py +0 -0
  116. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims.py +0 -0
  117. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims_coverage.py +0 -0
  118. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims_props.py +0 -0
  119. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_cli.py +0 -0
  120. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_config.py +0 -0
  121. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_bootstrap.py +0 -0
  122. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_calibration.py +0 -0
  123. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_harness.py +0 -0
  124. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_metrics.py +0 -0
  125. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_plotting.py +0 -0
  126. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_croissant_e2e.py +0 -0
  127. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  128. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_deprecated_scalars_shim.py +0 -0
  129. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_deprecations.py +0 -0
  130. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_docs_golden.py +0 -0
  131. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_docs_props.py +0 -0
  132. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_embeddings.py +0 -0
  133. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_evidence_validators.py +0 -0
  134. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_edge_cases.py +0 -0
  135. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_fault_injection.py +0 -0
  136. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_folded.py +0 -0
  137. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_internals.py +0 -0
  138. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_metric_options.py +0 -0
  139. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_parallelism.py +0 -0
  140. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_smoke.py +0 -0
  141. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_import_boundaries.py +0 -0
  142. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  143. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_lazy_extras_messages.py +0 -0
  144. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage.py +0 -0
  145. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage_error_paths.py +0 -0
  146. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage_props.py +0 -0
  147. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders_coverage.py +0 -0
  148. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders_props.py +0 -0
  149. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_logging.py +0 -0
  150. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_losses.py +0 -0
  151. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest.py +0 -0
  152. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  153. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_props.py +0 -0
  154. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_validation.py +0 -0
  155. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_props.py +0 -0
  156. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_stratified_subsets.py +0 -0
  157. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_unit.py +0 -0
  158. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_misc_coverage.py +0 -0
  159. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_numeric_edge_cases.py +0 -0
  160. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_ood_loader.py +0 -0
  161. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_operating_points.py +0 -0
  162. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_operating_points_props.py +0 -0
  163. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_parallel.py +0 -0
  164. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_paths.py +0 -0
  165. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_pipeline_e2e.py +0 -0
  166. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_edge.py +0 -0
  167. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_smoke.py +0 -0
  168. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_visual.py +0 -0
  169. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_preprocessing.py +0 -0
  170. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_probes.py +0 -0
  171. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_protocol_conformance.py +0 -0
  172. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_provenance.py +0 -0
  173. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_public_api.py +0 -0
  174. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_recall_at_fpr.py +0 -0
  175. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_reference_equivalence.py +0 -0
  176. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_reproducibility_integration.py +0 -0
  177. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_rng.py +0 -0
  178. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_schemas.py +0 -0
  179. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_scorecard.py +0 -0
  180. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_seeds.py +0 -0
  181. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits.py +0 -0
  182. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits_leakage_integration.py +0 -0
  183. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits_props.py +0 -0
  184. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_stacking.py +0 -0
  185. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_sweep.py +0 -0
  186. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup.py +0 -0
  187. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_coverage.py +0 -0
  188. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_props.py +0 -0
  189. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_strategies.py +0 -0
  190. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds.py +0 -0
  191. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_constant_score.py +0 -0
  192. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_coverage.py +0 -0
  193. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_props.py +0 -0
  194. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_research_grounded.py +0 -0
  195. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_tokenization_leakage_check.py +0 -0
  196. {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,145 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
9
+
10
+ Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
11
+
12
+ - **`eda` Job-1 integrity gate (#83):** `audit_dataset` / `DataAudit` / `SplitSummary` + the
13
+ `class_balance` / `no_cross_split_leakage` / `context_window_fit` gates + the §B2 obfuscation
14
+ prevalence module.
15
+ - **schema-aware `HFDatasetsLoader` (#85):** load real-world dataset schemas without column
16
+ guessing — `feature_cols` + `feature_join` (join multiple columns into one feature; NaN-safe),
17
+ `label_map` (remap raw labels → int; fail-fast `ValueError` lists unmapped values), `revision`
18
+ (pin the HF dataset SHA). All new params default to the prior behavior; a missing feature/label
19
+ column raises `KeyError` listing the observed columns.
20
+
21
+ ## [1.4.0] — 2026-05-26 — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82); shared `_narrative` helpers (ADR 0007)
22
+
23
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
24
+ Closes [#82](https://github.com/brandon-behring/eval-toolkit/issues/82)
25
+ — consumer-feedback follow-on after v1.3.0 closed
26
+ `audit_value_bindings`. The consumer's 188 residual warnings on
27
+ `audit_citation_alignment` were the same architectural-class gap
28
+ (missing Layer 2 + Layer 3 context-awareness) that
29
+ `audit_value_bindings` worked through over v1.1.0 → v1.3.0.
30
+
31
+ Introduces [ADR 0007](docs/source/adr/0007-three-layer-architecture-for-audit-validators.md)
32
+ codifying the **three-layer correctness model** (identity + scope +
33
+ pairing) as the canonical architecture for ALL `audit_*` validators
34
+ in the family. ADR 0005 + ADR 0006 were originally validator-
35
+ specific; ADR 0007 generalizes.
36
+
37
+ ### Added — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82)
38
+
39
+ - **`scope: Literal["all", "narrative"] = "all"`** kwarg on
40
+ `validate_citations(...)`. Default `"all"` preserves v1.0.1 /
41
+ v1.3.x behavior exactly (Tier-1 ADDITIVE; byte-identical legacy
42
+ semantics).
43
+ - **Pattern β (Layer 2)** — under `scope="narrative"`, citations
44
+ inside markdown table rows, bracketed expressions, and fenced
45
+ code blocks are excluded. Mirrors `audit_value_bindings`'s
46
+ Layer 2 from v1.1.0. Closes ~67 of the consumer's residual
47
+ warnings (SPEC_SHEET.md table rows).
48
+ - **Pattern γ (Layer 3)** — the category-keyword extraction window
49
+ for a citation is bounded by the SENTENCE containing the
50
+ citation, not by a ±N-line window. Uses
51
+ `_sentence_boundary_positions` (paragraph-aware, abbreviation-
52
+ guarded) from `_narrative`. Catches the consumer's dense
53
+ multi-clause sentences where keywords from prior clauses pull
54
+ through.
55
+ - **Pattern α (Layer 3)** — when MULTIPLE ADR citations appear in
56
+ the same sentence (e.g.,
57
+ `"per ADR-025 + ADR-021 + ADR-034 + ADR-045"`), the validator
58
+ switches from first-match-wins category check to multi-category
59
+ set membership. Each ADR's actual category is accepted if it's
60
+ in the SET of categories matched by the sentence's keywords —
61
+ not just the dominant first-match. Catches the dense multi-ADR
62
+ list pattern where each ADR addresses a different topic.
63
+
64
+ ### Refactor — Shared `_narrative.py` helpers
65
+
66
+ Per ADR 0007, narrative-prose helpers are extracted to private flat
67
+ module `src/eval_toolkit/_narrative.py` (consistent with ADR 0001's
68
+ `_rng.py` / `_parallel.py` / `_sweep.py` precedent — flat-module
69
+ compliant, private/underscore-prefixed). Both validators import:
70
+
71
+ - Keyword frozensets: `_DELTA_KEYWORDS`, `_FLOOR_KEYWORDS`,
72
+ `_GROUP_SUBJECT_KEYWORDS`, `_ABBREV_BEFORE_DOT`.
73
+ - Compiled patterns: `_DELTA_PATTERN`, `_FLOOR_PATTERN`,
74
+ `_GROUP_SUBJECT_PATTERN`.
75
+ - Helpers: `_build_exclusion_ranges`, `_is_excluded`,
76
+ `_is_sentence_terminator_dot`, `_sentence_boundary_positions`,
77
+ `_sentence_id_of`, `_crosses_sentence_boundary`,
78
+ `_is_signed_value`, `_has_keyword_in_window`,
79
+ `_compile_keyword_pattern`.
80
+
81
+ `audit_value_bindings.py` updated to import these from `_narrative`
82
+ instead of defining inline. **Signature-preserving refactor**: all
83
+ 43 existing `audit_value_bindings` tests pass UNCHANGED. The
84
+ private helpers are non-public, so no Tier-1 STRICT impact.
85
+
86
+ ### Dogfood result
87
+
88
+ | Configuration | Warnings on `prompt-injection-detection-submission` HEAD | Reduction |
89
+ |---|---|---|
90
+ | v1.3.0 (`audit_citation_alignment` with scope='all') | 188 | — (baseline) |
91
+ | **v1.4.0 (`scope='narrative'`)** | **37** | **80%** |
92
+
93
+ Verified locally via `.scratch/dogfood_v1_4_0_citation.py`
94
+ (monkey-patched consumer call with `scope="narrative"`).
95
+
96
+ The residual 37 are a mix of:
97
+ - **Real misalignments** consumer should triage (e.g., `ADR-025`
98
+ cited for a threshold claim when ADR-025 is the cost ADR — could
99
+ be a wrong-ADR bug or a multi-topic ADR not captured by the
100
+ consumer's category-keyword map).
101
+ - **Single-topic sentences** where the first-match category
102
+ inferred from the sentence genuinely differs from the ADR's
103
+ actual category. The multi-topic Pattern α fallback only fires
104
+ when ≥2 categories match; single-topic prose stays on the
105
+ legacy first-match check.
106
+ - **Edge cases** requiring parser-level understanding of how an
107
+ ADR's scope intersects with a multi-clause sentence's topics.
108
+
109
+ The original #82 acceptance criterion was ≤20 warnings (the
110
+ filer's estimate of "genuinely ambiguous citations"). v1.4.0
111
+ hits 37 — above the target but a 5× reduction overall. The
112
+ remaining gap requires either (a) consumer-side expansion of
113
+ `CATEGORY_KEYWORDS` to capture multi-topic ADRs, (b) consumer
114
+ prose adjustments for the real misalignments, or (c) future
115
+ v1.4.x refinements to the validator's heuristic. Consumer
116
+ HARD-gate promotion remains a judgment call — the residual 37
117
+ includes some real misalignments worth fixing.
118
+
119
+ ### Consumer adoption path
120
+
121
+ Consumer (`prompt-injection-detection-submission`):
122
+ 1. Re-pin `eval-toolkit>=1.4.0,<2`.
123
+ 2. Add `scope="narrative"` to their `validate_citations(...)` call
124
+ in `scripts/audit_citation_alignment.py`.
125
+ 3. **Bundled HARD-gate promotion** of BOTH `audit_value_bindings`
126
+ AND `audit_citation_alignment` now credible per the v1.3.8
127
+ plan. Promotes from SOFT to HARD in their next v1.3.X release.
128
+
129
+ ### Tests
130
+
131
+ 61 across the audit-validator suite (43 audit_value_bindings + 18
132
+ audit_citation_alignment; 6 new for v1.4.0 — Pattern α / β / γ /
133
+ scope='all' backward-compat / shared-helpers / combined dogfood).
134
+ All pass. Public API snapshot regenerated for `__version__` bump
135
+ + `validate_citations` signature with new `scope` kwarg.
136
+
137
+ ### Out of scope (deferred)
138
+
139
+ - **`audit_sister_doc_concept_drift` Layer 2 / Layer 3** — embedding-
140
+ based validator (v1.0.4); different false-positive surface. Add
141
+ layers only if consumer demand emerges.
142
+ - **Public helper promotion** (`eval_toolkit.audit_narrative`) —
143
+ YAGNI per ADR 0007 §A2.
144
+ - **Configurable category-keyword-window extension kwargs** —
145
+ YAGNI; add in v1.4.x patch if demand emerges.
146
+
8
147
  ## [1.3.0] — 2026-05-26 — `audit_value_bindings` cross-detector list-grammar pairing rules (closes #81)
9
148
 
10
149
  Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.3.0
3
+ Version: 1.5.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -60,6 +60,9 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
60
60
  Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
61
61
  Requires-Dist: sphinx-design>=0.6; extra == 'docs'
62
62
  Requires-Dist: sphinx>=7.3; extra == 'docs'
63
+ Provides-Extra: eda
64
+ Requires-Dist: matplotlib>=3.8; extra == 'eda'
65
+ Requires-Dist: pandas>=2.0; extra == 'eda'
63
66
  Provides-Extra: embeddings
64
67
  Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
65
68
  Provides-Extra: losses
@@ -79,3 +79,4 @@ What would have to change for this decision to be reopened?
79
79
  | [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
80
80
  | [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
81
81
  | [0006](0006-pairing-rules-for-cross-detector-list-grammar.md) | Pairing rules for cross-detector list-grammar in audit validators | Accepted | 2026-05-26 |
82
+ | [0007](0007-three-layer-architecture-for-audit-validators.md) | Three-layer architecture for audit validators (family-wide) | Accepted | 2026-05-26 |
@@ -74,6 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
74
74
  # (granular extras — losses callers should not have to install the larger
75
75
  # transformers stack). Shares the torch version pin with [probes].
76
76
  losses = ["torch>=2.0"]
77
+ # v1.5.0 (feat/eda-data-audit): eval_toolkit.eda Job-1 integrity-gate layer.
78
+ # Tier-2 surface (ADR 0003) — torch-free by design. pandas powers the
79
+ # DataFrameLoader reuse path; matplotlib is reserved for the EDA layer's
80
+ # future profiling plots. Intentionally NO sentence-transformers / torch:
81
+ # the near-dup / cross-split checks use the lexical TfidfCosineStrategy and
82
+ # token-length quantiles take a caller-supplied tokenizer (no transformers
83
+ # import in this module). NOT folded into [all] / [dev] — opt-in only.
84
+ eda = ["pandas>=2.0", "matplotlib>=3.8"]
77
85
  # NO-OP extra kept for backward compatibility (R3 at v0.49.0).
78
86
  #
79
87
  # jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
@@ -0,0 +1,425 @@
1
+ """Shared narrative-prose helpers for the `audit_*` validator family.
2
+
3
+ This private flat module hosts the Layer 2 (scope) + Layer 3 (pairing-
4
+ rule) building blocks that emerged from the v1.1.0 → v1.3.0 cycle of
5
+ ``audit_value_bindings`` and are reused by ``audit_citation_alignment``
6
+ at v1.4.0+. Per ADR 0007, the three-layer correctness model (identity
7
+ + scope + pairing) applies family-wide; this module is the canonical
8
+ home for the prose-pattern primitives that the scope + pairing layers
9
+ build on.
10
+
11
+ Design notes:
12
+
13
+ - **Private flat module** (underscore-prefixed name): matches ADR 0001's
14
+ `_rng.py` / `_parallel.py` / `_sweep.py` precedent. Not in the
15
+ package's public ``_EXPORTS`` resolver; consumers import via
16
+ ``eval_toolkit.audit_*`` modules, which in turn import from here.
17
+ - **Helpers preserve their exact signatures from audit_value_bindings.py**
18
+ (v1.1.0–v1.3.0 vintage) — extraction is a signature-preserving
19
+ refactor. All 43 existing audit_value_bindings tests continue to
20
+ pass unchanged.
21
+ - **Keyword frozensets are audit_value_bindings-specific** (delta /
22
+ floor / group-subject keywords are about value-binding prose, not
23
+ citation prose). Other validators that need similar lists define
24
+ their own constants. The SHARED parts are the regex-compilation
25
+ utility and the structural helpers (exclusion ranges, sentence
26
+ boundaries) that are validator-agnostic.
27
+
28
+ Cross-references:
29
+ - ADR 0005 — identity layer (BindingKey)
30
+ - ADR 0006 — pairing layer (audit_value_bindings-specific)
31
+ - ADR 0007 — three-layer architecture, family-wide
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import bisect
37
+ import re
38
+ from collections.abc import Sequence
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Module-level keyword sets and compiled patterns.
42
+ # Specific to audit_value_bindings' Layer 2 filters (T1/T2/C); kept here
43
+ # so the validator can `from eval_toolkit._narrative import ...` rather
44
+ # than maintaining two copies. Other audit_* validators define their
45
+ # own keyword sets analogously.
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ # _DELTA_KEYWORDS: case-insensitive whole-token markers indicating a
50
+ # value is a paired-delta or comparative magnitude, not a binding claim.
51
+ # T1 filter suppresses candidate values when any of these appears within
52
+ # ±30 chars of the value position (under scope="narrative").
53
+ _DELTA_KEYWORDS: frozenset[str] = frozenset(
54
+ {
55
+ # Unambiguous delta nouns/verbs (consumer prose patterns):
56
+ "delta",
57
+ "drop",
58
+ "drops",
59
+ "lift",
60
+ "lifts",
61
+ "gap",
62
+ "margin",
63
+ # Comparison verbs that signal "this is a relative magnitude":
64
+ "regresses",
65
+ "improves",
66
+ "beats",
67
+ "exceeds",
68
+ "trails",
69
+ "underperforms",
70
+ # "vs"/"versus" intentionally INCLUDED — they're the canonical
71
+ # delta separator in consumer prose ("AUPRC 0.556 vs 0.519").
72
+ # The before-only window keeps these tight: "X vs Y" fires on
73
+ # Y (preceded by "vs"), not X. T3 also catches the same-sentence
74
+ # duplicate-binding flag separately.
75
+ "vs",
76
+ "versus",
77
+ # Comparison directions — kept under before-only window so
78
+ # "drops -0.071 below" suppresses -0.071 (sign also catches),
79
+ # but "0.515 (delta -0.132)" doesn't suppress 0.515 ("delta"
80
+ # is AFTER 0.515).
81
+ # Excluded: "against", "above", "ahead", "behind" — too
82
+ # ambiguous; common comparison prepositions that appear in
83
+ # legitimate binding claims.
84
+ "below",
85
+ }
86
+ )
87
+
88
+ # _FLOOR_KEYWORDS: markers indicating a value is a random-baseline or
89
+ # floor reference, not a detector binding. T2 filter suppresses
90
+ # candidate values when any of these appears within −50 / +5 chars
91
+ # (asymmetric: floor mentions canonically precede the value, e.g.,
92
+ # "random AUPRC is 0.374").
93
+ #
94
+ # Intentionally narrow: "baseline", "prior", "majority" are EXCLUDED
95
+ # because they have legitimate non-floor senses ("TF-IDF baseline",
96
+ # "prior work", "majority of detectors"). The consumer's prose
97
+ # patterns with these words ("below the prevalence baseline of 0.374")
98
+ # are caught by T1 via "below"/"above" instead — the comparative
99
+ # preposition is the reliable signal, not the noun.
100
+ _FLOOR_KEYWORDS: frozenset[str] = frozenset(
101
+ {
102
+ "random",
103
+ "floor",
104
+ "chance",
105
+ "trivial",
106
+ }
107
+ )
108
+
109
+ # _ABBREV_BEFORE_DOT: tokens that should NOT trigger a sentence
110
+ # boundary when followed by `.`. The multi-letter pattern (e.g., i.e.,
111
+ # c.f.) is handled separately via letter-dot-letter detection.
112
+ _ABBREV_BEFORE_DOT: frozenset[str] = frozenset(
113
+ {
114
+ "vs",
115
+ "etc",
116
+ "cf",
117
+ "fig",
118
+ "eq",
119
+ "pp",
120
+ "viz",
121
+ "ca",
122
+ }
123
+ )
124
+
125
+
126
+ def _compile_keyword_pattern(keywords: frozenset[str]) -> re.Pattern[str]:
127
+ """Compile case-insensitive word-boundary OR regex matching any keyword."""
128
+ parts = sorted(re.escape(kw) for kw in keywords)
129
+ return re.compile(r"\b(?:" + "|".join(parts) + r")\b", re.IGNORECASE)
130
+
131
+
132
+ _DELTA_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_DELTA_KEYWORDS)
133
+ _FLOOR_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_FLOOR_KEYWORDS)
134
+
135
+
136
+ # Group-subject adjectives that introduce a multi-detector statement.
137
+ # When prose says "for the trained detectors", the following value
138
+ # refers to a GROUP (LoRA + TF-IDF + ... whatever bindings exist),
139
+ # not a single canonical detector. The validator can't infer which
140
+ # specific detectors own the group value with positional heuristics,
141
+ # so v1.3.0 suppresses the candidate rather than attempting multi-
142
+ # detector inference (a v1.4.0+ candidate per ADR 0006).
143
+ _GROUP_SUBJECT_KEYWORDS: frozenset[str] = frozenset(
144
+ {
145
+ "trained",
146
+ "frozen",
147
+ "baseline",
148
+ "all",
149
+ "both",
150
+ "other",
151
+ }
152
+ )
153
+
154
+ # Module-level: detector-independent group-subject regex. Matches
155
+ # "for the {trained|frozen|...} detectors" (with optional "the"; both
156
+ # singular and plural "detector"/"detectors" tolerated).
157
+ _GROUP_SUBJECT_PATTERN: re.Pattern[str] = re.compile(
158
+ r"\bfor\s+(?:the\s+)?(?:"
159
+ + "|".join(sorted(re.escape(kw) for kw in _GROUP_SUBJECT_KEYWORDS))
160
+ + r")\s+detectors?\b",
161
+ re.IGNORECASE,
162
+ )
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Layer 2: content-type filtering helpers (`scope="narrative"`).
167
+ # Used by audit_value_bindings (v1.1.0+) and audit_citation_alignment
168
+ # (v1.4.0+) to exclude markdown table rows, bracketed expressions, and
169
+ # fenced code blocks from candidate-value / candidate-citation matching.
170
+ # ---------------------------------------------------------------------------
171
+
172
+
173
+ def _build_exclusion_ranges(
174
+ text: str,
175
+ line_starts: Sequence[int],
176
+ ) -> list[tuple[int, int]]:
177
+ """Compute sorted character ranges that ``scope="narrative"`` excludes.
178
+
179
+ Excluded content types (per the lint-scope design discussion in
180
+ ADR 0005):
181
+
182
+ - **Markdown table rows**: lines starting with optional whitespace
183
+ then ``|``. Tables are structured data audited via different
184
+ mechanisms (e.g., direct results-table verification), not via
185
+ narrative-prose binding-claim checks. Values in cells are
186
+ typically inline statistics (multiple metrics per row), and the
187
+ validator's positional heuristics can't disambiguate them.
188
+ - **Bracketed expressions** ``[...]``: confidence intervals,
189
+ reference markers, ranges. The numeric content inside brackets
190
+ is not a point-estimate claim; the validator should not flag it.
191
+ - **Fenced code blocks**: triple-backtick blocks contain code or
192
+ literal data, not narrative claims.
193
+
194
+ Returns a sorted list of ``(start, end)`` character intervals
195
+ (half-open) for use with :func:`_is_excluded`.
196
+ """
197
+ excluded: list[tuple[int, int]] = []
198
+ in_code_block = False
199
+ code_block_start = 0
200
+ n_lines = len(line_starts)
201
+ for line_idx in range(n_lines):
202
+ line_start = line_starts[line_idx]
203
+ line_end = line_starts[line_idx + 1] if line_idx + 1 < n_lines else len(text)
204
+ line = text[line_start:line_end]
205
+
206
+ # Triple-backtick code-fence toggle. The fence line itself is
207
+ # also part of the excluded range (so values aren't matched
208
+ # from within the fence marker, though that's unlikely).
209
+ stripped = line.lstrip()
210
+ if stripped.startswith("```"):
211
+ if not in_code_block:
212
+ in_code_block = True
213
+ code_block_start = line_start
214
+ else:
215
+ in_code_block = False
216
+ excluded.append((code_block_start, line_end))
217
+ continue
218
+ if in_code_block:
219
+ # Lines inside a code block are folded into the outer
220
+ # range emitted at the closing fence; no per-line emission.
221
+ continue
222
+
223
+ # Markdown table row.
224
+ if stripped.startswith("|"):
225
+ excluded.append((line_start, line_end))
226
+ continue
227
+
228
+ # Bracketed expressions on this line. Multiple `[...]` allowed.
229
+ # Nested brackets are rare in measurement prose; first close
230
+ # wins.
231
+ i = 0
232
+ while True:
233
+ open_idx = line.find("[", i)
234
+ if open_idx == -1:
235
+ break
236
+ close_idx = line.find("]", open_idx + 1)
237
+ if close_idx == -1:
238
+ break
239
+ excluded.append((line_start + open_idx, line_start + close_idx + 1))
240
+ i = close_idx + 1
241
+
242
+ # Handle unterminated code block (defensive: treat rest of file as
243
+ # excluded). Sort by start position.
244
+ if in_code_block:
245
+ excluded.append((code_block_start, len(text)))
246
+ excluded.sort()
247
+ return excluded
248
+
249
+
250
+ def _is_excluded(pos: int, excluded: Sequence[tuple[int, int]]) -> bool:
251
+ """Return True if ``pos`` falls inside any excluded range.
252
+
253
+ Uses binary search on the sorted ranges. Half-open semantics: a
254
+ range ``(start, end)`` excludes positions ``start <= pos < end``.
255
+ """
256
+ if not excluded:
257
+ return False
258
+ # Find rightmost range with start <= pos.
259
+ lo, hi = 0, len(excluded)
260
+ while lo < hi:
261
+ mid = (lo + hi) // 2
262
+ if excluded[mid][0] <= pos:
263
+ lo = mid + 1
264
+ else:
265
+ hi = mid
266
+ if lo == 0:
267
+ return False
268
+ start, end = excluded[lo - 1]
269
+ return start <= pos < end
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Sentence-boundary detection (paragraph-aware, abbreviation-guarded).
274
+ # Used by v1.2.0 T3/T4 in audit_value_bindings and the v1.4.0 Layer 3
275
+ # rule γ in audit_citation_alignment (sentence-boundary respect for
276
+ # category-keyword window).
277
+ # ---------------------------------------------------------------------------
278
+
279
+
280
+ def _is_sentence_terminator_dot(text: str, dot_pos: int) -> bool:
281
+ """Return True if the dot at ``dot_pos`` terminates a sentence.
282
+
283
+ False positives the abbreviation guard catches:
284
+
285
+ - Decimal numbers (digit-dot-digit): ``0.5``, ``§5.2``.
286
+ - Letter-dot-letter-dot patterns: ``e.g.``, ``i.e.``, ``c.f.``.
287
+ - Single-token abbreviations preceding the dot (whitespace- /
288
+ punctuation-separated): ``vs.``, ``etc.``, ``cf.``, ``fig.``,
289
+ ``eq.``, ``pp.``, ``viz.``, ``ca.``. See ``_ABBREV_BEFORE_DOT``.
290
+ """
291
+ n = len(text)
292
+ prev_char = text[dot_pos - 1] if dot_pos > 0 else ""
293
+ next_char = text[dot_pos + 1] if dot_pos + 1 < n else ""
294
+ # Decimal: digit-dot-digit.
295
+ if prev_char.isdigit() and next_char.isdigit():
296
+ return False
297
+ # Letter-dot-letter-dot pattern, dot is the SECOND dot in "x.y."
298
+ if (
299
+ dot_pos >= 3
300
+ and prev_char.isalpha()
301
+ and text[dot_pos - 2] == "."
302
+ and text[dot_pos - 3].isalpha()
303
+ ):
304
+ return False
305
+ # Letter-dot-letter-dot pattern, dot is the FIRST dot in "x.y."
306
+ if dot_pos + 2 < n and next_char.isalpha() and text[dot_pos + 2] == ".":
307
+ return False
308
+ # Single-token abbreviation preceding the dot.
309
+ j = dot_pos - 1
310
+ while j >= 0 and text[j].isalpha():
311
+ j -= 1
312
+ word = text[j + 1 : dot_pos].lower()
313
+ return word not in _ABBREV_BEFORE_DOT
314
+
315
+
316
+ def _sentence_boundary_positions(text: str) -> list[int]:
317
+ """Return sorted character positions where each sentence STARTS.
318
+
319
+ Hard breaks (sentence terminators):
320
+
321
+ - ``!`` and ``?`` always terminate.
322
+ - ``.`` terminates unless the abbreviation guard
323
+ (:func:`_is_sentence_terminator_dot`) returns False.
324
+ - ``\\n\\n`` (paragraph break) terminates.
325
+
326
+ Soft breaks (NOT sentence boundaries):
327
+
328
+ - Single ``\\n`` (markdown line-wrap mid-sentence).
329
+ - ``;`` (semicolons in dense list constructions).
330
+ - ``:`` (colons preceding list items or definitions).
331
+
332
+ The first sentence starts at position 0. Subsequent sentence starts
333
+ are recorded at the first non-whitespace character after a hard
334
+ break. Used by audit_value_bindings T3/T4 and
335
+ audit_citation_alignment Layer 3 rule γ.
336
+ """
337
+ positions = [0]
338
+ n = len(text)
339
+ i = 0
340
+ while i < n:
341
+ ch = text[i]
342
+ boundary = False
343
+ skip = 1
344
+ if ch in "!?" or ch == "." and _is_sentence_terminator_dot(text, i):
345
+ boundary = True
346
+ elif ch == "\n" and i + 1 < n and text[i + 1] == "\n":
347
+ boundary = True
348
+ skip = 2
349
+ if boundary:
350
+ j = i + skip
351
+ while j < n and text[j].isspace():
352
+ j += 1
353
+ if j < n and j > positions[-1]:
354
+ positions.append(j)
355
+ i = max(j, i + skip)
356
+ else:
357
+ i += 1
358
+ return positions
359
+
360
+
361
+ def _sentence_id_of(pos: int, sentence_positions: Sequence[int]) -> int:
362
+ """Return the zero-based sentence index containing ``pos``.
363
+
364
+ Uses binary search over the sorted ``sentence_positions``. Returns
365
+ ``0`` for any position before the first sentence start.
366
+ """
367
+ if not sentence_positions:
368
+ return 0
369
+ idx = bisect.bisect_right(sentence_positions, pos) - 1
370
+ return max(0, idx)
371
+
372
+
373
+ def _crosses_sentence_boundary(pos_a: int, pos_b: int, sentence_positions: Sequence[int]) -> bool:
374
+ """Return True if a sentence boundary lies strictly between ``pos_a`` and ``pos_b``.
375
+
376
+ Sentence-boundary positions are derived from
377
+ :func:`_sentence_boundary_positions`. Used by audit_value_bindings
378
+ T4 (reject (detector, value) pairs across a sentence boundary)
379
+ and audit_citation_alignment Layer 3 rule γ (the category-keyword
380
+ extraction window for an ADR citation must not cross a sentence
381
+ boundary).
382
+ """
383
+ if not sentence_positions:
384
+ return False
385
+ lo = min(pos_a, pos_b)
386
+ hi = max(pos_a, pos_b)
387
+ idx = bisect.bisect_right(sentence_positions, lo)
388
+ return idx < len(sentence_positions) and sentence_positions[idx] <= hi
389
+
390
+
391
+ # ---------------------------------------------------------------------------
392
+ # Value-context helpers (used by audit_value_bindings T1/T2; kept here
393
+ # for any future audit_* validator that wants the same primitives).
394
+ # ---------------------------------------------------------------------------
395
+
396
+
397
+ def _is_signed_value(text: str, val_start: int) -> bool:
398
+ """True if the value at ``val_start`` is immediately preceded by ``+`` or ``-``.
399
+
400
+ The sign marker indicates a paired-delta or comparative magnitude
401
+ (e.g., ``-0.071`` AUPRC delta), not a binding claim. T1 filter
402
+ skips these under ``scope="narrative"``.
403
+ """
404
+ return val_start > 0 and text[val_start - 1] in "+-"
405
+
406
+
407
+ def _has_keyword_in_window(
408
+ text: str,
409
+ val_start: int,
410
+ pattern: re.Pattern[str],
411
+ before_chars: int,
412
+ after_chars: int,
413
+ ) -> bool:
414
+ """True if ``pattern`` matches anywhere in the character window around ``val_start``.
415
+
416
+ Used by audit_value_bindings T1 (delta keywords) and T2 (floor
417
+ keywords) to detect context cues near a candidate value.
418
+ ``before_chars`` and ``after_chars`` control the asymmetric
419
+ window — floor mentions typically PRECEDE the value (e.g.,
420
+ "random AUPRC is 0.374"), while delta mentions can be on either
421
+ side.
422
+ """
423
+ start = max(0, val_start - before_chars)
424
+ end = min(len(text), val_start + after_chars)
425
+ return bool(pattern.search(text, start, end))
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.3.0"
5
+ __version__ = "1.5.0"