eval-toolkit 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/CHANGELOG.md +109 -0
  2. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/PKG-INFO +1 -1
  3. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/__init__.py +8 -0
  4. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/_version.py +1 -1
  5. eval_toolkit-1.0.5/src/eval_toolkit/audit_sister_doc_concept_drift.py +432 -0
  6. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/public_api/snapshot.json +25 -1
  7. eval_toolkit-1.0.5/tests/test_audit_sister_doc_concept_drift.py +337 -0
  8. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/.gitignore +0 -0
  9. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/LICENSE +0 -0
  10. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/README.md +0 -0
  11. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/STYLE.md +0 -0
  12. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/archive/README.md +0 -0
  13. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/README.md +0 -0
  14. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/datasets/README.md +0 -0
  15. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/papers/data-integrity/README.md +0 -0
  16. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/papers/eval-ecosystem/README.md +0 -0
  17. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/papers/inference/README.md +0 -0
  18. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/research/papers/prompt-injection/README.md +0 -0
  19. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/source/adr/README.md +0 -0
  20. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/docs/source/methodology/README.md +0 -0
  21. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/pyproject.toml +0 -0
  22. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/audit_value_bindings.py +0 -0
  32. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/loaders.py +0 -0
  42. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/losses.py +0 -0
  43. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/manifest.py +0 -0
  44. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/metric_specs.py +0 -0
  45. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/preprocessing.py +0 -0
  50. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/probes.py +0 -0
  51. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  58. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/results.v1.json +0 -0
  59. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  60. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/scorecards.py +0 -0
  61. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/seeds.py +0 -0
  62. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/splits.py +0 -0
  63. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/stacking.py +0 -0
  64. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/text_dedup.py +0 -0
  65. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/src/eval_toolkit/thresholds.py +0 -0
  66. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  67. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  68. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  69. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  70. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  71. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  72. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  73. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  74. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  75. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  76. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/benchmarks/__init__.py +0 -0
  77. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  78. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/conftest.py +0 -0
  79. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/bootstrap_ci/cases.json +0 -0
  80. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/data/dedup_holdout.jsonl +0 -0
  81. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/data/dedup_holdout_expected.json +0 -0
  82. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  83. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/docs/expected.md +0 -0
  84. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/docs/input.md +0 -0
  85. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/docs/metrics.json +0 -0
  86. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  87. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/strategies.py +0 -0
  88. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_adversarial.py +0 -0
  89. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_analysis.py +0 -0
  90. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_artifacts.py +0 -0
  91. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_audit_citation_alignment.py +0 -0
  92. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_audit_value_bindings.py +0 -0
  93. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_block_bootstrap_on_folds.py +0 -0
  94. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_calibration_mc.py +0 -0
  95. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_edge_cases.py +0 -0
  96. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_golden.py +0 -0
  97. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_njobs.py +0 -0
  98. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_props.py +0 -0
  99. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_research_grounded.py +0 -0
  100. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_bootstrap_unit.py +0 -0
  101. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_binary_adapters.py +0 -0
  102. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_bootstrap_chain.py +0 -0
  103. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_determinism.py +0 -0
  104. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_optimization_failures.py +0 -0
  105. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_props.py +0 -0
  106. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_research_grounded.py +0 -0
  107. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_calibration_unit.py +0 -0
  108. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_claims.py +0 -0
  109. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_claims_coverage.py +0 -0
  110. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_claims_props.py +0 -0
  111. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_cli.py +0 -0
  112. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_config.py +0 -0
  113. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_coverage_bootstrap.py +0 -0
  114. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_coverage_calibration.py +0 -0
  115. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_coverage_harness.py +0 -0
  116. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_coverage_metrics.py +0 -0
  117. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_coverage_plotting.py +0 -0
  118. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_croissant_e2e.py +0 -0
  119. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_dedup_split_leakage_chain.py +0 -0
  120. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_deprecated_scalars_shim.py +0 -0
  121. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_deprecations.py +0 -0
  122. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_docs_golden.py +0 -0
  123. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_docs_props.py +0 -0
  124. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_embeddings.py +0 -0
  125. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_evidence_validators.py +0 -0
  126. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_edge_cases.py +0 -0
  127. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_fault_injection.py +0 -0
  128. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_folded.py +0 -0
  129. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_internals.py +0 -0
  130. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_metric_options.py +0 -0
  131. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_parallelism.py +0 -0
  132. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_harness_smoke.py +0 -0
  133. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_import_boundaries.py +0 -0
  134. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_is_metric_defined_for_slice.py +0 -0
  135. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_lazy_extras_messages.py +0 -0
  136. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_logging.py +0 -0
  143. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_losses.py +0 -0
  144. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_manifest.py +0 -0
  145. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_manifest_contamination_round_trip.py +0 -0
  146. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_manifest_props.py +0 -0
  147. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_manifest_validation.py +0 -0
  148. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_metrics_props.py +0 -0
  149. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_metrics_stratified_subsets.py +0 -0
  150. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_metrics_unit.py +0 -0
  151. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_misc_coverage.py +0 -0
  152. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_numeric_edge_cases.py +0 -0
  153. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_ood_loader.py +0 -0
  154. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_operating_points.py +0 -0
  155. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_operating_points_props.py +0 -0
  156. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_parallel.py +0 -0
  157. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_paths.py +0 -0
  158. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_pipeline_e2e.py +0 -0
  159. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_plotting_edge.py +0 -0
  160. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_plotting_smoke.py +0 -0
  161. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_plotting_visual.py +0 -0
  162. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_preprocessing.py +0 -0
  163. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_probes.py +0 -0
  164. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_protocol_conformance.py +0 -0
  165. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_provenance.py +0 -0
  166. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_public_api.py +0 -0
  167. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_recall_at_fpr.py +0 -0
  168. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_reference_equivalence.py +0 -0
  169. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_reproducibility_integration.py +0 -0
  170. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_rng.py +0 -0
  171. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_schemas.py +0 -0
  172. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_scorecard.py +0 -0
  173. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_seeds.py +0 -0
  174. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_splits.py +0 -0
  175. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_splits_leakage_integration.py +0 -0
  176. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_splits_props.py +0 -0
  177. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_stacking.py +0 -0
  178. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_sweep.py +0 -0
  179. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_text_dedup.py +0 -0
  180. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_text_dedup_coverage.py +0 -0
  181. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_text_dedup_props.py +0 -0
  182. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_text_dedup_strategies.py +0 -0
  183. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_thresholds.py +0 -0
  184. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_thresholds_constant_score.py +0 -0
  185. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_thresholds_coverage.py +0 -0
  186. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_thresholds_props.py +0 -0
  187. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_thresholds_research_grounded.py +0 -0
  188. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_tokenization_leakage_check.py +0 -0
  189. {eval_toolkit-1.0.3 → eval_toolkit-1.0.5}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,115 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.5] — 2026-05-26 — publish workflow hardening (infrastructure-only)
9
+
10
+ Tier-3 / infrastructure-only release. **No library code or public API
11
+ changes.** Hardens the release pipeline against the failure mode
12
+ observed at v1.0.4, where a documented GitHub Actions CRITICAL
13
+ incident (codeload action download failure across the platform) left
14
+ the v1.0.4 wheel un-published on PyPI despite a successful tag and
15
+ GitHub release. The wheel for v1.0.5 is functionally identical to
16
+ v1.0.4; this release exists primarily as a dress rehearsal for the
17
+ new verification step.
18
+
19
+ ### Added — `.github/workflows/publish.yml`
20
+
21
+ - **`workflow_dispatch:` trigger** — recovery path for failed
22
+ tag-triggered runs. Manually re-trigger via
23
+ `gh workflow run publish.yml --ref vX.Y.Z` or the Actions UI
24
+ "Run workflow" dropdown. Always uses the workflow file from main
25
+ HEAD, so workflow patches take effect immediately for recovery.
26
+ - **Post-publish `Verify PyPI receipt` step** — polls
27
+ `pypi.org/pypi/eval-toolkit/<version>/json` for HTTP 200 over a
28
+ 6-minute window (12 × 30s backoff); fails loudly if the wheel
29
+ never lands. Catches silent half-releases where
30
+ `pypa/gh-action-pypi-publish` returns success but PyPI never
31
+ receives the wheel.
32
+
33
+ ### Added — `docs/source/RELEASING.md`
34
+
35
+ - **"Tag-triggered publish failed; need to re-publish to PyPI"**
36
+ recovery recipe under Known gotchas. Documents both the
37
+ `gh run rerun` path (when the original run can be retried) and
38
+ the `workflow_dispatch` path (when the workflow has been patched
39
+ on main since the original tag). References the v1.0.4 incident
40
+ as the canonical example.
41
+
42
+ ### Notes
43
+
44
+ - `setup-uv@v8.1.0` pin is intentionally unchanged. The v1.0.4
45
+ failure was a documented GitHub Actions/codeload incident, not
46
+ an action-specific issue; replacing setup-uv with a curl-install
47
+ would lose the cache layer + Python integration + version-from-
48
+ pyproject detection it provides, and would not have prevented the
49
+ observed failure (actions/checkout downloaded successfully in the
50
+ same failing run; codeload was the SPOF, not setup-uv).
51
+ - The other 5 workflows (ci/codeql/docs/nightly-benchmarks/
52
+ nightly-mc) are not patched because they self-heal on the next
53
+ push; the SPOF only matters for one-shot tag-triggered runs.
54
+
55
+ ## [1.0.4] — 2026-05-26 — `audit_sister_doc_concept_drift` module (closes #72)
56
+
57
+ Tier-2 ADDITIVE — third (and final) member of the audit-validator
58
+ family. Flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md).
59
+ Family complete: `audit_citation_alignment` (v1.0.1) + `audit_value_bindings`
60
+ (v1.0.3) + `audit_sister_doc_concept_drift` (this release).
61
+
62
+ ### Added
63
+
64
+ - **`audit_sister_doc_concept_drift` module** exporting
65
+ `validate_sister_doc_concept_drift()` + `DriftCluster` +
66
+ `SisterDocDriftReport` as Tier 1 STRICT (per
67
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
68
+ Catches the bug class where two linked sister docs reference the
69
+ same concept token (e.g., `T1`, `manifest v3`) but the
70
+ surrounding-sentence definitions semantically disagree.
71
+ Cross-doc semantic drift survives lychee (links resolve), anchor
72
+ audits (anchors exist), and numeric audits (qualitative prose).
73
+ - Algorithm: per concept_token, scan all files for occurrences;
74
+ extract surrounding-sentence context (`context_window_sentences`);
75
+ embed each snippet via the supplied `embedder` (default lazily
76
+ routes to `make_minilm_embedder()` — requires `[embeddings]`
77
+ optional extra); cluster via single-linkage cosine similarity at
78
+ `similarity_threshold` (default 0.7); tokens with >1 cluster are
79
+ flagged as `DriftCluster`.
80
+ - The `embedder: Callable[[Sequence[str]], np.ndarray] | None`
81
+ parameter matches the existing
82
+ `EmbeddingCosineStrategy.embedder` Protocol — consumers can pass
83
+ any embedder (BGE, E5, OpenAI, or a mock for tests). Default
84
+ `None` defers `sentence_transformers` import to call-time
85
+ (`[embeddings]` extra is required only when caller doesn't supply
86
+ their own embedder).
87
+ - Motivating bug class: consumer audit found
88
+ `docs/REPRODUCIBILITY.md:85` defines `T1` as "full canonical
89
+ re-eval (GPU; A100 80GB)" while `WRITEUP/reproducibility.md:33`
90
+ defines `T1` as "smoke (laptop, $0, ~10 min)" — the two docs
91
+ cross-link as "Aggregator docs" so a reviewer following the link
92
+ lands on contradictory definitions.
93
+ - 13 tests at `tests/test_audit_sister_doc_concept_drift.py` using a
94
+ deterministic mock embedder (no `sentence_transformers` dependency
95
+ for unit tests). Covers: seed-case T1 drift, consistent definition
96
+ across files, single-occurrence consistency, unreferenced-token
97
+ coverage tracking, multi-token mixed (T0 + T1 + T3), threshold
98
+ sensitivity, whole-word boundary (`T1` vs `T10` vs `t1`), context
99
+ window scope, empty inputs, 3-way drift, frozen-dataclass
100
+ invariants, lazy default-embedder import. Closes #72.
101
+
102
+ ### Audit-validator family complete
103
+
104
+ | Validator | Released | Issue |
105
+ |---|---|---|
106
+ | `audit_citation_alignment` | v1.0.1 | #73 |
107
+ | `audit_value_bindings` | v1.0.3 | #71 |
108
+ | `audit_sister_doc_concept_drift` | v1.0.4 (this release) | #72 |
109
+
110
+ All three follow the flat-module convention (ADR 0001), closed-config
111
+ pattern (consumer supplies the auditable surface; validator owns the
112
+ parsing+matching logic; ADR 0002), and Tier 1 STRICT top-level
113
+ exports per ADR 0003. Consumer adoption pattern is the same across
114
+ all three: thin `scripts/audit_<name>.py` CLI wrapper invoking the
115
+ upstream validator.
116
+
8
117
  ## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
9
118
 
10
119
  Tier-2 ADDITIVE — second member of the audit-validator family
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -67,6 +67,14 @@ _EXPORTS: dict[str, str] = {
67
67
  "ValueBindingsReport": "eval_toolkit.audit_value_bindings",
68
68
  "Violation": "eval_toolkit.audit_value_bindings",
69
69
  "validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
70
+ # --- audit_sister_doc_concept_drift ---
71
+ # Flat-module per ADR 0001. Closes #72. Motivated by consumer T1
72
+ # definition contradiction across sister reproducibility docs.
73
+ # Requires [embeddings] extra (lazy resolution; embedder kwarg
74
+ # lets consumers swap in any callable).
75
+ "DriftCluster": "eval_toolkit.audit_sister_doc_concept_drift",
76
+ "SisterDocDriftReport": "eval_toolkit.audit_sister_doc_concept_drift",
77
+ "validate_sister_doc_concept_drift": "eval_toolkit.audit_sister_doc_concept_drift",
70
78
  # --- losses ---
71
79
  "RecallAtLowFPR": "eval_toolkit.losses",
72
80
  # --- preprocessing ---
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.3"
5
+ __version__ = "1.0.5"
@@ -0,0 +1,432 @@
1
+ r"""Sister-doc concept-drift validator (embedding-similarity-based).
2
+
3
+ Catches the bug class where two linked sister docs both reference the
4
+ same concept token (e.g., ``T1``, ``manifest v3``, ``verified_disjoint``)
5
+ but the **surrounding-sentence definitions disagree**. Cross-doc drift
6
+ survives lychee (links resolve), anchor audits (anchors exist), and
7
+ numeric audits (numbers don't disagree because the prose is qualitative).
8
+
9
+ Motivating test case (from `prompt-injection-detection-submission`
10
+ audit, two reproducibility surfaces)::
11
+
12
+ docs/REPRODUCIBILITY.md:85:
13
+ T1 = "full canonical re-eval (GPU; A100 80GB): make headline-cloud
14
+ re-runs ... ~7h wall-clock; ~$28 GPU spend"
15
+
16
+ WRITEUP/reproducibility.md:33:
17
+ T1 = "smoke (laptop, $0, ~10 min): `make smoke` verifies code health"
18
+
19
+ Both files cross-link as "Aggregator docs"; following the link lands a
20
+ reader on contradictory T1 definitions.
21
+
22
+ Algorithm
23
+ ---------
24
+ 1. For each ``concept_token``, scan all ``files`` for occurrences. Each
25
+ occurrence captures the *surrounding sentence(s)* (configurable
26
+ ``context_window_sentences``) — that's the candidate "definition".
27
+ 2. Embed each surrounding-sentence string via the supplied ``embedder``
28
+ (default: lazy :func:`eval_toolkit.embeddings.make_minilm_embedder`).
29
+ 3. Cluster occurrences by single-linkage: two occurrences belong to the
30
+ same cluster iff their cosine similarity is ``>= similarity_threshold``.
31
+ 4. A concept_token with **>1 cluster** is a :class:`DriftCluster` — its
32
+ occurrences split into semantically distinct definition groups.
33
+ 5. A concept_token with **exactly 1 cluster** is consistent across all
34
+ files.
35
+
36
+ Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
37
+ Tier 2 ADDITIVE on the ``[embeddings]`` optional extra surface):
38
+
39
+ - Consumer supplies the concept-token list + file glob; validator owns
40
+ parsing + embedding + clustering + report assembly.
41
+ - Embedder is a callable ``Callable[[Sequence[str]], np.ndarray]`` —
42
+ matches the existing :func:`~eval_toolkit.embeddings.make_minilm_embedder`
43
+ factory contract. ``embedder=None`` defers to the canonical MiniLM
44
+ recipe lazily (avoids forcing the ``[embeddings]`` extra import at
45
+ module load time).
46
+ - Flat-module: ``eval_toolkit.audit_sister_doc_concept_drift.*`` (NOT a
47
+ subpackage per ADR 0001 stay-flat-through-v1.x).
48
+
49
+ Closes upstream issue #72. v1.0.4. Completes the audit-validator family
50
+ of 3 (citation_alignment v1.0.1, value_bindings v1.0.3, sister_doc
51
+ concept_drift v1.0.4).
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import re
57
+ from collections.abc import Callable, Sequence
58
+ from dataclasses import dataclass
59
+ from pathlib import Path
60
+
61
+ import numpy as np
62
+
63
+ __all__ = [
64
+ "DriftCluster",
65
+ "SisterDocDriftReport",
66
+ "validate_sister_doc_concept_drift",
67
+ ]
68
+
69
+
70
+ DEFAULT_SIMILARITY_THRESHOLD: float = 0.7
71
+ DEFAULT_CONTEXT_WINDOW_SENTENCES: int = 1
72
+
73
+ # Sentence-ish splitter — markdown is not formal prose. Splits on
74
+ # ``.``, ``!``, ``?`` followed by whitespace or EOL. Imperfect but
75
+ # robust enough for cross-doc concept-drift detection (consumers
76
+ # tolerate boundary slop because clustering is the noise-tolerant
77
+ # downstream step).
78
+ _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\d`])")
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class DriftCluster:
83
+ """A concept token whose occurrences split into >1 semantic cluster.
84
+
85
+ Attributes
86
+ ----------
87
+ token : str
88
+ The concept token (e.g., ``"T1"``, ``"manifest v3"``).
89
+ sentences : tuple[tuple[Path, int, str], ...]
90
+ Each occurrence as ``(file, line, surrounding_text)`` — line is
91
+ 1-indexed; surrounding_text is the ``context_window_sentences``-sized
92
+ prose snippet that was embedded for clustering.
93
+ divergence_score : float
94
+ ``1 - min_inter_cluster_similarity`` for the worst-case pair
95
+ between any two clusters. Range ``[0.0, 1.0]``; higher = stronger
96
+ drift signal. ``0.0`` means clusters are barely distinguishable;
97
+ ``1.0`` means orthogonal embeddings.
98
+ """
99
+
100
+ token: str
101
+ sentences: tuple[tuple[Path, int, str], ...]
102
+ divergence_score: float
103
+
104
+
105
+ @dataclass(frozen=True)
106
+ class SisterDocDriftReport:
107
+ """Result of :func:`validate_sister_doc_concept_drift`.
108
+
109
+ Attributes
110
+ ----------
111
+ drift_clusters : tuple[DriftCluster, ...]
112
+ Each concept_token whose occurrences split into >1 cluster.
113
+ Empty tuple = all tokens consistent across the scanned files.
114
+ consistent_tokens : tuple[str, ...]
115
+ Concept tokens whose occurrences clustered to a single group
116
+ (or had ≤1 occurrence total). Reported for completeness +
117
+ coverage tracking.
118
+ coverage : float
119
+ Fraction of ``concept_tokens`` that produced ≥1 occurrence in
120
+ the scanned files. Range ``[0.0, 1.0]``. ``1.0`` means every
121
+ token was referenced; lower values flag stale tokens.
122
+ """
123
+
124
+ drift_clusters: tuple[DriftCluster, ...]
125
+ consistent_tokens: tuple[str, ...]
126
+ coverage: float
127
+
128
+
129
+ def validate_sister_doc_concept_drift(
130
+ *,
131
+ files: Sequence[Path | str],
132
+ concept_tokens: Sequence[str],
133
+ embedder: Callable[[Sequence[str]], np.ndarray] | None = None,
134
+ similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
135
+ context_window_sentences: int = DEFAULT_CONTEXT_WINDOW_SENTENCES,
136
+ ) -> SisterDocDriftReport:
137
+ """Validate cross-doc semantic consistency of concept token definitions.
138
+
139
+ For each ``concept_token``, scan ``files`` for occurrences; extract
140
+ the surrounding ``context_window_sentences``; embed each surrounding
141
+ snippet; cluster by single-linkage cosine similarity at
142
+ ``similarity_threshold``. Tokens that produce >1 cluster are flagged
143
+ as drift.
144
+
145
+ Parameters
146
+ ----------
147
+ files : Sequence[Path | str]
148
+ Markdown files to scan. UTF-8 encoded.
149
+ concept_tokens : Sequence[str]
150
+ Seed list of concept tokens (e.g., ``["T0", "T1", "T3",
151
+ "manifest v3", "verified_disjoint"]``). Each token is matched
152
+ case-sensitively as a whole-word boundary regex
153
+ (``\\b<token>\\b``).
154
+ embedder : Callable[[Sequence[str]], np.ndarray] | None, optional
155
+ Embedder callable returning ``(n, d)`` array. ``None`` (default)
156
+ lazily routes to :func:`eval_toolkit.embeddings.make_minilm_embedder`
157
+ — requires the ``[embeddings]`` optional extra
158
+ (``pip install eval-toolkit[embeddings]``). Custom callables let
159
+ consumers swap in any embedder (BGE, E5, OpenAI, mock for tests).
160
+ similarity_threshold : float, optional
161
+ Cosine-similarity threshold for single-linkage clustering.
162
+ Default ``0.7``. Higher = stricter (more clusters; more drift
163
+ flagged); lower = looser. ``0.7`` is the conservative default
164
+ for ``all-MiniLM-L6-v2`` — semantic-near-paraphrase territory.
165
+ context_window_sentences : int, optional
166
+ Number of sentences to extract on each side of the token mention
167
+ as the "definition" snippet (passed to the embedder). Default
168
+ ``1`` (the sentence containing the token; longer windows mute
169
+ token-specific signal with surrounding prose).
170
+
171
+ Returns
172
+ -------
173
+ SisterDocDriftReport
174
+ ``drift_clusters``, ``consistent_tokens``, ``coverage`` per the
175
+ dataclass.
176
+
177
+ Raises
178
+ ------
179
+ ImportError
180
+ If ``embedder=None`` and ``sentence_transformers`` is not
181
+ installed. Install via ``pip install eval-toolkit[embeddings]``.
182
+
183
+ Notes
184
+ -----
185
+ Clustering: single-linkage agglomerative on cosine similarity. Two
186
+ occurrences land in the same cluster iff their similarity is
187
+ ``>= similarity_threshold``. Transitive: ``a~b`` and ``b~c`` →
188
+ ``a, b, c`` in one cluster even if ``cos(a, c) < threshold``. This
189
+ is the canonical SBERT semantic-dedup recipe (see
190
+ :class:`~eval_toolkit.text_dedup.EmbeddingCosineStrategy` for the
191
+ sibling primitive at the inter-text-similarity level).
192
+
193
+ Token matching is case-sensitive whole-word — ``"T1"`` matches
194
+ ``"T1"`` but not ``"t1"`` or ``"T10"``. Adjust by passing
195
+ pre-normalized token strings if case-insensitivity is desired.
196
+
197
+ See Also
198
+ --------
199
+ eval_toolkit.audit_citation_alignment.validate_citations :
200
+ Sibling validator (catches ADR-citation alignment drift).
201
+ eval_toolkit.audit_value_bindings.validate_reader_value_bindings :
202
+ Sibling validator (catches detector→value binding drift).
203
+ eval_toolkit.embeddings.make_minilm_embedder :
204
+ Default embedder factory.
205
+ """
206
+ files_resolved = tuple(Path(f) for f in files)
207
+ tokens = tuple(concept_tokens)
208
+ if not tokens:
209
+ return SisterDocDriftReport(drift_clusters=(), consistent_tokens=(), coverage=0.0)
210
+
211
+ # Resolve embedder lazily — defer the [embeddings] extra import
212
+ # to call time so the module loads even when sentence_transformers
213
+ # isn't installed (matches the EmbeddingCosineStrategy pattern in
214
+ # text_dedup.py).
215
+ if embedder is None:
216
+ embedder = _default_embedder()
217
+
218
+ # Pre-load all files (avoid re-reading per token).
219
+ file_texts: dict[Path, str] = {}
220
+ for path in files_resolved:
221
+ try:
222
+ file_texts[path] = path.read_text(encoding="utf-8")
223
+ except OSError:
224
+ continue
225
+
226
+ drift_clusters: list[DriftCluster] = []
227
+ consistent_tokens: list[str] = []
228
+ tokens_with_hits: set[str] = set()
229
+
230
+ for token in tokens:
231
+ occurrences = _collect_occurrences(token, file_texts, context_window_sentences)
232
+ if not occurrences:
233
+ continue
234
+ tokens_with_hits.add(token)
235
+
236
+ if len(occurrences) == 1:
237
+ consistent_tokens.append(token)
238
+ continue
239
+
240
+ # Embed every surrounding snippet (one batch per token).
241
+ snippets = [occ[2] for occ in occurrences]
242
+ embeddings = np.asarray(embedder(snippets), dtype=np.float64)
243
+ clusters = _single_linkage_clusters(embeddings, similarity_threshold)
244
+
245
+ if len(clusters) == 1:
246
+ consistent_tokens.append(token)
247
+ continue
248
+
249
+ # Compute divergence score from inter-cluster similarity.
250
+ divergence = _divergence_score(embeddings, clusters)
251
+ drift_clusters.append(
252
+ DriftCluster(
253
+ token=token,
254
+ sentences=tuple(occurrences),
255
+ divergence_score=divergence,
256
+ )
257
+ )
258
+
259
+ coverage = len(tokens_with_hits) / len(tokens) if tokens else 0.0
260
+ return SisterDocDriftReport(
261
+ drift_clusters=tuple(drift_clusters),
262
+ consistent_tokens=tuple(consistent_tokens),
263
+ coverage=coverage,
264
+ )
265
+
266
+
267
+ def _default_embedder() -> Callable[[Sequence[str]], np.ndarray]:
268
+ """Lazy MiniLM embedder factory; raises ImportError with install hint."""
269
+ try:
270
+ from eval_toolkit.embeddings import make_minilm_embedder
271
+ except ImportError as exc: # pragma: no cover
272
+ msg = (
273
+ "audit_sister_doc_concept_drift requires the [embeddings] optional "
274
+ "extra (sentence_transformers). Install via "
275
+ "`pip install eval-toolkit[embeddings]` OR pass a custom embedder "
276
+ "callable via the embedder= kwarg."
277
+ )
278
+ raise ImportError(msg) from exc
279
+ return make_minilm_embedder()
280
+
281
+
282
+ def _collect_occurrences(
283
+ token: str, file_texts: dict[Path, str], context_window_sentences: int
284
+ ) -> list[tuple[Path, int, str]]:
285
+ """Find every occurrence of ``token`` (whole-word) across files.
286
+
287
+ Returns list of ``(file, line, surrounding_text)`` tuples where
288
+ ``surrounding_text`` is the ``context_window_sentences`` window
289
+ centered on the sentence containing the token.
290
+ """
291
+ occurrences: list[tuple[Path, int, str]] = []
292
+ token_re = re.compile(rf"\b{re.escape(token)}\b")
293
+ for path, text in file_texts.items():
294
+ sentences = _split_sentences(text)
295
+ for s_idx, sent in enumerate(sentences):
296
+ if not token_re.search(sent.text):
297
+ continue
298
+ window_lo = max(0, s_idx - context_window_sentences)
299
+ window_hi = min(len(sentences), s_idx + context_window_sentences + 1)
300
+ surrounding = " ".join(sentences[i].text for i in range(window_lo, window_hi))
301
+ occurrences.append((path, sent.line, surrounding))
302
+ return occurrences
303
+
304
+
305
+ @dataclass(frozen=True)
306
+ class _SentenceSpan:
307
+ text: str
308
+ line: int # 1-indexed line of the sentence's start
309
+
310
+
311
+ def _split_sentences(text: str) -> list[_SentenceSpan]:
312
+ """Split markdown text into sentence spans with line numbers.
313
+
314
+ Imperfect: skips fenced code blocks (```) but otherwise treats every
315
+ text region as prose. Good enough for concept-drift detection at the
316
+ sentence-of-context-around-token granularity.
317
+ """
318
+ # Strip fenced code blocks (replace with spaces preserving newlines so
319
+ # line numbers stay accurate).
320
+ in_fence = False
321
+ stripped_lines = []
322
+ for line in text.splitlines(keepends=True):
323
+ if line.lstrip().startswith("```"):
324
+ in_fence = not in_fence
325
+ stripped_lines.append(line) # keep newline for line-number alignment
326
+ continue
327
+ stripped_lines.append(line if not in_fence else "\n")
328
+ cleaned = "".join(stripped_lines)
329
+
330
+ # Compute (line_start_pos -> line_no) map
331
+ line_starts = [0]
332
+ for i, ch in enumerate(cleaned):
333
+ if ch == "\n":
334
+ line_starts.append(i + 1)
335
+
336
+ def pos_to_line(pos: int) -> int:
337
+ lo, hi = 0, len(line_starts) - 1
338
+ while lo < hi:
339
+ mid = (lo + hi + 1) // 2
340
+ if line_starts[mid] <= pos:
341
+ lo = mid
342
+ else:
343
+ hi = mid - 1
344
+ return lo + 1
345
+
346
+ # Split into rough sentences. Markdown headings + lists are
347
+ # treated as standalone sentences.
348
+ spans: list[_SentenceSpan] = []
349
+ # Process line-by-line first so headings/bullets stay isolated.
350
+ pos = 0
351
+ for raw_line in cleaned.splitlines(keepends=True):
352
+ line_text = raw_line.rstrip("\n").strip()
353
+ line_start_pos = pos
354
+ pos += len(raw_line)
355
+ if not line_text:
356
+ continue
357
+ # If line starts with #, treat as a sentence on its own
358
+ if line_text.startswith("#") or line_text.startswith("- ") or line_text.startswith("* "):
359
+ spans.append(_SentenceSpan(text=line_text, line=pos_to_line(line_start_pos)))
360
+ continue
361
+ # Else split on sentence-ish delimiters
362
+ for piece in _SENTENCE_SPLIT_RE.split(line_text):
363
+ piece = piece.strip()
364
+ if piece:
365
+ spans.append(_SentenceSpan(text=piece, line=pos_to_line(line_start_pos)))
366
+ return spans
367
+
368
+
369
+ def _single_linkage_clusters(embeddings: np.ndarray, threshold: float) -> list[list[int]]:
370
+ """Single-linkage agglomerative clustering on cosine similarity.
371
+
372
+ Returns list of clusters, each a list of row indices into ``embeddings``.
373
+ Two rows i, j are in the same cluster iff there exists a chain
374
+ i = k_0 ~ k_1 ~ ... ~ k_n = j where each adjacent pair has
375
+ ``cosine(k_i, k_{i+1}) >= threshold``.
376
+ """
377
+ n = embeddings.shape[0]
378
+ if n == 0:
379
+ return []
380
+ # Cosine similarity matrix
381
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
382
+ safe_norms = np.where(norms == 0, 1.0, norms)
383
+ normed = embeddings / safe_norms
384
+ sim = normed @ normed.T
385
+
386
+ # Union-find on edges where sim >= threshold
387
+ parent = list(range(n))
388
+
389
+ def find(x: int) -> int:
390
+ while parent[x] != x:
391
+ parent[x] = parent[parent[x]]
392
+ x = parent[x]
393
+ return x
394
+
395
+ def union(a: int, b: int) -> None:
396
+ ra, rb = find(a), find(b)
397
+ if ra != rb:
398
+ parent[ra] = rb
399
+
400
+ for i in range(n):
401
+ for j in range(i + 1, n):
402
+ if sim[i, j] >= threshold:
403
+ union(i, j)
404
+
405
+ # Group by root
406
+ groups: dict[int, list[int]] = {}
407
+ for i in range(n):
408
+ groups.setdefault(find(i), []).append(i)
409
+ return list(groups.values())
410
+
411
+
412
+ def _divergence_score(embeddings: np.ndarray, clusters: list[list[int]]) -> float:
413
+ """``1 - min_inter_cluster_similarity`` across all cluster pairs.
414
+
415
+ Higher = stronger drift. ``0.0`` means clusters are barely separated;
416
+ ``1.0`` means orthogonal embeddings.
417
+ """
418
+ if len(clusters) < 2:
419
+ return 0.0
420
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
421
+ safe_norms = np.where(norms == 0, 1.0, norms)
422
+ normed = embeddings / safe_norms
423
+ sim = normed @ normed.T
424
+ min_sim = 1.0
425
+ for a_idx in range(len(clusters)):
426
+ for b_idx in range(a_idx + 1, len(clusters)):
427
+ a, b = clusters[a_idx], clusters[b_idx]
428
+ # Min similarity between any pair across the two clusters
429
+ sub = sim[np.ix_(a, b)]
430
+ pair_min = float(sub.min())
431
+ min_sim = min(min_sim, pair_min)
432
+ return 1.0 - max(0.0, min_sim)
@@ -41,6 +41,7 @@
41
41
  "DedupReport",
42
42
  "DelimitVariant",
43
43
  "DiacriticInjection",
44
+ "DriftCluster",
44
45
  "EmbeddingCosineStrategy",
45
46
  "EncodeVariant",
46
47
  "EvalSlice",
@@ -106,6 +107,7 @@
106
107
  "SimilarityAuditReport",
107
108
  "SimilarityStrategy",
108
109
  "SingleSliceLoader",
110
+ "SisterDocDriftReport",
109
111
  "SliceAwareScorer",
110
112
  "SourceDisjointKFoldSplitter",
111
113
  "SourceRoleRecord",
@@ -247,6 +249,7 @@
247
249
  "validate_prediction_artifact_ref",
248
250
  "validate_reader_value_bindings",
249
251
  "validate_results",
252
+ "validate_sister_doc_concept_drift",
250
253
  "validate_source_roles",
251
254
  "walk_path",
252
255
  "wilson_interval",
@@ -575,6 +578,14 @@
575
578
  "kind": "class",
576
579
  "signature": "(ratio: 'float' = 0.3, seed: 'int' = 42, name: 'str' = 'diacritic') -> None"
577
580
  },
581
+ "DriftCluster": {
582
+ "bases": [
583
+ "object"
584
+ ],
585
+ "doc_first_line": "A concept token whose occurrences split into >1 semantic cluster.",
586
+ "kind": "class",
587
+ "signature": "(token: 'str', sentences: 'tuple[tuple[Path, int, str], ...]', divergence_score: 'float') -> None"
588
+ },
578
589
  "EmbeddingCosineStrategy": {
579
590
  "bases": [
580
591
  "object"
@@ -1152,6 +1163,14 @@
1152
1163
  "kind": "class",
1153
1164
  "signature": "(slice_: 'EvalSlice', name: 'str' = '', description: 'str' = '') -> None"
1154
1165
  },
1166
+ "SisterDocDriftReport": {
1167
+ "bases": [
1168
+ "object"
1169
+ ],
1170
+ "doc_first_line": "Result of :func:`validate_sister_doc_concept_drift`.",
1171
+ "kind": "class",
1172
+ "signature": "(drift_clusters: 'tuple[DriftCluster, ...]', consistent_tokens: 'tuple[str, ...]', coverage: 'float') -> None"
1173
+ },
1155
1174
  "SliceAwareScorer": {
1156
1175
  "bases": [
1157
1176
  "Scorer",
@@ -1401,7 +1420,7 @@
1401
1420
  "doc_first_line": "str(object='') -> str",
1402
1421
  "kind": "value",
1403
1422
  "type": "str",
1404
- "value": "'1.0.3'"
1423
+ "value": "'1.0.5'"
1405
1424
  },
1406
1425
  "apply_operating_points": {
1407
1426
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1958,6 +1977,11 @@
1958
1977
  "kind": "function",
1959
1978
  "signature": "(payload: 'Mapping[str, object]') -> 'None'"
1960
1979
  },
1980
+ "validate_sister_doc_concept_drift": {
1981
+ "doc_first_line": "Validate cross-doc semantic consistency of concept token definitions.",
1982
+ "kind": "function",
1983
+ "signature": "(*, files: 'Sequence[Path | str]', concept_tokens: 'Sequence[str]', embedder: 'Callable[[Sequence[str]], np.ndarray] | None' = None, similarity_threshold: 'float' = 0.7, context_window_sentences: 'int' = 1) -> 'SisterDocDriftReport'"
1984
+ },
1961
1985
  "validate_source_roles": {
1962
1986
  "doc_first_line": "Return validation errors for generic source-role records.",
1963
1987
  "kind": "function",