eval-toolkit 1.0.3__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/CHANGELOG.md +62 -0
  2. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/PKG-INFO +1 -1
  3. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/__init__.py +8 -0
  4. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_version.py +1 -1
  5. eval_toolkit-1.0.4/src/eval_toolkit/audit_sister_doc_concept_drift.py +432 -0
  6. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/public_api/snapshot.json +25 -1
  7. eval_toolkit-1.0.4/tests/test_audit_sister_doc_concept_drift.py +337 -0
  8. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/.gitignore +0 -0
  9. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/LICENSE +0 -0
  10. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/README.md +0 -0
  11. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/STYLE.md +0 -0
  12. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/archive/README.md +0 -0
  13. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/README.md +0 -0
  14. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/datasets/README.md +0 -0
  15. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/data-integrity/README.md +0 -0
  16. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/eval-ecosystem/README.md +0 -0
  17. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/inference/README.md +0 -0
  18. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/prompt-injection/README.md +0 -0
  19. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/source/adr/README.md +0 -0
  20. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/source/methodology/README.md +0 -0
  21. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/pyproject.toml +0 -0
  22. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/audit_value_bindings.py +0 -0
  32. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/loaders.py +0 -0
  42. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/losses.py +0 -0
  43. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/manifest.py +0 -0
  44. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/metric_specs.py +0 -0
  45. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/preprocessing.py +0 -0
  50. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/probes.py +0 -0
  51. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  58. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results.v1.json +0 -0
  59. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  60. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/scorecards.py +0 -0
  61. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/seeds.py +0 -0
  62. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/splits.py +0 -0
  63. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/stacking.py +0 -0
  64. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/text_dedup.py +0 -0
  65. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/thresholds.py +0 -0
  66. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  67. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  68. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  69. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  70. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  71. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  72. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  73. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  74. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  75. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  76. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/benchmarks/__init__.py +0 -0
  77. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  78. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/conftest.py +0 -0
  79. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/bootstrap_ci/cases.json +0 -0
  80. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout.jsonl +0 -0
  81. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_expected.json +0 -0
  82. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  83. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/expected.md +0 -0
  84. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/input.md +0 -0
  85. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/metrics.json +0 -0
  86. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  87. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/strategies.py +0 -0
  88. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_adversarial.py +0 -0
  89. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_analysis.py +0 -0
  90. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_artifacts.py +0 -0
  91. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_audit_citation_alignment.py +0 -0
  92. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_audit_value_bindings.py +0 -0
  93. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_block_bootstrap_on_folds.py +0 -0
  94. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_calibration_mc.py +0 -0
  95. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_edge_cases.py +0 -0
  96. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_golden.py +0 -0
  97. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_njobs.py +0 -0
  98. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_props.py +0 -0
  99. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_research_grounded.py +0 -0
  100. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_unit.py +0 -0
  101. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_binary_adapters.py +0 -0
  102. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_bootstrap_chain.py +0 -0
  103. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_determinism.py +0 -0
  104. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_optimization_failures.py +0 -0
  105. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_props.py +0 -0
  106. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_research_grounded.py +0 -0
  107. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_unit.py +0 -0
  108. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims.py +0 -0
  109. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims_coverage.py +0 -0
  110. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims_props.py +0 -0
  111. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_cli.py +0 -0
  112. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_config.py +0 -0
  113. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_bootstrap.py +0 -0
  114. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_calibration.py +0 -0
  115. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_harness.py +0 -0
  116. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_metrics.py +0 -0
  117. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_plotting.py +0 -0
  118. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_croissant_e2e.py +0 -0
  119. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_dedup_split_leakage_chain.py +0 -0
  120. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_deprecated_scalars_shim.py +0 -0
  121. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_deprecations.py +0 -0
  122. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_docs_golden.py +0 -0
  123. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_docs_props.py +0 -0
  124. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_embeddings.py +0 -0
  125. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_evidence_validators.py +0 -0
  126. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_edge_cases.py +0 -0
  127. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_fault_injection.py +0 -0
  128. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_folded.py +0 -0
  129. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_internals.py +0 -0
  130. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_metric_options.py +0 -0
  131. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_parallelism.py +0 -0
  132. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_smoke.py +0 -0
  133. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_import_boundaries.py +0 -0
  134. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_is_metric_defined_for_slice.py +0 -0
  135. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_lazy_extras_messages.py +0 -0
  136. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_logging.py +0 -0
  143. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_losses.py +0 -0
  144. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest.py +0 -0
  145. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_contamination_round_trip.py +0 -0
  146. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_props.py +0 -0
  147. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_validation.py +0 -0
  148. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_props.py +0 -0
  149. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_stratified_subsets.py +0 -0
  150. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_unit.py +0 -0
  151. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_misc_coverage.py +0 -0
  152. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_numeric_edge_cases.py +0 -0
  153. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_ood_loader.py +0 -0
  154. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_operating_points.py +0 -0
  155. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_operating_points_props.py +0 -0
  156. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_parallel.py +0 -0
  157. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_paths.py +0 -0
  158. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_pipeline_e2e.py +0 -0
  159. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_edge.py +0 -0
  160. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_smoke.py +0 -0
  161. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_visual.py +0 -0
  162. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_preprocessing.py +0 -0
  163. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_probes.py +0 -0
  164. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_protocol_conformance.py +0 -0
  165. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_provenance.py +0 -0
  166. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_public_api.py +0 -0
  167. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_recall_at_fpr.py +0 -0
  168. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_reference_equivalence.py +0 -0
  169. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_reproducibility_integration.py +0 -0
  170. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_rng.py +0 -0
  171. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_schemas.py +0 -0
  172. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_scorecard.py +0 -0
  173. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_seeds.py +0 -0
  174. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits.py +0 -0
  175. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits_leakage_integration.py +0 -0
  176. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits_props.py +0 -0
  177. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_stacking.py +0 -0
  178. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_sweep.py +0 -0
  179. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup.py +0 -0
  180. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_coverage.py +0 -0
  181. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_props.py +0 -0
  182. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_strategies.py +0 -0
  183. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds.py +0 -0
  184. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_constant_score.py +0 -0
  185. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_coverage.py +0 -0
  186. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_props.py +0 -0
  187. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_research_grounded.py +0 -0
  188. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_tokenization_leakage_check.py +0 -0
  189. {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,68 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.4] — 2026-05-26 — `audit_sister_doc_concept_drift` module (closes #72)
9
+
10
+ Tier-2 ADDITIVE — third (and final) member of the audit-validator
11
+ family. Flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md).
12
+ Family complete: `audit_citation_alignment` (v1.0.1) + `audit_value_bindings`
13
+ (v1.0.3) + `audit_sister_doc_concept_drift` (this release).
14
+
15
+ ### Added
16
+
17
+ - **`audit_sister_doc_concept_drift` module** exporting
18
+ `validate_sister_doc_concept_drift()` + `DriftCluster` +
19
+ `SisterDocDriftReport` as Tier 1 STRICT (per
20
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
21
+ Catches the bug class where two linked sister docs reference the
22
+ same concept token (e.g., `T1`, `manifest v3`) but the
23
+ surrounding-sentence definitions semantically disagree.
24
+ Cross-doc semantic drift survives lychee (links resolve), anchor
25
+ audits (anchors exist), and numeric audits (qualitative prose).
26
+ - Algorithm: per concept_token, scan all files for occurrences;
27
+ extract surrounding-sentence context (`context_window_sentences`);
28
+ embed each snippet via the supplied `embedder` (default lazily
29
+ routes to `make_minilm_embedder()` — requires `[embeddings]`
30
+ optional extra); cluster via single-linkage cosine similarity at
31
+ `similarity_threshold` (default 0.7); tokens with >1 cluster are
32
+ flagged as `DriftCluster`.
33
+ - The `embedder: Callable[[Sequence[str]], np.ndarray] | None`
34
+ parameter matches the existing
35
+ `EmbeddingCosineStrategy.embedder` Protocol — consumers can pass
36
+ any embedder (BGE, E5, OpenAI, or a mock for tests). Default
37
+ `None` defers `sentence_transformers` import to call-time
38
+ (`[embeddings]` extra is required only when caller doesn't supply
39
+ their own embedder).
40
+ - Motivating bug class: consumer audit found
41
+ `docs/REPRODUCIBILITY.md:85` defines `T1` as "full canonical
42
+ re-eval (GPU; A100 80GB)" while `WRITEUP/reproducibility.md:33`
43
+ defines `T1` as "smoke (laptop, $0, ~10 min)" — the two docs
44
+ cross-link as "Aggregator docs" so a reviewer following the link
45
+ lands on contradictory definitions.
46
+ - 13 tests at `tests/test_audit_sister_doc_concept_drift.py` using a
47
+ deterministic mock embedder (no `sentence_transformers` dependency
48
+ for unit tests). Covers: seed-case T1 drift, consistent definition
49
+ across files, single-occurrence consistency, unreferenced-token
50
+ coverage tracking, multi-token mixed (T0 + T1 + T3), threshold
51
+ sensitivity, whole-word boundary (`T1` vs `T10` vs `t1`), context
52
+ window scope, empty inputs, 3-way drift, frozen-dataclass
53
+ invariants, lazy default-embedder import. Closes #72.
54
+
55
+ ### Audit-validator family complete
56
+
57
+ | Validator | Released | Issue |
58
+ |---|---|---|
59
+ | `audit_citation_alignment` | v1.0.1 | #73 |
60
+ | `audit_value_bindings` | v1.0.3 | #71 |
61
+ | `audit_sister_doc_concept_drift` | v1.0.4 (this release) | #72 |
62
+
63
+ All three follow the flat-module convention (ADR 0001), closed-config
64
+ pattern (consumer supplies the auditable surface; validator owns the
65
+ parsing+matching logic; ADR 0002), and Tier 1 STRICT top-level
66
+ exports per ADR 0003. Consumer adoption pattern is the same across
67
+ all three: thin `scripts/audit_<name>.py` CLI wrapper invoking the
68
+ upstream validator.
69
+
8
70
  ## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
9
71
 
10
72
  Tier-2 ADDITIVE — second member of the audit-validator family
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -67,6 +67,14 @@ _EXPORTS: dict[str, str] = {
67
67
  "ValueBindingsReport": "eval_toolkit.audit_value_bindings",
68
68
  "Violation": "eval_toolkit.audit_value_bindings",
69
69
  "validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
70
+ # --- audit_sister_doc_concept_drift ---
71
+ # Flat-module per ADR 0001. Closes #72. Motivated by consumer T1
72
+ # definition contradiction across sister reproducibility docs.
73
+ # Requires [embeddings] extra (lazy resolution; embedder kwarg
74
+ # lets consumers swap in any callable).
75
+ "DriftCluster": "eval_toolkit.audit_sister_doc_concept_drift",
76
+ "SisterDocDriftReport": "eval_toolkit.audit_sister_doc_concept_drift",
77
+ "validate_sister_doc_concept_drift": "eval_toolkit.audit_sister_doc_concept_drift",
70
78
  # --- losses ---
71
79
  "RecallAtLowFPR": "eval_toolkit.losses",
72
80
  # --- preprocessing ---
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.3"
5
+ __version__ = "1.0.4"
@@ -0,0 +1,432 @@
1
+ r"""Sister-doc concept-drift validator (embedding-similarity-based).
2
+
3
+ Catches the bug class where two linked sister docs both reference the
4
+ same concept token (e.g., ``T1``, ``manifest v3``, ``verified_disjoint``)
5
+ but the **surrounding-sentence definitions disagree**. Cross-doc drift
6
+ survives lychee (links resolve), anchor audits (anchors exist), and
7
+ numeric audits (numbers don't disagree because the prose is qualitative).
8
+
9
+ Motivating test case (from `prompt-injection-detection-submission`
10
+ audit, two reproducibility surfaces)::
11
+
12
+ docs/REPRODUCIBILITY.md:85:
13
+ T1 = "full canonical re-eval (GPU; A100 80GB): make headline-cloud
14
+ re-runs ... ~7h wall-clock; ~$28 GPU spend"
15
+
16
+ WRITEUP/reproducibility.md:33:
17
+ T1 = "smoke (laptop, $0, ~10 min): `make smoke` verifies code health"
18
+
19
+ Both files cross-link as "Aggregator docs"; following the link lands a
20
+ reader on contradictory T1 definitions.
21
+
22
+ Algorithm
23
+ ---------
24
+ 1. For each ``concept_token``, scan all ``files`` for occurrences. Each
25
+ occurrence captures the *surrounding sentence(s)* (configurable
26
+ ``context_window_sentences``) — that's the candidate "definition".
27
+ 2. Embed each surrounding-sentence string via the supplied ``embedder``
28
+ (default: lazy :func:`eval_toolkit.embeddings.make_minilm_embedder`).
29
+ 3. Cluster occurrences by single-linkage: two occurrences belong to the
30
+ same cluster iff their cosine similarity is ``>= similarity_threshold``.
31
+ 4. A concept_token with **>1 cluster** is a :class:`DriftCluster` — its
32
+ occurrences split into semantically distinct definition groups.
33
+ 5. A concept_token with **exactly 1 cluster** is consistent across all
34
+ files.
35
+
36
+ Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
37
+ Tier 2 ADDITIVE on the ``[embeddings]`` optional extra surface):
38
+
39
+ - Consumer supplies the concept-token list + file glob; validator owns
40
+ parsing + embedding + clustering + report assembly.
41
+ - Embedder is a callable ``Callable[[Sequence[str]], np.ndarray]`` —
42
+ matches the existing :func:`~eval_toolkit.embeddings.make_minilm_embedder`
43
+ factory contract. ``embedder=None`` defers to the canonical MiniLM
44
+ recipe lazily (avoids forcing the ``[embeddings]`` extra import at
45
+ module load time).
46
+ - Flat-module: ``eval_toolkit.audit_sister_doc_concept_drift.*`` (NOT a
47
+ subpackage per ADR 0001 stay-flat-through-v1.x).
48
+
49
+ Closes upstream issue #72. v1.0.4. Completes the audit-validator family
50
+ of 3 (citation_alignment v1.0.1, value_bindings v1.0.3, sister_doc
51
+ concept_drift v1.0.4).
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import re
57
+ from collections.abc import Callable, Sequence
58
+ from dataclasses import dataclass
59
+ from pathlib import Path
60
+
61
+ import numpy as np
62
+
63
+ __all__ = [
64
+ "DriftCluster",
65
+ "SisterDocDriftReport",
66
+ "validate_sister_doc_concept_drift",
67
+ ]
68
+
69
+
70
+ DEFAULT_SIMILARITY_THRESHOLD: float = 0.7
71
+ DEFAULT_CONTEXT_WINDOW_SENTENCES: int = 1
72
+
73
+ # Sentence-ish splitter — markdown is not formal prose. Splits on
74
+ # ``.``, ``!``, ``?`` followed by whitespace or EOL. Imperfect but
75
+ # robust enough for cross-doc concept-drift detection (consumers
76
+ # tolerate boundary slop because clustering is the noise-tolerant
77
+ # downstream step).
78
+ _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\d`])")
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class DriftCluster:
83
+ """A concept token whose occurrences split into >1 semantic cluster.
84
+
85
+ Attributes
86
+ ----------
87
+ token : str
88
+ The concept token (e.g., ``"T1"``, ``"manifest v3"``).
89
+ sentences : tuple[tuple[Path, int, str], ...]
90
+ Each occurrence as ``(file, line, surrounding_text)`` — line is
91
+ 1-indexed; surrounding_text is the ``context_window_sentences``-sized
92
+ prose snippet that was embedded for clustering.
93
+ divergence_score : float
94
+ ``1 - min_inter_cluster_similarity`` for the worst-case pair
95
+ between any two clusters. Range ``[0.0, 1.0]``; higher = stronger
96
+ drift signal. ``0.0`` means clusters are barely distinguishable;
97
+ ``1.0`` means orthogonal embeddings.
98
+ """
99
+
100
+ token: str
101
+ sentences: tuple[tuple[Path, int, str], ...]
102
+ divergence_score: float
103
+
104
+
105
+ @dataclass(frozen=True)
106
+ class SisterDocDriftReport:
107
+ """Result of :func:`validate_sister_doc_concept_drift`.
108
+
109
+ Attributes
110
+ ----------
111
+ drift_clusters : tuple[DriftCluster, ...]
112
+ Each concept_token whose occurrences split into >1 cluster.
113
+ Empty tuple = all tokens consistent across the scanned files.
114
+ consistent_tokens : tuple[str, ...]
115
+ Concept tokens whose occurrences clustered to a single group
116
+ (or had ≤1 occurrence total). Reported for completeness +
117
+ coverage tracking.
118
+ coverage : float
119
+ Fraction of ``concept_tokens`` that produced ≥1 occurrence in
120
+ the scanned files. Range ``[0.0, 1.0]``. ``1.0`` means every
121
+ token was referenced; lower values flag stale tokens.
122
+ """
123
+
124
+ drift_clusters: tuple[DriftCluster, ...]
125
+ consistent_tokens: tuple[str, ...]
126
+ coverage: float
127
+
128
+
129
+ def validate_sister_doc_concept_drift(
130
+ *,
131
+ files: Sequence[Path | str],
132
+ concept_tokens: Sequence[str],
133
+ embedder: Callable[[Sequence[str]], np.ndarray] | None = None,
134
+ similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
135
+ context_window_sentences: int = DEFAULT_CONTEXT_WINDOW_SENTENCES,
136
+ ) -> SisterDocDriftReport:
137
+ """Validate cross-doc semantic consistency of concept token definitions.
138
+
139
+ For each ``concept_token``, scan ``files`` for occurrences; extract
140
+ the surrounding ``context_window_sentences``; embed each surrounding
141
+ snippet; cluster by single-linkage cosine similarity at
142
+ ``similarity_threshold``. Tokens that produce >1 cluster are flagged
143
+ as drift.
144
+
145
+ Parameters
146
+ ----------
147
+ files : Sequence[Path | str]
148
+ Markdown files to scan. UTF-8 encoded.
149
+ concept_tokens : Sequence[str]
150
+ Seed list of concept tokens (e.g., ``["T0", "T1", "T3",
151
+ "manifest v3", "verified_disjoint"]``). Each token is matched
152
+ case-sensitively as a whole-word boundary regex
153
+ (``\\b<token>\\b``).
154
+ embedder : Callable[[Sequence[str]], np.ndarray] | None, optional
155
+ Embedder callable returning ``(n, d)`` array. ``None`` (default)
156
+ lazily routes to :func:`eval_toolkit.embeddings.make_minilm_embedder`
157
+ — requires the ``[embeddings]`` optional extra
158
+ (``pip install eval-toolkit[embeddings]``). Custom callables let
159
+ consumers swap in any embedder (BGE, E5, OpenAI, mock for tests).
160
+ similarity_threshold : float, optional
161
+ Cosine-similarity threshold for single-linkage clustering.
162
+ Default ``0.7``. Higher = stricter (more clusters; more drift
163
+ flagged); lower = looser. ``0.7`` is the conservative default
164
+ for ``all-MiniLM-L6-v2`` — semantic-near-paraphrase territory.
165
+ context_window_sentences : int, optional
166
+ Number of sentences to extract on each side of the token mention
167
+ as the "definition" snippet (passed to the embedder). Default
168
+ ``1`` (the sentence containing the token; longer windows mute
169
+ token-specific signal with surrounding prose).
170
+
171
+ Returns
172
+ -------
173
+ SisterDocDriftReport
174
+ ``drift_clusters``, ``consistent_tokens``, ``coverage`` per the
175
+ dataclass.
176
+
177
+ Raises
178
+ ------
179
+ ImportError
180
+ If ``embedder=None`` and ``sentence_transformers`` is not
181
+ installed. Install via ``pip install eval-toolkit[embeddings]``.
182
+
183
+ Notes
184
+ -----
185
+ Clustering: single-linkage agglomerative on cosine similarity. Two
186
+ occurrences land in the same cluster iff their similarity is
187
+ ``>= similarity_threshold``. Transitive: ``a~b`` and ``b~c`` →
188
+ ``a, b, c`` in one cluster even if ``cos(a, c) < threshold``. This
189
+ is the canonical SBERT semantic-dedup recipe (see
190
+ :class:`~eval_toolkit.text_dedup.EmbeddingCosineStrategy` for the
191
+ sibling primitive at the inter-text-similarity level).
192
+
193
+ Token matching is case-sensitive whole-word — ``"T1"`` matches
194
+ ``"T1"`` but not ``"t1"`` or ``"T10"``. Adjust by passing
195
+ pre-normalized token strings if case-insensitivity is desired.
196
+
197
+ See Also
198
+ --------
199
+ eval_toolkit.audit_citation_alignment.validate_citations :
200
+ Sibling validator (catches ADR-citation alignment drift).
201
+ eval_toolkit.audit_value_bindings.validate_reader_value_bindings :
202
+ Sibling validator (catches detector→value binding drift).
203
+ eval_toolkit.embeddings.make_minilm_embedder :
204
+ Default embedder factory.
205
+ """
206
+ files_resolved = tuple(Path(f) for f in files)
207
+ tokens = tuple(concept_tokens)
208
+ if not tokens:
209
+ return SisterDocDriftReport(drift_clusters=(), consistent_tokens=(), coverage=0.0)
210
+
211
+ # Resolve embedder lazily — defer the [embeddings] extra import
212
+ # to call time so the module loads even when sentence_transformers
213
+ # isn't installed (matches the EmbeddingCosineStrategy pattern in
214
+ # text_dedup.py).
215
+ if embedder is None:
216
+ embedder = _default_embedder()
217
+
218
+ # Pre-load all files (avoid re-reading per token).
219
+ file_texts: dict[Path, str] = {}
220
+ for path in files_resolved:
221
+ try:
222
+ file_texts[path] = path.read_text(encoding="utf-8")
223
+ except OSError:
224
+ continue
225
+
226
+ drift_clusters: list[DriftCluster] = []
227
+ consistent_tokens: list[str] = []
228
+ tokens_with_hits: set[str] = set()
229
+
230
+ for token in tokens:
231
+ occurrences = _collect_occurrences(token, file_texts, context_window_sentences)
232
+ if not occurrences:
233
+ continue
234
+ tokens_with_hits.add(token)
235
+
236
+ if len(occurrences) == 1:
237
+ consistent_tokens.append(token)
238
+ continue
239
+
240
+ # Embed every surrounding snippet (one batch per token).
241
+ snippets = [occ[2] for occ in occurrences]
242
+ embeddings = np.asarray(embedder(snippets), dtype=np.float64)
243
+ clusters = _single_linkage_clusters(embeddings, similarity_threshold)
244
+
245
+ if len(clusters) == 1:
246
+ consistent_tokens.append(token)
247
+ continue
248
+
249
+ # Compute divergence score from inter-cluster similarity.
250
+ divergence = _divergence_score(embeddings, clusters)
251
+ drift_clusters.append(
252
+ DriftCluster(
253
+ token=token,
254
+ sentences=tuple(occurrences),
255
+ divergence_score=divergence,
256
+ )
257
+ )
258
+
259
+ coverage = len(tokens_with_hits) / len(tokens) if tokens else 0.0
260
+ return SisterDocDriftReport(
261
+ drift_clusters=tuple(drift_clusters),
262
+ consistent_tokens=tuple(consistent_tokens),
263
+ coverage=coverage,
264
+ )
265
+
266
+
267
+ def _default_embedder() -> Callable[[Sequence[str]], np.ndarray]:
268
+ """Lazy MiniLM embedder factory; raises ImportError with install hint."""
269
+ try:
270
+ from eval_toolkit.embeddings import make_minilm_embedder
271
+ except ImportError as exc: # pragma: no cover
272
+ msg = (
273
+ "audit_sister_doc_concept_drift requires the [embeddings] optional "
274
+ "extra (sentence_transformers). Install via "
275
+ "`pip install eval-toolkit[embeddings]` OR pass a custom embedder "
276
+ "callable via the embedder= kwarg."
277
+ )
278
+ raise ImportError(msg) from exc
279
+ return make_minilm_embedder()
280
+
281
+
282
+ def _collect_occurrences(
283
+ token: str, file_texts: dict[Path, str], context_window_sentences: int
284
+ ) -> list[tuple[Path, int, str]]:
285
+ """Find every occurrence of ``token`` (whole-word) across files.
286
+
287
+ Returns list of ``(file, line, surrounding_text)`` tuples where
288
+ ``surrounding_text`` is the ``context_window_sentences`` window
289
+ centered on the sentence containing the token.
290
+ """
291
+ occurrences: list[tuple[Path, int, str]] = []
292
+ token_re = re.compile(rf"\b{re.escape(token)}\b")
293
+ for path, text in file_texts.items():
294
+ sentences = _split_sentences(text)
295
+ for s_idx, sent in enumerate(sentences):
296
+ if not token_re.search(sent.text):
297
+ continue
298
+ window_lo = max(0, s_idx - context_window_sentences)
299
+ window_hi = min(len(sentences), s_idx + context_window_sentences + 1)
300
+ surrounding = " ".join(sentences[i].text for i in range(window_lo, window_hi))
301
+ occurrences.append((path, sent.line, surrounding))
302
+ return occurrences
303
+
304
+
305
+ @dataclass(frozen=True)
306
+ class _SentenceSpan:
307
+ text: str
308
+ line: int # 1-indexed line of the sentence's start
309
+
310
+
311
+ def _split_sentences(text: str) -> list[_SentenceSpan]:
312
+ """Split markdown text into sentence spans with line numbers.
313
+
314
+ Imperfect: skips fenced code blocks (```) but otherwise treats every
315
+ text region as prose. Good enough for concept-drift detection at the
316
+ sentence-of-context-around-token granularity.
317
+ """
318
+ # Strip fenced code blocks (replace with spaces preserving newlines so
319
+ # line numbers stay accurate).
320
+ in_fence = False
321
+ stripped_lines = []
322
+ for line in text.splitlines(keepends=True):
323
+ if line.lstrip().startswith("```"):
324
+ in_fence = not in_fence
325
+ stripped_lines.append(line) # keep newline for line-number alignment
326
+ continue
327
+ stripped_lines.append(line if not in_fence else "\n")
328
+ cleaned = "".join(stripped_lines)
329
+
330
+ # Compute (line_start_pos -> line_no) map
331
+ line_starts = [0]
332
+ for i, ch in enumerate(cleaned):
333
+ if ch == "\n":
334
+ line_starts.append(i + 1)
335
+
336
+ def pos_to_line(pos: int) -> int:
337
+ lo, hi = 0, len(line_starts) - 1
338
+ while lo < hi:
339
+ mid = (lo + hi + 1) // 2
340
+ if line_starts[mid] <= pos:
341
+ lo = mid
342
+ else:
343
+ hi = mid - 1
344
+ return lo + 1
345
+
346
+ # Split into rough sentences. Markdown headings + lists are
347
+ # treated as standalone sentences.
348
+ spans: list[_SentenceSpan] = []
349
+ # Process line-by-line first so headings/bullets stay isolated.
350
+ pos = 0
351
+ for raw_line in cleaned.splitlines(keepends=True):
352
+ line_text = raw_line.rstrip("\n").strip()
353
+ line_start_pos = pos
354
+ pos += len(raw_line)
355
+ if not line_text:
356
+ continue
357
+ # If line starts with #, treat as a sentence on its own
358
+ if line_text.startswith("#") or line_text.startswith("- ") or line_text.startswith("* "):
359
+ spans.append(_SentenceSpan(text=line_text, line=pos_to_line(line_start_pos)))
360
+ continue
361
+ # Else split on sentence-ish delimiters
362
+ for piece in _SENTENCE_SPLIT_RE.split(line_text):
363
+ piece = piece.strip()
364
+ if piece:
365
+ spans.append(_SentenceSpan(text=piece, line=pos_to_line(line_start_pos)))
366
+ return spans
367
+
368
+
369
+ def _single_linkage_clusters(embeddings: np.ndarray, threshold: float) -> list[list[int]]:
370
+ """Single-linkage agglomerative clustering on cosine similarity.
371
+
372
+ Returns list of clusters, each a list of row indices into ``embeddings``.
373
+ Two rows i, j are in the same cluster iff there exists a chain
374
+ i = k_0 ~ k_1 ~ ... ~ k_n = j where each adjacent pair has
375
+ ``cosine(k_i, k_{i+1}) >= threshold``.
376
+ """
377
+ n = embeddings.shape[0]
378
+ if n == 0:
379
+ return []
380
+ # Cosine similarity matrix
381
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
382
+ safe_norms = np.where(norms == 0, 1.0, norms)
383
+ normed = embeddings / safe_norms
384
+ sim = normed @ normed.T
385
+
386
+ # Union-find on edges where sim >= threshold
387
+ parent = list(range(n))
388
+
389
+ def find(x: int) -> int:
390
+ while parent[x] != x:
391
+ parent[x] = parent[parent[x]]
392
+ x = parent[x]
393
+ return x
394
+
395
+ def union(a: int, b: int) -> None:
396
+ ra, rb = find(a), find(b)
397
+ if ra != rb:
398
+ parent[ra] = rb
399
+
400
+ for i in range(n):
401
+ for j in range(i + 1, n):
402
+ if sim[i, j] >= threshold:
403
+ union(i, j)
404
+
405
+ # Group by root
406
+ groups: dict[int, list[int]] = {}
407
+ for i in range(n):
408
+ groups.setdefault(find(i), []).append(i)
409
+ return list(groups.values())
410
+
411
+
412
+ def _divergence_score(embeddings: np.ndarray, clusters: list[list[int]]) -> float:
413
+ """``1 - min_inter_cluster_similarity`` across all cluster pairs.
414
+
415
+ Higher = stronger drift. ``0.0`` means clusters are barely separated;
416
+ ``1.0`` means orthogonal embeddings.
417
+ """
418
+ if len(clusters) < 2:
419
+ return 0.0
420
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
421
+ safe_norms = np.where(norms == 0, 1.0, norms)
422
+ normed = embeddings / safe_norms
423
+ sim = normed @ normed.T
424
+ min_sim = 1.0
425
+ for a_idx in range(len(clusters)):
426
+ for b_idx in range(a_idx + 1, len(clusters)):
427
+ a, b = clusters[a_idx], clusters[b_idx]
428
+ # Min similarity between any pair across the two clusters
429
+ sub = sim[np.ix_(a, b)]
430
+ pair_min = float(sub.min())
431
+ min_sim = min(min_sim, pair_min)
432
+ return 1.0 - max(0.0, min_sim)
@@ -41,6 +41,7 @@
41
41
  "DedupReport",
42
42
  "DelimitVariant",
43
43
  "DiacriticInjection",
44
+ "DriftCluster",
44
45
  "EmbeddingCosineStrategy",
45
46
  "EncodeVariant",
46
47
  "EvalSlice",
@@ -106,6 +107,7 @@
106
107
  "SimilarityAuditReport",
107
108
  "SimilarityStrategy",
108
109
  "SingleSliceLoader",
110
+ "SisterDocDriftReport",
109
111
  "SliceAwareScorer",
110
112
  "SourceDisjointKFoldSplitter",
111
113
  "SourceRoleRecord",
@@ -247,6 +249,7 @@
247
249
  "validate_prediction_artifact_ref",
248
250
  "validate_reader_value_bindings",
249
251
  "validate_results",
252
+ "validate_sister_doc_concept_drift",
250
253
  "validate_source_roles",
251
254
  "walk_path",
252
255
  "wilson_interval",
@@ -575,6 +578,14 @@
575
578
  "kind": "class",
576
579
  "signature": "(ratio: 'float' = 0.3, seed: 'int' = 42, name: 'str' = 'diacritic') -> None"
577
580
  },
581
+ "DriftCluster": {
582
+ "bases": [
583
+ "object"
584
+ ],
585
+ "doc_first_line": "A concept token whose occurrences split into >1 semantic cluster.",
586
+ "kind": "class",
587
+ "signature": "(token: 'str', sentences: 'tuple[tuple[Path, int, str], ...]', divergence_score: 'float') -> None"
588
+ },
578
589
  "EmbeddingCosineStrategy": {
579
590
  "bases": [
580
591
  "object"
@@ -1152,6 +1163,14 @@
1152
1163
  "kind": "class",
1153
1164
  "signature": "(slice_: 'EvalSlice', name: 'str' = '', description: 'str' = '') -> None"
1154
1165
  },
1166
+ "SisterDocDriftReport": {
1167
+ "bases": [
1168
+ "object"
1169
+ ],
1170
+ "doc_first_line": "Result of :func:`validate_sister_doc_concept_drift`.",
1171
+ "kind": "class",
1172
+ "signature": "(drift_clusters: 'tuple[DriftCluster, ...]', consistent_tokens: 'tuple[str, ...]', coverage: 'float') -> None"
1173
+ },
1155
1174
  "SliceAwareScorer": {
1156
1175
  "bases": [
1157
1176
  "Scorer",
@@ -1401,7 +1420,7 @@
1401
1420
  "doc_first_line": "str(object='') -> str",
1402
1421
  "kind": "value",
1403
1422
  "type": "str",
1404
- "value": "'1.0.3'"
1423
+ "value": "'1.0.4'"
1405
1424
  },
1406
1425
  "apply_operating_points": {
1407
1426
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1958,6 +1977,11 @@
1958
1977
  "kind": "function",
1959
1978
  "signature": "(payload: 'Mapping[str, object]') -> 'None'"
1960
1979
  },
1980
+ "validate_sister_doc_concept_drift": {
1981
+ "doc_first_line": "Validate cross-doc semantic consistency of concept token definitions.",
1982
+ "kind": "function",
1983
+ "signature": "(*, files: 'Sequence[Path | str]', concept_tokens: 'Sequence[str]', embedder: 'Callable[[Sequence[str]], np.ndarray] | None' = None, similarity_threshold: 'float' = 0.7, context_window_sentences: 'int' = 1) -> 'SisterDocDriftReport'"
1984
+ },
1961
1985
  "validate_source_roles": {
1962
1986
  "doc_first_line": "Return validation errors for generic source-role records.",
1963
1987
  "kind": "function",