eval-toolkit 1.0.2__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/CHANGELOG.md +99 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/PKG-INFO +1 -1
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/__init__.py +15 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.0.4/src/eval_toolkit/audit_sister_doc_concept_drift.py +432 -0
- eval_toolkit-1.0.4/src/eval_toolkit/audit_value_bindings.py +448 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/public_api/snapshot.json +58 -1
- eval_toolkit-1.0.4/tests/test_audit_sister_doc_concept_drift.py +337 -0
- eval_toolkit-1.0.4/tests/test_audit_value_bindings.py +338 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/.gitignore +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/LICENSE +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/STYLE.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/pyproject.toml +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.4}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,105 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.4] — 2026-05-26 — `audit_sister_doc_concept_drift` module (closes #72)
|
|
9
|
+
|
|
10
|
+
Tier-2 ADDITIVE — third (and final) member of the audit-validator
|
|
11
|
+
family. Flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md).
|
|
12
|
+
Family complete: `audit_citation_alignment` (v1.0.1) + `audit_value_bindings`
|
|
13
|
+
(v1.0.3) + `audit_sister_doc_concept_drift` (this release).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- **`audit_sister_doc_concept_drift` module** exporting
|
|
18
|
+
`validate_sister_doc_concept_drift()` + `DriftCluster` +
|
|
19
|
+
`SisterDocDriftReport` as Tier 1 STRICT (per
|
|
20
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
21
|
+
Catches the bug class where two linked sister docs reference the
|
|
22
|
+
same concept token (e.g., `T1`, `manifest v3`) but the
|
|
23
|
+
surrounding-sentence definitions semantically disagree.
|
|
24
|
+
Cross-doc semantic drift survives lychee (links resolve), anchor
|
|
25
|
+
audits (anchors exist), and numeric audits (qualitative prose).
|
|
26
|
+
- Algorithm: per concept_token, scan all files for occurrences;
|
|
27
|
+
extract surrounding-sentence context (`context_window_sentences`);
|
|
28
|
+
embed each snippet via the supplied `embedder` (default lazily
|
|
29
|
+
routes to `make_minilm_embedder()` — requires `[embeddings]`
|
|
30
|
+
optional extra); cluster via single-linkage cosine similarity at
|
|
31
|
+
`similarity_threshold` (default 0.7); tokens with >1 cluster are
|
|
32
|
+
flagged as `DriftCluster`.
|
|
33
|
+
- The `embedder: Callable[[Sequence[str]], np.ndarray] | None`
|
|
34
|
+
parameter matches the existing
|
|
35
|
+
`EmbeddingCosineStrategy.embedder` Protocol — consumers can pass
|
|
36
|
+
any embedder (BGE, E5, OpenAI, or a mock for tests). Default
|
|
37
|
+
`None` defers `sentence_transformers` import to call-time
|
|
38
|
+
(`[embeddings]` extra is required only when caller doesn't supply
|
|
39
|
+
their own embedder).
|
|
40
|
+
- Motivating bug class: consumer audit found
|
|
41
|
+
`docs/REPRODUCIBILITY.md:85` defines `T1` as "full canonical
|
|
42
|
+
re-eval (GPU; A100 80GB)" while `WRITEUP/reproducibility.md:33`
|
|
43
|
+
defines `T1` as "smoke (laptop, $0, ~10 min)" — the two docs
|
|
44
|
+
cross-link as "Aggregator docs" so a reviewer following the link
|
|
45
|
+
lands on contradictory definitions.
|
|
46
|
+
- 13 tests at `tests/test_audit_sister_doc_concept_drift.py` using a
|
|
47
|
+
deterministic mock embedder (no `sentence_transformers` dependency
|
|
48
|
+
for unit tests). Covers: seed-case T1 drift, consistent definition
|
|
49
|
+
across files, single-occurrence consistency, unreferenced-token
|
|
50
|
+
coverage tracking, multi-token mixed (T0 + T1 + T3), threshold
|
|
51
|
+
sensitivity, whole-word boundary (`T1` vs `T10` vs `t1`), context
|
|
52
|
+
window scope, empty inputs, 3-way drift, frozen-dataclass
|
|
53
|
+
invariants, lazy default-embedder import. Closes #72.
|
|
54
|
+
|
|
55
|
+
### Audit-validator family complete
|
|
56
|
+
|
|
57
|
+
| Validator | Released | Issue |
|
|
58
|
+
|---|---|---|
|
|
59
|
+
| `audit_citation_alignment` | v1.0.1 | #73 |
|
|
60
|
+
| `audit_value_bindings` | v1.0.3 | #71 |
|
|
61
|
+
| `audit_sister_doc_concept_drift` | v1.0.4 (this release) | #72 |
|
|
62
|
+
|
|
63
|
+
All three follow the flat-module convention (ADR 0001), closed-config
|
|
64
|
+
pattern (consumer supplies the auditable surface; validator owns the
|
|
65
|
+
parsing+matching logic; ADR 0002), and Tier 1 STRICT top-level
|
|
66
|
+
exports per ADR 0003. Consumer adoption pattern is the same across
|
|
67
|
+
all three: thin `scripts/audit_<name>.py` CLI wrapper invoking the
|
|
68
|
+
upstream validator.
|
|
69
|
+
|
|
70
|
+
## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
|
|
71
|
+
|
|
72
|
+
Tier-2 ADDITIVE — second member of the audit-validator family
|
|
73
|
+
following `audit_citation_alignment` (v1.0.1). Flat-module per
|
|
74
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md).
|
|
75
|
+
|
|
76
|
+
### Added
|
|
77
|
+
|
|
78
|
+
- **`audit_value_bindings` module** exporting
|
|
79
|
+
`validate_reader_value_bindings()` + `Match` + `Violation` +
|
|
80
|
+
`ValueBindingsReport` as Tier 1 STRICT (per
|
|
81
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
82
|
+
Catches the bug class where a markdown surface pairs a detector name
|
|
83
|
+
with the **wrong** canonical value — both values exist in the
|
|
84
|
+
source-of-truth table but the binding is misordered. Motivated by
|
|
85
|
+
the consumer V1.3.1 ADR-080 audit-fix patch closure (2026-05-22)
|
|
86
|
+
where `WRITEUP_NARRATIVE.md:38` said "TF-IDF + logistic regression
|
|
87
|
+
baseline reaches 0.974 AUPRC" but canonical TF-IDF direct val AUPRC
|
|
88
|
+
is 0.971 (0.974 was LoRA's value). The existing `audit_numbers.py`
|
|
89
|
+
validates VALUES against source data but not BINDINGS — this
|
|
90
|
+
validator closes that gap.
|
|
91
|
+
- Cross-detector disambiguation: when multiple detectors and values
|
|
92
|
+
appear in the same paragraph (e.g., "TF-IDF achieves 0.971, while
|
|
93
|
+
LoRA reaches 0.974"), each value pairs with the LAST detector
|
|
94
|
+
appearing before it in text order (falling back to first detector
|
|
95
|
+
after if no before-detector is in range). Avoids false-positive
|
|
96
|
+
bindings across closely-spaced detector mentions.
|
|
97
|
+
- Coverage metric: `ValueBindingsReport.coverage` reports the fraction
|
|
98
|
+
of `(detector, metric)` keys in the canonical `bindings` dict that
|
|
99
|
+
produced at least one `Match` — useful for detecting stale or
|
|
100
|
+
unreferenced bindings in reader prose.
|
|
101
|
+
- 13 tests at `tests/test_audit_value_bindings.py` including the
|
|
102
|
+
verbatim WRITEUP_NARRATIVE seed-case regression, alias resolution
|
|
103
|
+
(detector + metric), distance-window edge, value-without-metric
|
|
104
|
+
skip, coverage fraction, tolerance band, multi-detector
|
|
105
|
+
disambiguation, frozen-dataclass invariants. Closes #71.
|
|
106
|
+
|
|
8
107
|
## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
|
|
9
108
|
|
|
10
109
|
Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,21 @@ _EXPORTS: dict[str, str] = {
|
|
|
60
60
|
"CitationMisalignment": "eval_toolkit.audit_citation_alignment",
|
|
61
61
|
"extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
|
|
62
62
|
"validate_citations": "eval_toolkit.audit_citation_alignment",
|
|
63
|
+
# --- audit_value_bindings ---
|
|
64
|
+
# Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
|
|
65
|
+
# ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
|
|
66
|
+
"Match": "eval_toolkit.audit_value_bindings",
|
|
67
|
+
"ValueBindingsReport": "eval_toolkit.audit_value_bindings",
|
|
68
|
+
"Violation": "eval_toolkit.audit_value_bindings",
|
|
69
|
+
"validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
|
|
70
|
+
# --- audit_sister_doc_concept_drift ---
|
|
71
|
+
# Flat-module per ADR 0001. Closes #72. Motivated by consumer T1
|
|
72
|
+
# definition contradiction across sister reproducibility docs.
|
|
73
|
+
# Requires [embeddings] extra (lazy resolution; embedder kwarg
|
|
74
|
+
# lets consumers swap in any callable).
|
|
75
|
+
"DriftCluster": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
76
|
+
"SisterDocDriftReport": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
77
|
+
"validate_sister_doc_concept_drift": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
63
78
|
# --- losses ---
|
|
64
79
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
65
80
|
# --- preprocessing ---
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
r"""Sister-doc concept-drift validator (embedding-similarity-based).
|
|
2
|
+
|
|
3
|
+
Catches the bug class where two linked sister docs both reference the
|
|
4
|
+
same concept token (e.g., ``T1``, ``manifest v3``, ``verified_disjoint``)
|
|
5
|
+
but the **surrounding-sentence definitions disagree**. Cross-doc drift
|
|
6
|
+
survives lychee (links resolve), anchor audits (anchors exist), and
|
|
7
|
+
numeric audits (numbers don't disagree because the prose is qualitative).
|
|
8
|
+
|
|
9
|
+
Motivating test case (from `prompt-injection-detection-submission`
|
|
10
|
+
audit, two reproducibility surfaces)::
|
|
11
|
+
|
|
12
|
+
docs/REPRODUCIBILITY.md:85:
|
|
13
|
+
T1 = "full canonical re-eval (GPU; A100 80GB): make headline-cloud
|
|
14
|
+
re-runs ... ~7h wall-clock; ~$28 GPU spend"
|
|
15
|
+
|
|
16
|
+
WRITEUP/reproducibility.md:33:
|
|
17
|
+
T1 = "smoke (laptop, $0, ~10 min): `make smoke` verifies code health"
|
|
18
|
+
|
|
19
|
+
Both files cross-link as "Aggregator docs"; following the link lands a
|
|
20
|
+
reader on contradictory T1 definitions.
|
|
21
|
+
|
|
22
|
+
Algorithm
|
|
23
|
+
---------
|
|
24
|
+
1. For each ``concept_token``, scan all ``files`` for occurrences. Each
|
|
25
|
+
occurrence captures the *surrounding sentence(s)* (configurable
|
|
26
|
+
``context_window_sentences``) — that's the candidate "definition".
|
|
27
|
+
2. Embed each surrounding-sentence string via the supplied ``embedder``
|
|
28
|
+
(default: lazy :func:`eval_toolkit.embeddings.make_minilm_embedder`).
|
|
29
|
+
3. Cluster occurrences by single-linkage: two occurrences belong to the
|
|
30
|
+
same cluster iff their cosine similarity is ``>= similarity_threshold``.
|
|
31
|
+
4. A concept_token with **>1 cluster** is a :class:`DriftCluster` — its
|
|
32
|
+
occurrences split into semantically distinct definition groups.
|
|
33
|
+
5. A concept_token with **exactly 1 cluster** is consistent across all
|
|
34
|
+
files.
|
|
35
|
+
|
|
36
|
+
Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
|
|
37
|
+
Tier 2 ADDITIVE on the ``[embeddings]`` optional extra surface):
|
|
38
|
+
|
|
39
|
+
- Consumer supplies the concept-token list + file glob; validator owns
|
|
40
|
+
parsing + embedding + clustering + report assembly.
|
|
41
|
+
- Embedder is a callable ``Callable[[Sequence[str]], np.ndarray]`` —
|
|
42
|
+
matches the existing :func:`~eval_toolkit.embeddings.make_minilm_embedder`
|
|
43
|
+
factory contract. ``embedder=None`` defers to the canonical MiniLM
|
|
44
|
+
recipe lazily (avoids forcing the ``[embeddings]`` extra import at
|
|
45
|
+
module load time).
|
|
46
|
+
- Flat-module: ``eval_toolkit.audit_sister_doc_concept_drift.*`` (NOT a
|
|
47
|
+
subpackage per ADR 0001 stay-flat-through-v1.x).
|
|
48
|
+
|
|
49
|
+
Closes upstream issue #72. v1.0.4. Completes the audit-validator family
|
|
50
|
+
of 3 (citation_alignment v1.0.1, value_bindings v1.0.3, sister_doc
|
|
51
|
+
concept_drift v1.0.4).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
import re
|
|
57
|
+
from collections.abc import Callable, Sequence
|
|
58
|
+
from dataclasses import dataclass
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"DriftCluster",
|
|
65
|
+
"SisterDocDriftReport",
|
|
66
|
+
"validate_sister_doc_concept_drift",
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
DEFAULT_SIMILARITY_THRESHOLD: float = 0.7
|
|
71
|
+
DEFAULT_CONTEXT_WINDOW_SENTENCES: int = 1
|
|
72
|
+
|
|
73
|
+
# Sentence-ish splitter — markdown is not formal prose. Splits on
|
|
74
|
+
# ``.``, ``!``, ``?`` followed by whitespace or EOL. Imperfect but
|
|
75
|
+
# robust enough for cross-doc concept-drift detection (consumers
|
|
76
|
+
# tolerate boundary slop because clustering is the noise-tolerant
|
|
77
|
+
# downstream step).
|
|
78
|
+
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\d`])")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class DriftCluster:
|
|
83
|
+
"""A concept token whose occurrences split into >1 semantic cluster.
|
|
84
|
+
|
|
85
|
+
Attributes
|
|
86
|
+
----------
|
|
87
|
+
token : str
|
|
88
|
+
The concept token (e.g., ``"T1"``, ``"manifest v3"``).
|
|
89
|
+
sentences : tuple[tuple[Path, int, str], ...]
|
|
90
|
+
Each occurrence as ``(file, line, surrounding_text)`` — line is
|
|
91
|
+
1-indexed; surrounding_text is the ``context_window_sentences``-sized
|
|
92
|
+
prose snippet that was embedded for clustering.
|
|
93
|
+
divergence_score : float
|
|
94
|
+
``1 - min_inter_cluster_similarity`` for the worst-case pair
|
|
95
|
+
between any two clusters. Range ``[0.0, 1.0]``; higher = stronger
|
|
96
|
+
drift signal. ``0.0`` means clusters are barely distinguishable;
|
|
97
|
+
``1.0`` means orthogonal embeddings.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
token: str
|
|
101
|
+
sentences: tuple[tuple[Path, int, str], ...]
|
|
102
|
+
divergence_score: float
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class SisterDocDriftReport:
|
|
107
|
+
"""Result of :func:`validate_sister_doc_concept_drift`.
|
|
108
|
+
|
|
109
|
+
Attributes
|
|
110
|
+
----------
|
|
111
|
+
drift_clusters : tuple[DriftCluster, ...]
|
|
112
|
+
Each concept_token whose occurrences split into >1 cluster.
|
|
113
|
+
Empty tuple = all tokens consistent across the scanned files.
|
|
114
|
+
consistent_tokens : tuple[str, ...]
|
|
115
|
+
Concept tokens whose occurrences clustered to a single group
|
|
116
|
+
(or had ≤1 occurrence total). Reported for completeness +
|
|
117
|
+
coverage tracking.
|
|
118
|
+
coverage : float
|
|
119
|
+
Fraction of ``concept_tokens`` that produced ≥1 occurrence in
|
|
120
|
+
the scanned files. Range ``[0.0, 1.0]``. ``1.0`` means every
|
|
121
|
+
token was referenced; lower values flag stale tokens.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
drift_clusters: tuple[DriftCluster, ...]
|
|
125
|
+
consistent_tokens: tuple[str, ...]
|
|
126
|
+
coverage: float
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def validate_sister_doc_concept_drift(
|
|
130
|
+
*,
|
|
131
|
+
files: Sequence[Path | str],
|
|
132
|
+
concept_tokens: Sequence[str],
|
|
133
|
+
embedder: Callable[[Sequence[str]], np.ndarray] | None = None,
|
|
134
|
+
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
|
|
135
|
+
context_window_sentences: int = DEFAULT_CONTEXT_WINDOW_SENTENCES,
|
|
136
|
+
) -> SisterDocDriftReport:
|
|
137
|
+
"""Validate cross-doc semantic consistency of concept token definitions.
|
|
138
|
+
|
|
139
|
+
For each ``concept_token``, scan ``files`` for occurrences; extract
|
|
140
|
+
the surrounding ``context_window_sentences``; embed each surrounding
|
|
141
|
+
snippet; cluster by single-linkage cosine similarity at
|
|
142
|
+
``similarity_threshold``. Tokens that produce >1 cluster are flagged
|
|
143
|
+
as drift.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
files : Sequence[Path | str]
|
|
148
|
+
Markdown files to scan. UTF-8 encoded.
|
|
149
|
+
concept_tokens : Sequence[str]
|
|
150
|
+
Seed list of concept tokens (e.g., ``["T0", "T1", "T3",
|
|
151
|
+
"manifest v3", "verified_disjoint"]``). Each token is matched
|
|
152
|
+
case-sensitively as a whole-word boundary regex
|
|
153
|
+
(``\\b<token>\\b``).
|
|
154
|
+
embedder : Callable[[Sequence[str]], np.ndarray] | None, optional
|
|
155
|
+
Embedder callable returning ``(n, d)`` array. ``None`` (default)
|
|
156
|
+
lazily routes to :func:`eval_toolkit.embeddings.make_minilm_embedder`
|
|
157
|
+
— requires the ``[embeddings]`` optional extra
|
|
158
|
+
(``pip install eval-toolkit[embeddings]``). Custom callables let
|
|
159
|
+
consumers swap in any embedder (BGE, E5, OpenAI, mock for tests).
|
|
160
|
+
similarity_threshold : float, optional
|
|
161
|
+
Cosine-similarity threshold for single-linkage clustering.
|
|
162
|
+
Default ``0.7``. Higher = stricter (more clusters; more drift
|
|
163
|
+
flagged); lower = looser. ``0.7`` is the conservative default
|
|
164
|
+
for ``all-MiniLM-L6-v2`` — semantic-near-paraphrase territory.
|
|
165
|
+
context_window_sentences : int, optional
|
|
166
|
+
Number of sentences to extract on each side of the token mention
|
|
167
|
+
as the "definition" snippet (passed to the embedder). Default
|
|
168
|
+
``1`` (the sentence containing the token; longer windows mute
|
|
169
|
+
token-specific signal with surrounding prose).
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
SisterDocDriftReport
|
|
174
|
+
``drift_clusters``, ``consistent_tokens``, ``coverage`` per the
|
|
175
|
+
dataclass.
|
|
176
|
+
|
|
177
|
+
Raises
|
|
178
|
+
------
|
|
179
|
+
ImportError
|
|
180
|
+
If ``embedder=None`` and ``sentence_transformers`` is not
|
|
181
|
+
installed. Install via ``pip install eval-toolkit[embeddings]``.
|
|
182
|
+
|
|
183
|
+
Notes
|
|
184
|
+
-----
|
|
185
|
+
Clustering: single-linkage agglomerative on cosine similarity. Two
|
|
186
|
+
occurrences land in the same cluster iff their similarity is
|
|
187
|
+
``>= similarity_threshold``. Transitive: ``a~b`` and ``b~c`` →
|
|
188
|
+
``a, b, c`` in one cluster even if ``cos(a, c) < threshold``. This
|
|
189
|
+
is the canonical SBERT semantic-dedup recipe (see
|
|
190
|
+
:class:`~eval_toolkit.text_dedup.EmbeddingCosineStrategy` for the
|
|
191
|
+
sibling primitive at the inter-text-similarity level).
|
|
192
|
+
|
|
193
|
+
Token matching is case-sensitive whole-word — ``"T1"`` matches
|
|
194
|
+
``"T1"`` but not ``"t1"`` or ``"T10"``. Adjust by passing
|
|
195
|
+
pre-normalized token strings if case-insensitivity is desired.
|
|
196
|
+
|
|
197
|
+
See Also
|
|
198
|
+
--------
|
|
199
|
+
eval_toolkit.audit_citation_alignment.validate_citations :
|
|
200
|
+
Sibling validator (catches ADR-citation alignment drift).
|
|
201
|
+
eval_toolkit.audit_value_bindings.validate_reader_value_bindings :
|
|
202
|
+
Sibling validator (catches detector→value binding drift).
|
|
203
|
+
eval_toolkit.embeddings.make_minilm_embedder :
|
|
204
|
+
Default embedder factory.
|
|
205
|
+
"""
|
|
206
|
+
files_resolved = tuple(Path(f) for f in files)
|
|
207
|
+
tokens = tuple(concept_tokens)
|
|
208
|
+
if not tokens:
|
|
209
|
+
return SisterDocDriftReport(drift_clusters=(), consistent_tokens=(), coverage=0.0)
|
|
210
|
+
|
|
211
|
+
# Resolve embedder lazily — defer the [embeddings] extra import
|
|
212
|
+
# to call time so the module loads even when sentence_transformers
|
|
213
|
+
# isn't installed (matches the EmbeddingCosineStrategy pattern in
|
|
214
|
+
# text_dedup.py).
|
|
215
|
+
if embedder is None:
|
|
216
|
+
embedder = _default_embedder()
|
|
217
|
+
|
|
218
|
+
# Pre-load all files (avoid re-reading per token).
|
|
219
|
+
file_texts: dict[Path, str] = {}
|
|
220
|
+
for path in files_resolved:
|
|
221
|
+
try:
|
|
222
|
+
file_texts[path] = path.read_text(encoding="utf-8")
|
|
223
|
+
except OSError:
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
drift_clusters: list[DriftCluster] = []
|
|
227
|
+
consistent_tokens: list[str] = []
|
|
228
|
+
tokens_with_hits: set[str] = set()
|
|
229
|
+
|
|
230
|
+
for token in tokens:
|
|
231
|
+
occurrences = _collect_occurrences(token, file_texts, context_window_sentences)
|
|
232
|
+
if not occurrences:
|
|
233
|
+
continue
|
|
234
|
+
tokens_with_hits.add(token)
|
|
235
|
+
|
|
236
|
+
if len(occurrences) == 1:
|
|
237
|
+
consistent_tokens.append(token)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# Embed every surrounding snippet (one batch per token).
|
|
241
|
+
snippets = [occ[2] for occ in occurrences]
|
|
242
|
+
embeddings = np.asarray(embedder(snippets), dtype=np.float64)
|
|
243
|
+
clusters = _single_linkage_clusters(embeddings, similarity_threshold)
|
|
244
|
+
|
|
245
|
+
if len(clusters) == 1:
|
|
246
|
+
consistent_tokens.append(token)
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Compute divergence score from inter-cluster similarity.
|
|
250
|
+
divergence = _divergence_score(embeddings, clusters)
|
|
251
|
+
drift_clusters.append(
|
|
252
|
+
DriftCluster(
|
|
253
|
+
token=token,
|
|
254
|
+
sentences=tuple(occurrences),
|
|
255
|
+
divergence_score=divergence,
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
coverage = len(tokens_with_hits) / len(tokens) if tokens else 0.0
|
|
260
|
+
return SisterDocDriftReport(
|
|
261
|
+
drift_clusters=tuple(drift_clusters),
|
|
262
|
+
consistent_tokens=tuple(consistent_tokens),
|
|
263
|
+
coverage=coverage,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _default_embedder() -> Callable[[Sequence[str]], np.ndarray]:
|
|
268
|
+
"""Lazy MiniLM embedder factory; raises ImportError with install hint."""
|
|
269
|
+
try:
|
|
270
|
+
from eval_toolkit.embeddings import make_minilm_embedder
|
|
271
|
+
except ImportError as exc: # pragma: no cover
|
|
272
|
+
msg = (
|
|
273
|
+
"audit_sister_doc_concept_drift requires the [embeddings] optional "
|
|
274
|
+
"extra (sentence_transformers). Install via "
|
|
275
|
+
"`pip install eval-toolkit[embeddings]` OR pass a custom embedder "
|
|
276
|
+
"callable via the embedder= kwarg."
|
|
277
|
+
)
|
|
278
|
+
raise ImportError(msg) from exc
|
|
279
|
+
return make_minilm_embedder()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _collect_occurrences(
|
|
283
|
+
token: str, file_texts: dict[Path, str], context_window_sentences: int
|
|
284
|
+
) -> list[tuple[Path, int, str]]:
|
|
285
|
+
"""Find every occurrence of ``token`` (whole-word) across files.
|
|
286
|
+
|
|
287
|
+
Returns list of ``(file, line, surrounding_text)`` tuples where
|
|
288
|
+
``surrounding_text`` is the ``context_window_sentences`` window
|
|
289
|
+
centered on the sentence containing the token.
|
|
290
|
+
"""
|
|
291
|
+
occurrences: list[tuple[Path, int, str]] = []
|
|
292
|
+
token_re = re.compile(rf"\b{re.escape(token)}\b")
|
|
293
|
+
for path, text in file_texts.items():
|
|
294
|
+
sentences = _split_sentences(text)
|
|
295
|
+
for s_idx, sent in enumerate(sentences):
|
|
296
|
+
if not token_re.search(sent.text):
|
|
297
|
+
continue
|
|
298
|
+
window_lo = max(0, s_idx - context_window_sentences)
|
|
299
|
+
window_hi = min(len(sentences), s_idx + context_window_sentences + 1)
|
|
300
|
+
surrounding = " ".join(sentences[i].text for i in range(window_lo, window_hi))
|
|
301
|
+
occurrences.append((path, sent.line, surrounding))
|
|
302
|
+
return occurrences
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@dataclass(frozen=True)
|
|
306
|
+
class _SentenceSpan:
|
|
307
|
+
text: str
|
|
308
|
+
line: int # 1-indexed line of the sentence's start
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _split_sentences(text: str) -> list[_SentenceSpan]:
|
|
312
|
+
"""Split markdown text into sentence spans with line numbers.
|
|
313
|
+
|
|
314
|
+
Imperfect: skips fenced code blocks (```) but otherwise treats every
|
|
315
|
+
text region as prose. Good enough for concept-drift detection at the
|
|
316
|
+
sentence-of-context-around-token granularity.
|
|
317
|
+
"""
|
|
318
|
+
# Strip fenced code blocks (replace with spaces preserving newlines so
|
|
319
|
+
# line numbers stay accurate).
|
|
320
|
+
in_fence = False
|
|
321
|
+
stripped_lines = []
|
|
322
|
+
for line in text.splitlines(keepends=True):
|
|
323
|
+
if line.lstrip().startswith("```"):
|
|
324
|
+
in_fence = not in_fence
|
|
325
|
+
stripped_lines.append(line) # keep newline for line-number alignment
|
|
326
|
+
continue
|
|
327
|
+
stripped_lines.append(line if not in_fence else "\n")
|
|
328
|
+
cleaned = "".join(stripped_lines)
|
|
329
|
+
|
|
330
|
+
# Compute (line_start_pos -> line_no) map
|
|
331
|
+
line_starts = [0]
|
|
332
|
+
for i, ch in enumerate(cleaned):
|
|
333
|
+
if ch == "\n":
|
|
334
|
+
line_starts.append(i + 1)
|
|
335
|
+
|
|
336
|
+
def pos_to_line(pos: int) -> int:
|
|
337
|
+
lo, hi = 0, len(line_starts) - 1
|
|
338
|
+
while lo < hi:
|
|
339
|
+
mid = (lo + hi + 1) // 2
|
|
340
|
+
if line_starts[mid] <= pos:
|
|
341
|
+
lo = mid
|
|
342
|
+
else:
|
|
343
|
+
hi = mid - 1
|
|
344
|
+
return lo + 1
|
|
345
|
+
|
|
346
|
+
# Split into rough sentences. Markdown headings + lists are
|
|
347
|
+
# treated as standalone sentences.
|
|
348
|
+
spans: list[_SentenceSpan] = []
|
|
349
|
+
# Process line-by-line first so headings/bullets stay isolated.
|
|
350
|
+
pos = 0
|
|
351
|
+
for raw_line in cleaned.splitlines(keepends=True):
|
|
352
|
+
line_text = raw_line.rstrip("\n").strip()
|
|
353
|
+
line_start_pos = pos
|
|
354
|
+
pos += len(raw_line)
|
|
355
|
+
if not line_text:
|
|
356
|
+
continue
|
|
357
|
+
# If line starts with #, treat as a sentence on its own
|
|
358
|
+
if line_text.startswith("#") or line_text.startswith("- ") or line_text.startswith("* "):
|
|
359
|
+
spans.append(_SentenceSpan(text=line_text, line=pos_to_line(line_start_pos)))
|
|
360
|
+
continue
|
|
361
|
+
# Else split on sentence-ish delimiters
|
|
362
|
+
for piece in _SENTENCE_SPLIT_RE.split(line_text):
|
|
363
|
+
piece = piece.strip()
|
|
364
|
+
if piece:
|
|
365
|
+
spans.append(_SentenceSpan(text=piece, line=pos_to_line(line_start_pos)))
|
|
366
|
+
return spans
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _single_linkage_clusters(embeddings: np.ndarray, threshold: float) -> list[list[int]]:
|
|
370
|
+
"""Single-linkage agglomerative clustering on cosine similarity.
|
|
371
|
+
|
|
372
|
+
Returns list of clusters, each a list of row indices into ``embeddings``.
|
|
373
|
+
Two rows i, j are in the same cluster iff there exists a chain
|
|
374
|
+
i = k_0 ~ k_1 ~ ... ~ k_n = j where each adjacent pair has
|
|
375
|
+
``cosine(k_i, k_{i+1}) >= threshold``.
|
|
376
|
+
"""
|
|
377
|
+
n = embeddings.shape[0]
|
|
378
|
+
if n == 0:
|
|
379
|
+
return []
|
|
380
|
+
# Cosine similarity matrix
|
|
381
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
382
|
+
safe_norms = np.where(norms == 0, 1.0, norms)
|
|
383
|
+
normed = embeddings / safe_norms
|
|
384
|
+
sim = normed @ normed.T
|
|
385
|
+
|
|
386
|
+
# Union-find on edges where sim >= threshold
|
|
387
|
+
parent = list(range(n))
|
|
388
|
+
|
|
389
|
+
def find(x: int) -> int:
|
|
390
|
+
while parent[x] != x:
|
|
391
|
+
parent[x] = parent[parent[x]]
|
|
392
|
+
x = parent[x]
|
|
393
|
+
return x
|
|
394
|
+
|
|
395
|
+
def union(a: int, b: int) -> None:
|
|
396
|
+
ra, rb = find(a), find(b)
|
|
397
|
+
if ra != rb:
|
|
398
|
+
parent[ra] = rb
|
|
399
|
+
|
|
400
|
+
for i in range(n):
|
|
401
|
+
for j in range(i + 1, n):
|
|
402
|
+
if sim[i, j] >= threshold:
|
|
403
|
+
union(i, j)
|
|
404
|
+
|
|
405
|
+
# Group by root
|
|
406
|
+
groups: dict[int, list[int]] = {}
|
|
407
|
+
for i in range(n):
|
|
408
|
+
groups.setdefault(find(i), []).append(i)
|
|
409
|
+
return list(groups.values())
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _divergence_score(embeddings: np.ndarray, clusters: list[list[int]]) -> float:
|
|
413
|
+
"""``1 - min_inter_cluster_similarity`` across all cluster pairs.
|
|
414
|
+
|
|
415
|
+
Higher = stronger drift. ``0.0`` means clusters are barely separated;
|
|
416
|
+
``1.0`` means orthogonal embeddings.
|
|
417
|
+
"""
|
|
418
|
+
if len(clusters) < 2:
|
|
419
|
+
return 0.0
|
|
420
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
421
|
+
safe_norms = np.where(norms == 0, 1.0, norms)
|
|
422
|
+
normed = embeddings / safe_norms
|
|
423
|
+
sim = normed @ normed.T
|
|
424
|
+
min_sim = 1.0
|
|
425
|
+
for a_idx in range(len(clusters)):
|
|
426
|
+
for b_idx in range(a_idx + 1, len(clusters)):
|
|
427
|
+
a, b = clusters[a_idx], clusters[b_idx]
|
|
428
|
+
# Min similarity between any pair across the two clusters
|
|
429
|
+
sub = sim[np.ix_(a, b)]
|
|
430
|
+
pair_min = float(sub.min())
|
|
431
|
+
min_sim = min(min_sim, pair_min)
|
|
432
|
+
return 1.0 - max(0.0, min_sim)
|