eval-toolkit 1.0.3__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/CHANGELOG.md +62 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/PKG-INFO +1 -1
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/__init__.py +8 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.0.4/src/eval_toolkit/audit_sister_doc_concept_drift.py +432 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/public_api/snapshot.json +25 -1
- eval_toolkit-1.0.4/tests/test_audit_sister_doc_concept_drift.py +337 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/.gitignore +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/LICENSE +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/STYLE.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/pyproject.toml +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/audit_value_bindings.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_audit_value_bindings.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.3 → eval_toolkit-1.0.4}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,68 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.4] — 2026-05-26 — `audit_sister_doc_concept_drift` module (closes #72)
|
|
9
|
+
|
|
10
|
+
Tier-2 ADDITIVE — third (and final) member of the audit-validator
|
|
11
|
+
family. Flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md).
|
|
12
|
+
Family complete: `audit_citation_alignment` (v1.0.1) + `audit_value_bindings`
|
|
13
|
+
(v1.0.3) + `audit_sister_doc_concept_drift` (this release).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- **`audit_sister_doc_concept_drift` module** exporting
|
|
18
|
+
`validate_sister_doc_concept_drift()` + `DriftCluster` +
|
|
19
|
+
`SisterDocDriftReport` as Tier 1 STRICT (per
|
|
20
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
21
|
+
Catches the bug class where two linked sister docs reference the
|
|
22
|
+
same concept token (e.g., `T1`, `manifest v3`) but the
|
|
23
|
+
surrounding-sentence definitions semantically disagree.
|
|
24
|
+
Cross-doc semantic drift survives lychee (links resolve), anchor
|
|
25
|
+
audits (anchors exist), and numeric audits (qualitative prose).
|
|
26
|
+
- Algorithm: per concept_token, scan all files for occurrences;
|
|
27
|
+
extract surrounding-sentence context (`context_window_sentences`);
|
|
28
|
+
embed each snippet via the supplied `embedder` (default lazily
|
|
29
|
+
routes to `make_minilm_embedder()` — requires `[embeddings]`
|
|
30
|
+
optional extra); cluster via single-linkage cosine similarity at
|
|
31
|
+
`similarity_threshold` (default 0.7); tokens with >1 cluster are
|
|
32
|
+
flagged as `DriftCluster`.
|
|
33
|
+
- The `embedder: Callable[[Sequence[str]], np.ndarray] | None`
|
|
34
|
+
parameter matches the existing
|
|
35
|
+
`EmbeddingCosineStrategy.embedder` Protocol — consumers can pass
|
|
36
|
+
any embedder (BGE, E5, OpenAI, or a mock for tests). Default
|
|
37
|
+
`None` defers `sentence_transformers` import to call-time
|
|
38
|
+
(`[embeddings]` extra is required only when caller doesn't supply
|
|
39
|
+
their own embedder).
|
|
40
|
+
- Motivating bug class: consumer audit found
|
|
41
|
+
`docs/REPRODUCIBILITY.md:85` defines `T1` as "full canonical
|
|
42
|
+
re-eval (GPU; A100 80GB)" while `WRITEUP/reproducibility.md:33`
|
|
43
|
+
defines `T1` as "smoke (laptop, $0, ~10 min)" — the two docs
|
|
44
|
+
cross-link as "Aggregator docs" so a reviewer following the link
|
|
45
|
+
lands on contradictory definitions.
|
|
46
|
+
- 13 tests at `tests/test_audit_sister_doc_concept_drift.py` using a
|
|
47
|
+
deterministic mock embedder (no `sentence_transformers` dependency
|
|
48
|
+
for unit tests). Covers: seed-case T1 drift, consistent definition
|
|
49
|
+
across files, single-occurrence consistency, unreferenced-token
|
|
50
|
+
coverage tracking, multi-token mixed (T0 + T1 + T3), threshold
|
|
51
|
+
sensitivity, whole-word boundary (`T1` vs `T10` vs `t1`), context
|
|
52
|
+
window scope, empty inputs, 3-way drift, frozen-dataclass
|
|
53
|
+
invariants, lazy default-embedder import. Closes #72.
|
|
54
|
+
|
|
55
|
+
### Audit-validator family complete
|
|
56
|
+
|
|
57
|
+
| Validator | Released | Issue |
|
|
58
|
+
|---|---|---|
|
|
59
|
+
| `audit_citation_alignment` | v1.0.1 | #73 |
|
|
60
|
+
| `audit_value_bindings` | v1.0.3 | #71 |
|
|
61
|
+
| `audit_sister_doc_concept_drift` | v1.0.4 (this release) | #72 |
|
|
62
|
+
|
|
63
|
+
All three follow the flat-module convention (ADR 0001), closed-config
|
|
64
|
+
pattern (consumer supplies the auditable surface; validator owns the
|
|
65
|
+
parsing+matching logic; ADR 0002), and Tier 1 STRICT top-level
|
|
66
|
+
exports per ADR 0003. Consumer adoption pattern is the same across
|
|
67
|
+
all three: thin `scripts/audit_<name>.py` CLI wrapper invoking the
|
|
68
|
+
upstream validator.
|
|
69
|
+
|
|
8
70
|
## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
|
|
9
71
|
|
|
10
72
|
Tier-2 ADDITIVE — second member of the audit-validator family
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -67,6 +67,14 @@ _EXPORTS: dict[str, str] = {
|
|
|
67
67
|
"ValueBindingsReport": "eval_toolkit.audit_value_bindings",
|
|
68
68
|
"Violation": "eval_toolkit.audit_value_bindings",
|
|
69
69
|
"validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
|
|
70
|
+
# --- audit_sister_doc_concept_drift ---
|
|
71
|
+
# Flat-module per ADR 0001. Closes #72. Motivated by consumer T1
|
|
72
|
+
# definition contradiction across sister reproducibility docs.
|
|
73
|
+
# Requires [embeddings] extra (lazy resolution; embedder kwarg
|
|
74
|
+
# lets consumers swap in any callable).
|
|
75
|
+
"DriftCluster": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
76
|
+
"SisterDocDriftReport": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
77
|
+
"validate_sister_doc_concept_drift": "eval_toolkit.audit_sister_doc_concept_drift",
|
|
70
78
|
# --- losses ---
|
|
71
79
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
72
80
|
# --- preprocessing ---
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
r"""Sister-doc concept-drift validator (embedding-similarity-based).
|
|
2
|
+
|
|
3
|
+
Catches the bug class where two linked sister docs both reference the
|
|
4
|
+
same concept token (e.g., ``T1``, ``manifest v3``, ``verified_disjoint``)
|
|
5
|
+
but the **surrounding-sentence definitions disagree**. Cross-doc drift
|
|
6
|
+
survives lychee (links resolve), anchor audits (anchors exist), and
|
|
7
|
+
numeric audits (numbers don't disagree because the prose is qualitative).
|
|
8
|
+
|
|
9
|
+
Motivating test case (from `prompt-injection-detection-submission`
|
|
10
|
+
audit, two reproducibility surfaces)::
|
|
11
|
+
|
|
12
|
+
docs/REPRODUCIBILITY.md:85:
|
|
13
|
+
T1 = "full canonical re-eval (GPU; A100 80GB): make headline-cloud
|
|
14
|
+
re-runs ... ~7h wall-clock; ~$28 GPU spend"
|
|
15
|
+
|
|
16
|
+
WRITEUP/reproducibility.md:33:
|
|
17
|
+
T1 = "smoke (laptop, $0, ~10 min): `make smoke` verifies code health"
|
|
18
|
+
|
|
19
|
+
Both files cross-link as "Aggregator docs"; following the link lands a
|
|
20
|
+
reader on contradictory T1 definitions.
|
|
21
|
+
|
|
22
|
+
Algorithm
|
|
23
|
+
---------
|
|
24
|
+
1. For each ``concept_token``, scan all ``files`` for occurrences. Each
|
|
25
|
+
occurrence captures the *surrounding sentence(s)* (configurable
|
|
26
|
+
``context_window_sentences``) — that's the candidate "definition".
|
|
27
|
+
2. Embed each surrounding-sentence string via the supplied ``embedder``
|
|
28
|
+
(default: lazy :func:`eval_toolkit.embeddings.make_minilm_embedder`).
|
|
29
|
+
3. Cluster occurrences by single-linkage: two occurrences belong to the
|
|
30
|
+
same cluster iff their cosine similarity is ``>= similarity_threshold``.
|
|
31
|
+
4. A concept_token with **>1 cluster** is a :class:`DriftCluster` — its
|
|
32
|
+
occurrences split into semantically distinct definition groups.
|
|
33
|
+
5. A concept_token with **exactly 1 cluster** is consistent across all
|
|
34
|
+
files.
|
|
35
|
+
|
|
36
|
+
Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
|
|
37
|
+
Tier 2 ADDITIVE on the ``[embeddings]`` optional extra surface):
|
|
38
|
+
|
|
39
|
+
- Consumer supplies the concept-token list + file glob; validator owns
|
|
40
|
+
parsing + embedding + clustering + report assembly.
|
|
41
|
+
- Embedder is a callable ``Callable[[Sequence[str]], np.ndarray]`` —
|
|
42
|
+
matches the existing :func:`~eval_toolkit.embeddings.make_minilm_embedder`
|
|
43
|
+
factory contract. ``embedder=None`` defers to the canonical MiniLM
|
|
44
|
+
recipe lazily (avoids forcing the ``[embeddings]`` extra import at
|
|
45
|
+
module load time).
|
|
46
|
+
- Flat-module: ``eval_toolkit.audit_sister_doc_concept_drift.*`` (NOT a
|
|
47
|
+
subpackage per ADR 0001 stay-flat-through-v1.x).
|
|
48
|
+
|
|
49
|
+
Closes upstream issue #72. v1.0.4. Completes the audit-validator family
|
|
50
|
+
of 3 (citation_alignment v1.0.1, value_bindings v1.0.3, sister_doc
|
|
51
|
+
concept_drift v1.0.4).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
import re
|
|
57
|
+
from collections.abc import Callable, Sequence
|
|
58
|
+
from dataclasses import dataclass
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"DriftCluster",
|
|
65
|
+
"SisterDocDriftReport",
|
|
66
|
+
"validate_sister_doc_concept_drift",
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
DEFAULT_SIMILARITY_THRESHOLD: float = 0.7
|
|
71
|
+
DEFAULT_CONTEXT_WINDOW_SENTENCES: int = 1
|
|
72
|
+
|
|
73
|
+
# Sentence-ish splitter — markdown is not formal prose. Splits on
|
|
74
|
+
# ``.``, ``!``, ``?`` followed by whitespace or EOL. Imperfect but
|
|
75
|
+
# robust enough for cross-doc concept-drift detection (consumers
|
|
76
|
+
# tolerate boundary slop because clustering is the noise-tolerant
|
|
77
|
+
# downstream step).
|
|
78
|
+
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\d`])")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class DriftCluster:
|
|
83
|
+
"""A concept token whose occurrences split into >1 semantic cluster.
|
|
84
|
+
|
|
85
|
+
Attributes
|
|
86
|
+
----------
|
|
87
|
+
token : str
|
|
88
|
+
The concept token (e.g., ``"T1"``, ``"manifest v3"``).
|
|
89
|
+
sentences : tuple[tuple[Path, int, str], ...]
|
|
90
|
+
Each occurrence as ``(file, line, surrounding_text)`` — line is
|
|
91
|
+
1-indexed; surrounding_text is the ``context_window_sentences``-sized
|
|
92
|
+
prose snippet that was embedded for clustering.
|
|
93
|
+
divergence_score : float
|
|
94
|
+
``1 - min_inter_cluster_similarity`` for the worst-case pair
|
|
95
|
+
between any two clusters. Range ``[0.0, 1.0]``; higher = stronger
|
|
96
|
+
drift signal. ``0.0`` means clusters are barely distinguishable;
|
|
97
|
+
``1.0`` means orthogonal embeddings.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
token: str
|
|
101
|
+
sentences: tuple[tuple[Path, int, str], ...]
|
|
102
|
+
divergence_score: float
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class SisterDocDriftReport:
|
|
107
|
+
"""Result of :func:`validate_sister_doc_concept_drift`.
|
|
108
|
+
|
|
109
|
+
Attributes
|
|
110
|
+
----------
|
|
111
|
+
drift_clusters : tuple[DriftCluster, ...]
|
|
112
|
+
Each concept_token whose occurrences split into >1 cluster.
|
|
113
|
+
Empty tuple = all tokens consistent across the scanned files.
|
|
114
|
+
consistent_tokens : tuple[str, ...]
|
|
115
|
+
Concept tokens whose occurrences clustered to a single group
|
|
116
|
+
(or had ≤1 occurrence total). Reported for completeness +
|
|
117
|
+
coverage tracking.
|
|
118
|
+
coverage : float
|
|
119
|
+
Fraction of ``concept_tokens`` that produced ≥1 occurrence in
|
|
120
|
+
the scanned files. Range ``[0.0, 1.0]``. ``1.0`` means every
|
|
121
|
+
token was referenced; lower values flag stale tokens.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
drift_clusters: tuple[DriftCluster, ...]
|
|
125
|
+
consistent_tokens: tuple[str, ...]
|
|
126
|
+
coverage: float
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def validate_sister_doc_concept_drift(
|
|
130
|
+
*,
|
|
131
|
+
files: Sequence[Path | str],
|
|
132
|
+
concept_tokens: Sequence[str],
|
|
133
|
+
embedder: Callable[[Sequence[str]], np.ndarray] | None = None,
|
|
134
|
+
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
|
|
135
|
+
context_window_sentences: int = DEFAULT_CONTEXT_WINDOW_SENTENCES,
|
|
136
|
+
) -> SisterDocDriftReport:
|
|
137
|
+
"""Validate cross-doc semantic consistency of concept token definitions.
|
|
138
|
+
|
|
139
|
+
For each ``concept_token``, scan ``files`` for occurrences; extract
|
|
140
|
+
the surrounding ``context_window_sentences``; embed each surrounding
|
|
141
|
+
snippet; cluster by single-linkage cosine similarity at
|
|
142
|
+
``similarity_threshold``. Tokens that produce >1 cluster are flagged
|
|
143
|
+
as drift.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
files : Sequence[Path | str]
|
|
148
|
+
Markdown files to scan. UTF-8 encoded.
|
|
149
|
+
concept_tokens : Sequence[str]
|
|
150
|
+
Seed list of concept tokens (e.g., ``["T0", "T1", "T3",
|
|
151
|
+
"manifest v3", "verified_disjoint"]``). Each token is matched
|
|
152
|
+
case-sensitively as a whole-word boundary regex
|
|
153
|
+
(``\\b<token>\\b``).
|
|
154
|
+
embedder : Callable[[Sequence[str]], np.ndarray] | None, optional
|
|
155
|
+
Embedder callable returning ``(n, d)`` array. ``None`` (default)
|
|
156
|
+
lazily routes to :func:`eval_toolkit.embeddings.make_minilm_embedder`
|
|
157
|
+
— requires the ``[embeddings]`` optional extra
|
|
158
|
+
(``pip install eval-toolkit[embeddings]``). Custom callables let
|
|
159
|
+
consumers swap in any embedder (BGE, E5, OpenAI, mock for tests).
|
|
160
|
+
similarity_threshold : float, optional
|
|
161
|
+
Cosine-similarity threshold for single-linkage clustering.
|
|
162
|
+
Default ``0.7``. Higher = stricter (more clusters; more drift
|
|
163
|
+
flagged); lower = looser. ``0.7`` is the conservative default
|
|
164
|
+
for ``all-MiniLM-L6-v2`` — semantic-near-paraphrase territory.
|
|
165
|
+
context_window_sentences : int, optional
|
|
166
|
+
Number of sentences to extract on each side of the token mention
|
|
167
|
+
as the "definition" snippet (passed to the embedder). Default
|
|
168
|
+
``1`` (the sentence containing the token; longer windows mute
|
|
169
|
+
token-specific signal with surrounding prose).
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
SisterDocDriftReport
|
|
174
|
+
``drift_clusters``, ``consistent_tokens``, ``coverage`` per the
|
|
175
|
+
dataclass.
|
|
176
|
+
|
|
177
|
+
Raises
|
|
178
|
+
------
|
|
179
|
+
ImportError
|
|
180
|
+
If ``embedder=None`` and ``sentence_transformers`` is not
|
|
181
|
+
installed. Install via ``pip install eval-toolkit[embeddings]``.
|
|
182
|
+
|
|
183
|
+
Notes
|
|
184
|
+
-----
|
|
185
|
+
Clustering: single-linkage agglomerative on cosine similarity. Two
|
|
186
|
+
occurrences land in the same cluster iff their similarity is
|
|
187
|
+
``>= similarity_threshold``. Transitive: ``a~b`` and ``b~c`` →
|
|
188
|
+
``a, b, c`` in one cluster even if ``cos(a, c) < threshold``. This
|
|
189
|
+
is the canonical SBERT semantic-dedup recipe (see
|
|
190
|
+
:class:`~eval_toolkit.text_dedup.EmbeddingCosineStrategy` for the
|
|
191
|
+
sibling primitive at the inter-text-similarity level).
|
|
192
|
+
|
|
193
|
+
Token matching is case-sensitive whole-word — ``"T1"`` matches
|
|
194
|
+
``"T1"`` but not ``"t1"`` or ``"T10"``. Adjust by passing
|
|
195
|
+
pre-normalized token strings if case-insensitivity is desired.
|
|
196
|
+
|
|
197
|
+
See Also
|
|
198
|
+
--------
|
|
199
|
+
eval_toolkit.audit_citation_alignment.validate_citations :
|
|
200
|
+
Sibling validator (catches ADR-citation alignment drift).
|
|
201
|
+
eval_toolkit.audit_value_bindings.validate_reader_value_bindings :
|
|
202
|
+
Sibling validator (catches detector→value binding drift).
|
|
203
|
+
eval_toolkit.embeddings.make_minilm_embedder :
|
|
204
|
+
Default embedder factory.
|
|
205
|
+
"""
|
|
206
|
+
files_resolved = tuple(Path(f) for f in files)
|
|
207
|
+
tokens = tuple(concept_tokens)
|
|
208
|
+
if not tokens:
|
|
209
|
+
return SisterDocDriftReport(drift_clusters=(), consistent_tokens=(), coverage=0.0)
|
|
210
|
+
|
|
211
|
+
# Resolve embedder lazily — defer the [embeddings] extra import
|
|
212
|
+
# to call time so the module loads even when sentence_transformers
|
|
213
|
+
# isn't installed (matches the EmbeddingCosineStrategy pattern in
|
|
214
|
+
# text_dedup.py).
|
|
215
|
+
if embedder is None:
|
|
216
|
+
embedder = _default_embedder()
|
|
217
|
+
|
|
218
|
+
# Pre-load all files (avoid re-reading per token).
|
|
219
|
+
file_texts: dict[Path, str] = {}
|
|
220
|
+
for path in files_resolved:
|
|
221
|
+
try:
|
|
222
|
+
file_texts[path] = path.read_text(encoding="utf-8")
|
|
223
|
+
except OSError:
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
drift_clusters: list[DriftCluster] = []
|
|
227
|
+
consistent_tokens: list[str] = []
|
|
228
|
+
tokens_with_hits: set[str] = set()
|
|
229
|
+
|
|
230
|
+
for token in tokens:
|
|
231
|
+
occurrences = _collect_occurrences(token, file_texts, context_window_sentences)
|
|
232
|
+
if not occurrences:
|
|
233
|
+
continue
|
|
234
|
+
tokens_with_hits.add(token)
|
|
235
|
+
|
|
236
|
+
if len(occurrences) == 1:
|
|
237
|
+
consistent_tokens.append(token)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# Embed every surrounding snippet (one batch per token).
|
|
241
|
+
snippets = [occ[2] for occ in occurrences]
|
|
242
|
+
embeddings = np.asarray(embedder(snippets), dtype=np.float64)
|
|
243
|
+
clusters = _single_linkage_clusters(embeddings, similarity_threshold)
|
|
244
|
+
|
|
245
|
+
if len(clusters) == 1:
|
|
246
|
+
consistent_tokens.append(token)
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Compute divergence score from inter-cluster similarity.
|
|
250
|
+
divergence = _divergence_score(embeddings, clusters)
|
|
251
|
+
drift_clusters.append(
|
|
252
|
+
DriftCluster(
|
|
253
|
+
token=token,
|
|
254
|
+
sentences=tuple(occurrences),
|
|
255
|
+
divergence_score=divergence,
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
coverage = len(tokens_with_hits) / len(tokens) if tokens else 0.0
|
|
260
|
+
return SisterDocDriftReport(
|
|
261
|
+
drift_clusters=tuple(drift_clusters),
|
|
262
|
+
consistent_tokens=tuple(consistent_tokens),
|
|
263
|
+
coverage=coverage,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _default_embedder() -> Callable[[Sequence[str]], np.ndarray]:
|
|
268
|
+
"""Lazy MiniLM embedder factory; raises ImportError with install hint."""
|
|
269
|
+
try:
|
|
270
|
+
from eval_toolkit.embeddings import make_minilm_embedder
|
|
271
|
+
except ImportError as exc: # pragma: no cover
|
|
272
|
+
msg = (
|
|
273
|
+
"audit_sister_doc_concept_drift requires the [embeddings] optional "
|
|
274
|
+
"extra (sentence_transformers). Install via "
|
|
275
|
+
"`pip install eval-toolkit[embeddings]` OR pass a custom embedder "
|
|
276
|
+
"callable via the embedder= kwarg."
|
|
277
|
+
)
|
|
278
|
+
raise ImportError(msg) from exc
|
|
279
|
+
return make_minilm_embedder()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _collect_occurrences(
|
|
283
|
+
token: str, file_texts: dict[Path, str], context_window_sentences: int
|
|
284
|
+
) -> list[tuple[Path, int, str]]:
|
|
285
|
+
"""Find every occurrence of ``token`` (whole-word) across files.
|
|
286
|
+
|
|
287
|
+
Returns list of ``(file, line, surrounding_text)`` tuples where
|
|
288
|
+
``surrounding_text`` is the ``context_window_sentences`` window
|
|
289
|
+
centered on the sentence containing the token.
|
|
290
|
+
"""
|
|
291
|
+
occurrences: list[tuple[Path, int, str]] = []
|
|
292
|
+
token_re = re.compile(rf"\b{re.escape(token)}\b")
|
|
293
|
+
for path, text in file_texts.items():
|
|
294
|
+
sentences = _split_sentences(text)
|
|
295
|
+
for s_idx, sent in enumerate(sentences):
|
|
296
|
+
if not token_re.search(sent.text):
|
|
297
|
+
continue
|
|
298
|
+
window_lo = max(0, s_idx - context_window_sentences)
|
|
299
|
+
window_hi = min(len(sentences), s_idx + context_window_sentences + 1)
|
|
300
|
+
surrounding = " ".join(sentences[i].text for i in range(window_lo, window_hi))
|
|
301
|
+
occurrences.append((path, sent.line, surrounding))
|
|
302
|
+
return occurrences
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@dataclass(frozen=True)
|
|
306
|
+
class _SentenceSpan:
|
|
307
|
+
text: str
|
|
308
|
+
line: int # 1-indexed line of the sentence's start
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _split_sentences(text: str) -> list[_SentenceSpan]:
|
|
312
|
+
"""Split markdown text into sentence spans with line numbers.
|
|
313
|
+
|
|
314
|
+
Imperfect: skips fenced code blocks (```) but otherwise treats every
|
|
315
|
+
text region as prose. Good enough for concept-drift detection at the
|
|
316
|
+
sentence-of-context-around-token granularity.
|
|
317
|
+
"""
|
|
318
|
+
# Strip fenced code blocks (replace with spaces preserving newlines so
|
|
319
|
+
# line numbers stay accurate).
|
|
320
|
+
in_fence = False
|
|
321
|
+
stripped_lines = []
|
|
322
|
+
for line in text.splitlines(keepends=True):
|
|
323
|
+
if line.lstrip().startswith("```"):
|
|
324
|
+
in_fence = not in_fence
|
|
325
|
+
stripped_lines.append(line) # keep newline for line-number alignment
|
|
326
|
+
continue
|
|
327
|
+
stripped_lines.append(line if not in_fence else "\n")
|
|
328
|
+
cleaned = "".join(stripped_lines)
|
|
329
|
+
|
|
330
|
+
# Compute (line_start_pos -> line_no) map
|
|
331
|
+
line_starts = [0]
|
|
332
|
+
for i, ch in enumerate(cleaned):
|
|
333
|
+
if ch == "\n":
|
|
334
|
+
line_starts.append(i + 1)
|
|
335
|
+
|
|
336
|
+
def pos_to_line(pos: int) -> int:
|
|
337
|
+
lo, hi = 0, len(line_starts) - 1
|
|
338
|
+
while lo < hi:
|
|
339
|
+
mid = (lo + hi + 1) // 2
|
|
340
|
+
if line_starts[mid] <= pos:
|
|
341
|
+
lo = mid
|
|
342
|
+
else:
|
|
343
|
+
hi = mid - 1
|
|
344
|
+
return lo + 1
|
|
345
|
+
|
|
346
|
+
# Split into rough sentences. Markdown headings + lists are
|
|
347
|
+
# treated as standalone sentences.
|
|
348
|
+
spans: list[_SentenceSpan] = []
|
|
349
|
+
# Process line-by-line first so headings/bullets stay isolated.
|
|
350
|
+
pos = 0
|
|
351
|
+
for raw_line in cleaned.splitlines(keepends=True):
|
|
352
|
+
line_text = raw_line.rstrip("\n").strip()
|
|
353
|
+
line_start_pos = pos
|
|
354
|
+
pos += len(raw_line)
|
|
355
|
+
if not line_text:
|
|
356
|
+
continue
|
|
357
|
+
# If line starts with #, treat as a sentence on its own
|
|
358
|
+
if line_text.startswith("#") or line_text.startswith("- ") or line_text.startswith("* "):
|
|
359
|
+
spans.append(_SentenceSpan(text=line_text, line=pos_to_line(line_start_pos)))
|
|
360
|
+
continue
|
|
361
|
+
# Else split on sentence-ish delimiters
|
|
362
|
+
for piece in _SENTENCE_SPLIT_RE.split(line_text):
|
|
363
|
+
piece = piece.strip()
|
|
364
|
+
if piece:
|
|
365
|
+
spans.append(_SentenceSpan(text=piece, line=pos_to_line(line_start_pos)))
|
|
366
|
+
return spans
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _single_linkage_clusters(embeddings: np.ndarray, threshold: float) -> list[list[int]]:
|
|
370
|
+
"""Single-linkage agglomerative clustering on cosine similarity.
|
|
371
|
+
|
|
372
|
+
Returns list of clusters, each a list of row indices into ``embeddings``.
|
|
373
|
+
Two rows i, j are in the same cluster iff there exists a chain
|
|
374
|
+
i = k_0 ~ k_1 ~ ... ~ k_n = j where each adjacent pair has
|
|
375
|
+
``cosine(k_i, k_{i+1}) >= threshold``.
|
|
376
|
+
"""
|
|
377
|
+
n = embeddings.shape[0]
|
|
378
|
+
if n == 0:
|
|
379
|
+
return []
|
|
380
|
+
# Cosine similarity matrix
|
|
381
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
382
|
+
safe_norms = np.where(norms == 0, 1.0, norms)
|
|
383
|
+
normed = embeddings / safe_norms
|
|
384
|
+
sim = normed @ normed.T
|
|
385
|
+
|
|
386
|
+
# Union-find on edges where sim >= threshold
|
|
387
|
+
parent = list(range(n))
|
|
388
|
+
|
|
389
|
+
def find(x: int) -> int:
|
|
390
|
+
while parent[x] != x:
|
|
391
|
+
parent[x] = parent[parent[x]]
|
|
392
|
+
x = parent[x]
|
|
393
|
+
return x
|
|
394
|
+
|
|
395
|
+
def union(a: int, b: int) -> None:
|
|
396
|
+
ra, rb = find(a), find(b)
|
|
397
|
+
if ra != rb:
|
|
398
|
+
parent[ra] = rb
|
|
399
|
+
|
|
400
|
+
for i in range(n):
|
|
401
|
+
for j in range(i + 1, n):
|
|
402
|
+
if sim[i, j] >= threshold:
|
|
403
|
+
union(i, j)
|
|
404
|
+
|
|
405
|
+
# Group by root
|
|
406
|
+
groups: dict[int, list[int]] = {}
|
|
407
|
+
for i in range(n):
|
|
408
|
+
groups.setdefault(find(i), []).append(i)
|
|
409
|
+
return list(groups.values())
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _divergence_score(embeddings: np.ndarray, clusters: list[list[int]]) -> float:
|
|
413
|
+
"""``1 - min_inter_cluster_similarity`` across all cluster pairs.
|
|
414
|
+
|
|
415
|
+
Higher = stronger drift. ``0.0`` means clusters are barely separated;
|
|
416
|
+
``1.0`` means orthogonal embeddings.
|
|
417
|
+
"""
|
|
418
|
+
if len(clusters) < 2:
|
|
419
|
+
return 0.0
|
|
420
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
421
|
+
safe_norms = np.where(norms == 0, 1.0, norms)
|
|
422
|
+
normed = embeddings / safe_norms
|
|
423
|
+
sim = normed @ normed.T
|
|
424
|
+
min_sim = 1.0
|
|
425
|
+
for a_idx in range(len(clusters)):
|
|
426
|
+
for b_idx in range(a_idx + 1, len(clusters)):
|
|
427
|
+
a, b = clusters[a_idx], clusters[b_idx]
|
|
428
|
+
# Min similarity between any pair across the two clusters
|
|
429
|
+
sub = sim[np.ix_(a, b)]
|
|
430
|
+
pair_min = float(sub.min())
|
|
431
|
+
min_sim = min(min_sim, pair_min)
|
|
432
|
+
return 1.0 - max(0.0, min_sim)
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
"DedupReport",
|
|
42
42
|
"DelimitVariant",
|
|
43
43
|
"DiacriticInjection",
|
|
44
|
+
"DriftCluster",
|
|
44
45
|
"EmbeddingCosineStrategy",
|
|
45
46
|
"EncodeVariant",
|
|
46
47
|
"EvalSlice",
|
|
@@ -106,6 +107,7 @@
|
|
|
106
107
|
"SimilarityAuditReport",
|
|
107
108
|
"SimilarityStrategy",
|
|
108
109
|
"SingleSliceLoader",
|
|
110
|
+
"SisterDocDriftReport",
|
|
109
111
|
"SliceAwareScorer",
|
|
110
112
|
"SourceDisjointKFoldSplitter",
|
|
111
113
|
"SourceRoleRecord",
|
|
@@ -247,6 +249,7 @@
|
|
|
247
249
|
"validate_prediction_artifact_ref",
|
|
248
250
|
"validate_reader_value_bindings",
|
|
249
251
|
"validate_results",
|
|
252
|
+
"validate_sister_doc_concept_drift",
|
|
250
253
|
"validate_source_roles",
|
|
251
254
|
"walk_path",
|
|
252
255
|
"wilson_interval",
|
|
@@ -575,6 +578,14 @@
|
|
|
575
578
|
"kind": "class",
|
|
576
579
|
"signature": "(ratio: 'float' = 0.3, seed: 'int' = 42, name: 'str' = 'diacritic') -> None"
|
|
577
580
|
},
|
|
581
|
+
"DriftCluster": {
|
|
582
|
+
"bases": [
|
|
583
|
+
"object"
|
|
584
|
+
],
|
|
585
|
+
"doc_first_line": "A concept token whose occurrences split into >1 semantic cluster.",
|
|
586
|
+
"kind": "class",
|
|
587
|
+
"signature": "(token: 'str', sentences: 'tuple[tuple[Path, int, str], ...]', divergence_score: 'float') -> None"
|
|
588
|
+
},
|
|
578
589
|
"EmbeddingCosineStrategy": {
|
|
579
590
|
"bases": [
|
|
580
591
|
"object"
|
|
@@ -1152,6 +1163,14 @@
|
|
|
1152
1163
|
"kind": "class",
|
|
1153
1164
|
"signature": "(slice_: 'EvalSlice', name: 'str' = '', description: 'str' = '') -> None"
|
|
1154
1165
|
},
|
|
1166
|
+
"SisterDocDriftReport": {
|
|
1167
|
+
"bases": [
|
|
1168
|
+
"object"
|
|
1169
|
+
],
|
|
1170
|
+
"doc_first_line": "Result of :func:`validate_sister_doc_concept_drift`.",
|
|
1171
|
+
"kind": "class",
|
|
1172
|
+
"signature": "(drift_clusters: 'tuple[DriftCluster, ...]', consistent_tokens: 'tuple[str, ...]', coverage: 'float') -> None"
|
|
1173
|
+
},
|
|
1155
1174
|
"SliceAwareScorer": {
|
|
1156
1175
|
"bases": [
|
|
1157
1176
|
"Scorer",
|
|
@@ -1401,7 +1420,7 @@
|
|
|
1401
1420
|
"doc_first_line": "str(object='') -> str",
|
|
1402
1421
|
"kind": "value",
|
|
1403
1422
|
"type": "str",
|
|
1404
|
-
"value": "'1.0.
|
|
1423
|
+
"value": "'1.0.4'"
|
|
1405
1424
|
},
|
|
1406
1425
|
"apply_operating_points": {
|
|
1407
1426
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1958,6 +1977,11 @@
|
|
|
1958
1977
|
"kind": "function",
|
|
1959
1978
|
"signature": "(payload: 'Mapping[str, object]') -> 'None'"
|
|
1960
1979
|
},
|
|
1980
|
+
"validate_sister_doc_concept_drift": {
|
|
1981
|
+
"doc_first_line": "Validate cross-doc semantic consistency of concept token definitions.",
|
|
1982
|
+
"kind": "function",
|
|
1983
|
+
"signature": "(*, files: 'Sequence[Path | str]', concept_tokens: 'Sequence[str]', embedder: 'Callable[[Sequence[str]], np.ndarray] | None' = None, similarity_threshold: 'float' = 0.7, context_window_sentences: 'int' = 1) -> 'SisterDocDriftReport'"
|
|
1984
|
+
},
|
|
1961
1985
|
"validate_source_roles": {
|
|
1962
1986
|
"doc_first_line": "Return validation errors for generic source-role records.",
|
|
1963
1987
|
"kind": "function",
|