eval-toolkit 0.27.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_toolkit-0.27.1/.gitignore +42 -0
- eval_toolkit-0.27.1/CHANGELOG.md +2001 -0
- eval_toolkit-0.27.1/LICENSE +21 -0
- eval_toolkit-0.27.1/PKG-INFO +314 -0
- eval_toolkit-0.27.1/README.md +251 -0
- eval_toolkit-0.27.1/STYLE.md +265 -0
- eval_toolkit-0.27.1/docs/methodology/README.md +90 -0
- eval_toolkit-0.27.1/docs/research/README.md +107 -0
- eval_toolkit-0.27.1/docs/research/datasets/README.md +83 -0
- eval_toolkit-0.27.1/docs/research/papers/data-integrity/README.md +108 -0
- eval_toolkit-0.27.1/docs/research/papers/eval-ecosystem/README.md +84 -0
- eval_toolkit-0.27.1/docs/research/papers/inference/README.md +128 -0
- eval_toolkit-0.27.1/docs/research/papers/prompt-injection/README.md +90 -0
- eval_toolkit-0.27.1/pyproject.toml +166 -0
- eval_toolkit-0.27.1/src/eval_toolkit/__init__.py +238 -0
- eval_toolkit-0.27.1/src/eval_toolkit/__main__.py +156 -0
- eval_toolkit-0.27.1/src/eval_toolkit/_version.py +5 -0
- eval_toolkit-0.27.1/src/eval_toolkit/analysis.py +196 -0
- eval_toolkit-0.27.1/src/eval_toolkit/artifacts.py +376 -0
- eval_toolkit-0.27.1/src/eval_toolkit/bootstrap.py +1344 -0
- eval_toolkit-0.27.1/src/eval_toolkit/calibration.py +1143 -0
- eval_toolkit-0.27.1/src/eval_toolkit/claims.py +670 -0
- eval_toolkit-0.27.1/src/eval_toolkit/config.py +112 -0
- eval_toolkit-0.27.1/src/eval_toolkit/docs.py +305 -0
- eval_toolkit-0.27.1/src/eval_toolkit/evidence.py +90 -0
- eval_toolkit-0.27.1/src/eval_toolkit/harness.py +1193 -0
- eval_toolkit-0.27.1/src/eval_toolkit/leakage.py +1052 -0
- eval_toolkit-0.27.1/src/eval_toolkit/loaders.py +424 -0
- eval_toolkit-0.27.1/src/eval_toolkit/manifest.py +622 -0
- eval_toolkit-0.27.1/src/eval_toolkit/metrics.py +1720 -0
- eval_toolkit-0.27.1/src/eval_toolkit/operating_points.py +192 -0
- eval_toolkit-0.27.1/src/eval_toolkit/paths.py +125 -0
- eval_toolkit-0.27.1/src/eval_toolkit/plotting.py +991 -0
- eval_toolkit-0.27.1/src/eval_toolkit/protocols.py +98 -0
- eval_toolkit-0.27.1/src/eval_toolkit/provenance.py +255 -0
- eval_toolkit-0.27.1/src/eval_toolkit/py.typed +0 -0
- eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v1.json +155 -0
- eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v2.json +186 -0
- eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v3.json +186 -0
- eval_toolkit-0.27.1/src/eval_toolkit/schemas/results.v1.json +87 -0
- eval_toolkit-0.27.1/src/eval_toolkit/schemas/results_full.v1.json +83 -0
- eval_toolkit-0.27.1/src/eval_toolkit/seeds.py +119 -0
- eval_toolkit-0.27.1/src/eval_toolkit/splits.py +520 -0
- eval_toolkit-0.27.1/src/eval_toolkit/text_dedup.py +1403 -0
- eval_toolkit-0.27.1/src/eval_toolkit/thresholds.py +819 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- eval_toolkit-0.27.1/tests/conftest.py +106 -0
- eval_toolkit-0.27.1/tests/golden/docs/expected.md +22 -0
- eval_toolkit-0.27.1/tests/golden/docs/input.md +22 -0
- eval_toolkit-0.27.1/tests/golden/docs/metrics.json +17 -0
- eval_toolkit-0.27.1/tests/strategies.py +59 -0
- eval_toolkit-0.27.1/tests/test_analysis.py +255 -0
- eval_toolkit-0.27.1/tests/test_artifacts.py +444 -0
- eval_toolkit-0.27.1/tests/test_bootstrap_edge_cases.py +128 -0
- eval_toolkit-0.27.1/tests/test_bootstrap_props.py +153 -0
- eval_toolkit-0.27.1/tests/test_bootstrap_research_grounded.py +287 -0
- eval_toolkit-0.27.1/tests/test_bootstrap_unit.py +482 -0
- eval_toolkit-0.27.1/tests/test_calibration_bootstrap_chain.py +165 -0
- eval_toolkit-0.27.1/tests/test_calibration_optimization_failures.py +105 -0
- eval_toolkit-0.27.1/tests/test_calibration_props.py +167 -0
- eval_toolkit-0.27.1/tests/test_calibration_research_grounded.py +471 -0
- eval_toolkit-0.27.1/tests/test_calibration_unit.py +363 -0
- eval_toolkit-0.27.1/tests/test_claims.py +197 -0
- eval_toolkit-0.27.1/tests/test_claims_coverage.py +477 -0
- eval_toolkit-0.27.1/tests/test_claims_props.py +214 -0
- eval_toolkit-0.27.1/tests/test_cli.py +312 -0
- eval_toolkit-0.27.1/tests/test_config.py +105 -0
- eval_toolkit-0.27.1/tests/test_coverage_gap.py +1252 -0
- eval_toolkit-0.27.1/tests/test_dedup_split_leakage_chain.py +149 -0
- eval_toolkit-0.27.1/tests/test_docs_golden.py +142 -0
- eval_toolkit-0.27.1/tests/test_docs_props.py +167 -0
- eval_toolkit-0.27.1/tests/test_evidence_validators.py +69 -0
- eval_toolkit-0.27.1/tests/test_harness_edge_cases.py +262 -0
- eval_toolkit-0.27.1/tests/test_harness_internals.py +187 -0
- eval_toolkit-0.27.1/tests/test_harness_smoke.py +187 -0
- eval_toolkit-0.27.1/tests/test_harness_v07.py +231 -0
- eval_toolkit-0.27.1/tests/test_harness_v22.py +250 -0
- eval_toolkit-0.27.1/tests/test_import_boundaries.py +72 -0
- eval_toolkit-0.27.1/tests/test_leakage.py +510 -0
- eval_toolkit-0.27.1/tests/test_leakage_error_paths.py +161 -0
- eval_toolkit-0.27.1/tests/test_leakage_props.py +209 -0
- eval_toolkit-0.27.1/tests/test_loaders.py +155 -0
- eval_toolkit-0.27.1/tests/test_loaders_coverage.py +170 -0
- eval_toolkit-0.27.1/tests/test_loaders_props.py +123 -0
- eval_toolkit-0.27.1/tests/test_manifest.py +259 -0
- eval_toolkit-0.27.1/tests/test_manifest_contamination_round_trip.py +108 -0
- eval_toolkit-0.27.1/tests/test_manifest_props.py +177 -0
- eval_toolkit-0.27.1/tests/test_manifest_validation.py +259 -0
- eval_toolkit-0.27.1/tests/test_metrics_props.py +164 -0
- eval_toolkit-0.27.1/tests/test_metrics_stratified_subsets.py +129 -0
- eval_toolkit-0.27.1/tests/test_metrics_unit.py +407 -0
- eval_toolkit-0.27.1/tests/test_misc_coverage.py +187 -0
- eval_toolkit-0.27.1/tests/test_numeric_edge_cases.py +176 -0
- eval_toolkit-0.27.1/tests/test_operating_points.py +136 -0
- eval_toolkit-0.27.1/tests/test_operating_points_props.py +212 -0
- eval_toolkit-0.27.1/tests/test_paths.py +87 -0
- eval_toolkit-0.27.1/tests/test_plotting_edge.py +322 -0
- eval_toolkit-0.27.1/tests/test_plotting_smoke.py +232 -0
- eval_toolkit-0.27.1/tests/test_plotting_visual.py +141 -0
- eval_toolkit-0.27.1/tests/test_protocol_conformance.py +499 -0
- eval_toolkit-0.27.1/tests/test_provenance.py +175 -0
- eval_toolkit-0.27.1/tests/test_reference_equivalence.py +256 -0
- eval_toolkit-0.27.1/tests/test_reproducibility_integration.py +201 -0
- eval_toolkit-0.27.1/tests/test_schemas.py +209 -0
- eval_toolkit-0.27.1/tests/test_seeds.py +167 -0
- eval_toolkit-0.27.1/tests/test_splits.py +162 -0
- eval_toolkit-0.27.1/tests/test_splits_leakage_integration.py +164 -0
- eval_toolkit-0.27.1/tests/test_splits_props.py +196 -0
- eval_toolkit-0.27.1/tests/test_text_dedup.py +170 -0
- eval_toolkit-0.27.1/tests/test_text_dedup_coverage.py +356 -0
- eval_toolkit-0.27.1/tests/test_text_dedup_props.py +186 -0
- eval_toolkit-0.27.1/tests/test_text_dedup_strategies.py +498 -0
- eval_toolkit-0.27.1/tests/test_thresholds.py +155 -0
- eval_toolkit-0.27.1/tests/test_thresholds_constant_score.py +127 -0
- eval_toolkit-0.27.1/tests/test_thresholds_coverage.py +260 -0
- eval_toolkit-0.27.1/tests/test_thresholds_props.py +219 -0
- eval_toolkit-0.27.1/tests/test_thresholds_research_grounded.py +183 -0
- eval_toolkit-0.27.1/tests/test_v09_contracts.py +212 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Virtual environments
|
|
2
|
+
.venv/
|
|
3
|
+
venv/
|
|
4
|
+
env/
|
|
5
|
+
|
|
6
|
+
# Python bytecode
|
|
7
|
+
__pycache__/
|
|
8
|
+
*.py[cod]
|
|
9
|
+
*$py.class
|
|
10
|
+
*.so
|
|
11
|
+
|
|
12
|
+
# Distribution / packaging
|
|
13
|
+
build/
|
|
14
|
+
dist/
|
|
15
|
+
*.egg-info/
|
|
16
|
+
*.egg
|
|
17
|
+
wheels/
|
|
18
|
+
|
|
19
|
+
# Testing / coverage
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
.coverage
|
|
22
|
+
.coverage.*
|
|
23
|
+
htmlcov/
|
|
24
|
+
coverage.xml
|
|
25
|
+
.hypothesis/
|
|
26
|
+
|
|
27
|
+
# Type-checker / linter caches
|
|
28
|
+
.mypy_cache/
|
|
29
|
+
.ruff_cache/
|
|
30
|
+
|
|
31
|
+
# Editors
|
|
32
|
+
.vscode/
|
|
33
|
+
.idea/
|
|
34
|
+
*.swp
|
|
35
|
+
*.swo
|
|
36
|
+
.DS_Store
|
|
37
|
+
|
|
38
|
+
# Logs
|
|
39
|
+
*.log
|
|
40
|
+
|
|
41
|
+
# Claude Code project settings (machine-local)
|
|
42
|
+
.claude/
|