eval-toolkit 0.27.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. eval_toolkit-0.27.1/.gitignore +42 -0
  2. eval_toolkit-0.27.1/CHANGELOG.md +2001 -0
  3. eval_toolkit-0.27.1/LICENSE +21 -0
  4. eval_toolkit-0.27.1/PKG-INFO +314 -0
  5. eval_toolkit-0.27.1/README.md +251 -0
  6. eval_toolkit-0.27.1/STYLE.md +265 -0
  7. eval_toolkit-0.27.1/docs/methodology/README.md +90 -0
  8. eval_toolkit-0.27.1/docs/research/README.md +107 -0
  9. eval_toolkit-0.27.1/docs/research/datasets/README.md +83 -0
  10. eval_toolkit-0.27.1/docs/research/papers/data-integrity/README.md +108 -0
  11. eval_toolkit-0.27.1/docs/research/papers/eval-ecosystem/README.md +84 -0
  12. eval_toolkit-0.27.1/docs/research/papers/inference/README.md +128 -0
  13. eval_toolkit-0.27.1/docs/research/papers/prompt-injection/README.md +90 -0
  14. eval_toolkit-0.27.1/pyproject.toml +166 -0
  15. eval_toolkit-0.27.1/src/eval_toolkit/__init__.py +238 -0
  16. eval_toolkit-0.27.1/src/eval_toolkit/__main__.py +156 -0
  17. eval_toolkit-0.27.1/src/eval_toolkit/_version.py +5 -0
  18. eval_toolkit-0.27.1/src/eval_toolkit/analysis.py +196 -0
  19. eval_toolkit-0.27.1/src/eval_toolkit/artifacts.py +376 -0
  20. eval_toolkit-0.27.1/src/eval_toolkit/bootstrap.py +1344 -0
  21. eval_toolkit-0.27.1/src/eval_toolkit/calibration.py +1143 -0
  22. eval_toolkit-0.27.1/src/eval_toolkit/claims.py +670 -0
  23. eval_toolkit-0.27.1/src/eval_toolkit/config.py +112 -0
  24. eval_toolkit-0.27.1/src/eval_toolkit/docs.py +305 -0
  25. eval_toolkit-0.27.1/src/eval_toolkit/evidence.py +90 -0
  26. eval_toolkit-0.27.1/src/eval_toolkit/harness.py +1193 -0
  27. eval_toolkit-0.27.1/src/eval_toolkit/leakage.py +1052 -0
  28. eval_toolkit-0.27.1/src/eval_toolkit/loaders.py +424 -0
  29. eval_toolkit-0.27.1/src/eval_toolkit/manifest.py +622 -0
  30. eval_toolkit-0.27.1/src/eval_toolkit/metrics.py +1720 -0
  31. eval_toolkit-0.27.1/src/eval_toolkit/operating_points.py +192 -0
  32. eval_toolkit-0.27.1/src/eval_toolkit/paths.py +125 -0
  33. eval_toolkit-0.27.1/src/eval_toolkit/plotting.py +991 -0
  34. eval_toolkit-0.27.1/src/eval_toolkit/protocols.py +98 -0
  35. eval_toolkit-0.27.1/src/eval_toolkit/provenance.py +255 -0
  36. eval_toolkit-0.27.1/src/eval_toolkit/py.typed +0 -0
  37. eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v1.json +155 -0
  38. eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v2.json +186 -0
  39. eval_toolkit-0.27.1/src/eval_toolkit/schemas/manifest.v3.json +186 -0
  40. eval_toolkit-0.27.1/src/eval_toolkit/schemas/results.v1.json +87 -0
  41. eval_toolkit-0.27.1/src/eval_toolkit/schemas/results_full.v1.json +83 -0
  42. eval_toolkit-0.27.1/src/eval_toolkit/seeds.py +119 -0
  43. eval_toolkit-0.27.1/src/eval_toolkit/splits.py +520 -0
  44. eval_toolkit-0.27.1/src/eval_toolkit/text_dedup.py +1403 -0
  45. eval_toolkit-0.27.1/src/eval_toolkit/thresholds.py +819 -0
  46. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  47. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  48. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  49. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  50. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  51. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  52. eval_toolkit-0.27.1/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  53. eval_toolkit-0.27.1/tests/conftest.py +106 -0
  54. eval_toolkit-0.27.1/tests/golden/docs/expected.md +22 -0
  55. eval_toolkit-0.27.1/tests/golden/docs/input.md +22 -0
  56. eval_toolkit-0.27.1/tests/golden/docs/metrics.json +17 -0
  57. eval_toolkit-0.27.1/tests/strategies.py +59 -0
  58. eval_toolkit-0.27.1/tests/test_analysis.py +255 -0
  59. eval_toolkit-0.27.1/tests/test_artifacts.py +444 -0
  60. eval_toolkit-0.27.1/tests/test_bootstrap_edge_cases.py +128 -0
  61. eval_toolkit-0.27.1/tests/test_bootstrap_props.py +153 -0
  62. eval_toolkit-0.27.1/tests/test_bootstrap_research_grounded.py +287 -0
  63. eval_toolkit-0.27.1/tests/test_bootstrap_unit.py +482 -0
  64. eval_toolkit-0.27.1/tests/test_calibration_bootstrap_chain.py +165 -0
  65. eval_toolkit-0.27.1/tests/test_calibration_optimization_failures.py +105 -0
  66. eval_toolkit-0.27.1/tests/test_calibration_props.py +167 -0
  67. eval_toolkit-0.27.1/tests/test_calibration_research_grounded.py +471 -0
  68. eval_toolkit-0.27.1/tests/test_calibration_unit.py +363 -0
  69. eval_toolkit-0.27.1/tests/test_claims.py +197 -0
  70. eval_toolkit-0.27.1/tests/test_claims_coverage.py +477 -0
  71. eval_toolkit-0.27.1/tests/test_claims_props.py +214 -0
  72. eval_toolkit-0.27.1/tests/test_cli.py +312 -0
  73. eval_toolkit-0.27.1/tests/test_config.py +105 -0
  74. eval_toolkit-0.27.1/tests/test_coverage_gap.py +1252 -0
  75. eval_toolkit-0.27.1/tests/test_dedup_split_leakage_chain.py +149 -0
  76. eval_toolkit-0.27.1/tests/test_docs_golden.py +142 -0
  77. eval_toolkit-0.27.1/tests/test_docs_props.py +167 -0
  78. eval_toolkit-0.27.1/tests/test_evidence_validators.py +69 -0
  79. eval_toolkit-0.27.1/tests/test_harness_edge_cases.py +262 -0
  80. eval_toolkit-0.27.1/tests/test_harness_internals.py +187 -0
  81. eval_toolkit-0.27.1/tests/test_harness_smoke.py +187 -0
  82. eval_toolkit-0.27.1/tests/test_harness_v07.py +231 -0
  83. eval_toolkit-0.27.1/tests/test_harness_v22.py +250 -0
  84. eval_toolkit-0.27.1/tests/test_import_boundaries.py +72 -0
  85. eval_toolkit-0.27.1/tests/test_leakage.py +510 -0
  86. eval_toolkit-0.27.1/tests/test_leakage_error_paths.py +161 -0
  87. eval_toolkit-0.27.1/tests/test_leakage_props.py +209 -0
  88. eval_toolkit-0.27.1/tests/test_loaders.py +155 -0
  89. eval_toolkit-0.27.1/tests/test_loaders_coverage.py +170 -0
  90. eval_toolkit-0.27.1/tests/test_loaders_props.py +123 -0
  91. eval_toolkit-0.27.1/tests/test_manifest.py +259 -0
  92. eval_toolkit-0.27.1/tests/test_manifest_contamination_round_trip.py +108 -0
  93. eval_toolkit-0.27.1/tests/test_manifest_props.py +177 -0
  94. eval_toolkit-0.27.1/tests/test_manifest_validation.py +259 -0
  95. eval_toolkit-0.27.1/tests/test_metrics_props.py +164 -0
  96. eval_toolkit-0.27.1/tests/test_metrics_stratified_subsets.py +129 -0
  97. eval_toolkit-0.27.1/tests/test_metrics_unit.py +407 -0
  98. eval_toolkit-0.27.1/tests/test_misc_coverage.py +187 -0
  99. eval_toolkit-0.27.1/tests/test_numeric_edge_cases.py +176 -0
  100. eval_toolkit-0.27.1/tests/test_operating_points.py +136 -0
  101. eval_toolkit-0.27.1/tests/test_operating_points_props.py +212 -0
  102. eval_toolkit-0.27.1/tests/test_paths.py +87 -0
  103. eval_toolkit-0.27.1/tests/test_plotting_edge.py +322 -0
  104. eval_toolkit-0.27.1/tests/test_plotting_smoke.py +232 -0
  105. eval_toolkit-0.27.1/tests/test_plotting_visual.py +141 -0
  106. eval_toolkit-0.27.1/tests/test_protocol_conformance.py +499 -0
  107. eval_toolkit-0.27.1/tests/test_provenance.py +175 -0
  108. eval_toolkit-0.27.1/tests/test_reference_equivalence.py +256 -0
  109. eval_toolkit-0.27.1/tests/test_reproducibility_integration.py +201 -0
  110. eval_toolkit-0.27.1/tests/test_schemas.py +209 -0
  111. eval_toolkit-0.27.1/tests/test_seeds.py +167 -0
  112. eval_toolkit-0.27.1/tests/test_splits.py +162 -0
  113. eval_toolkit-0.27.1/tests/test_splits_leakage_integration.py +164 -0
  114. eval_toolkit-0.27.1/tests/test_splits_props.py +196 -0
  115. eval_toolkit-0.27.1/tests/test_text_dedup.py +170 -0
  116. eval_toolkit-0.27.1/tests/test_text_dedup_coverage.py +356 -0
  117. eval_toolkit-0.27.1/tests/test_text_dedup_props.py +186 -0
  118. eval_toolkit-0.27.1/tests/test_text_dedup_strategies.py +498 -0
  119. eval_toolkit-0.27.1/tests/test_thresholds.py +155 -0
  120. eval_toolkit-0.27.1/tests/test_thresholds_constant_score.py +127 -0
  121. eval_toolkit-0.27.1/tests/test_thresholds_coverage.py +260 -0
  122. eval_toolkit-0.27.1/tests/test_thresholds_props.py +219 -0
  123. eval_toolkit-0.27.1/tests/test_thresholds_research_grounded.py +183 -0
  124. eval_toolkit-0.27.1/tests/test_v09_contracts.py +212 -0
@@ -0,0 +1,42 @@
1
+ # Virtual environments
2
+ .venv/
3
+ venv/
4
+ env/
5
+
6
+ # Python bytecode
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ build/
14
+ dist/
15
+ *.egg-info/
16
+ *.egg
17
+ wheels/
18
+
19
+ # Testing / coverage
20
+ .pytest_cache/
21
+ .coverage
22
+ .coverage.*
23
+ htmlcov/
24
+ coverage.xml
25
+ .hypothesis/
26
+
27
+ # Type-checker / linter caches
28
+ .mypy_cache/
29
+ .ruff_cache/
30
+
31
+ # Editors
32
+ .vscode/
33
+ .idea/
34
+ *.swp
35
+ *.swo
36
+ .DS_Store
37
+
38
+ # Logs
39
+ *.log
40
+
41
+ # Claude Code project settings (machine-local)
42
+ .claude/