driftless 0.2.1__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {driftless-0.2.1 → driftless-0.2.4}/CHANGELOG.md +45 -2
  2. {driftless-0.2.1 → driftless-0.2.4}/PKG-INFO +7 -2
  3. {driftless-0.2.1 → driftless-0.2.4}/README.md +6 -1
  4. {driftless-0.2.1 → driftless-0.2.4}/docs/RELEASE.md +44 -4
  5. {driftless-0.2.1 → driftless-0.2.4}/site/docs.html +5 -1
  6. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/__init__.py +1 -1
  7. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/cli.py +161 -0
  8. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/compare.py +6 -0
  9. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/contract.py +12 -0
  10. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/engine.py +39 -0
  11. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/judges.py +52 -0
  12. driftless-0.2.4/src/driftless/label_audit.py +290 -0
  13. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/report.py +29 -0
  14. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/templates.py +2 -0
  15. driftless-0.2.4/tests/fixtures/live_eval_baseline.json +38 -0
  16. driftless-0.2.4/tests/regression_metrics.py +141 -0
  17. driftless-0.2.4/tests/scenarios.py +880 -0
  18. driftless-0.2.4/tests/test_cli.py +268 -0
  19. {driftless-0.2.1 → driftless-0.2.4}/tests/test_github.py +65 -0
  20. driftless-0.2.4/tests/test_grading_loop.py +40 -0
  21. {driftless-0.2.1 → driftless-0.2.4}/tests/test_judge.py +63 -0
  22. driftless-0.2.4/tests/test_judge_loop.py +69 -0
  23. driftless-0.2.4/tests/test_label_audit.py +183 -0
  24. driftless-0.2.4/tests/test_migration_live.py +131 -0
  25. {driftless-0.2.1 → driftless-0.2.4}/tests/test_migration_regression.py +114 -0
  26. driftless-0.2.4/tests/test_regression_metrics.py +85 -0
  27. {driftless-0.2.1 → driftless-0.2.4}/tests/test_report.py +23 -0
  28. driftless-0.2.1/tests/scenarios.py +0 -387
  29. driftless-0.2.1/tests/test_cli.py +0 -72
  30. driftless-0.2.1/tests/test_grading_loop.py +0 -103
  31. driftless-0.2.1/tests/test_judge_loop.py +0 -93
  32. driftless-0.2.1/tests/test_migration_live.py +0 -40
  33. {driftless-0.2.1 → driftless-0.2.4}/.gitignore +0 -0
  34. {driftless-0.2.1 → driftless-0.2.4}/LICENSE +0 -0
  35. {driftless-0.2.1 → driftless-0.2.4}/docs/repair-and-generators.md +0 -0
  36. {driftless-0.2.1 → driftless-0.2.4}/pyproject.toml +0 -0
  37. {driftless-0.2.1 → driftless-0.2.4}/site/assets/app.js +0 -0
  38. {driftless-0.2.1 → driftless-0.2.4}/site/assets/hero-workflow.png +0 -0
  39. {driftless-0.2.1 → driftless-0.2.4}/site/assets/landing.css +0 -0
  40. {driftless-0.2.1 → driftless-0.2.4}/site/assets/runs.css +0 -0
  41. {driftless-0.2.1 → driftless-0.2.4}/site/assets/runs.js +0 -0
  42. {driftless-0.2.1 → driftless-0.2.4}/site/assets/sample-run.json +0 -0
  43. {driftless-0.2.1 → driftless-0.2.4}/site/assets/styles.css +0 -0
  44. {driftless-0.2.1 → driftless-0.2.4}/site/index.html +0 -0
  45. {driftless-0.2.1 → driftless-0.2.4}/site/runs.html +0 -0
  46. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/calibrate.py +0 -0
  47. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/configure.py +0 -0
  48. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/data/model_lifecycle.json +0 -0
  49. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/datasource.py +0 -0
  50. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/datastate.py +0 -0
  51. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/discovery.py +0 -0
  52. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/errors.py +0 -0
  53. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/evaluation.py +0 -0
  54. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/generators.py +0 -0
  55. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/github.py +0 -0
  56. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/harness.py +0 -0
  57. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/init_ci.py +0 -0
  58. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/lifecycle.py +0 -0
  59. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/policy.py +0 -0
  60. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/preflight.py +0 -0
  61. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/progress.py +0 -0
  62. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/scanner.py +0 -0
  63. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/splits.py +0 -0
  64. {driftless-0.2.1 → driftless-0.2.4}/src/driftless/view.py +0 -0
  65. {driftless-0.2.1 → driftless-0.2.4}/tests/fixtures/smoke/driftless.yml +0 -0
  66. {driftless-0.2.1 → driftless-0.2.4}/tests/fixtures/smoke/inputs.jsonl +0 -0
  67. {driftless-0.2.1 → driftless-0.2.4}/tests/fixtures/smoke/labels.jsonl +0 -0
  68. {driftless-0.2.1 → driftless-0.2.4}/tests/test_compare.py +0 -0
  69. {driftless-0.2.1 → driftless-0.2.4}/tests/test_contract.py +0 -0
  70. {driftless-0.2.1 → driftless-0.2.4}/tests/test_data_change_gate.py +0 -0
  71. {driftless-0.2.1 → driftless-0.2.4}/tests/test_data_change_regression.py +0 -0
  72. {driftless-0.2.1 → driftless-0.2.4}/tests/test_datasource.py +0 -0
  73. {driftless-0.2.1 → driftless-0.2.4}/tests/test_datastate.py +0 -0
  74. {driftless-0.2.1 → driftless-0.2.4}/tests/test_discovery.py +0 -0
  75. {driftless-0.2.1 → driftless-0.2.4}/tests/test_endpoint.py +0 -0
  76. {driftless-0.2.1 → driftless-0.2.4}/tests/test_engine.py +0 -0
  77. {driftless-0.2.1 → driftless-0.2.4}/tests/test_evaluation.py +0 -0
  78. {driftless-0.2.1 → driftless-0.2.4}/tests/test_extraction.py +0 -0
  79. {driftless-0.2.1 → driftless-0.2.4}/tests/test_generators.py +0 -0
  80. {driftless-0.2.1 → driftless-0.2.4}/tests/test_harness.py +0 -0
  81. {driftless-0.2.1 → driftless-0.2.4}/tests/test_init_ci.py +0 -0
  82. {driftless-0.2.1 → driftless-0.2.4}/tests/test_lifecycle.py +0 -0
  83. {driftless-0.2.1 → driftless-0.2.4}/tests/test_plan_act.py +0 -0
  84. {driftless-0.2.1 → driftless-0.2.4}/tests/test_policy.py +0 -0
  85. {driftless-0.2.1 → driftless-0.2.4}/tests/test_poll_act.py +0 -0
  86. {driftless-0.2.1 → driftless-0.2.4}/tests/test_preflight.py +0 -0
  87. {driftless-0.2.1 → driftless-0.2.4}/tests/test_progress.py +0 -0
  88. {driftless-0.2.1 → driftless-0.2.4}/tests/test_refine.py +0 -0
  89. {driftless-0.2.1 → driftless-0.2.4}/tests/test_refresh_catalog.py +0 -0
  90. {driftless-0.2.1 → driftless-0.2.4}/tests/test_repair_prompt.py +0 -0
  91. {driftless-0.2.1 → driftless-0.2.4}/tests/test_scanner.py +0 -0
  92. {driftless-0.2.1 → driftless-0.2.4}/tests/test_view.py +0 -0
@@ -17,6 +17,46 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
17
17
 
18
18
  ---
19
19
 
20
+ ## [0.2.4] - 2026-07-01
21
+
22
+ ### Fixed
23
+
24
+ - **`judge-check` gate output under CI** — emit gate status via plain stdout so Rich
25
+ TTY highlighting (when `GITHUB_ACTIONS=true`) does not break publish workflow tests.
26
+
27
+ ---
28
+
29
+ ## [0.2.3] - 2026-07-01
30
+
31
+ ### Fixed
32
+
33
+ - **`judge-check` gate output** — print gate status with Rich markup disabled so
34
+ publish CI can assert on `max_mae` / `min_correlation` lines reliably.
35
+
36
+ ---
37
+
38
+ ## [0.2.2] - 2026-07-01
39
+
40
+ ### Added
41
+
42
+ - **`driftless judge-check`** — measure judge↔human agreement on a calibration set;
43
+ `--enforce` applies the same gates as `migrate` / `compare`.
44
+ - **`driftless audit-labels`** — find duplicate/near-duplicate inputs with disagreeing
45
+ gold labels; `--fail` for CI.
46
+ - **Judge trust hardening** — optional `max_mae` / `min_correlation` gates on
47
+ judge-graded workflows; judge reliability and scoring evidence in migration reports.
48
+ - **P0.1 expansion** — judge-graded regression scenario; live eval CI baseline
49
+ checks with `--require-all` and job summaries.
50
+ - **`open-pr --create` integration tests** — mocked git/gh execution path coverage.
51
+ - **`migrate` / `refine` label-audit preflight** — warn on label conflicts by default;
52
+ `--strict-label-audit` blocks; `--skip-label-audit` to silence.
53
+
54
+ ### Changed
55
+
56
+ - Live eval workflow sets `DRIFTLESS_REGRESSION_METRICS` explicitly.
57
+
58
+ ---
59
+
20
60
  ## [0.2.1] - 2026-07-01
21
61
 
22
62
  ### Fixed
@@ -80,8 +120,11 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
80
120
  - **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
81
121
  Poetry + Dependabot product framing.
82
122
 
83
- [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.1...HEAD
123
+ [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.4...HEAD
124
+ [0.2.4]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.4
125
+ [0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
126
+ [0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
84
127
  [0.2.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.1
85
- [0.2.0]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.0
128
+ [0.2.0]: https://github.com/driftless-dev/driftless/compare/v0.2.0...v0.2.1
86
129
  [0.1.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.1
87
130
  [0.1.0]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: driftless
3
- Version: 0.2.1
3
+ Version: 0.2.4
4
4
  Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
5
5
  Project-URL: Homepage, https://github.com/driftless-dev/driftless
6
6
  Project-URL: Repository, https://github.com/driftless-dev/driftless
@@ -87,6 +87,8 @@ optimizes against it, with your team owning the definition of "good":
87
87
  precision/recall/F1 against the gold record.
88
88
  - **`eval.judge`** — an LLM judge grades each free-form output against a rubric
89
89
  (with an optional human-scored calibration set for a judge-agreement check).
90
+ Run `driftless judge-check -w <workflow>` before optimizing; set
91
+ `max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
90
92
 
91
93
  ## CLI
92
94
 
@@ -102,9 +104,12 @@ optimizes against it, with your team owning the definition of "good":
102
104
  | `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
103
105
  | `compare -w <w> --to <model>` | Baseline vs target scorecard. |
104
106
  | `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
107
+ | | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
105
108
  | `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
106
109
  | `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
107
110
  | `validate -w <w>` | Check the contract parses and the harness runs. |
111
+ | `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
112
+ | `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
108
113
  | `report` | Render the latest migration report. |
109
114
  | `view` | Open the optimization run viewer (charts + attempt log). |
110
115
  | `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
@@ -128,7 +133,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
128
133
  manually-triggered migration that opens a PR (or an issue when blocked).
129
134
 
130
135
  ```yaml
131
- - uses: driftless-dev/driftless@v0.2.1
136
+ - uses: driftless-dev/driftless@v0.2.4
132
137
  with:
133
138
  command: scan
134
139
  ```
@@ -48,6 +48,8 @@ optimizes against it, with your team owning the definition of "good":
48
48
  precision/recall/F1 against the gold record.
49
49
  - **`eval.judge`** — an LLM judge grades each free-form output against a rubric
50
50
  (with an optional human-scored calibration set for a judge-agreement check).
51
+ Run `driftless judge-check -w <workflow>` before optimizing; set
52
+ `max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
51
53
 
52
54
  ## CLI
53
55
 
@@ -63,9 +65,12 @@ optimizes against it, with your team owning the definition of "good":
63
65
  | `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
64
66
  | `compare -w <w> --to <model>` | Baseline vs target scorecard. |
65
67
  | `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
68
+ | | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
66
69
  | `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
67
70
  | `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
68
71
  | `validate -w <w>` | Check the contract parses and the harness runs. |
72
+ | `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
73
+ | `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
69
74
  | `report` | Render the latest migration report. |
70
75
  | `view` | Open the optimization run viewer (charts + attempt log). |
71
76
  | `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
@@ -89,7 +94,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
89
94
  manually-triggered migration that opens a PR (or an issue when blocked).
90
95
 
91
96
  ```yaml
92
- - uses: driftless-dev/driftless@v0.2.1
97
+ - uses: driftless-dev/driftless@v0.2.4
93
98
  with:
94
99
  command: scan
95
100
  ```
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
153
153
  (`action.yml` lives at the repo root — no `/action` path segment):
154
154
 
155
155
  ```yaml
156
- - uses: driftless-dev/driftless@v0.2.1
156
+ - uses: driftless-dev/driftless@v0.2.4
157
157
  with:
158
158
  command: scan
159
159
  ```
@@ -161,13 +161,19 @@ After a release, users can pin the composite Action by release tag
161
161
  Or pin the PyPI package in the Action input:
162
162
 
163
163
  ```yaml
164
- - uses: driftless-dev/driftless@v0.2.1
164
+ - uses: driftless-dev/driftless@v0.2.4
165
165
  with:
166
- version: "==0.2.1"
166
+ version: "==0.2.4"
167
167
  command: migrate
168
168
  ```
169
169
 
170
- Optionally maintain a floating **`v1`** tag on the latest stable minor release.
170
+ Optionally maintain a floating **`v1`** tag on the latest stable minor release
171
+ (point it at the current release tag after each publish):
172
+
173
+ ```bash
174
+ git tag -f v1 v0.2.4 && git push origin v1 --force
175
+ ```
176
+
171
177
  Update [`action.yml`](../action.yml) default `version` input when cutting releases.
172
178
 
173
179
  ---
@@ -188,3 +194,37 @@ Update [`action.yml`](../action.yml) default `version` input when cutting releas
188
194
  `0.1.0` was uploaded manually before Trusted Publishing was wired. Tags and
189
195
  GitHub Release for `v0.1.0` can be added retroactively for a clean history; PyPI
190
196
  already hosts that version.
197
+
198
+ ---
199
+
200
+ ## Maintainer: live optimizer eval (P0.1)
201
+
202
+ The **migration-regression** workflow runs deterministic regression on every
203
+ push/PR and a **live** LLM optimizer eval nightly (or on manual dispatch). The
204
+ live job costs tokens and is opt-in via repository secrets.
205
+
206
+ ### Required secrets
207
+
208
+ In **Settings → Secrets and variables → Actions**, add:
209
+
210
+ | Secret | Used by |
211
+ |---|---|
212
+ | `OPENAI_API_KEY` | Live eval matrix job (`provider: openai`) |
213
+ | `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
214
+
215
+ If a secret is missing, that provider job exits cleanly with a warning (CI stays
216
+ green). When both are set, nightly runs append to
217
+ `.driftless/regression-metrics.jsonl` and check against
218
+ `tests/fixtures/live_eval_baseline.json` with `--require-all`.
219
+
220
+ ### Local reproduction
221
+
222
+ ```bash
223
+ export DRIFTLESS_LIVE_EVAL=1
224
+ export OPENAI_API_KEY=...
225
+ pytest tests/test_migration_live.py -v -k openai
226
+ python scripts/check_live_eval_metrics.py --provider openai --require-all
227
+ ```
228
+
229
+ After a few stable nightly runs, tighten floors in `live_eval_baseline.json`
230
+ (iterations ceiling, min F1/score).
@@ -308,8 +308,11 @@ driftless open-pr -w support_classifier --create</code></pre>
308
308
  <tr><td><code>plan</code></td><td>Discover at-risk workflows and apply the migration policy (CI triage).</td></tr>
309
309
  <tr><td><code>configure &lt;workflow&gt;</code></td><td>Turn a detected workflow into a migration-ready contract.</td></tr>
310
310
  <tr><td><code>validate -w &lt;w&gt;</code></td><td>Check the contract parses and the harness runs.</td></tr>
311
+ <tr><td><code>audit-labels -w &lt;w&gt;</code></td><td>Find duplicate inputs with disagreeing gold labels (<code>--fail</code> for CI).</td></tr>
312
+ <tr><td><code>judge-check -w &lt;w&gt;</code></td><td>Measure judge↔human agreement on a calibration set (<code>--enforce</code> to gate).</td></tr>
311
313
  <tr><td><code>compare -w &lt;w&gt; --to &lt;model&gt;</code></td><td>Baseline vs. target scorecard + threshold checks.</td></tr>
312
314
  <tr><td><code>migrate -w &lt;w&gt; --to &lt;model&gt;</code></td><td>Repair + validate + produce migrated files.</td></tr>
315
+ <tr><td><code>refine -w &lt;w&gt;</code></td><td>Re-optimize the prompt for a changed dataset (model pinned).</td></tr>
313
316
  <tr><td><code>report [-w &lt;w&gt;]</code></td><td>Render the latest migration report(s).</td></tr>
314
317
  <tr><td><code>open-pr -w &lt;w&gt;</code></td><td>Open a PR (or issue) whose body is the evidence report: summary, scorecard, unified diffs, attempt log, holdout checks.</td></tr>
315
318
  </tbody>
@@ -318,6 +321,7 @@ driftless open-pr -w support_classifier --create</code></pre>
318
321
  <ul>
319
322
  <li><code class="inline">--generator llm|none</code> — the repair strategy (LLM-backed by default; <code class="inline">none</code> turns the loop into a dry analysis).</li>
320
323
  <li><code class="inline">--to &lt;model&gt;</code> — the target model to migrate to (otherwise the contract's candidates are used).</li>
324
+ <li><code class="inline">--strict-label-audit</code> — block when duplicate/near-duplicate inputs disagree on gold labels (warns by default).</li>
321
325
  </ul>
322
326
  </section>
323
327
 
@@ -424,7 +428,7 @@ driftless view -w support_classifier</code></pre>
424
428
  <span class="tok-k">runs-on</span>: ubuntu-latest
425
429
  <span class="tok-k">steps</span>:
426
430
  - <span class="tok-k">uses</span>: actions/checkout@v4
427
- - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.1
431
+ - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.4
428
432
  <span class="tok-k">with</span>:
429
433
  <span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
430
434
  <p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
@@ -1,3 +1,3 @@
1
1
  """driftless: Dependabot for LLM models."""
2
2
 
3
- __version__ = "0.2.1"
3
+ __version__ = "0.2.4"
@@ -355,6 +355,30 @@ def _preflight(wf: Workflow, target_model: str) -> None:
355
355
  err_console.print(f"[yellow]warning:[/] {pf.warning}")
356
356
 
357
357
 
358
+ def _label_audit_preflight(
359
+ workflow_name: str,
360
+ wf: Workflow,
361
+ *,
362
+ skip: bool,
363
+ strict: bool,
364
+ ) -> None:
365
+ """Warn or block when duplicate inputs carry disagreeing gold labels."""
366
+ if skip or wf.eval.grading != "label" or not wf.eval.labels_path:
367
+ return
368
+ from .label_audit import audit_labels, format_audit_report
369
+
370
+ report = audit_labels(workflow_name, wf, cwd=Path.cwd())
371
+ if not report.has_conflicts:
372
+ return
373
+ text = format_audit_report(report)
374
+ if strict:
375
+ err_console.print(text)
376
+ raise typer.Exit(code=1)
377
+ err_console.print(f"[yellow]Label audit warning[/] — {report.conflict_groups[0].kind} conflicts detected")
378
+ err_console.print(f"[dim]{text}[/]")
379
+ err_console.print("[dim]re-run with --strict-label-audit to block, or --skip-label-audit to silence[/]")
380
+
381
+
358
382
  def _fmt(value: float | None, *, pct: bool = False) -> str:
359
383
  if value is None:
360
384
  return "[dim]n/a[/]"
@@ -812,6 +836,14 @@ def migrate(
812
836
  2, "--candidates", help="Candidate patches to propose per iteration "
813
837
  "(widened automatically when an iteration stalls).",
814
838
  ),
839
+ skip_label_audit: bool = typer.Option(
840
+ False, "--skip-label-audit", help="Skip duplicate-label preflight check."
841
+ ),
842
+ strict_label_audit: bool = typer.Option(
843
+ False,
844
+ "--strict-label-audit",
845
+ help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
846
+ ),
815
847
  ) -> None:
816
848
  """Attempt a migration: repair editable files, validate on holdout, report."""
817
849
  from .engine import MigrationStatus, run_migration
@@ -820,6 +852,9 @@ def migrate(
820
852
  try:
821
853
  contract = load_contract(contract_path)
822
854
  wf = contract.workflow(workflow)
855
+ _label_audit_preflight(
856
+ workflow, wf, skip=skip_label_audit, strict=strict_label_audit
857
+ )
823
858
  _preflight(wf, to)
824
859
  gen = build_generator(
825
860
  generator,
@@ -916,6 +951,14 @@ def refine(
916
951
  2, "--candidates", help="Candidate patches to propose per iteration "
917
952
  "(widened automatically when an iteration stalls).",
918
953
  ),
954
+ skip_label_audit: bool = typer.Option(
955
+ False, "--skip-label-audit", help="Skip duplicate-label preflight check."
956
+ ),
957
+ strict_label_audit: bool = typer.Option(
958
+ False,
959
+ "--strict-label-audit",
960
+ help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
961
+ ),
919
962
  ) -> None:
920
963
  """Re-optimize a prompt for a changed eval dataset (model stays pinned).
921
964
 
@@ -933,6 +976,9 @@ def refine(
933
976
  try:
934
977
  contract = load_contract(contract_path)
935
978
  wf = contract.workflow(workflow)
979
+ _label_audit_preflight(
980
+ workflow, wf, skip=skip_label_audit, strict=strict_label_audit
981
+ )
936
982
  gen = build_generator(
937
983
  generator,
938
984
  provider=generator_provider,
@@ -1191,6 +1237,121 @@ def open_pr(
1191
1237
  )
1192
1238
 
1193
1239
 
1240
+ @app.command(name="judge-check")
1241
+ def judge_check(
1242
+ workflow: str = typer.Option(..., "--workflow", "-w"),
1243
+ contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
1244
+ enforce: bool = typer.Option(
1245
+ False,
1246
+ "--enforce",
1247
+ help="Apply eval.judge max_mae/min_correlation gates (same as migrate/compare).",
1248
+ ),
1249
+ ) -> None:
1250
+ """Measure LLM-judge agreement against a human calibration set."""
1251
+ from .judges import build_judge, judge_agreement, require_judge_agreement
1252
+
1253
+ try:
1254
+ contract = load_contract(contract_path)
1255
+ wf = contract.workflow(workflow)
1256
+ except DriftlessError as exc:
1257
+ _fail(exc)
1258
+ return
1259
+
1260
+ if wf.eval.grading != "judge" or wf.eval.judge is None:
1261
+ _fail(
1262
+ DriftlessError(
1263
+ f"{workflow!r} is not judge-graded",
1264
+ hint="add eval.judge to the workflow in driftless.yml",
1265
+ )
1266
+ )
1267
+ return
1268
+
1269
+ spec = wf.eval.judge
1270
+ if not spec.calibration_path:
1271
+ _fail(
1272
+ DriftlessError(
1273
+ "eval.judge.calibration_path is not set",
1274
+ hint="add a human-scored JSONL file for judge agreement",
1275
+ )
1276
+ )
1277
+ return
1278
+
1279
+ judge = build_judge(spec)
1280
+ try:
1281
+ agreement = (
1282
+ require_judge_agreement(judge, spec)
1283
+ if enforce
1284
+ else judge_agreement(judge, spec)
1285
+ )
1286
+ except DriftlessError as exc:
1287
+ _fail(exc)
1288
+ return
1289
+
1290
+ if agreement is None:
1291
+ _fail(DriftlessError("calibration set is empty or produced no scores"))
1292
+ return
1293
+
1294
+ console.print(f"[bold]{workflow}[/] — judge calibration check\n")
1295
+ console.print(f" records: {agreement.n}")
1296
+ console.print(f" MAE: {agreement.mean_abs_error:.3f}")
1297
+ corr = f"{agreement.correlation:.3f}" if agreement.correlation is not None else "n/a"
1298
+ console.print(f" correlation: {corr}")
1299
+
1300
+ gate_bits: list[str] = []
1301
+ if spec.max_mae is not None:
1302
+ ok = agreement.mean_abs_error <= spec.max_mae
1303
+ gate_bits.append(f"max_mae={spec.max_mae:g} ({'ok' if ok else 'FAIL'})")
1304
+ if spec.min_correlation is not None:
1305
+ ok = agreement.correlation is not None and agreement.correlation >= spec.min_correlation
1306
+ gate_bits.append(f"min_correlation={spec.min_correlation:g} ({'ok' if ok else 'FAIL'})")
1307
+ if gate_bits:
1308
+ # Plain stdout — Rich highlight/markup breaks publish CI assertions on the
1309
+ # gate status line when GITHUB_ACTIONS forces a TTY console.
1310
+ typer.echo(" gates: " + ", ".join(gate_bits))
1311
+
1312
+ if enforce:
1313
+ console.print(f"\n[green]gates passed[/] — {agreement.summary}")
1314
+ else:
1315
+ console.print(f"\n[dim]{agreement.summary}[/]")
1316
+ if spec.max_mae is not None or spec.min_correlation is not None:
1317
+ console.print("[dim]re-run with --enforce to apply contract gates[/]")
1318
+
1319
+
1320
+ @app.command(name="audit-labels")
1321
+ def audit_labels_cmd(
1322
+ workflow: str = typer.Option(..., "--workflow", "-w"),
1323
+ contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
1324
+ near_threshold: float = typer.Option(
1325
+ 0.85, "--near-threshold", min=0.5, max=1.0,
1326
+ help="Token Jaccard threshold for near-duplicate detection.",
1327
+ ),
1328
+ fail: bool = typer.Option(
1329
+ False, "--fail", help="Exit non-zero when label conflicts are found.",
1330
+ ),
1331
+ ) -> None:
1332
+ """Audit gold labels for duplicate inputs with disagreeing labels."""
1333
+ from .label_audit import audit_labels, format_audit_report
1334
+
1335
+ try:
1336
+ contract = load_contract(contract_path)
1337
+ wf = contract.workflow(workflow)
1338
+ report = audit_labels(
1339
+ workflow, wf, cwd=Path.cwd(), near_threshold=near_threshold
1340
+ )
1341
+ except DriftlessError as exc:
1342
+ _fail(exc)
1343
+ return
1344
+
1345
+ text = format_audit_report(report)
1346
+ if report.has_conflicts:
1347
+ err_console.print(text)
1348
+ else:
1349
+ console.print(text)
1350
+
1351
+ if fail and report.has_conflicts:
1352
+ raise typer.Exit(code=1)
1353
+
1354
+
1194
1355
  @app.command()
1195
1356
  def report(
1196
1357
  workflow: str = typer.Option(None, "--workflow", "-w", help="Workflow to show (default: all)."),
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
  import json
12
12
  from dataclasses import asdict, dataclass, field
13
13
  from pathlib import Path
14
+ from typing import cast
14
15
 
15
16
  from .contract import ThresholdsSpec, Workflow
16
17
  from .errors import DriftlessError
@@ -195,6 +196,11 @@ def compare_models(
195
196
  )
196
197
  judge = build_judge(judge_spec)
197
198
 
199
+ if judge is not None and workflow.eval.judge is not None:
200
+ from .judges import Judge, require_judge_agreement
201
+
202
+ require_judge_agreement(cast(Judge, judge), workflow.eval.judge, cwd=cwd)
203
+
198
204
  progress_log(f"compare: baseline run ({current})...")
199
205
  baseline_run = run_workflow(workflow, current, cwd=cwd)
200
206
  baseline_metrics = evaluate(workflow, baseline_run, judge=judge, cwd=cwd)
@@ -157,6 +157,18 @@ class JudgeSpec(StrictModel):
157
157
  # Optional path to human-scored records (carrying a numeric ``score``) for a
158
158
  # judge-reliability agreement check.
159
159
  calibration_path: str | None = None
160
+ # Optional gates (require ``calibration_path``). When set, ``migrate`` /
161
+ # ``compare`` / ``refine`` refuse to optimize against an untrusted judge.
162
+ max_mae: float | None = None
163
+ min_correlation: float | None = None
164
+
165
+ @model_validator(mode="after")
166
+ def _gates_need_calibration(self) -> "JudgeSpec":
167
+ if (self.max_mae is not None or self.min_correlation is not None) and not self.calibration_path:
168
+ raise ValueError(
169
+ "eval.judge.max_mae/min_correlation require calibration_path"
170
+ )
171
+ return self
160
172
 
161
173
  @field_validator("rubric")
162
174
  @classmethod
@@ -327,6 +327,9 @@ class MigrationResult:
327
327
  experiment_log: list[AttemptRecord] = field(default_factory=list)
328
328
  cluster_history: list[list[FailureCluster]] = field(default_factory=list)
329
329
  warnings: list[str] = field(default_factory=list)
330
+ # Judge-graded workflows: calibration agreement + low-score rationales for reviewers.
331
+ judge_agreement: Any | None = None
332
+ judge_evidence: list[dict[str, Any]] = field(default_factory=list)
330
333
  # refine-only: thresholds derived from the achieved holdout metrics, for the
331
334
  # customer to accept/edit (the old dataset's thresholds are stale).
332
335
  suggested_thresholds: dict[str, float] = field(default_factory=dict)
@@ -478,6 +481,27 @@ def run_migration(
478
481
  )
479
482
  judge = build_judge(judge_spec)
480
483
 
484
+ judge_agreement_info = None
485
+ if judge is not None and workflow.eval.judge is not None:
486
+ from .judges import require_judge_agreement
487
+
488
+ try:
489
+ judge_agreement_info = require_judge_agreement(
490
+ judge, workflow.eval.judge, cwd=cwd
491
+ )
492
+ except DriftlessError as exc:
493
+ return MigrationResult(
494
+ workflow=workflow_name,
495
+ current_model=current,
496
+ target_model=target_model,
497
+ status=MigrationStatus.BLOCKED,
498
+ iterations=0,
499
+ baseline=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
500
+ naive_target=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
501
+ final=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
502
+ message=str(exc),
503
+ )
504
+
481
505
  if not workflow.model.has_override():
482
506
  return MigrationResult(
483
507
  workflow=workflow_name,
@@ -500,6 +524,13 @@ def run_migration(
500
524
 
501
525
  use_ids = bool(workflow.eval.id_field) and split.gold is not None
502
526
 
527
+ def _judge_evidence(rows: list[RecordRow]) -> list[dict[str, Any]]:
528
+ if workflow.eval.grading != "judge":
529
+ return []
530
+ from .judges import judge_evidence_samples
531
+
532
+ return judge_evidence_samples(rows)
533
+
503
534
  def evaluate_on(
504
535
  model: str, idx: list[int], files: dict[str, str] | None = None
505
536
  ) -> RunAnalysis:
@@ -566,6 +597,8 @@ def run_migration(
566
597
  holdout_checks=holdout_checks,
567
598
  tuning_checks=naive_checks,
568
599
  warnings=size_warnings,
600
+ judge_agreement=judge_agreement_info,
601
+ judge_evidence=_judge_evidence(naive_analysis.rows),
569
602
  message="naive model swap passes thresholds; only the model ID changes",
570
603
  )
571
604
 
@@ -753,6 +786,8 @@ def run_migration(
753
786
  experiment_log=experiment_log,
754
787
  cluster_history=cluster_history,
755
788
  warnings=size_warnings,
789
+ judge_agreement=judge_agreement_info,
790
+ judge_evidence=_judge_evidence(best_analysis.rows),
756
791
  original_editable_files=original_editable,
757
792
  message="migration passed tuning and holdout thresholds",
758
793
  )
@@ -821,6 +856,8 @@ def run_migration(
821
856
  cluster_history=cluster_history,
822
857
  warnings=size_warnings,
823
858
  suggested_thresholds=suggested,
859
+ judge_agreement=judge_agreement_info,
860
+ judge_evidence=_judge_evidence(best_analysis.rows),
824
861
  original_editable_files=original_editable,
825
862
  message=message,
826
863
  )
@@ -850,6 +887,8 @@ def run_migration(
850
887
  experiment_log=experiment_log,
851
888
  cluster_history=cluster_history,
852
889
  warnings=size_warnings,
890
+ judge_agreement=judge_agreement_info,
891
+ judge_evidence=_judge_evidence(best_analysis.rows),
853
892
  original_editable_files=original_editable,
854
893
  message=message,
855
894
  )
@@ -183,3 +183,55 @@ def judge_agreement(
183
183
  return None
184
184
  mae = sum(abs(m - h) for m, h in zip(model, human)) / len(human)
185
185
  return JudgeAgreement(n=len(human), mean_abs_error=mae, correlation=_pearson(model, human))
186
+
187
+
188
+ def require_judge_agreement(
189
+ judge: Judge, spec: JudgeSpec, *, cwd: Path | None = None
190
+ ) -> JudgeAgreement | None:
191
+ """Run ``judge_agreement`` and enforce optional ``max_mae`` / ``min_correlation`` gates."""
192
+ agreement = judge_agreement(judge, spec, cwd=cwd)
193
+ if spec.max_mae is None and spec.min_correlation is None:
194
+ return agreement
195
+ if agreement is None:
196
+ raise DriftlessError(
197
+ "judge agreement gate requires a non-empty calibration set",
198
+ hint=f"add human-scored records to {spec.calibration_path}",
199
+ )
200
+ if spec.max_mae is not None and agreement.mean_abs_error > spec.max_mae:
201
+ raise DriftlessError(
202
+ f"judge mean absolute error {agreement.mean_abs_error:.3f} exceeds "
203
+ f"max_mae={spec.max_mae:g}",
204
+ hint=agreement.summary,
205
+ )
206
+ if spec.min_correlation is not None:
207
+ if agreement.correlation is None:
208
+ raise DriftlessError(
209
+ f"judge correlation is undefined on {agreement.n} calibration records; "
210
+ f"need min_correlation={spec.min_correlation:g}",
211
+ hint=agreement.summary,
212
+ )
213
+ if agreement.correlation < spec.min_correlation:
214
+ raise DriftlessError(
215
+ f"judge correlation {agreement.correlation:.3f} below "
216
+ f"min_correlation={spec.min_correlation:g}",
217
+ hint=agreement.summary,
218
+ )
219
+ return agreement
220
+
221
+
222
+ def judge_evidence_samples(
223
+ rows: list[Any], *, max_samples: int = 5
224
+ ) -> list[dict[str, Any]]:
225
+ """Lowest-scoring judge-graded rows with rationale for PR reports."""
226
+ low = [r for r in rows if getattr(r, "is_low_score", False) and getattr(r, "rationale", None)]
227
+ low.sort(key=lambda r: getattr(r, "score", 0.0) or 0.0)
228
+ out: list[dict[str, Any]] = []
229
+ for row in low[:max_samples]:
230
+ out.append(
231
+ {
232
+ "index": row.index,
233
+ "score": row.score,
234
+ "rationale": row.rationale,
235
+ }
236
+ )
237
+ return out