driftless 0.2.1__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {driftless-0.2.1 → driftless-0.2.5}/CHANGELOG.md +58 -2
  2. {driftless-0.2.1 → driftless-0.2.5}/PKG-INFO +8 -3
  3. {driftless-0.2.1 → driftless-0.2.5}/README.md +7 -2
  4. {driftless-0.2.1 → driftless-0.2.5}/docs/RELEASE.md +44 -4
  5. {driftless-0.2.1 → driftless-0.2.5}/site/docs.html +5 -1
  6. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/__init__.py +1 -1
  7. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/cli.py +173 -0
  8. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/compare.py +6 -0
  9. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/contract.py +12 -0
  10. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/engine.py +39 -0
  11. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/init_ci.py +247 -2
  12. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/judges.py +52 -0
  13. driftless-0.2.5/src/driftless/label_audit.py +290 -0
  14. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/report.py +29 -0
  15. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/templates.py +2 -0
  16. driftless-0.2.5/tests/fixtures/live_eval_baseline.json +38 -0
  17. driftless-0.2.5/tests/regression_metrics.py +141 -0
  18. driftless-0.2.5/tests/scenarios.py +880 -0
  19. driftless-0.2.5/tests/test_cli.py +268 -0
  20. {driftless-0.2.1 → driftless-0.2.5}/tests/test_github.py +65 -0
  21. driftless-0.2.5/tests/test_grading_loop.py +40 -0
  22. driftless-0.2.5/tests/test_init_ci.py +314 -0
  23. {driftless-0.2.1 → driftless-0.2.5}/tests/test_judge.py +63 -0
  24. driftless-0.2.5/tests/test_judge_loop.py +69 -0
  25. driftless-0.2.5/tests/test_label_audit.py +183 -0
  26. driftless-0.2.5/tests/test_migration_live.py +131 -0
  27. {driftless-0.2.1 → driftless-0.2.5}/tests/test_migration_regression.py +114 -0
  28. driftless-0.2.5/tests/test_regression_metrics.py +85 -0
  29. {driftless-0.2.1 → driftless-0.2.5}/tests/test_report.py +23 -0
  30. driftless-0.2.1/tests/scenarios.py +0 -387
  31. driftless-0.2.1/tests/test_cli.py +0 -72
  32. driftless-0.2.1/tests/test_grading_loop.py +0 -103
  33. driftless-0.2.1/tests/test_init_ci.py +0 -128
  34. driftless-0.2.1/tests/test_judge_loop.py +0 -93
  35. driftless-0.2.1/tests/test_migration_live.py +0 -40
  36. {driftless-0.2.1 → driftless-0.2.5}/.gitignore +0 -0
  37. {driftless-0.2.1 → driftless-0.2.5}/LICENSE +0 -0
  38. {driftless-0.2.1 → driftless-0.2.5}/docs/repair-and-generators.md +0 -0
  39. {driftless-0.2.1 → driftless-0.2.5}/pyproject.toml +0 -0
  40. {driftless-0.2.1 → driftless-0.2.5}/site/assets/app.js +0 -0
  41. {driftless-0.2.1 → driftless-0.2.5}/site/assets/hero-workflow.png +0 -0
  42. {driftless-0.2.1 → driftless-0.2.5}/site/assets/landing.css +0 -0
  43. {driftless-0.2.1 → driftless-0.2.5}/site/assets/runs.css +0 -0
  44. {driftless-0.2.1 → driftless-0.2.5}/site/assets/runs.js +0 -0
  45. {driftless-0.2.1 → driftless-0.2.5}/site/assets/sample-run.json +0 -0
  46. {driftless-0.2.1 → driftless-0.2.5}/site/assets/styles.css +0 -0
  47. {driftless-0.2.1 → driftless-0.2.5}/site/index.html +0 -0
  48. {driftless-0.2.1 → driftless-0.2.5}/site/runs.html +0 -0
  49. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/calibrate.py +0 -0
  50. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/configure.py +0 -0
  51. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/data/model_lifecycle.json +0 -0
  52. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/datasource.py +0 -0
  53. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/datastate.py +0 -0
  54. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/discovery.py +0 -0
  55. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/errors.py +0 -0
  56. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/evaluation.py +0 -0
  57. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/generators.py +0 -0
  58. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/github.py +0 -0
  59. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/harness.py +0 -0
  60. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/lifecycle.py +0 -0
  61. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/policy.py +0 -0
  62. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/preflight.py +0 -0
  63. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/progress.py +0 -0
  64. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/scanner.py +0 -0
  65. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/splits.py +0 -0
  66. {driftless-0.2.1 → driftless-0.2.5}/src/driftless/view.py +0 -0
  67. {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/driftless.yml +0 -0
  68. {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/inputs.jsonl +0 -0
  69. {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/labels.jsonl +0 -0
  70. {driftless-0.2.1 → driftless-0.2.5}/tests/test_compare.py +0 -0
  71. {driftless-0.2.1 → driftless-0.2.5}/tests/test_contract.py +0 -0
  72. {driftless-0.2.1 → driftless-0.2.5}/tests/test_data_change_gate.py +0 -0
  73. {driftless-0.2.1 → driftless-0.2.5}/tests/test_data_change_regression.py +0 -0
  74. {driftless-0.2.1 → driftless-0.2.5}/tests/test_datasource.py +0 -0
  75. {driftless-0.2.1 → driftless-0.2.5}/tests/test_datastate.py +0 -0
  76. {driftless-0.2.1 → driftless-0.2.5}/tests/test_discovery.py +0 -0
  77. {driftless-0.2.1 → driftless-0.2.5}/tests/test_endpoint.py +0 -0
  78. {driftless-0.2.1 → driftless-0.2.5}/tests/test_engine.py +0 -0
  79. {driftless-0.2.1 → driftless-0.2.5}/tests/test_evaluation.py +0 -0
  80. {driftless-0.2.1 → driftless-0.2.5}/tests/test_extraction.py +0 -0
  81. {driftless-0.2.1 → driftless-0.2.5}/tests/test_generators.py +0 -0
  82. {driftless-0.2.1 → driftless-0.2.5}/tests/test_harness.py +0 -0
  83. {driftless-0.2.1 → driftless-0.2.5}/tests/test_lifecycle.py +0 -0
  84. {driftless-0.2.1 → driftless-0.2.5}/tests/test_plan_act.py +0 -0
  85. {driftless-0.2.1 → driftless-0.2.5}/tests/test_policy.py +0 -0
  86. {driftless-0.2.1 → driftless-0.2.5}/tests/test_poll_act.py +0 -0
  87. {driftless-0.2.1 → driftless-0.2.5}/tests/test_preflight.py +0 -0
  88. {driftless-0.2.1 → driftless-0.2.5}/tests/test_progress.py +0 -0
  89. {driftless-0.2.1 → driftless-0.2.5}/tests/test_refine.py +0 -0
  90. {driftless-0.2.1 → driftless-0.2.5}/tests/test_refresh_catalog.py +0 -0
  91. {driftless-0.2.1 → driftless-0.2.5}/tests/test_repair_prompt.py +0 -0
  92. {driftless-0.2.1 → driftless-0.2.5}/tests/test_scanner.py +0 -0
  93. {driftless-0.2.1 → driftless-0.2.5}/tests/test_view.py +0 -0
@@ -17,6 +17,58 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
17
17
 
18
18
  ---
19
19
 
20
+ ## [0.2.5] - 2026-07-01
21
+
22
+ ### Added
23
+
24
+ - **`init-ci` label-audit workflow** — scaffold `driftless-label-audit.yml` (or
25
+ `-all` matrix) with `audit-labels --fail` on eval dataset path changes.
26
+ - **`init-ci` judge-check workflow** — scaffold `driftless-judge-check.yml` when
27
+ `eval.judge.calibration_path` is set; uses `--enforce` when gate thresholds
28
+ are configured.
29
+
30
+ ---
31
+
32
+ ## [0.2.4] - 2026-07-01
33
+
34
+ ### Fixed
35
+
36
+ - **`judge-check` gate output under CI** — emit gate status via plain stdout so Rich
37
+ TTY highlighting (when `GITHUB_ACTIONS=true`) does not break publish workflow tests.
38
+
39
+ ---
40
+
41
+ ## [0.2.3] - 2026-07-01
42
+
43
+ ### Fixed
44
+
45
+ - **`judge-check` gate output** — print gate status with Rich markup disabled so
46
+ publish CI can assert on `max_mae` / `min_correlation` lines reliably.
47
+
48
+ ---
49
+
50
+ ## [0.2.2] - 2026-07-01
51
+
52
+ ### Added
53
+
54
+ - **`driftless judge-check`** — measure judge↔human agreement on a calibration set;
55
+ `--enforce` applies the same gates as `migrate` / `compare`.
56
+ - **`driftless audit-labels`** — find duplicate/near-duplicate inputs with disagreeing
57
+ gold labels; `--fail` for CI.
58
+ - **Judge trust hardening** — optional `max_mae` / `min_correlation` gates on
59
+ judge-graded workflows; judge reliability and scoring evidence in migration reports.
60
+ - **P0.1 expansion** — judge-graded regression scenario; live eval CI baseline
61
+ checks with `--require-all` and job summaries.
62
+ - **`open-pr --create` integration tests** — mocked git/gh execution path coverage.
63
+ - **`migrate` / `refine` label-audit preflight** — warn on label conflicts by default;
64
+ `--strict-label-audit` blocks; `--skip-label-audit` to silence.
65
+
66
+ ### Changed
67
+
68
+ - Live eval workflow sets `DRIFTLESS_REGRESSION_METRICS` explicitly.
69
+
70
+ ---
71
+
20
72
  ## [0.2.1] - 2026-07-01
21
73
 
22
74
  ### Fixed
@@ -80,8 +132,12 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
80
132
  - **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
81
133
  Poetry + Dependabot product framing.
82
134
 
83
- [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.1...HEAD
135
+ [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.5...HEAD
136
+ [0.2.5]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.5
137
+ [0.2.4]: https://github.com/driftless-dev/driftless/compare/v0.2.4...v0.2.5
138
+ [0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
139
+ [0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
84
140
  [0.2.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.1
85
- [0.2.0]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.0
141
+ [0.2.0]: https://github.com/driftless-dev/driftless/compare/v0.2.0...v0.2.1
86
142
  [0.1.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.1
87
143
  [0.1.0]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: driftless
3
- Version: 0.2.1
3
+ Version: 0.2.5
4
4
  Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
5
5
  Project-URL: Homepage, https://github.com/driftless-dev/driftless
6
6
  Project-URL: Repository, https://github.com/driftless-dev/driftless
@@ -87,6 +87,8 @@ optimizes against it, with your team owning the definition of "good":
87
87
  precision/recall/F1 against the gold record.
88
88
  - **`eval.judge`** — an LLM judge grades each free-form output against a rubric
89
89
  (with an optional human-scored calibration set for a judge-agreement check).
90
+ Run `driftless judge-check -w <workflow>` before optimizing; set
91
+ `max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
90
92
 
91
93
  ## CLI
92
94
 
@@ -94,7 +96,7 @@ optimizes against it, with your team owning the definition of "good":
94
96
  |---|---|
95
97
  | `init` | Scaffold a `driftless.yml`. |
96
98
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
97
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and poll. |
99
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
98
100
  | `scan` | Find probable LLM usage and at-risk models. |
99
101
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
100
102
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -102,9 +104,12 @@ optimizes against it, with your team owning the definition of "good":
102
104
  | `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
103
105
  | `compare -w <w> --to <model>` | Baseline vs target scorecard. |
104
106
  | `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
107
+ | | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
105
108
  | `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
106
109
  | `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
107
110
  | `validate -w <w>` | Check the contract parses and the harness runs. |
111
+ | `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
112
+ | `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
108
113
  | `report` | Render the latest migration report. |
109
114
  | `view` | Open the optimization run viewer (charts + attempt log). |
110
115
  | `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
@@ -128,7 +133,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
128
133
  manually-triggered migration that opens a PR (or an issue when blocked).
129
134
 
130
135
  ```yaml
131
- - uses: driftless-dev/driftless@v0.2.1
136
+ - uses: driftless-dev/driftless@v0.2.5
132
137
  with:
133
138
  command: scan
134
139
  ```
@@ -48,6 +48,8 @@ optimizes against it, with your team owning the definition of "good":
48
48
  precision/recall/F1 against the gold record.
49
49
  - **`eval.judge`** — an LLM judge grades each free-form output against a rubric
50
50
  (with an optional human-scored calibration set for a judge-agreement check).
51
+ Run `driftless judge-check -w <workflow>` before optimizing; set
52
+ `max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
51
53
 
52
54
  ## CLI
53
55
 
@@ -55,7 +57,7 @@ optimizes against it, with your team owning the definition of "good":
55
57
  |---|---|
56
58
  | `init` | Scaffold a `driftless.yml`. |
57
59
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
58
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and poll. |
60
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
59
61
  | `scan` | Find probable LLM usage and at-risk models. |
60
62
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
61
63
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -63,9 +65,12 @@ optimizes against it, with your team owning the definition of "good":
63
65
  | `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
64
66
  | `compare -w <w> --to <model>` | Baseline vs target scorecard. |
65
67
  | `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
68
+ | | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
66
69
  | `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
67
70
  | `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
68
71
  | `validate -w <w>` | Check the contract parses and the harness runs. |
72
+ | `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
73
+ | `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
69
74
  | `report` | Render the latest migration report. |
70
75
  | `view` | Open the optimization run viewer (charts + attempt log). |
71
76
  | `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
@@ -89,7 +94,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
89
94
  manually-triggered migration that opens a PR (or an issue when blocked).
90
95
 
91
96
  ```yaml
92
- - uses: driftless-dev/driftless@v0.2.1
97
+ - uses: driftless-dev/driftless@v0.2.5
93
98
  with:
94
99
  command: scan
95
100
  ```
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
153
153
  (`action.yml` lives at the repo root — no `/action` path segment):
154
154
 
155
155
  ```yaml
156
- - uses: driftless-dev/driftless@v0.2.1
156
+ - uses: driftless-dev/driftless@v0.2.5
157
157
  with:
158
158
  command: scan
159
159
  ```
@@ -161,13 +161,19 @@ After a release, users can pin the composite Action by release tag
161
161
  Or pin the PyPI package in the Action input:
162
162
 
163
163
  ```yaml
164
- - uses: driftless-dev/driftless@v0.2.1
164
+ - uses: driftless-dev/driftless@v0.2.5
165
165
  with:
166
- version: "==0.2.1"
166
+ version: "==0.2.5"
167
167
  command: migrate
168
168
  ```
169
169
 
170
- Optionally maintain a floating **`v1`** tag on the latest stable minor release.
170
+ Optionally maintain a floating **`v1`** tag on the latest stable minor release
171
+ (point it at the current release tag after each publish):
172
+
173
+ ```bash
174
+ git tag -f v1 v0.2.5 && git push origin v1 --force
175
+ ```
176
+
171
177
  Update [`action.yml`](../action.yml) default `version` input when cutting releases.
172
178
 
173
179
  ---
@@ -188,3 +194,37 @@ Update [`action.yml`](../action.yml) default `version` input when cutting releas
188
194
  `0.1.0` was uploaded manually before Trusted Publishing was wired. Tags and
189
195
  GitHub Release for `v0.1.0` can be added retroactively for a clean history; PyPI
190
196
  already hosts that version.
197
+
198
+ ---
199
+
200
+ ## Maintainer: live optimizer eval (P0.1)
201
+
202
+ The **migration-regression** workflow runs deterministic regression on every
203
+ push/PR and a **live** LLM optimizer eval nightly (or on manual dispatch). The
204
+ live job costs tokens and is opt-in via repository secrets.
205
+
206
+ ### Required secrets
207
+
208
+ In **Settings → Secrets and variables → Actions**, add:
209
+
210
+ | Secret | Used by |
211
+ |---|---|
212
+ | `OPENAI_API_KEY` | Live eval matrix job (`provider: openai`) |
213
+ | `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
214
+
215
+ If a secret is missing, that provider job exits cleanly with a warning (CI stays
216
+ green). When both are set, nightly runs append to
217
+ `.driftless/regression-metrics.jsonl` and check against
218
+ `tests/fixtures/live_eval_baseline.json` with `--require-all`.
219
+
220
+ ### Local reproduction
221
+
222
+ ```bash
223
+ export DRIFTLESS_LIVE_EVAL=1
224
+ export OPENAI_API_KEY=...
225
+ pytest tests/test_migration_live.py -v -k openai
226
+ python scripts/check_live_eval_metrics.py --provider openai --require-all
227
+ ```
228
+
229
+ After a few stable nightly runs, tighten floors in `live_eval_baseline.json`
230
+ (iterations ceiling, min F1/score).
@@ -308,8 +308,11 @@ driftless open-pr -w support_classifier --create</code></pre>
308
308
  <tr><td><code>plan</code></td><td>Discover at-risk workflows and apply the migration policy (CI triage).</td></tr>
309
309
  <tr><td><code>configure &lt;workflow&gt;</code></td><td>Turn a detected workflow into a migration-ready contract.</td></tr>
310
310
  <tr><td><code>validate -w &lt;w&gt;</code></td><td>Check the contract parses and the harness runs.</td></tr>
311
+ <tr><td><code>audit-labels -w &lt;w&gt;</code></td><td>Find duplicate inputs with disagreeing gold labels (<code>--fail</code> for CI).</td></tr>
312
+ <tr><td><code>judge-check -w &lt;w&gt;</code></td><td>Measure judge↔human agreement on a calibration set (<code>--enforce</code> to gate).</td></tr>
311
313
  <tr><td><code>compare -w &lt;w&gt; --to &lt;model&gt;</code></td><td>Baseline vs. target scorecard + threshold checks.</td></tr>
312
314
  <tr><td><code>migrate -w &lt;w&gt; --to &lt;model&gt;</code></td><td>Repair + validate + produce migrated files.</td></tr>
315
+ <tr><td><code>refine -w &lt;w&gt;</code></td><td>Re-optimize the prompt for a changed dataset (model pinned).</td></tr>
313
316
  <tr><td><code>report [-w &lt;w&gt;]</code></td><td>Render the latest migration report(s).</td></tr>
314
317
  <tr><td><code>open-pr -w &lt;w&gt;</code></td><td>Open a PR (or issue) whose body is the evidence report: summary, scorecard, unified diffs, attempt log, holdout checks.</td></tr>
315
318
  </tbody>
@@ -318,6 +321,7 @@ driftless open-pr -w support_classifier --create</code></pre>
318
321
  <ul>
319
322
  <li><code class="inline">--generator llm|none</code> — the repair strategy (LLM-backed by default; <code class="inline">none</code> turns the loop into a dry analysis).</li>
320
323
  <li><code class="inline">--to &lt;model&gt;</code> — the target model to migrate to (otherwise the contract's candidates are used).</li>
324
+ <li><code class="inline">--strict-label-audit</code> — block when duplicate/near-duplicate inputs disagree on gold labels (warns by default).</li>
321
325
  </ul>
322
326
  </section>
323
327
 
@@ -424,7 +428,7 @@ driftless view -w support_classifier</code></pre>
424
428
  <span class="tok-k">runs-on</span>: ubuntu-latest
425
429
  <span class="tok-k">steps</span>:
426
430
  - <span class="tok-k">uses</span>: actions/checkout@v4
427
- - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.1
431
+ - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.5
428
432
  <span class="tok-k">with</span>:
429
433
  <span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
430
434
  <p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
@@ -1,3 +1,3 @@
1
1
  """driftless: Dependabot for LLM models."""
2
2
 
3
- __version__ = "0.2.1"
3
+ __version__ = "0.2.5"
@@ -136,6 +136,16 @@ def init_ci(
136
136
  plan: bool = typer.Option(
137
137
  False, "--plan/--no-plan", help="Scaffold scheduled plan --act workflow."
138
138
  ),
139
+ audit_labels: bool | None = typer.Option(
140
+ None,
141
+ "--audit-labels/--no-audit-labels",
142
+ help="Scaffold label-audit CI workflow (default: on if labels_path is set).",
143
+ ),
144
+ judge_check: bool | None = typer.Option(
145
+ None,
146
+ "--judge-check/--no-judge-check",
147
+ help="Scaffold judge-calibration CI workflow (default: on if calibration_path is set).",
148
+ ),
139
149
  ) -> None:
140
150
  """Scaffold GitHub Actions workflows wired to the driftless composite Action."""
141
151
  from .init_ci import CHECKLIST, scaffold_ci_from_path
@@ -151,6 +161,8 @@ def init_ci(
151
161
  include_refine=refine,
152
162
  include_poll=poll,
153
163
  include_plan=plan,
164
+ include_audit_labels=audit_labels,
165
+ include_judge_check=judge_check,
154
166
  )
155
167
  except DriftlessError as exc:
156
168
  _fail(exc)
@@ -355,6 +367,30 @@ def _preflight(wf: Workflow, target_model: str) -> None:
355
367
  err_console.print(f"[yellow]warning:[/] {pf.warning}")
356
368
 
357
369
 
370
+ def _label_audit_preflight(
371
+ workflow_name: str,
372
+ wf: Workflow,
373
+ *,
374
+ skip: bool,
375
+ strict: bool,
376
+ ) -> None:
377
+ """Warn or block when duplicate inputs carry disagreeing gold labels."""
378
+ if skip or wf.eval.grading != "label" or not wf.eval.labels_path:
379
+ return
380
+ from .label_audit import audit_labels, format_audit_report
381
+
382
+ report = audit_labels(workflow_name, wf, cwd=Path.cwd())
383
+ if not report.has_conflicts:
384
+ return
385
+ text = format_audit_report(report)
386
+ if strict:
387
+ err_console.print(text)
388
+ raise typer.Exit(code=1)
389
+ err_console.print(f"[yellow]Label audit warning[/] — {report.conflict_groups[0].kind} conflicts detected")
390
+ err_console.print(f"[dim]{text}[/]")
391
+ err_console.print("[dim]re-run with --strict-label-audit to block, or --skip-label-audit to silence[/]")
392
+
393
+
358
394
  def _fmt(value: float | None, *, pct: bool = False) -> str:
359
395
  if value is None:
360
396
  return "[dim]n/a[/]"
@@ -812,6 +848,14 @@ def migrate(
812
848
  2, "--candidates", help="Candidate patches to propose per iteration "
813
849
  "(widened automatically when an iteration stalls).",
814
850
  ),
851
+ skip_label_audit: bool = typer.Option(
852
+ False, "--skip-label-audit", help="Skip duplicate-label preflight check."
853
+ ),
854
+ strict_label_audit: bool = typer.Option(
855
+ False,
856
+ "--strict-label-audit",
857
+ help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
858
+ ),
815
859
  ) -> None:
816
860
  """Attempt a migration: repair editable files, validate on holdout, report."""
817
861
  from .engine import MigrationStatus, run_migration
@@ -820,6 +864,9 @@ def migrate(
820
864
  try:
821
865
  contract = load_contract(contract_path)
822
866
  wf = contract.workflow(workflow)
867
+ _label_audit_preflight(
868
+ workflow, wf, skip=skip_label_audit, strict=strict_label_audit
869
+ )
823
870
  _preflight(wf, to)
824
871
  gen = build_generator(
825
872
  generator,
@@ -916,6 +963,14 @@ def refine(
916
963
  2, "--candidates", help="Candidate patches to propose per iteration "
917
964
  "(widened automatically when an iteration stalls).",
918
965
  ),
966
+ skip_label_audit: bool = typer.Option(
967
+ False, "--skip-label-audit", help="Skip duplicate-label preflight check."
968
+ ),
969
+ strict_label_audit: bool = typer.Option(
970
+ False,
971
+ "--strict-label-audit",
972
+ help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
973
+ ),
919
974
  ) -> None:
920
975
  """Re-optimize a prompt for a changed eval dataset (model stays pinned).
921
976
 
@@ -933,6 +988,9 @@ def refine(
933
988
  try:
934
989
  contract = load_contract(contract_path)
935
990
  wf = contract.workflow(workflow)
991
+ _label_audit_preflight(
992
+ workflow, wf, skip=skip_label_audit, strict=strict_label_audit
993
+ )
936
994
  gen = build_generator(
937
995
  generator,
938
996
  provider=generator_provider,
@@ -1191,6 +1249,121 @@ def open_pr(
1191
1249
  )
1192
1250
 
1193
1251
 
1252
+ @app.command(name="judge-check")
1253
+ def judge_check(
1254
+ workflow: str = typer.Option(..., "--workflow", "-w"),
1255
+ contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
1256
+ enforce: bool = typer.Option(
1257
+ False,
1258
+ "--enforce",
1259
+ help="Apply eval.judge max_mae/min_correlation gates (same as migrate/compare).",
1260
+ ),
1261
+ ) -> None:
1262
+ """Measure LLM-judge agreement against a human calibration set."""
1263
+ from .judges import build_judge, judge_agreement, require_judge_agreement
1264
+
1265
+ try:
1266
+ contract = load_contract(contract_path)
1267
+ wf = contract.workflow(workflow)
1268
+ except DriftlessError as exc:
1269
+ _fail(exc)
1270
+ return
1271
+
1272
+ if wf.eval.grading != "judge" or wf.eval.judge is None:
1273
+ _fail(
1274
+ DriftlessError(
1275
+ f"{workflow!r} is not judge-graded",
1276
+ hint="add eval.judge to the workflow in driftless.yml",
1277
+ )
1278
+ )
1279
+ return
1280
+
1281
+ spec = wf.eval.judge
1282
+ if not spec.calibration_path:
1283
+ _fail(
1284
+ DriftlessError(
1285
+ "eval.judge.calibration_path is not set",
1286
+ hint="add a human-scored JSONL file for judge agreement",
1287
+ )
1288
+ )
1289
+ return
1290
+
1291
+ judge = build_judge(spec)
1292
+ try:
1293
+ agreement = (
1294
+ require_judge_agreement(judge, spec)
1295
+ if enforce
1296
+ else judge_agreement(judge, spec)
1297
+ )
1298
+ except DriftlessError as exc:
1299
+ _fail(exc)
1300
+ return
1301
+
1302
+ if agreement is None:
1303
+ _fail(DriftlessError("calibration set is empty or produced no scores"))
1304
+ return
1305
+
1306
+ console.print(f"[bold]{workflow}[/] — judge calibration check\n")
1307
+ console.print(f" records: {agreement.n}")
1308
+ console.print(f" MAE: {agreement.mean_abs_error:.3f}")
1309
+ corr = f"{agreement.correlation:.3f}" if agreement.correlation is not None else "n/a"
1310
+ console.print(f" correlation: {corr}")
1311
+
1312
+ gate_bits: list[str] = []
1313
+ if spec.max_mae is not None:
1314
+ ok = agreement.mean_abs_error <= spec.max_mae
1315
+ gate_bits.append(f"max_mae={spec.max_mae:g} ({'ok' if ok else 'FAIL'})")
1316
+ if spec.min_correlation is not None:
1317
+ ok = agreement.correlation is not None and agreement.correlation >= spec.min_correlation
1318
+ gate_bits.append(f"min_correlation={spec.min_correlation:g} ({'ok' if ok else 'FAIL'})")
1319
+ if gate_bits:
1320
+ # Plain stdout — Rich highlight/markup breaks publish CI assertions on the
1321
+ # gate status line when GITHUB_ACTIONS forces a TTY console.
1322
+ typer.echo(" gates: " + ", ".join(gate_bits))
1323
+
1324
+ if enforce:
1325
+ console.print(f"\n[green]gates passed[/] — {agreement.summary}")
1326
+ else:
1327
+ console.print(f"\n[dim]{agreement.summary}[/]")
1328
+ if spec.max_mae is not None or spec.min_correlation is not None:
1329
+ console.print("[dim]re-run with --enforce to apply contract gates[/]")
1330
+
1331
+
1332
+ @app.command(name="audit-labels")
1333
+ def audit_labels_cmd(
1334
+ workflow: str = typer.Option(..., "--workflow", "-w"),
1335
+ contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
1336
+ near_threshold: float = typer.Option(
1337
+ 0.85, "--near-threshold", min=0.5, max=1.0,
1338
+ help="Token Jaccard threshold for near-duplicate detection.",
1339
+ ),
1340
+ fail: bool = typer.Option(
1341
+ False, "--fail", help="Exit non-zero when label conflicts are found.",
1342
+ ),
1343
+ ) -> None:
1344
+ """Audit gold labels for duplicate inputs with disagreeing labels."""
1345
+ from .label_audit import audit_labels, format_audit_report
1346
+
1347
+ try:
1348
+ contract = load_contract(contract_path)
1349
+ wf = contract.workflow(workflow)
1350
+ report = audit_labels(
1351
+ workflow, wf, cwd=Path.cwd(), near_threshold=near_threshold
1352
+ )
1353
+ except DriftlessError as exc:
1354
+ _fail(exc)
1355
+ return
1356
+
1357
+ text = format_audit_report(report)
1358
+ if report.has_conflicts:
1359
+ err_console.print(text)
1360
+ else:
1361
+ console.print(text)
1362
+
1363
+ if fail and report.has_conflicts:
1364
+ raise typer.Exit(code=1)
1365
+
1366
+
1194
1367
  @app.command()
1195
1368
  def report(
1196
1369
  workflow: str = typer.Option(None, "--workflow", "-w", help="Workflow to show (default: all)."),
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
  import json
12
12
  from dataclasses import asdict, dataclass, field
13
13
  from pathlib import Path
14
+ from typing import cast
14
15
 
15
16
  from .contract import ThresholdsSpec, Workflow
16
17
  from .errors import DriftlessError
@@ -195,6 +196,11 @@ def compare_models(
195
196
  )
196
197
  judge = build_judge(judge_spec)
197
198
 
199
+ if judge is not None and workflow.eval.judge is not None:
200
+ from .judges import Judge, require_judge_agreement
201
+
202
+ require_judge_agreement(cast(Judge, judge), workflow.eval.judge, cwd=cwd)
203
+
198
204
  progress_log(f"compare: baseline run ({current})...")
199
205
  baseline_run = run_workflow(workflow, current, cwd=cwd)
200
206
  baseline_metrics = evaluate(workflow, baseline_run, judge=judge, cwd=cwd)
@@ -157,6 +157,18 @@ class JudgeSpec(StrictModel):
157
157
  # Optional path to human-scored records (carrying a numeric ``score``) for a
158
158
  # judge-reliability agreement check.
159
159
  calibration_path: str | None = None
160
+ # Optional gates (require ``calibration_path``). When set, ``migrate`` /
161
+ # ``compare`` / ``refine`` refuse to optimize against an untrusted judge.
162
+ max_mae: float | None = None
163
+ min_correlation: float | None = None
164
+
165
+ @model_validator(mode="after")
166
+ def _gates_need_calibration(self) -> "JudgeSpec":
167
+ if (self.max_mae is not None or self.min_correlation is not None) and not self.calibration_path:
168
+ raise ValueError(
169
+ "eval.judge.max_mae/min_correlation require calibration_path"
170
+ )
171
+ return self
160
172
 
161
173
  @field_validator("rubric")
162
174
  @classmethod
@@ -327,6 +327,9 @@ class MigrationResult:
327
327
  experiment_log: list[AttemptRecord] = field(default_factory=list)
328
328
  cluster_history: list[list[FailureCluster]] = field(default_factory=list)
329
329
  warnings: list[str] = field(default_factory=list)
330
+ # Judge-graded workflows: calibration agreement + low-score rationales for reviewers.
331
+ judge_agreement: Any | None = None
332
+ judge_evidence: list[dict[str, Any]] = field(default_factory=list)
330
333
  # refine-only: thresholds derived from the achieved holdout metrics, for the
331
334
  # customer to accept/edit (the old dataset's thresholds are stale).
332
335
  suggested_thresholds: dict[str, float] = field(default_factory=dict)
@@ -478,6 +481,27 @@ def run_migration(
478
481
  )
479
482
  judge = build_judge(judge_spec)
480
483
 
484
+ judge_agreement_info = None
485
+ if judge is not None and workflow.eval.judge is not None:
486
+ from .judges import require_judge_agreement
487
+
488
+ try:
489
+ judge_agreement_info = require_judge_agreement(
490
+ judge, workflow.eval.judge, cwd=cwd
491
+ )
492
+ except DriftlessError as exc:
493
+ return MigrationResult(
494
+ workflow=workflow_name,
495
+ current_model=current,
496
+ target_model=target_model,
497
+ status=MigrationStatus.BLOCKED,
498
+ iterations=0,
499
+ baseline=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
500
+ naive_target=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
501
+ final=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
502
+ message=str(exc),
503
+ )
504
+
481
505
  if not workflow.model.has_override():
482
506
  return MigrationResult(
483
507
  workflow=workflow_name,
@@ -500,6 +524,13 @@ def run_migration(
500
524
 
501
525
  use_ids = bool(workflow.eval.id_field) and split.gold is not None
502
526
 
527
+ def _judge_evidence(rows: list[RecordRow]) -> list[dict[str, Any]]:
528
+ if workflow.eval.grading != "judge":
529
+ return []
530
+ from .judges import judge_evidence_samples
531
+
532
+ return judge_evidence_samples(rows)
533
+
503
534
  def evaluate_on(
504
535
  model: str, idx: list[int], files: dict[str, str] | None = None
505
536
  ) -> RunAnalysis:
@@ -566,6 +597,8 @@ def run_migration(
566
597
  holdout_checks=holdout_checks,
567
598
  tuning_checks=naive_checks,
568
599
  warnings=size_warnings,
600
+ judge_agreement=judge_agreement_info,
601
+ judge_evidence=_judge_evidence(naive_analysis.rows),
569
602
  message="naive model swap passes thresholds; only the model ID changes",
570
603
  )
571
604
 
@@ -753,6 +786,8 @@ def run_migration(
753
786
  experiment_log=experiment_log,
754
787
  cluster_history=cluster_history,
755
788
  warnings=size_warnings,
789
+ judge_agreement=judge_agreement_info,
790
+ judge_evidence=_judge_evidence(best_analysis.rows),
756
791
  original_editable_files=original_editable,
757
792
  message="migration passed tuning and holdout thresholds",
758
793
  )
@@ -821,6 +856,8 @@ def run_migration(
821
856
  cluster_history=cluster_history,
822
857
  warnings=size_warnings,
823
858
  suggested_thresholds=suggested,
859
+ judge_agreement=judge_agreement_info,
860
+ judge_evidence=_judge_evidence(best_analysis.rows),
824
861
  original_editable_files=original_editable,
825
862
  message=message,
826
863
  )
@@ -850,6 +887,8 @@ def run_migration(
850
887
  experiment_log=experiment_log,
851
888
  cluster_history=cluster_history,
852
889
  warnings=size_warnings,
890
+ judge_agreement=judge_agreement_info,
891
+ judge_evidence=_judge_evidence(best_analysis.rows),
853
892
  original_editable_files=original_editable,
854
893
  message=message,
855
894
  )