driftless 0.2.1__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {driftless-0.2.1 → driftless-0.2.5}/CHANGELOG.md +58 -2
- {driftless-0.2.1 → driftless-0.2.5}/PKG-INFO +8 -3
- {driftless-0.2.1 → driftless-0.2.5}/README.md +7 -2
- {driftless-0.2.1 → driftless-0.2.5}/docs/RELEASE.md +44 -4
- {driftless-0.2.1 → driftless-0.2.5}/site/docs.html +5 -1
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/__init__.py +1 -1
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/cli.py +173 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/compare.py +6 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/contract.py +12 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/engine.py +39 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/init_ci.py +247 -2
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/judges.py +52 -0
- driftless-0.2.5/src/driftless/label_audit.py +290 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/report.py +29 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/templates.py +2 -0
- driftless-0.2.5/tests/fixtures/live_eval_baseline.json +38 -0
- driftless-0.2.5/tests/regression_metrics.py +141 -0
- driftless-0.2.5/tests/scenarios.py +880 -0
- driftless-0.2.5/tests/test_cli.py +268 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_github.py +65 -0
- driftless-0.2.5/tests/test_grading_loop.py +40 -0
- driftless-0.2.5/tests/test_init_ci.py +314 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_judge.py +63 -0
- driftless-0.2.5/tests/test_judge_loop.py +69 -0
- driftless-0.2.5/tests/test_label_audit.py +183 -0
- driftless-0.2.5/tests/test_migration_live.py +131 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_migration_regression.py +114 -0
- driftless-0.2.5/tests/test_regression_metrics.py +85 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_report.py +23 -0
- driftless-0.2.1/tests/scenarios.py +0 -387
- driftless-0.2.1/tests/test_cli.py +0 -72
- driftless-0.2.1/tests/test_grading_loop.py +0 -103
- driftless-0.2.1/tests/test_init_ci.py +0 -128
- driftless-0.2.1/tests/test_judge_loop.py +0 -93
- driftless-0.2.1/tests/test_migration_live.py +0 -40
- {driftless-0.2.1 → driftless-0.2.5}/.gitignore +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/LICENSE +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/docs/repair-and-generators.md +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/pyproject.toml +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/app.js +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/hero-workflow.png +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/landing.css +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/runs.css +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/runs.js +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/sample-run.json +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/assets/styles.css +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/index.html +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/site/runs.html +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/calibrate.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/configure.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/data/model_lifecycle.json +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/datasource.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/datastate.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/discovery.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/errors.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/evaluation.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/generators.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/github.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/harness.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/lifecycle.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/policy.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/preflight.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/progress.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/scanner.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/splits.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/src/driftless/view.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/driftless.yml +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/inputs.jsonl +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/fixtures/smoke/labels.jsonl +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_compare.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_contract.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_data_change_gate.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_data_change_regression.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_datasource.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_datastate.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_discovery.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_endpoint.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_engine.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_evaluation.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_extraction.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_generators.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_harness.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_lifecycle.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_plan_act.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_policy.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_poll_act.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_preflight.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_progress.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_refine.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_refresh_catalog.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_repair_prompt.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_scanner.py +0 -0
- {driftless-0.2.1 → driftless-0.2.5}/tests/test_view.py +0 -0
|
@@ -17,6 +17,58 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
17
17
|
|
|
18
18
|
---
|
|
19
19
|
|
|
20
|
+
## [0.2.5] - 2026-07-01
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
|
|
24
|
+
- **`init-ci` label-audit workflow** — scaffold `driftless-label-audit.yml` (or
|
|
25
|
+
`-all` matrix) with `audit-labels --fail` on eval dataset path changes.
|
|
26
|
+
- **`init-ci` judge-check workflow** — scaffold `driftless-judge-check.yml` when
|
|
27
|
+
`eval.judge.calibration_path` is set; uses `--enforce` when gate thresholds
|
|
28
|
+
are configured.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## [0.2.4] - 2026-07-01
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
|
|
36
|
+
- **`judge-check` gate output under CI** — emit gate status via plain stdout so Rich
|
|
37
|
+
TTY highlighting (when `GITHUB_ACTIONS=true`) does not break publish workflow tests.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## [0.2.3] - 2026-07-01
|
|
42
|
+
|
|
43
|
+
### Fixed
|
|
44
|
+
|
|
45
|
+
- **`judge-check` gate output** — print gate status with Rich markup disabled so
|
|
46
|
+
publish CI can assert on `max_mae` / `min_correlation` lines reliably.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## [0.2.2] - 2026-07-01
|
|
51
|
+
|
|
52
|
+
### Added
|
|
53
|
+
|
|
54
|
+
- **`driftless judge-check`** — measure judge↔human agreement on a calibration set;
|
|
55
|
+
`--enforce` applies the same gates as `migrate` / `compare`.
|
|
56
|
+
- **`driftless audit-labels`** — find duplicate/near-duplicate inputs with disagreeing
|
|
57
|
+
gold labels; `--fail` for CI.
|
|
58
|
+
- **Judge trust hardening** — optional `max_mae` / `min_correlation` gates on
|
|
59
|
+
judge-graded workflows; judge reliability and scoring evidence in migration reports.
|
|
60
|
+
- **P0.1 expansion** — judge-graded regression scenario; live eval CI baseline
|
|
61
|
+
checks with `--require-all` and job summaries.
|
|
62
|
+
- **`open-pr --create` integration tests** — mocked git/gh execution path coverage.
|
|
63
|
+
- **`migrate` / `refine` label-audit preflight** — warn on label conflicts by default;
|
|
64
|
+
`--strict-label-audit` blocks; `--skip-label-audit` to silence.
|
|
65
|
+
|
|
66
|
+
### Changed
|
|
67
|
+
|
|
68
|
+
- Live eval workflow sets `DRIFTLESS_REGRESSION_METRICS` explicitly.
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
20
72
|
## [0.2.1] - 2026-07-01
|
|
21
73
|
|
|
22
74
|
### Fixed
|
|
@@ -80,8 +132,12 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
|
|
|
80
132
|
- **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
|
|
81
133
|
Poetry + Dependabot product framing.
|
|
82
134
|
|
|
83
|
-
[Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.
|
|
135
|
+
[Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.5...HEAD
|
|
136
|
+
[0.2.5]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.5
|
|
137
|
+
[0.2.4]: https://github.com/driftless-dev/driftless/compare/v0.2.4...v0.2.5
|
|
138
|
+
[0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
|
|
139
|
+
[0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
|
|
84
140
|
[0.2.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.1
|
|
85
|
-
[0.2.0]: https://github.com/driftless-dev/driftless/
|
|
141
|
+
[0.2.0]: https://github.com/driftless-dev/driftless/compare/v0.2.0...v0.2.1
|
|
86
142
|
[0.1.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.1
|
|
87
143
|
[0.1.0]: https://github.com/driftless-dev/driftless/releases/tag/v0.1.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: driftless
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
|
|
5
5
|
Project-URL: Homepage, https://github.com/driftless-dev/driftless
|
|
6
6
|
Project-URL: Repository, https://github.com/driftless-dev/driftless
|
|
@@ -87,6 +87,8 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
87
87
|
precision/recall/F1 against the gold record.
|
|
88
88
|
- **`eval.judge`** — an LLM judge grades each free-form output against a rubric
|
|
89
89
|
(with an optional human-scored calibration set for a judge-agreement check).
|
|
90
|
+
Run `driftless judge-check -w <workflow>` before optimizing; set
|
|
91
|
+
`max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
|
|
90
92
|
|
|
91
93
|
## CLI
|
|
92
94
|
|
|
@@ -94,7 +96,7 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
94
96
|
|---|---|
|
|
95
97
|
| `init` | Scaffold a `driftless.yml`. |
|
|
96
98
|
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
97
|
-
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and
|
|
99
|
+
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
|
|
98
100
|
| `scan` | Find probable LLM usage and at-risk models. |
|
|
99
101
|
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
100
102
|
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
@@ -102,9 +104,12 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
102
104
|
| `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
|
|
103
105
|
| `compare -w <w> --to <model>` | Baseline vs target scorecard. |
|
|
104
106
|
| `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
|
|
107
|
+
| | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
|
|
105
108
|
| `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
|
|
106
109
|
| `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
|
|
107
110
|
| `validate -w <w>` | Check the contract parses and the harness runs. |
|
|
111
|
+
| `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
|
|
112
|
+
| `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
|
|
108
113
|
| `report` | Render the latest migration report. |
|
|
109
114
|
| `view` | Open the optimization run viewer (charts + attempt log). |
|
|
110
115
|
| `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
|
|
@@ -128,7 +133,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
|
|
|
128
133
|
manually-triggered migration that opens a PR (or an issue when blocked).
|
|
129
134
|
|
|
130
135
|
```yaml
|
|
131
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
136
|
+
- uses: driftless-dev/driftless@v0.2.5
|
|
132
137
|
with:
|
|
133
138
|
command: scan
|
|
134
139
|
```
|
|
@@ -48,6 +48,8 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
48
48
|
precision/recall/F1 against the gold record.
|
|
49
49
|
- **`eval.judge`** — an LLM judge grades each free-form output against a rubric
|
|
50
50
|
(with an optional human-scored calibration set for a judge-agreement check).
|
|
51
|
+
Run `driftless judge-check -w <workflow>` before optimizing; set
|
|
52
|
+
`max_mae` / `min_correlation` in the contract to gate `migrate` / `compare`.
|
|
51
53
|
|
|
52
54
|
## CLI
|
|
53
55
|
|
|
@@ -55,7 +57,7 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
55
57
|
|---|---|
|
|
56
58
|
| `init` | Scaffold a `driftless.yml`. |
|
|
57
59
|
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
58
|
-
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and
|
|
60
|
+
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
|
|
59
61
|
| `scan` | Find probable LLM usage and at-risk models. |
|
|
60
62
|
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
61
63
|
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
@@ -63,9 +65,12 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
63
65
|
| `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
|
|
64
66
|
| `compare -w <w> --to <model>` | Baseline vs target scorecard. |
|
|
65
67
|
| `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
|
|
68
|
+
| | `--strict-label-audit` warns/blocks on duplicate-label conflicts. |
|
|
66
69
|
| `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
|
|
67
70
|
| `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
|
|
68
71
|
| `validate -w <w>` | Check the contract parses and the harness runs. |
|
|
72
|
+
| `judge-check -w <w>` | Measure judge↔human agreement on a calibration set (`--enforce` to gate). |
|
|
73
|
+
| `audit-labels -w <w>` | Find duplicate inputs with disagreeing gold labels (`--fail` for CI). |
|
|
69
74
|
| `report` | Render the latest migration report. |
|
|
70
75
|
| `view` | Open the optimization run viewer (charts + attempt log). |
|
|
71
76
|
| `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
|
|
@@ -89,7 +94,7 @@ can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
|
|
|
89
94
|
manually-triggered migration that opens a PR (or an issue when blocked).
|
|
90
95
|
|
|
91
96
|
```yaml
|
|
92
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
97
|
+
- uses: driftless-dev/driftless@v0.2.5
|
|
93
98
|
with:
|
|
94
99
|
command: scan
|
|
95
100
|
```
|
|
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
|
|
|
153
153
|
(`action.yml` lives at the repo root — no `/action` path segment):
|
|
154
154
|
|
|
155
155
|
```yaml
|
|
156
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
156
|
+
- uses: driftless-dev/driftless@v0.2.5
|
|
157
157
|
with:
|
|
158
158
|
command: scan
|
|
159
159
|
```
|
|
@@ -161,13 +161,19 @@ After a release, users can pin the composite Action by release tag
|
|
|
161
161
|
Or pin the PyPI package in the Action input:
|
|
162
162
|
|
|
163
163
|
```yaml
|
|
164
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
164
|
+
- uses: driftless-dev/driftless@v0.2.5
|
|
165
165
|
with:
|
|
166
|
-
version: "==0.2.
|
|
166
|
+
version: "==0.2.5"
|
|
167
167
|
command: migrate
|
|
168
168
|
```
|
|
169
169
|
|
|
170
|
-
Optionally maintain a floating **`v1`** tag on the latest stable minor release
|
|
170
|
+
Optionally maintain a floating **`v1`** tag on the latest stable minor release
|
|
171
|
+
(point it at the current release tag after each publish):
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
git tag -f v1 v0.2.5 && git push origin v1 --force
|
|
175
|
+
```
|
|
176
|
+
|
|
171
177
|
Update [`action.yml`](../action.yml) default `version` input when cutting releases.
|
|
172
178
|
|
|
173
179
|
---
|
|
@@ -188,3 +194,37 @@ Update [`action.yml`](../action.yml) default `version` input when cutting releas
|
|
|
188
194
|
`0.1.0` was uploaded manually before Trusted Publishing was wired. Tags and
|
|
189
195
|
GitHub Release for `v0.1.0` can be added retroactively for a clean history; PyPI
|
|
190
196
|
already hosts that version.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Maintainer: live optimizer eval (P0.1)
|
|
201
|
+
|
|
202
|
+
The **migration-regression** workflow runs deterministic regression on every
|
|
203
|
+
push/PR and a **live** LLM optimizer eval nightly (or on manual dispatch). The
|
|
204
|
+
live job costs tokens and is opt-in via repository secrets.
|
|
205
|
+
|
|
206
|
+
### Required secrets
|
|
207
|
+
|
|
208
|
+
In **Settings → Secrets and variables → Actions**, add:
|
|
209
|
+
|
|
210
|
+
| Secret | Used by |
|
|
211
|
+
|---|---|
|
|
212
|
+
| `OPENAI_API_KEY` | Live eval matrix job (`provider: openai`) |
|
|
213
|
+
| `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
|
|
214
|
+
|
|
215
|
+
If a secret is missing, that provider job exits cleanly with a warning (CI stays
|
|
216
|
+
green). When both are set, nightly runs append to
|
|
217
|
+
`.driftless/regression-metrics.jsonl` and check against
|
|
218
|
+
`tests/fixtures/live_eval_baseline.json` with `--require-all`.
|
|
219
|
+
|
|
220
|
+
### Local reproduction
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
export DRIFTLESS_LIVE_EVAL=1
|
|
224
|
+
export OPENAI_API_KEY=...
|
|
225
|
+
pytest tests/test_migration_live.py -v -k openai
|
|
226
|
+
python scripts/check_live_eval_metrics.py --provider openai --require-all
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
After a few stable nightly runs, tighten floors in `live_eval_baseline.json`
|
|
230
|
+
(iterations ceiling, min F1/score).
|
|
@@ -308,8 +308,11 @@ driftless open-pr -w support_classifier --create</code></pre>
|
|
|
308
308
|
<tr><td><code>plan</code></td><td>Discover at-risk workflows and apply the migration policy (CI triage).</td></tr>
|
|
309
309
|
<tr><td><code>configure <workflow></code></td><td>Turn a detected workflow into a migration-ready contract.</td></tr>
|
|
310
310
|
<tr><td><code>validate -w <w></code></td><td>Check the contract parses and the harness runs.</td></tr>
|
|
311
|
+
<tr><td><code>audit-labels -w <w></code></td><td>Find duplicate inputs with disagreeing gold labels (<code>--fail</code> for CI).</td></tr>
|
|
312
|
+
<tr><td><code>judge-check -w <w></code></td><td>Measure judge↔human agreement on a calibration set (<code>--enforce</code> to gate).</td></tr>
|
|
311
313
|
<tr><td><code>compare -w <w> --to <model></code></td><td>Baseline vs. target scorecard + threshold checks.</td></tr>
|
|
312
314
|
<tr><td><code>migrate -w <w> --to <model></code></td><td>Repair + validate + produce migrated files.</td></tr>
|
|
315
|
+
<tr><td><code>refine -w <w></code></td><td>Re-optimize the prompt for a changed dataset (model pinned).</td></tr>
|
|
313
316
|
<tr><td><code>report [-w <w>]</code></td><td>Render the latest migration report(s).</td></tr>
|
|
314
317
|
<tr><td><code>open-pr -w <w></code></td><td>Open a PR (or issue) whose body is the evidence report: summary, scorecard, unified diffs, attempt log, holdout checks.</td></tr>
|
|
315
318
|
</tbody>
|
|
@@ -318,6 +321,7 @@ driftless open-pr -w support_classifier --create</code></pre>
|
|
|
318
321
|
<ul>
|
|
319
322
|
<li><code class="inline">--generator llm|none</code> — the repair strategy (LLM-backed by default; <code class="inline">none</code> turns the loop into a dry analysis).</li>
|
|
320
323
|
<li><code class="inline">--to <model></code> — the target model to migrate to (otherwise the contract's candidates are used).</li>
|
|
324
|
+
<li><code class="inline">--strict-label-audit</code> — block when duplicate/near-duplicate inputs disagree on gold labels (warns by default).</li>
|
|
321
325
|
</ul>
|
|
322
326
|
</section>
|
|
323
327
|
|
|
@@ -424,7 +428,7 @@ driftless view -w support_classifier</code></pre>
|
|
|
424
428
|
<span class="tok-k">runs-on</span>: ubuntu-latest
|
|
425
429
|
<span class="tok-k">steps</span>:
|
|
426
430
|
- <span class="tok-k">uses</span>: actions/checkout@v4
|
|
427
|
-
- <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.
|
|
431
|
+
- <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.5
|
|
428
432
|
<span class="tok-k">with</span>:
|
|
429
433
|
<span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
|
|
430
434
|
<p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
|
|
@@ -136,6 +136,16 @@ def init_ci(
|
|
|
136
136
|
plan: bool = typer.Option(
|
|
137
137
|
False, "--plan/--no-plan", help="Scaffold scheduled plan --act workflow."
|
|
138
138
|
),
|
|
139
|
+
audit_labels: bool | None = typer.Option(
|
|
140
|
+
None,
|
|
141
|
+
"--audit-labels/--no-audit-labels",
|
|
142
|
+
help="Scaffold label-audit CI workflow (default: on if labels_path is set).",
|
|
143
|
+
),
|
|
144
|
+
judge_check: bool | None = typer.Option(
|
|
145
|
+
None,
|
|
146
|
+
"--judge-check/--no-judge-check",
|
|
147
|
+
help="Scaffold judge-calibration CI workflow (default: on if calibration_path is set).",
|
|
148
|
+
),
|
|
139
149
|
) -> None:
|
|
140
150
|
"""Scaffold GitHub Actions workflows wired to the driftless composite Action."""
|
|
141
151
|
from .init_ci import CHECKLIST, scaffold_ci_from_path
|
|
@@ -151,6 +161,8 @@ def init_ci(
|
|
|
151
161
|
include_refine=refine,
|
|
152
162
|
include_poll=poll,
|
|
153
163
|
include_plan=plan,
|
|
164
|
+
include_audit_labels=audit_labels,
|
|
165
|
+
include_judge_check=judge_check,
|
|
154
166
|
)
|
|
155
167
|
except DriftlessError as exc:
|
|
156
168
|
_fail(exc)
|
|
@@ -355,6 +367,30 @@ def _preflight(wf: Workflow, target_model: str) -> None:
|
|
|
355
367
|
err_console.print(f"[yellow]warning:[/] {pf.warning}")
|
|
356
368
|
|
|
357
369
|
|
|
370
|
+
def _label_audit_preflight(
|
|
371
|
+
workflow_name: str,
|
|
372
|
+
wf: Workflow,
|
|
373
|
+
*,
|
|
374
|
+
skip: bool,
|
|
375
|
+
strict: bool,
|
|
376
|
+
) -> None:
|
|
377
|
+
"""Warn or block when duplicate inputs carry disagreeing gold labels."""
|
|
378
|
+
if skip or wf.eval.grading != "label" or not wf.eval.labels_path:
|
|
379
|
+
return
|
|
380
|
+
from .label_audit import audit_labels, format_audit_report
|
|
381
|
+
|
|
382
|
+
report = audit_labels(workflow_name, wf, cwd=Path.cwd())
|
|
383
|
+
if not report.has_conflicts:
|
|
384
|
+
return
|
|
385
|
+
text = format_audit_report(report)
|
|
386
|
+
if strict:
|
|
387
|
+
err_console.print(text)
|
|
388
|
+
raise typer.Exit(code=1)
|
|
389
|
+
err_console.print(f"[yellow]Label audit warning[/] — {report.conflict_groups[0].kind} conflicts detected")
|
|
390
|
+
err_console.print(f"[dim]{text}[/]")
|
|
391
|
+
err_console.print("[dim]re-run with --strict-label-audit to block, or --skip-label-audit to silence[/]")
|
|
392
|
+
|
|
393
|
+
|
|
358
394
|
def _fmt(value: float | None, *, pct: bool = False) -> str:
|
|
359
395
|
if value is None:
|
|
360
396
|
return "[dim]n/a[/]"
|
|
@@ -812,6 +848,14 @@ def migrate(
|
|
|
812
848
|
2, "--candidates", help="Candidate patches to propose per iteration "
|
|
813
849
|
"(widened automatically when an iteration stalls).",
|
|
814
850
|
),
|
|
851
|
+
skip_label_audit: bool = typer.Option(
|
|
852
|
+
False, "--skip-label-audit", help="Skip duplicate-label preflight check."
|
|
853
|
+
),
|
|
854
|
+
strict_label_audit: bool = typer.Option(
|
|
855
|
+
False,
|
|
856
|
+
"--strict-label-audit",
|
|
857
|
+
help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
|
|
858
|
+
),
|
|
815
859
|
) -> None:
|
|
816
860
|
"""Attempt a migration: repair editable files, validate on holdout, report."""
|
|
817
861
|
from .engine import MigrationStatus, run_migration
|
|
@@ -820,6 +864,9 @@ def migrate(
|
|
|
820
864
|
try:
|
|
821
865
|
contract = load_contract(contract_path)
|
|
822
866
|
wf = contract.workflow(workflow)
|
|
867
|
+
_label_audit_preflight(
|
|
868
|
+
workflow, wf, skip=skip_label_audit, strict=strict_label_audit
|
|
869
|
+
)
|
|
823
870
|
_preflight(wf, to)
|
|
824
871
|
gen = build_generator(
|
|
825
872
|
generator,
|
|
@@ -916,6 +963,14 @@ def refine(
|
|
|
916
963
|
2, "--candidates", help="Candidate patches to propose per iteration "
|
|
917
964
|
"(widened automatically when an iteration stalls).",
|
|
918
965
|
),
|
|
966
|
+
skip_label_audit: bool = typer.Option(
|
|
967
|
+
False, "--skip-label-audit", help="Skip duplicate-label preflight check."
|
|
968
|
+
),
|
|
969
|
+
strict_label_audit: bool = typer.Option(
|
|
970
|
+
False,
|
|
971
|
+
"--strict-label-audit",
|
|
972
|
+
help="Block when duplicate/near-duplicate inputs disagree on gold labels.",
|
|
973
|
+
),
|
|
919
974
|
) -> None:
|
|
920
975
|
"""Re-optimize a prompt for a changed eval dataset (model stays pinned).
|
|
921
976
|
|
|
@@ -933,6 +988,9 @@ def refine(
|
|
|
933
988
|
try:
|
|
934
989
|
contract = load_contract(contract_path)
|
|
935
990
|
wf = contract.workflow(workflow)
|
|
991
|
+
_label_audit_preflight(
|
|
992
|
+
workflow, wf, skip=skip_label_audit, strict=strict_label_audit
|
|
993
|
+
)
|
|
936
994
|
gen = build_generator(
|
|
937
995
|
generator,
|
|
938
996
|
provider=generator_provider,
|
|
@@ -1191,6 +1249,121 @@ def open_pr(
|
|
|
1191
1249
|
)
|
|
1192
1250
|
|
|
1193
1251
|
|
|
1252
|
+
@app.command(name="judge-check")
|
|
1253
|
+
def judge_check(
|
|
1254
|
+
workflow: str = typer.Option(..., "--workflow", "-w"),
|
|
1255
|
+
contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
|
|
1256
|
+
enforce: bool = typer.Option(
|
|
1257
|
+
False,
|
|
1258
|
+
"--enforce",
|
|
1259
|
+
help="Apply eval.judge max_mae/min_correlation gates (same as migrate/compare).",
|
|
1260
|
+
),
|
|
1261
|
+
) -> None:
|
|
1262
|
+
"""Measure LLM-judge agreement against a human calibration set."""
|
|
1263
|
+
from .judges import build_judge, judge_agreement, require_judge_agreement
|
|
1264
|
+
|
|
1265
|
+
try:
|
|
1266
|
+
contract = load_contract(contract_path)
|
|
1267
|
+
wf = contract.workflow(workflow)
|
|
1268
|
+
except DriftlessError as exc:
|
|
1269
|
+
_fail(exc)
|
|
1270
|
+
return
|
|
1271
|
+
|
|
1272
|
+
if wf.eval.grading != "judge" or wf.eval.judge is None:
|
|
1273
|
+
_fail(
|
|
1274
|
+
DriftlessError(
|
|
1275
|
+
f"{workflow!r} is not judge-graded",
|
|
1276
|
+
hint="add eval.judge to the workflow in driftless.yml",
|
|
1277
|
+
)
|
|
1278
|
+
)
|
|
1279
|
+
return
|
|
1280
|
+
|
|
1281
|
+
spec = wf.eval.judge
|
|
1282
|
+
if not spec.calibration_path:
|
|
1283
|
+
_fail(
|
|
1284
|
+
DriftlessError(
|
|
1285
|
+
"eval.judge.calibration_path is not set",
|
|
1286
|
+
hint="add a human-scored JSONL file for judge agreement",
|
|
1287
|
+
)
|
|
1288
|
+
)
|
|
1289
|
+
return
|
|
1290
|
+
|
|
1291
|
+
judge = build_judge(spec)
|
|
1292
|
+
try:
|
|
1293
|
+
agreement = (
|
|
1294
|
+
require_judge_agreement(judge, spec)
|
|
1295
|
+
if enforce
|
|
1296
|
+
else judge_agreement(judge, spec)
|
|
1297
|
+
)
|
|
1298
|
+
except DriftlessError as exc:
|
|
1299
|
+
_fail(exc)
|
|
1300
|
+
return
|
|
1301
|
+
|
|
1302
|
+
if agreement is None:
|
|
1303
|
+
_fail(DriftlessError("calibration set is empty or produced no scores"))
|
|
1304
|
+
return
|
|
1305
|
+
|
|
1306
|
+
console.print(f"[bold]{workflow}[/] — judge calibration check\n")
|
|
1307
|
+
console.print(f" records: {agreement.n}")
|
|
1308
|
+
console.print(f" MAE: {agreement.mean_abs_error:.3f}")
|
|
1309
|
+
corr = f"{agreement.correlation:.3f}" if agreement.correlation is not None else "n/a"
|
|
1310
|
+
console.print(f" correlation: {corr}")
|
|
1311
|
+
|
|
1312
|
+
gate_bits: list[str] = []
|
|
1313
|
+
if spec.max_mae is not None:
|
|
1314
|
+
ok = agreement.mean_abs_error <= spec.max_mae
|
|
1315
|
+
gate_bits.append(f"max_mae={spec.max_mae:g} ({'ok' if ok else 'FAIL'})")
|
|
1316
|
+
if spec.min_correlation is not None:
|
|
1317
|
+
ok = agreement.correlation is not None and agreement.correlation >= spec.min_correlation
|
|
1318
|
+
gate_bits.append(f"min_correlation={spec.min_correlation:g} ({'ok' if ok else 'FAIL'})")
|
|
1319
|
+
if gate_bits:
|
|
1320
|
+
# Plain stdout — Rich highlight/markup breaks publish CI assertions on the
|
|
1321
|
+
# gate status line when GITHUB_ACTIONS forces a TTY console.
|
|
1322
|
+
typer.echo(" gates: " + ", ".join(gate_bits))
|
|
1323
|
+
|
|
1324
|
+
if enforce:
|
|
1325
|
+
console.print(f"\n[green]gates passed[/] — {agreement.summary}")
|
|
1326
|
+
else:
|
|
1327
|
+
console.print(f"\n[dim]{agreement.summary}[/]")
|
|
1328
|
+
if spec.max_mae is not None or spec.min_correlation is not None:
|
|
1329
|
+
console.print("[dim]re-run with --enforce to apply contract gates[/]")
|
|
1330
|
+
|
|
1331
|
+
|
|
1332
|
+
@app.command(name="audit-labels")
|
|
1333
|
+
def audit_labels_cmd(
|
|
1334
|
+
workflow: str = typer.Option(..., "--workflow", "-w"),
|
|
1335
|
+
contract_path: Path = typer.Option(None, "--contract", help="Path to driftless.yml."),
|
|
1336
|
+
near_threshold: float = typer.Option(
|
|
1337
|
+
0.85, "--near-threshold", min=0.5, max=1.0,
|
|
1338
|
+
help="Token Jaccard threshold for near-duplicate detection.",
|
|
1339
|
+
),
|
|
1340
|
+
fail: bool = typer.Option(
|
|
1341
|
+
False, "--fail", help="Exit non-zero when label conflicts are found.",
|
|
1342
|
+
),
|
|
1343
|
+
) -> None:
|
|
1344
|
+
"""Audit gold labels for duplicate inputs with disagreeing labels."""
|
|
1345
|
+
from .label_audit import audit_labels, format_audit_report
|
|
1346
|
+
|
|
1347
|
+
try:
|
|
1348
|
+
contract = load_contract(contract_path)
|
|
1349
|
+
wf = contract.workflow(workflow)
|
|
1350
|
+
report = audit_labels(
|
|
1351
|
+
workflow, wf, cwd=Path.cwd(), near_threshold=near_threshold
|
|
1352
|
+
)
|
|
1353
|
+
except DriftlessError as exc:
|
|
1354
|
+
_fail(exc)
|
|
1355
|
+
return
|
|
1356
|
+
|
|
1357
|
+
text = format_audit_report(report)
|
|
1358
|
+
if report.has_conflicts:
|
|
1359
|
+
err_console.print(text)
|
|
1360
|
+
else:
|
|
1361
|
+
console.print(text)
|
|
1362
|
+
|
|
1363
|
+
if fail and report.has_conflicts:
|
|
1364
|
+
raise typer.Exit(code=1)
|
|
1365
|
+
|
|
1366
|
+
|
|
1194
1367
|
@app.command()
|
|
1195
1368
|
def report(
|
|
1196
1369
|
workflow: str = typer.Option(None, "--workflow", "-w", help="Workflow to show (default: all)."),
|
|
@@ -11,6 +11,7 @@ from __future__ import annotations
|
|
|
11
11
|
import json
|
|
12
12
|
from dataclasses import asdict, dataclass, field
|
|
13
13
|
from pathlib import Path
|
|
14
|
+
from typing import cast
|
|
14
15
|
|
|
15
16
|
from .contract import ThresholdsSpec, Workflow
|
|
16
17
|
from .errors import DriftlessError
|
|
@@ -195,6 +196,11 @@ def compare_models(
|
|
|
195
196
|
)
|
|
196
197
|
judge = build_judge(judge_spec)
|
|
197
198
|
|
|
199
|
+
if judge is not None and workflow.eval.judge is not None:
|
|
200
|
+
from .judges import Judge, require_judge_agreement
|
|
201
|
+
|
|
202
|
+
require_judge_agreement(cast(Judge, judge), workflow.eval.judge, cwd=cwd)
|
|
203
|
+
|
|
198
204
|
progress_log(f"compare: baseline run ({current})...")
|
|
199
205
|
baseline_run = run_workflow(workflow, current, cwd=cwd)
|
|
200
206
|
baseline_metrics = evaluate(workflow, baseline_run, judge=judge, cwd=cwd)
|
|
@@ -157,6 +157,18 @@ class JudgeSpec(StrictModel):
|
|
|
157
157
|
# Optional path to human-scored records (carrying a numeric ``score``) for a
|
|
158
158
|
# judge-reliability agreement check.
|
|
159
159
|
calibration_path: str | None = None
|
|
160
|
+
# Optional gates (require ``calibration_path``). When set, ``migrate`` /
|
|
161
|
+
# ``compare`` / ``refine`` refuse to optimize against an untrusted judge.
|
|
162
|
+
max_mae: float | None = None
|
|
163
|
+
min_correlation: float | None = None
|
|
164
|
+
|
|
165
|
+
@model_validator(mode="after")
|
|
166
|
+
def _gates_need_calibration(self) -> "JudgeSpec":
|
|
167
|
+
if (self.max_mae is not None or self.min_correlation is not None) and not self.calibration_path:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
"eval.judge.max_mae/min_correlation require calibration_path"
|
|
170
|
+
)
|
|
171
|
+
return self
|
|
160
172
|
|
|
161
173
|
@field_validator("rubric")
|
|
162
174
|
@classmethod
|
|
@@ -327,6 +327,9 @@ class MigrationResult:
|
|
|
327
327
|
experiment_log: list[AttemptRecord] = field(default_factory=list)
|
|
328
328
|
cluster_history: list[list[FailureCluster]] = field(default_factory=list)
|
|
329
329
|
warnings: list[str] = field(default_factory=list)
|
|
330
|
+
# Judge-graded workflows: calibration agreement + low-score rationales for reviewers.
|
|
331
|
+
judge_agreement: Any | None = None
|
|
332
|
+
judge_evidence: list[dict[str, Any]] = field(default_factory=list)
|
|
330
333
|
# refine-only: thresholds derived from the achieved holdout metrics, for the
|
|
331
334
|
# customer to accept/edit (the old dataset's thresholds are stale).
|
|
332
335
|
suggested_thresholds: dict[str, float] = field(default_factory=dict)
|
|
@@ -478,6 +481,27 @@ def run_migration(
|
|
|
478
481
|
)
|
|
479
482
|
judge = build_judge(judge_spec)
|
|
480
483
|
|
|
484
|
+
judge_agreement_info = None
|
|
485
|
+
if judge is not None and workflow.eval.judge is not None:
|
|
486
|
+
from .judges import require_judge_agreement
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
judge_agreement_info = require_judge_agreement(
|
|
490
|
+
judge, workflow.eval.judge, cwd=cwd
|
|
491
|
+
)
|
|
492
|
+
except DriftlessError as exc:
|
|
493
|
+
return MigrationResult(
|
|
494
|
+
workflow=workflow_name,
|
|
495
|
+
current_model=current,
|
|
496
|
+
target_model=target_model,
|
|
497
|
+
status=MigrationStatus.BLOCKED,
|
|
498
|
+
iterations=0,
|
|
499
|
+
baseline=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
|
|
500
|
+
naive_target=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
|
|
501
|
+
final=Metrics(n=0, schema_error_rate=None, refusal_rate=0.0),
|
|
502
|
+
message=str(exc),
|
|
503
|
+
)
|
|
504
|
+
|
|
481
505
|
if not workflow.model.has_override():
|
|
482
506
|
return MigrationResult(
|
|
483
507
|
workflow=workflow_name,
|
|
@@ -500,6 +524,13 @@ def run_migration(
|
|
|
500
524
|
|
|
501
525
|
use_ids = bool(workflow.eval.id_field) and split.gold is not None
|
|
502
526
|
|
|
527
|
+
def _judge_evidence(rows: list[RecordRow]) -> list[dict[str, Any]]:
|
|
528
|
+
if workflow.eval.grading != "judge":
|
|
529
|
+
return []
|
|
530
|
+
from .judges import judge_evidence_samples
|
|
531
|
+
|
|
532
|
+
return judge_evidence_samples(rows)
|
|
533
|
+
|
|
503
534
|
def evaluate_on(
|
|
504
535
|
model: str, idx: list[int], files: dict[str, str] | None = None
|
|
505
536
|
) -> RunAnalysis:
|
|
@@ -566,6 +597,8 @@ def run_migration(
|
|
|
566
597
|
holdout_checks=holdout_checks,
|
|
567
598
|
tuning_checks=naive_checks,
|
|
568
599
|
warnings=size_warnings,
|
|
600
|
+
judge_agreement=judge_agreement_info,
|
|
601
|
+
judge_evidence=_judge_evidence(naive_analysis.rows),
|
|
569
602
|
message="naive model swap passes thresholds; only the model ID changes",
|
|
570
603
|
)
|
|
571
604
|
|
|
@@ -753,6 +786,8 @@ def run_migration(
|
|
|
753
786
|
experiment_log=experiment_log,
|
|
754
787
|
cluster_history=cluster_history,
|
|
755
788
|
warnings=size_warnings,
|
|
789
|
+
judge_agreement=judge_agreement_info,
|
|
790
|
+
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
756
791
|
original_editable_files=original_editable,
|
|
757
792
|
message="migration passed tuning and holdout thresholds",
|
|
758
793
|
)
|
|
@@ -821,6 +856,8 @@ def run_migration(
|
|
|
821
856
|
cluster_history=cluster_history,
|
|
822
857
|
warnings=size_warnings,
|
|
823
858
|
suggested_thresholds=suggested,
|
|
859
|
+
judge_agreement=judge_agreement_info,
|
|
860
|
+
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
824
861
|
original_editable_files=original_editable,
|
|
825
862
|
message=message,
|
|
826
863
|
)
|
|
@@ -850,6 +887,8 @@ def run_migration(
|
|
|
850
887
|
experiment_log=experiment_log,
|
|
851
888
|
cluster_history=cluster_history,
|
|
852
889
|
warnings=size_warnings,
|
|
890
|
+
judge_agreement=judge_agreement_info,
|
|
891
|
+
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
853
892
|
original_editable_files=original_editable,
|
|
854
893
|
message=message,
|
|
855
894
|
)
|