driftless 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {driftless-0.2.5 → driftless-0.2.7}/CHANGELOG.md +24 -2
- {driftless-0.2.5 → driftless-0.2.7}/PKG-INFO +5 -5
- {driftless-0.2.5 → driftless-0.2.7}/README.md +4 -4
- {driftless-0.2.5 → driftless-0.2.7}/docs/RELEASE.md +7 -5
- {driftless-0.2.5 → driftless-0.2.7}/site/docs.html +1 -1
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/__init__.py +1 -1
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/cli.py +5 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/compare.py +12 -1
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/contract.py +10 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/engine.py +51 -8
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/evaluation.py +60 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/report.py +1 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_contract.py +11 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_engine.py +11 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_evaluation.py +30 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_init_ci.py +40 -0
- driftless-0.2.7/tests/test_splits.py +27 -0
- {driftless-0.2.5 → driftless-0.2.7}/.gitignore +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/LICENSE +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/docs/repair-and-generators.md +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/pyproject.toml +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/app.js +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/hero-workflow.png +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/landing.css +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/runs.css +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/runs.js +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/sample-run.json +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/assets/styles.css +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/index.html +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/site/runs.html +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/calibrate.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/configure.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/data/model_lifecycle.json +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/datasource.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/datastate.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/discovery.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/errors.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/generators.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/github.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/harness.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/init_ci.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/judges.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/label_audit.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/lifecycle.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/policy.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/preflight.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/progress.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/scanner.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/splits.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/templates.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/src/driftless/view.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/live_eval_baseline.json +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/driftless.yml +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/inputs.jsonl +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/labels.jsonl +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/regression_metrics.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/scenarios.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_cli.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_compare.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_data_change_gate.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_data_change_regression.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_datasource.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_datastate.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_discovery.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_endpoint.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_extraction.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_generators.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_github.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_grading_loop.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_harness.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_judge.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_judge_loop.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_label_audit.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_lifecycle.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_migration_live.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_migration_regression.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_plan_act.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_policy.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_poll_act.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_preflight.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_progress.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_refine.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_refresh_catalog.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_regression_metrics.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_repair_prompt.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_report.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_scanner.py +0 -0
- {driftless-0.2.5 → driftless-0.2.7}/tests/test_view.py +0 -0
|
@@ -17,6 +17,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
17
17
|
|
|
18
18
|
---
|
|
19
19
|
|
|
20
|
+
## [0.2.7] - 2026-07-01
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
|
|
24
|
+
- **P0.3 per-class support floors** — warn when any class has fewer than five gold
|
|
25
|
+
examples on a split (`assess_class_support`); surfaced on `migrate` (tuning +
|
|
26
|
+
holdout), `compare` (baseline + target), CLI "Confidence caveats", and saved
|
|
27
|
+
compare JSON.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## [0.2.6] - 2026-07-01
|
|
32
|
+
|
|
33
|
+
### Added
|
|
34
|
+
|
|
35
|
+
- **P0.3 multi-seed tuning selection** — optional `migration.split_seed_count`
|
|
36
|
+
(1–5) averages tuning-split metrics across shuffle seeds when scoring repair
|
|
37
|
+
candidates; holdout validation still uses the primary `--seed` only.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
20
41
|
## [0.2.5] - 2026-07-01
|
|
21
42
|
|
|
22
43
|
### Added
|
|
@@ -132,8 +153,9 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
|
|
|
132
153
|
- **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
|
|
133
154
|
Poetry + Dependabot product framing.
|
|
134
155
|
|
|
135
|
-
[Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.
|
|
136
|
-
[0.2.
|
|
156
|
+
[Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.7...HEAD
|
|
157
|
+
[0.2.7]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.7
|
|
158
|
+
[0.2.6]: https://github.com/driftless-dev/driftless/compare/v0.2.6...v0.2.7
|
|
137
159
|
[0.2.4]: https://github.com/driftless-dev/driftless/compare/v0.2.4...v0.2.5
|
|
138
160
|
[0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
|
|
139
161
|
[0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: driftless
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
|
|
5
5
|
Project-URL: Homepage, https://github.com/driftless-dev/driftless
|
|
6
6
|
Project-URL: Repository, https://github.com/driftless-dev/driftless
|
|
@@ -96,7 +96,7 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
96
96
|
|---|---|
|
|
97
97
|
| `init` | Scaffold a `driftless.yml`. |
|
|
98
98
|
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
99
|
-
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
|
|
99
|
+
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
|
|
100
100
|
| `scan` | Find probable LLM usage and at-risk models. |
|
|
101
101
|
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
102
102
|
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
@@ -129,11 +129,11 @@ propose it.
|
|
|
129
129
|
## GitHub-native usage
|
|
130
130
|
|
|
131
131
|
A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
|
|
132
|
-
can run in CI. See `.github/workflows/` for a scheduled deprecation scan
|
|
133
|
-
manually-triggered migration
|
|
132
|
+
can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
|
|
133
|
+
`plan --act` triage, and manually-triggered migration workflows.
|
|
134
134
|
|
|
135
135
|
```yaml
|
|
136
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
136
|
+
- uses: driftless-dev/driftless@v0.2.7
|
|
137
137
|
with:
|
|
138
138
|
command: scan
|
|
139
139
|
```
|
|
@@ -57,7 +57,7 @@ optimizes against it, with your team owning the definition of "good":
|
|
|
57
57
|
|---|---|
|
|
58
58
|
| `init` | Scaffold a `driftless.yml`. |
|
|
59
59
|
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
60
|
-
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
|
|
60
|
+
| `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
|
|
61
61
|
| `scan` | Find probable LLM usage and at-risk models. |
|
|
62
62
|
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
63
63
|
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
@@ -90,11 +90,11 @@ propose it.
|
|
|
90
90
|
## GitHub-native usage
|
|
91
91
|
|
|
92
92
|
A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
|
|
93
|
-
can run in CI. See `.github/workflows/` for a scheduled deprecation scan
|
|
94
|
-
manually-triggered migration
|
|
93
|
+
can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
|
|
94
|
+
`plan --act` triage, and manually-triggered migration workflows.
|
|
95
95
|
|
|
96
96
|
```yaml
|
|
97
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
97
|
+
- uses: driftless-dev/driftless@v0.2.7
|
|
98
98
|
with:
|
|
99
99
|
command: scan
|
|
100
100
|
```
|
|
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
|
|
|
153
153
|
(`action.yml` lives at the repo root — no `/action` path segment):
|
|
154
154
|
|
|
155
155
|
```yaml
|
|
156
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
156
|
+
- uses: driftless-dev/driftless@v0.2.7
|
|
157
157
|
with:
|
|
158
158
|
command: scan
|
|
159
159
|
```
|
|
@@ -161,9 +161,9 @@ After a release, users can pin the composite Action by release tag
|
|
|
161
161
|
Or pin the PyPI package in the Action input:
|
|
162
162
|
|
|
163
163
|
```yaml
|
|
164
|
-
- uses: driftless-dev/driftless@v0.2.
|
|
164
|
+
- uses: driftless-dev/driftless@v0.2.7
|
|
165
165
|
with:
|
|
166
|
-
version: "==0.2.
|
|
166
|
+
version: "==0.2.7"
|
|
167
167
|
command: migrate
|
|
168
168
|
```
|
|
169
169
|
|
|
@@ -171,7 +171,7 @@ Optionally maintain a floating **`v1`** tag on the latest stable minor release
|
|
|
171
171
|
(point it at the current release tag after each publish):
|
|
172
172
|
|
|
173
173
|
```bash
|
|
174
|
-
git tag -f v1 v0.2.
|
|
174
|
+
git tag -f v1 v0.2.7 && git push origin v1 --force
|
|
175
175
|
```
|
|
176
176
|
|
|
177
177
|
Update [`action.yml`](../action.yml) default `version` input when cutting releases.
|
|
@@ -213,7 +213,9 @@ In **Settings → Secrets and variables → Actions**, add:
|
|
|
213
213
|
| `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
|
|
214
214
|
|
|
215
215
|
If a secret is missing, that provider job exits cleanly with a warning (CI stays
|
|
216
|
-
green).
|
|
216
|
+
green). On scheduled or manual runs, the **secrets-preflight** job writes a
|
|
217
|
+
summary table to the workflow run so you can see which keys are configured.
|
|
218
|
+
When both are set, nightly runs append to
|
|
217
219
|
`.driftless/regression-metrics.jsonl` and check against
|
|
218
220
|
`tests/fixtures/live_eval_baseline.json` with `--require-all`.
|
|
219
221
|
|
|
@@ -428,7 +428,7 @@ driftless view -w support_classifier</code></pre>
|
|
|
428
428
|
<span class="tok-k">runs-on</span>: ubuntu-latest
|
|
429
429
|
<span class="tok-k">steps</span>:
|
|
430
430
|
- <span class="tok-k">uses</span>: actions/checkout@v4
|
|
431
|
-
- <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.
|
|
431
|
+
- <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.7
|
|
432
432
|
<span class="tok-k">with</span>:
|
|
433
433
|
<span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
|
|
434
434
|
<p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
|
|
@@ -446,6 +446,11 @@ def compare(
|
|
|
446
446
|
|
|
447
447
|
console.print(_scorecard(comparison))
|
|
448
448
|
|
|
449
|
+
if comparison.warnings:
|
|
450
|
+
console.print("\n[bold yellow]Confidence caveats[/]:")
|
|
451
|
+
for w in comparison.warnings:
|
|
452
|
+
console.print(f" • {w}")
|
|
453
|
+
|
|
449
454
|
console.print("\n[bold]Thresholds[/] (target vs contract):")
|
|
450
455
|
if not comparison.checks:
|
|
451
456
|
console.print(" [dim]no thresholds configured[/]")
|
|
@@ -15,7 +15,7 @@ from typing import cast
|
|
|
15
15
|
|
|
16
16
|
from .contract import ThresholdsSpec, Workflow
|
|
17
17
|
from .errors import DriftlessError
|
|
18
|
-
from .evaluation import Metrics, evaluate
|
|
18
|
+
from .evaluation import Metrics, assess_class_support, evaluate
|
|
19
19
|
from .harness import run_workflow
|
|
20
20
|
from .progress import log as progress_log
|
|
21
21
|
|
|
@@ -35,6 +35,7 @@ class Comparison:
|
|
|
35
35
|
baseline: Metrics
|
|
36
36
|
target: Metrics
|
|
37
37
|
checks: list[ThresholdCheck] = field(default_factory=list)
|
|
38
|
+
warnings: list[str] = field(default_factory=list)
|
|
38
39
|
|
|
39
40
|
@property
|
|
40
41
|
def passed(self) -> bool:
|
|
@@ -218,6 +219,14 @@ def compare_models(
|
|
|
218
219
|
)
|
|
219
220
|
|
|
220
221
|
checks = check_thresholds(workflow.thresholds, baseline_metrics, target_metrics)
|
|
222
|
+
warnings: list[str] = []
|
|
223
|
+
for metrics, label in (
|
|
224
|
+
(baseline_metrics, "baseline"),
|
|
225
|
+
(target_metrics, "target"),
|
|
226
|
+
):
|
|
227
|
+
for w in assess_class_support(metrics, context=f"{label} eval"):
|
|
228
|
+
if w not in warnings:
|
|
229
|
+
warnings.append(w)
|
|
221
230
|
|
|
222
231
|
return Comparison(
|
|
223
232
|
workflow=workflow_name,
|
|
@@ -226,6 +235,7 @@ def compare_models(
|
|
|
226
235
|
baseline=baseline_metrics,
|
|
227
236
|
target=target_metrics,
|
|
228
237
|
checks=checks,
|
|
238
|
+
warnings=warnings,
|
|
229
239
|
)
|
|
230
240
|
|
|
231
241
|
|
|
@@ -241,6 +251,7 @@ def save_comparison(comparison: Comparison, cwd: Path | None = None) -> Path:
|
|
|
241
251
|
"baseline": asdict(comparison.baseline),
|
|
242
252
|
"target": asdict(comparison.target),
|
|
243
253
|
"checks": [asdict(c) for c in comparison.checks],
|
|
254
|
+
"warnings": comparison.warnings,
|
|
244
255
|
"passed": comparison.passed,
|
|
245
256
|
}
|
|
246
257
|
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
@@ -324,6 +324,16 @@ class MigrationSpec(StrictModel):
|
|
|
324
324
|
allow_business_logic_edits: bool = False
|
|
325
325
|
max_iterations: int = 8
|
|
326
326
|
holdout_required: bool = True
|
|
327
|
+
# When >1, average tuning-split metrics across this many shuffle seeds
|
|
328
|
+
# (seed, seed+1, …) when scoring repair candidates. Holdout uses ``seed`` only.
|
|
329
|
+
split_seed_count: int = 1
|
|
330
|
+
|
|
331
|
+
@field_validator("split_seed_count")
|
|
332
|
+
@classmethod
|
|
333
|
+
def _split_seed_count_range(cls, v: int) -> int:
|
|
334
|
+
if v < 1 or v > 5:
|
|
335
|
+
raise ValueError("migration.split_seed_count must be between 1 and 5")
|
|
336
|
+
return v
|
|
327
337
|
|
|
328
338
|
|
|
329
339
|
class RepairSpec(StrictModel):
|
|
@@ -30,10 +30,10 @@ from .calibrate import suggest_thresholds
|
|
|
30
30
|
from .compare import ThresholdCheck, check_thresholds
|
|
31
31
|
from .contract import ThresholdsSpec, Workflow
|
|
32
32
|
from .errors import DriftlessError
|
|
33
|
-
from .evaluation import Metrics, RecordRow, RunAnalysis, analyze
|
|
33
|
+
from .evaluation import Metrics, RecordRow, RunAnalysis, analyze, average_metrics, assess_class_support
|
|
34
34
|
from .harness import run_workflow
|
|
35
35
|
from .progress import log as progress_log
|
|
36
|
-
from .splits import make_splits, materialize_inputs
|
|
36
|
+
from .splits import Split, make_splits, materialize_inputs
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
# --------------------------------------------------------------------------- #
|
|
@@ -336,6 +336,8 @@ class MigrationResult:
|
|
|
336
336
|
message: str = ""
|
|
337
337
|
# Frozen editable files at loop start — baseline for per-candidate diffs in reports/UI.
|
|
338
338
|
original_editable_files: dict[str, str] = field(default_factory=dict)
|
|
339
|
+
# Shuffle seeds used for tuning (primary ``seed`` only when split_seed_count==1).
|
|
340
|
+
split_seeds_used: list[int] = field(default_factory=list)
|
|
339
341
|
|
|
340
342
|
@property
|
|
341
343
|
def succeeded(self) -> bool:
|
|
@@ -516,11 +518,19 @@ def run_migration(
|
|
|
516
518
|
)
|
|
517
519
|
|
|
518
520
|
split = make_splits(workflow, cwd=cwd, seed=seed)
|
|
521
|
+
split_seeds_used = list(range(seed, seed + mig.split_seed_count))
|
|
519
522
|
size_warnings = assess_split_sizes(
|
|
520
523
|
len(split.input_lines),
|
|
521
524
|
len(split.holdout_idx),
|
|
522
525
|
holdout_required=mig.holdout_required,
|
|
523
526
|
)
|
|
527
|
+
if mig.split_seed_count > 1:
|
|
528
|
+
size_warnings.append(
|
|
529
|
+
f"Multi-seed tuning: candidate selection averages metrics across "
|
|
530
|
+
f"{mig.split_seed_count} shuffle seeds ({split_seeds_used[0]}.."
|
|
531
|
+
f"{split_seeds_used[-1]}); each candidate scoring multiplies tuning "
|
|
532
|
+
"workflow runs."
|
|
533
|
+
)
|
|
524
534
|
|
|
525
535
|
use_ids = bool(workflow.eval.id_field) and split.gold is not None
|
|
526
536
|
|
|
@@ -532,18 +542,23 @@ def run_migration(
|
|
|
532
542
|
return judge_evidence_samples(rows)
|
|
533
543
|
|
|
534
544
|
def evaluate_on(
|
|
535
|
-
model: str,
|
|
545
|
+
model: str,
|
|
546
|
+
idx: list[int],
|
|
547
|
+
files: dict[str, str] | None = None,
|
|
548
|
+
*,
|
|
549
|
+
split_ref: Split | None = None,
|
|
536
550
|
) -> RunAnalysis:
|
|
551
|
+
sp = split_ref or split
|
|
537
552
|
file_ctx = apply_files(files, cwd=cwd) if files else nullcontext()
|
|
538
|
-
idx_lines =
|
|
553
|
+
idx_lines = sp.lines_for(idx)
|
|
539
554
|
with materialize_inputs(workflow, idx_lines, cwd=cwd):
|
|
540
555
|
with file_ctx:
|
|
541
556
|
run = run_workflow(workflow, model, cwd=cwd)
|
|
542
|
-
if use_ids:
|
|
557
|
+
if use_ids and sp.gold_ids is not None:
|
|
543
558
|
return analyze(
|
|
544
559
|
workflow,
|
|
545
560
|
run,
|
|
546
|
-
gold_by_id=
|
|
561
|
+
gold_by_id=sp.gold_by_id_for(idx),
|
|
547
562
|
inputs=idx_lines,
|
|
548
563
|
judge=judge,
|
|
549
564
|
cwd=cwd,
|
|
@@ -551,18 +566,34 @@ def run_migration(
|
|
|
551
566
|
return analyze(
|
|
552
567
|
workflow,
|
|
553
568
|
run,
|
|
554
|
-
gold_labels=
|
|
569
|
+
gold_labels=sp.gold_for(idx),
|
|
555
570
|
inputs=idx_lines,
|
|
556
571
|
judge=judge,
|
|
557
572
|
cwd=cwd,
|
|
558
573
|
)
|
|
559
574
|
|
|
575
|
+
def evaluate_tuning(
|
|
576
|
+
model: str, files: dict[str, str] | None = None
|
|
577
|
+
) -> RunAnalysis:
|
|
578
|
+
"""Score on the tuning split; average across seeds when configured."""
|
|
579
|
+
if mig.split_seed_count <= 1:
|
|
580
|
+
return evaluate_on(model, split.tuning_idx, files)
|
|
581
|
+
tuning_splits = [make_splits(workflow, cwd=cwd, seed=s) for s in split_seeds_used]
|
|
582
|
+
analyses = [
|
|
583
|
+
evaluate_on(model, sp.tuning_idx, files, split_ref=sp) for sp in tuning_splits
|
|
584
|
+
]
|
|
585
|
+
return RunAnalysis(
|
|
586
|
+
metrics=average_metrics([a.metrics for a in analyses]),
|
|
587
|
+
rows=analyses[0].rows,
|
|
588
|
+
)
|
|
589
|
+
|
|
560
590
|
progress_log(
|
|
561
591
|
f"migration: phase 1/3 — initial eval "
|
|
562
592
|
f"({len(split.tuning_idx)} tuning examples, model={current})"
|
|
563
593
|
)
|
|
564
594
|
progress_log("migration: phase 1/3 — baseline prompt on tuning split...")
|
|
565
595
|
baseline_tuning = evaluate_on(current, split.tuning_idx).metrics
|
|
596
|
+
size_warnings.extend(assess_class_support(baseline_tuning, context="tuning split"))
|
|
566
597
|
progress_log(f"migration: phase 1/3 — baseline F1={_fmt_f1(baseline_tuning.f1)}")
|
|
567
598
|
progress_log("migration: phase 1/3 — current prompt on tuning split...")
|
|
568
599
|
naive_analysis = evaluate_on(target_model, split.tuning_idx)
|
|
@@ -575,8 +606,15 @@ def run_migration(
|
|
|
575
606
|
baseline_holdout = evaluate_on(current, split.holdout_idx).metrics
|
|
576
607
|
holdout_metrics = evaluate_on(target_model, split.holdout_idx, files=files).metrics
|
|
577
608
|
checks = check_thresholds(thresholds, baseline_holdout, holdout_metrics)
|
|
609
|
+
append_holdout_class_warnings(holdout_metrics)
|
|
578
610
|
return all(c.passed for c in checks), holdout_metrics, checks
|
|
579
611
|
|
|
612
|
+
def append_holdout_class_warnings(holdout_metrics: Metrics | None) -> None:
|
|
613
|
+
if holdout_metrics is not None:
|
|
614
|
+
size_warnings.extend(
|
|
615
|
+
assess_class_support(holdout_metrics, context="holdout split")
|
|
616
|
+
)
|
|
617
|
+
|
|
580
618
|
# Step: naive target already good? (migrate only -- in refine the model is
|
|
581
619
|
# pinned, so the "naive target" is just the current prompt and there's no
|
|
582
620
|
# model-only change to short-circuit on.)
|
|
@@ -597,6 +635,7 @@ def run_migration(
|
|
|
597
635
|
holdout_checks=holdout_checks,
|
|
598
636
|
tuning_checks=naive_checks,
|
|
599
637
|
warnings=size_warnings,
|
|
638
|
+
split_seeds_used=split_seeds_used,
|
|
600
639
|
judge_agreement=judge_agreement_info,
|
|
601
640
|
judge_evidence=_judge_evidence(naive_analysis.rows),
|
|
602
641
|
message="naive model swap passes thresholds; only the model ID changes",
|
|
@@ -691,7 +730,7 @@ def run_migration(
|
|
|
691
730
|
cand_size = _patch_diff_size(patch.files, original_editable)
|
|
692
731
|
try:
|
|
693
732
|
validate_patch_scope(patch, workflow, cwd)
|
|
694
|
-
analysis =
|
|
733
|
+
analysis = evaluate_tuning(target_model, files=patch.files)
|
|
695
734
|
except DriftlessError as exc:
|
|
696
735
|
experiment_log.append(
|
|
697
736
|
AttemptRecord(
|
|
@@ -786,6 +825,7 @@ def run_migration(
|
|
|
786
825
|
experiment_log=experiment_log,
|
|
787
826
|
cluster_history=cluster_history,
|
|
788
827
|
warnings=size_warnings,
|
|
828
|
+
split_seeds_used=split_seeds_used,
|
|
789
829
|
judge_agreement=judge_agreement_info,
|
|
790
830
|
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
791
831
|
original_editable_files=original_editable,
|
|
@@ -826,6 +866,7 @@ def run_migration(
|
|
|
826
866
|
refine_holdout_checks = check_thresholds(
|
|
827
867
|
ThresholdsSpec(), baseline_holdout, refine_holdout_metrics
|
|
828
868
|
)
|
|
869
|
+
append_holdout_class_warnings(refine_holdout_metrics)
|
|
829
870
|
basis = refine_holdout_metrics if refine_holdout_metrics is not None else best_metrics
|
|
830
871
|
suggested = suggest_thresholds(basis)
|
|
831
872
|
|
|
@@ -855,6 +896,7 @@ def run_migration(
|
|
|
855
896
|
experiment_log=experiment_log,
|
|
856
897
|
cluster_history=cluster_history,
|
|
857
898
|
warnings=size_warnings,
|
|
899
|
+
split_seeds_used=split_seeds_used,
|
|
858
900
|
suggested_thresholds=suggested,
|
|
859
901
|
judge_agreement=judge_agreement_info,
|
|
860
902
|
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
@@ -887,6 +929,7 @@ def run_migration(
|
|
|
887
929
|
experiment_log=experiment_log,
|
|
888
930
|
cluster_history=cluster_history,
|
|
889
931
|
warnings=size_warnings,
|
|
932
|
+
split_seeds_used=split_seeds_used,
|
|
890
933
|
judge_agreement=judge_agreement_info,
|
|
891
934
|
judge_evidence=_judge_evidence(best_analysis.rows),
|
|
892
935
|
original_editable_files=original_editable,
|
|
@@ -74,6 +74,33 @@ class ClassMetrics:
|
|
|
74
74
|
f1: float
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
# Warn when macro-F1 aggregates classes with very few gold examples on a split.
|
|
78
|
+
MIN_CLASS_SUPPORT = 5
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def assess_class_support(
|
|
82
|
+
metrics: Metrics,
|
|
83
|
+
*,
|
|
84
|
+
context: str,
|
|
85
|
+
min_support: int = MIN_CLASS_SUPPORT,
|
|
86
|
+
) -> list[str]:
|
|
87
|
+
"""Low-confidence warnings for rare classes in classification metrics."""
|
|
88
|
+
if metrics.f1 is None or not metrics.per_class or min_support <= 0:
|
|
89
|
+
return []
|
|
90
|
+
low = [
|
|
91
|
+
(name, cm.support)
|
|
92
|
+
for name, cm in sorted(metrics.per_class.items())
|
|
93
|
+
if 0 < cm.support < min_support
|
|
94
|
+
]
|
|
95
|
+
if not low:
|
|
96
|
+
return []
|
|
97
|
+
bits = ", ".join(f"{name} ({n})" for name, n in low)
|
|
98
|
+
return [
|
|
99
|
+
f"Low per-class support on {context}: {bits} — each below {min_support} gold "
|
|
100
|
+
"examples. Macro-F1 may not reflect rare-class performance."
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
|
|
77
104
|
@dataclass
|
|
78
105
|
class Metrics:
|
|
79
106
|
n: int
|
|
@@ -96,6 +123,39 @@ class Metrics:
|
|
|
96
123
|
scored: int = 0
|
|
97
124
|
|
|
98
125
|
|
|
126
|
+
def average_metrics(items: list[Metrics]) -> Metrics:
|
|
127
|
+
"""Mean of headline metrics across multiple eval runs (multi-seed tuning)."""
|
|
128
|
+
if not items:
|
|
129
|
+
raise ValueError("average_metrics requires at least one Metrics")
|
|
130
|
+
if len(items) == 1:
|
|
131
|
+
return items[0]
|
|
132
|
+
|
|
133
|
+
def _mean(vals: list[float | None]) -> float | None:
|
|
134
|
+
nums = [v for v in vals if v is not None]
|
|
135
|
+
return sum(nums) / len(nums) if nums else None
|
|
136
|
+
|
|
137
|
+
def _mean_int(vals: list[int]) -> int:
|
|
138
|
+
return int(round(sum(vals) / len(vals)))
|
|
139
|
+
|
|
140
|
+
costs = [m.total_cost for m in items if m.total_cost is not None]
|
|
141
|
+
return Metrics(
|
|
142
|
+
n=items[0].n,
|
|
143
|
+
schema_error_rate=_mean([m.schema_error_rate for m in items]),
|
|
144
|
+
refusal_rate=_mean([m.refusal_rate for m in items]) or 0.0,
|
|
145
|
+
accuracy=_mean([m.accuracy for m in items]),
|
|
146
|
+
precision=_mean([m.precision for m in items]),
|
|
147
|
+
recall=_mean([m.recall for m in items]),
|
|
148
|
+
f1=_mean([m.f1 for m in items]),
|
|
149
|
+
avg_latency_ms=_mean([m.avg_latency_ms for m in items]),
|
|
150
|
+
total_cost=sum(costs) if costs else None,
|
|
151
|
+
score=_mean([m.score for m in items]),
|
|
152
|
+
schema_errors=_mean_int([m.schema_errors for m in items]),
|
|
153
|
+
refusals=_mean_int([m.refusals for m in items]),
|
|
154
|
+
labeled=items[0].labeled,
|
|
155
|
+
scored=items[0].scored,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
99
159
|
def load_jsonl(path: Path) -> list[OutputRecord]:
|
|
100
160
|
records: list[OutputRecord] = []
|
|
101
161
|
with path.open(encoding="utf-8") as fh:
|
|
@@ -568,6 +568,7 @@ def result_to_dict(result: MigrationResult) -> dict:
|
|
|
568
568
|
"experiment_log": [asdict(a) for a in result.experiment_log],
|
|
569
569
|
"cluster_trajectory": cluster_trajectories(result.cluster_history),
|
|
570
570
|
"warnings": result.warnings,
|
|
571
|
+
"split_seeds_used": result.split_seeds_used,
|
|
571
572
|
"judge_agreement": asdict(result.judge_agreement) if result.judge_agreement else None,
|
|
572
573
|
"judge_evidence": result.judge_evidence,
|
|
573
574
|
"suggested_thresholds": result.suggested_thresholds,
|
|
@@ -63,6 +63,17 @@ def test_workflow_not_found():
|
|
|
63
63
|
contract.workflow("missing")
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def test_split_seed_count_must_be_in_range():
|
|
67
|
+
with pytest.raises(Exception):
|
|
68
|
+
Workflow.model_validate(
|
|
69
|
+
{
|
|
70
|
+
"run": {"command": "true", "input_path": "i", "output_path": "o"},
|
|
71
|
+
"model": {"current": "m", "env_var": "M"},
|
|
72
|
+
"migration": {"split_seed_count": 0},
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
66
77
|
def test_load_missing_contract(tmp_path: Path):
|
|
67
78
|
with pytest.raises(ContractError):
|
|
68
79
|
load_contract(tmp_path / "nope.yml")
|
|
@@ -191,6 +191,7 @@ def test_small_dataset_run_carries_warning(tmp_path: Path):
|
|
|
191
191
|
wf = _make_workflow(tmp_path) # 6 examples -> below the min thresholds
|
|
192
192
|
result = run_migration("demo", wf, "weak", generator=StrictGen(), cwd=tmp_path, seed=1)
|
|
193
193
|
assert any("Small dataset" in w for w in result.warnings)
|
|
194
|
+
assert any("Low per-class support" in w for w in result.warnings)
|
|
194
195
|
|
|
195
196
|
|
|
196
197
|
def test_cluster_failures():
|
|
@@ -207,3 +208,13 @@ def test_cluster_failures():
|
|
|
207
208
|
assert kinds["refusal"].count == 1
|
|
208
209
|
assert kinds["misclassification"].count == 2 # billing<-technical pair
|
|
209
210
|
assert kinds["misclassification"].key == "billing -> technical"
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_multi_seed_tuning_still_passes(tmp_path: Path):
|
|
214
|
+
wf = _make_workflow(tmp_path)
|
|
215
|
+
wf.migration.split_seed_count = 2
|
|
216
|
+
result = run_migration("demo", wf, "weak", generator=StrictGen(), cwd=tmp_path, seed=1)
|
|
217
|
+
|
|
218
|
+
assert result.status == MigrationStatus.PASS
|
|
219
|
+
assert result.split_seeds_used == [1, 2]
|
|
220
|
+
assert any("Multi-seed tuning" in w for w in result.warnings)
|
|
@@ -309,6 +309,25 @@ def test_id_alignment_duplicate_output_id_raises(tmp_path: Path):
|
|
|
309
309
|
evaluate(wf, run, cwd=tmp_path)
|
|
310
310
|
|
|
311
311
|
|
|
312
|
+
def test_assess_class_support_flags_rare_classes():
|
|
313
|
+
from driftless.evaluation import ClassMetrics, Metrics, assess_class_support
|
|
314
|
+
|
|
315
|
+
metrics = Metrics(
|
|
316
|
+
n=12,
|
|
317
|
+
schema_error_rate=0.0,
|
|
318
|
+
refusal_rate=0.0,
|
|
319
|
+
f1=0.9,
|
|
320
|
+
per_class={
|
|
321
|
+
"billing": ClassMetrics(4, 1.0, 1.0, 1.0),
|
|
322
|
+
"technical": ClassMetrics(8, 0.9, 0.9, 0.9),
|
|
323
|
+
},
|
|
324
|
+
)
|
|
325
|
+
warnings = assess_class_support(metrics, context="tuning split")
|
|
326
|
+
assert len(warnings) == 1
|
|
327
|
+
assert "billing (4)" in warnings[0]
|
|
328
|
+
assert "tuning split" in warnings[0]
|
|
329
|
+
|
|
330
|
+
|
|
312
331
|
def test_load_labels_by_id_rejects_duplicates(tmp_path: Path):
|
|
313
332
|
from driftless.evaluation import load_labels_by_id
|
|
314
333
|
|
|
@@ -316,3 +335,14 @@ def test_load_labels_by_id_rejects_duplicates(tmp_path: Path):
|
|
|
316
335
|
p.write_text('{"id":"a","label":"x"}\n{"id":"a","label":"y"}\n')
|
|
317
336
|
with pytest.raises(Exception):
|
|
318
337
|
load_labels_by_id(p, "id", "label")
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def test_average_metrics_means_headline_fields():
|
|
341
|
+
from driftless.evaluation import Metrics, average_metrics
|
|
342
|
+
|
|
343
|
+
a = Metrics(n=10, schema_error_rate=0.2, refusal_rate=0.1, f1=0.8)
|
|
344
|
+
b = Metrics(n=10, schema_error_rate=0.0, refusal_rate=0.0, f1=1.0)
|
|
345
|
+
avg = average_metrics([a, b])
|
|
346
|
+
assert avg.f1 == pytest.approx(0.9)
|
|
347
|
+
assert avg.schema_error_rate == pytest.approx(0.1)
|
|
348
|
+
assert avg.refusal_rate == pytest.approx(0.05)
|
|
@@ -293,6 +293,46 @@ def test_label_audit_helpers():
|
|
|
293
293
|
assert label_audit_paths(contract) == ["labels.jsonl", "in.jsonl"]
|
|
294
294
|
|
|
295
295
|
|
|
296
|
+
def test_init_ci_scaffolds_plan_workflow(tmp_path, monkeypatch):
|
|
297
|
+
monkeypatch.chdir(tmp_path)
|
|
298
|
+
Path("driftless.yml").write_text(
|
|
299
|
+
"""
|
|
300
|
+
version: 1
|
|
301
|
+
workflows:
|
|
302
|
+
smoke:
|
|
303
|
+
run:
|
|
304
|
+
command: echo ok
|
|
305
|
+
input_path: in.jsonl
|
|
306
|
+
output_path: out.jsonl
|
|
307
|
+
model:
|
|
308
|
+
current: gpt-4o-mini
|
|
309
|
+
env_var: MODEL
|
|
310
|
+
eval:
|
|
311
|
+
labels_path: labels.jsonl
|
|
312
|
+
""".lstrip()
|
|
313
|
+
)
|
|
314
|
+
out = tmp_path / "workflows"
|
|
315
|
+
result = runner.invoke(
|
|
316
|
+
app,
|
|
317
|
+
[
|
|
318
|
+
"init-ci",
|
|
319
|
+
"--out-dir",
|
|
320
|
+
str(out),
|
|
321
|
+
"--no-scan",
|
|
322
|
+
"--no-migrate",
|
|
323
|
+
"--no-refine",
|
|
324
|
+
"--no-audit-labels",
|
|
325
|
+
"--plan",
|
|
326
|
+
],
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
assert result.exit_code == 0
|
|
330
|
+
plan = (out / "driftless-plan-act.yml").read_text()
|
|
331
|
+
assert "command: plan" in plan
|
|
332
|
+
assert "--act" in plan
|
|
333
|
+
assert "GH_TOKEN" in plan
|
|
334
|
+
|
|
335
|
+
|
|
296
336
|
def test_rendered_workflows_use_action_ref():
|
|
297
337
|
ref = "driftless-dev/driftless@v9.9.9"
|
|
298
338
|
assert ref in render_migrate_workflow(ref)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Tests for tuning/holdout splits."""
|
|
2
|
+
|
|
3
|
+
from driftless.contract import Workflow
|
|
4
|
+
from driftless.splits import make_splits
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _workflow() -> Workflow:
|
|
8
|
+
return Workflow.model_validate(
|
|
9
|
+
{
|
|
10
|
+
"run": {"command": "true", "input_path": "i.jsonl", "output_path": "o.jsonl"},
|
|
11
|
+
"model": {"current": "m", "env_var": "M"},
|
|
12
|
+
"eval": {"labels_path": "l.jsonl", "split": {"tuning": 0.5, "holdout": 0.5}},
|
|
13
|
+
}
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_different_seeds_produce_different_partitions(tmp_path):
|
|
18
|
+
lines = "\n".join(f'{{"id": {i}, "label": "a"}}' for i in range(20)) + "\n"
|
|
19
|
+
labels = "\n".join('{"id": ' + str(i) + ', "label": "a"}' for i in range(20)) + "\n"
|
|
20
|
+
(tmp_path / "i.jsonl").write_text(lines)
|
|
21
|
+
(tmp_path / "l.jsonl").write_text(labels)
|
|
22
|
+
|
|
23
|
+
wf = _workflow()
|
|
24
|
+
wf.eval.id_field = "id"
|
|
25
|
+
a = make_splits(wf, cwd=tmp_path, seed=0)
|
|
26
|
+
b = make_splits(wf, cwd=tmp_path, seed=1)
|
|
27
|
+
assert a.tuning_idx != b.tuning_idx
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|