driftless 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {driftless-0.2.4 → driftless-0.2.6}/CHANGELOG.md +26 -2
  2. {driftless-0.2.4 → driftless-0.2.6}/PKG-INFO +5 -5
  3. {driftless-0.2.4 → driftless-0.2.6}/README.md +4 -4
  4. {driftless-0.2.4 → driftless-0.2.6}/docs/RELEASE.md +7 -5
  5. {driftless-0.2.4 → driftless-0.2.6}/site/docs.html +1 -1
  6. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/__init__.py +1 -1
  7. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/cli.py +12 -0
  8. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/contract.py +10 -0
  9. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/engine.py +42 -8
  10. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/evaluation.py +33 -0
  11. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/init_ci.py +247 -2
  12. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/report.py +1 -0
  13. {driftless-0.2.4 → driftless-0.2.6}/tests/test_contract.py +11 -0
  14. {driftless-0.2.4 → driftless-0.2.6}/tests/test_engine.py +10 -0
  15. {driftless-0.2.4 → driftless-0.2.6}/tests/test_evaluation.py +11 -0
  16. driftless-0.2.6/tests/test_init_ci.py +354 -0
  17. driftless-0.2.6/tests/test_splits.py +27 -0
  18. driftless-0.2.4/tests/test_init_ci.py +0 -128
  19. {driftless-0.2.4 → driftless-0.2.6}/.gitignore +0 -0
  20. {driftless-0.2.4 → driftless-0.2.6}/LICENSE +0 -0
  21. {driftless-0.2.4 → driftless-0.2.6}/docs/repair-and-generators.md +0 -0
  22. {driftless-0.2.4 → driftless-0.2.6}/pyproject.toml +0 -0
  23. {driftless-0.2.4 → driftless-0.2.6}/site/assets/app.js +0 -0
  24. {driftless-0.2.4 → driftless-0.2.6}/site/assets/hero-workflow.png +0 -0
  25. {driftless-0.2.4 → driftless-0.2.6}/site/assets/landing.css +0 -0
  26. {driftless-0.2.4 → driftless-0.2.6}/site/assets/runs.css +0 -0
  27. {driftless-0.2.4 → driftless-0.2.6}/site/assets/runs.js +0 -0
  28. {driftless-0.2.4 → driftless-0.2.6}/site/assets/sample-run.json +0 -0
  29. {driftless-0.2.4 → driftless-0.2.6}/site/assets/styles.css +0 -0
  30. {driftless-0.2.4 → driftless-0.2.6}/site/index.html +0 -0
  31. {driftless-0.2.4 → driftless-0.2.6}/site/runs.html +0 -0
  32. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/calibrate.py +0 -0
  33. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/compare.py +0 -0
  34. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/configure.py +0 -0
  35. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/data/model_lifecycle.json +0 -0
  36. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/datasource.py +0 -0
  37. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/datastate.py +0 -0
  38. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/discovery.py +0 -0
  39. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/errors.py +0 -0
  40. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/generators.py +0 -0
  41. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/github.py +0 -0
  42. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/harness.py +0 -0
  43. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/judges.py +0 -0
  44. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/label_audit.py +0 -0
  45. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/lifecycle.py +0 -0
  46. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/policy.py +0 -0
  47. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/preflight.py +0 -0
  48. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/progress.py +0 -0
  49. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/scanner.py +0 -0
  50. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/splits.py +0 -0
  51. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/templates.py +0 -0
  52. {driftless-0.2.4 → driftless-0.2.6}/src/driftless/view.py +0 -0
  53. {driftless-0.2.4 → driftless-0.2.6}/tests/fixtures/live_eval_baseline.json +0 -0
  54. {driftless-0.2.4 → driftless-0.2.6}/tests/fixtures/smoke/driftless.yml +0 -0
  55. {driftless-0.2.4 → driftless-0.2.6}/tests/fixtures/smoke/inputs.jsonl +0 -0
  56. {driftless-0.2.4 → driftless-0.2.6}/tests/fixtures/smoke/labels.jsonl +0 -0
  57. {driftless-0.2.4 → driftless-0.2.6}/tests/regression_metrics.py +0 -0
  58. {driftless-0.2.4 → driftless-0.2.6}/tests/scenarios.py +0 -0
  59. {driftless-0.2.4 → driftless-0.2.6}/tests/test_cli.py +0 -0
  60. {driftless-0.2.4 → driftless-0.2.6}/tests/test_compare.py +0 -0
  61. {driftless-0.2.4 → driftless-0.2.6}/tests/test_data_change_gate.py +0 -0
  62. {driftless-0.2.4 → driftless-0.2.6}/tests/test_data_change_regression.py +0 -0
  63. {driftless-0.2.4 → driftless-0.2.6}/tests/test_datasource.py +0 -0
  64. {driftless-0.2.4 → driftless-0.2.6}/tests/test_datastate.py +0 -0
  65. {driftless-0.2.4 → driftless-0.2.6}/tests/test_discovery.py +0 -0
  66. {driftless-0.2.4 → driftless-0.2.6}/tests/test_endpoint.py +0 -0
  67. {driftless-0.2.4 → driftless-0.2.6}/tests/test_extraction.py +0 -0
  68. {driftless-0.2.4 → driftless-0.2.6}/tests/test_generators.py +0 -0
  69. {driftless-0.2.4 → driftless-0.2.6}/tests/test_github.py +0 -0
  70. {driftless-0.2.4 → driftless-0.2.6}/tests/test_grading_loop.py +0 -0
  71. {driftless-0.2.4 → driftless-0.2.6}/tests/test_harness.py +0 -0
  72. {driftless-0.2.4 → driftless-0.2.6}/tests/test_judge.py +0 -0
  73. {driftless-0.2.4 → driftless-0.2.6}/tests/test_judge_loop.py +0 -0
  74. {driftless-0.2.4 → driftless-0.2.6}/tests/test_label_audit.py +0 -0
  75. {driftless-0.2.4 → driftless-0.2.6}/tests/test_lifecycle.py +0 -0
  76. {driftless-0.2.4 → driftless-0.2.6}/tests/test_migration_live.py +0 -0
  77. {driftless-0.2.4 → driftless-0.2.6}/tests/test_migration_regression.py +0 -0
  78. {driftless-0.2.4 → driftless-0.2.6}/tests/test_plan_act.py +0 -0
  79. {driftless-0.2.4 → driftless-0.2.6}/tests/test_policy.py +0 -0
  80. {driftless-0.2.4 → driftless-0.2.6}/tests/test_poll_act.py +0 -0
  81. {driftless-0.2.4 → driftless-0.2.6}/tests/test_preflight.py +0 -0
  82. {driftless-0.2.4 → driftless-0.2.6}/tests/test_progress.py +0 -0
  83. {driftless-0.2.4 → driftless-0.2.6}/tests/test_refine.py +0 -0
  84. {driftless-0.2.4 → driftless-0.2.6}/tests/test_refresh_catalog.py +0 -0
  85. {driftless-0.2.4 → driftless-0.2.6}/tests/test_regression_metrics.py +0 -0
  86. {driftless-0.2.4 → driftless-0.2.6}/tests/test_repair_prompt.py +0 -0
  87. {driftless-0.2.4 → driftless-0.2.6}/tests/test_report.py +0 -0
  88. {driftless-0.2.4 → driftless-0.2.6}/tests/test_scanner.py +0 -0
  89. {driftless-0.2.4 → driftless-0.2.6}/tests/test_view.py +0 -0
@@ -17,6 +17,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
17
17
 
18
18
  ---
19
19
 
20
+ ## [0.2.6] - 2026-07-01
21
+
22
+ ### Added
23
+
24
+ - **P0.3 multi-seed tuning selection** — optional `migration.split_seed_count`
25
+ (1–5) averages tuning-split metrics across shuffle seeds when scoring repair
26
+ candidates; holdout validation still uses the primary `--seed` only.
27
+
28
+ ---
29
+
30
+ ## [0.2.5] - 2026-07-01
31
+
32
+ ### Added
33
+
34
+ - **`init-ci` label-audit workflow** — scaffold `driftless-label-audit.yml` (or
35
+ `-all` matrix) with `audit-labels --fail` on eval dataset path changes.
36
+ - **`init-ci` judge-check workflow** — scaffold `driftless-judge-check.yml` when
37
+ `eval.judge.calibration_path` is set; uses `--enforce` when gate thresholds
38
+ are configured.
39
+
40
+ ---
41
+
20
42
  ## [0.2.4] - 2026-07-01
21
43
 
22
44
  ### Fixed
@@ -120,8 +142,10 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
120
142
  - **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
121
143
  Poetry + Dependabot product framing.
122
144
 
123
- [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.4...HEAD
124
- [0.2.4]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.4
145
+ [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.6...HEAD
146
+ [0.2.6]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.6
147
+ [0.2.5]: https://github.com/driftless-dev/driftless/compare/v0.2.5...v0.2.6
148
+ [0.2.4]: https://github.com/driftless-dev/driftless/compare/v0.2.4...v0.2.5
125
149
  [0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
126
150
  [0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
127
151
  [0.2.1]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: driftless
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
5
5
  Project-URL: Homepage, https://github.com/driftless-dev/driftless
6
6
  Project-URL: Repository, https://github.com/driftless-dev/driftless
@@ -96,7 +96,7 @@ optimizes against it, with your team owning the definition of "good":
96
96
  |---|---|
97
97
  | `init` | Scaffold a `driftless.yml`. |
98
98
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
99
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and poll. |
99
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
100
100
  | `scan` | Find probable LLM usage and at-risk models. |
101
101
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
102
102
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -129,11 +129,11 @@ propose it.
129
129
  ## GitHub-native usage
130
130
 
131
131
  A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
132
- can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
133
- manually-triggered migration that opens a PR (or an issue when blocked).
132
+ can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
133
+ `plan --act` triage, and manually-triggered migration workflows.
134
134
 
135
135
  ```yaml
136
- - uses: driftless-dev/driftless@v0.2.4
136
+ - uses: driftless-dev/driftless@v0.2.6
137
137
  with:
138
138
  command: scan
139
139
  ```
@@ -57,7 +57,7 @@ optimizes against it, with your team owning the definition of "good":
57
57
  |---|---|
58
58
  | `init` | Scaffold a `driftless.yml`. |
59
59
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
60
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, and poll. |
60
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
61
61
  | `scan` | Find probable LLM usage and at-risk models. |
62
62
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
63
63
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -90,11 +90,11 @@ propose it.
90
90
  ## GitHub-native usage
91
91
 
92
92
  A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
93
- can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
94
- manually-triggered migration that opens a PR (or an issue when blocked).
93
+ can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
94
+ `plan --act` triage, and manually-triggered migration workflows.
95
95
 
96
96
  ```yaml
97
- - uses: driftless-dev/driftless@v0.2.4
97
+ - uses: driftless-dev/driftless@v0.2.6
98
98
  with:
99
99
  command: scan
100
100
  ```
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
153
153
  (`action.yml` lives at the repo root — no `/action` path segment):
154
154
 
155
155
  ```yaml
156
- - uses: driftless-dev/driftless@v0.2.4
156
+ - uses: driftless-dev/driftless@v0.2.6
157
157
  with:
158
158
  command: scan
159
159
  ```
@@ -161,9 +161,9 @@ After a release, users can pin the composite Action by release tag
161
161
  Or pin the PyPI package in the Action input:
162
162
 
163
163
  ```yaml
164
- - uses: driftless-dev/driftless@v0.2.4
164
+ - uses: driftless-dev/driftless@v0.2.6
165
165
  with:
166
- version: "==0.2.4"
166
+ version: "==0.2.6"
167
167
  command: migrate
168
168
  ```
169
169
 
@@ -171,7 +171,7 @@ Optionally maintain a floating **`v1`** tag on the latest stable minor release
171
171
  (point it at the current release tag after each publish):
172
172
 
173
173
  ```bash
174
- git tag -f v1 v0.2.4 && git push origin v1 --force
174
+ git tag -f v1 v0.2.6 && git push origin v1 --force
175
175
  ```
176
176
 
177
177
  Update [`action.yml`](../action.yml) default `version` input when cutting releases.
@@ -213,7 +213,9 @@ In **Settings → Secrets and variables → Actions**, add:
213
213
  | `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
214
214
 
215
215
  If a secret is missing, that provider job exits cleanly with a warning (CI stays
216
- green). When both are set, nightly runs append to
216
+ green). On scheduled or manual runs, the **secrets-preflight** job writes a
217
+ summary table to the workflow run so you can see which keys are configured.
218
+ When both are set, nightly runs append to
217
219
  `.driftless/regression-metrics.jsonl` and check against
218
220
  `tests/fixtures/live_eval_baseline.json` with `--require-all`.
219
221
 
@@ -428,7 +428,7 @@ driftless view -w support_classifier</code></pre>
428
428
  <span class="tok-k">runs-on</span>: ubuntu-latest
429
429
  <span class="tok-k">steps</span>:
430
430
  - <span class="tok-k">uses</span>: actions/checkout@v4
431
- - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.4
431
+ - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.6
432
432
  <span class="tok-k">with</span>:
433
433
  <span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
434
434
  <p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
@@ -1,3 +1,3 @@
1
1
  """driftless: Dependabot for LLM models."""
2
2
 
3
- __version__ = "0.2.4"
3
+ __version__ = "0.2.6"
@@ -136,6 +136,16 @@ def init_ci(
136
136
  plan: bool = typer.Option(
137
137
  False, "--plan/--no-plan", help="Scaffold scheduled plan --act workflow."
138
138
  ),
139
+ audit_labels: bool | None = typer.Option(
140
+ None,
141
+ "--audit-labels/--no-audit-labels",
142
+ help="Scaffold label-audit CI workflow (default: on if labels_path is set).",
143
+ ),
144
+ judge_check: bool | None = typer.Option(
145
+ None,
146
+ "--judge-check/--no-judge-check",
147
+ help="Scaffold judge-calibration CI workflow (default: on if calibration_path is set).",
148
+ ),
139
149
  ) -> None:
140
150
  """Scaffold GitHub Actions workflows wired to the driftless composite Action."""
141
151
  from .init_ci import CHECKLIST, scaffold_ci_from_path
@@ -151,6 +161,8 @@ def init_ci(
151
161
  include_refine=refine,
152
162
  include_poll=poll,
153
163
  include_plan=plan,
164
+ include_audit_labels=audit_labels,
165
+ include_judge_check=judge_check,
154
166
  )
155
167
  except DriftlessError as exc:
156
168
  _fail(exc)
@@ -324,6 +324,16 @@ class MigrationSpec(StrictModel):
324
324
  allow_business_logic_edits: bool = False
325
325
  max_iterations: int = 8
326
326
  holdout_required: bool = True
327
+ # When >1, average tuning-split metrics across this many shuffle seeds
328
+ # (seed, seed+1, …) when scoring repair candidates. Holdout uses ``seed`` only.
329
+ split_seed_count: int = 1
330
+
331
+ @field_validator("split_seed_count")
332
+ @classmethod
333
+ def _split_seed_count_range(cls, v: int) -> int:
334
+ if v < 1 or v > 5:
335
+ raise ValueError("migration.split_seed_count must be between 1 and 5")
336
+ return v
327
337
 
328
338
 
329
339
  class RepairSpec(StrictModel):
@@ -30,10 +30,10 @@ from .calibrate import suggest_thresholds
30
30
  from .compare import ThresholdCheck, check_thresholds
31
31
  from .contract import ThresholdsSpec, Workflow
32
32
  from .errors import DriftlessError
33
- from .evaluation import Metrics, RecordRow, RunAnalysis, analyze
33
+ from .evaluation import Metrics, RecordRow, RunAnalysis, analyze, average_metrics
34
34
  from .harness import run_workflow
35
35
  from .progress import log as progress_log
36
- from .splits import make_splits, materialize_inputs
36
+ from .splits import Split, make_splits, materialize_inputs
37
37
 
38
38
 
39
39
  # --------------------------------------------------------------------------- #
@@ -336,6 +336,8 @@ class MigrationResult:
336
336
  message: str = ""
337
337
  # Frozen editable files at loop start — baseline for per-candidate diffs in reports/UI.
338
338
  original_editable_files: dict[str, str] = field(default_factory=dict)
339
+ # Shuffle seeds used for tuning (primary ``seed`` only when split_seed_count==1).
340
+ split_seeds_used: list[int] = field(default_factory=list)
339
341
 
340
342
  @property
341
343
  def succeeded(self) -> bool:
@@ -516,11 +518,19 @@ def run_migration(
516
518
  )
517
519
 
518
520
  split = make_splits(workflow, cwd=cwd, seed=seed)
521
+ split_seeds_used = list(range(seed, seed + mig.split_seed_count))
519
522
  size_warnings = assess_split_sizes(
520
523
  len(split.input_lines),
521
524
  len(split.holdout_idx),
522
525
  holdout_required=mig.holdout_required,
523
526
  )
527
+ if mig.split_seed_count > 1:
528
+ size_warnings.append(
529
+ f"Multi-seed tuning: candidate selection averages metrics across "
530
+ f"{mig.split_seed_count} shuffle seeds ({split_seeds_used[0]}.."
531
+ f"{split_seeds_used[-1]}); each candidate scoring multiplies tuning "
532
+ "workflow runs."
533
+ )
524
534
 
525
535
  use_ids = bool(workflow.eval.id_field) and split.gold is not None
526
536
 
@@ -532,18 +542,23 @@ def run_migration(
532
542
  return judge_evidence_samples(rows)
533
543
 
534
544
  def evaluate_on(
535
- model: str, idx: list[int], files: dict[str, str] | None = None
545
+ model: str,
546
+ idx: list[int],
547
+ files: dict[str, str] | None = None,
548
+ *,
549
+ split_ref: Split | None = None,
536
550
  ) -> RunAnalysis:
551
+ sp = split_ref or split
537
552
  file_ctx = apply_files(files, cwd=cwd) if files else nullcontext()
538
- idx_lines = split.lines_for(idx)
553
+ idx_lines = sp.lines_for(idx)
539
554
  with materialize_inputs(workflow, idx_lines, cwd=cwd):
540
555
  with file_ctx:
541
556
  run = run_workflow(workflow, model, cwd=cwd)
542
- if use_ids:
557
+ if use_ids and sp.gold_ids is not None:
543
558
  return analyze(
544
559
  workflow,
545
560
  run,
546
- gold_by_id=split.gold_by_id_for(idx),
561
+ gold_by_id=sp.gold_by_id_for(idx),
547
562
  inputs=idx_lines,
548
563
  judge=judge,
549
564
  cwd=cwd,
@@ -551,12 +566,27 @@ def run_migration(
551
566
  return analyze(
552
567
  workflow,
553
568
  run,
554
- gold_labels=split.gold_for(idx),
569
+ gold_labels=sp.gold_for(idx),
555
570
  inputs=idx_lines,
556
571
  judge=judge,
557
572
  cwd=cwd,
558
573
  )
559
574
 
575
+ def evaluate_tuning(
576
+ model: str, files: dict[str, str] | None = None
577
+ ) -> RunAnalysis:
578
+ """Score on the tuning split; average across seeds when configured."""
579
+ if mig.split_seed_count <= 1:
580
+ return evaluate_on(model, split.tuning_idx, files)
581
+ tuning_splits = [make_splits(workflow, cwd=cwd, seed=s) for s in split_seeds_used]
582
+ analyses = [
583
+ evaluate_on(model, sp.tuning_idx, files, split_ref=sp) for sp in tuning_splits
584
+ ]
585
+ return RunAnalysis(
586
+ metrics=average_metrics([a.metrics for a in analyses]),
587
+ rows=analyses[0].rows,
588
+ )
589
+
560
590
  progress_log(
561
591
  f"migration: phase 1/3 — initial eval "
562
592
  f"({len(split.tuning_idx)} tuning examples, model={current})"
@@ -597,6 +627,7 @@ def run_migration(
597
627
  holdout_checks=holdout_checks,
598
628
  tuning_checks=naive_checks,
599
629
  warnings=size_warnings,
630
+ split_seeds_used=split_seeds_used,
600
631
  judge_agreement=judge_agreement_info,
601
632
  judge_evidence=_judge_evidence(naive_analysis.rows),
602
633
  message="naive model swap passes thresholds; only the model ID changes",
@@ -691,7 +722,7 @@ def run_migration(
691
722
  cand_size = _patch_diff_size(patch.files, original_editable)
692
723
  try:
693
724
  validate_patch_scope(patch, workflow, cwd)
694
- analysis = evaluate_on(target_model, split.tuning_idx, files=patch.files)
725
+ analysis = evaluate_tuning(target_model, files=patch.files)
695
726
  except DriftlessError as exc:
696
727
  experiment_log.append(
697
728
  AttemptRecord(
@@ -786,6 +817,7 @@ def run_migration(
786
817
  experiment_log=experiment_log,
787
818
  cluster_history=cluster_history,
788
819
  warnings=size_warnings,
820
+ split_seeds_used=split_seeds_used,
789
821
  judge_agreement=judge_agreement_info,
790
822
  judge_evidence=_judge_evidence(best_analysis.rows),
791
823
  original_editable_files=original_editable,
@@ -855,6 +887,7 @@ def run_migration(
855
887
  experiment_log=experiment_log,
856
888
  cluster_history=cluster_history,
857
889
  warnings=size_warnings,
890
+ split_seeds_used=split_seeds_used,
858
891
  suggested_thresholds=suggested,
859
892
  judge_agreement=judge_agreement_info,
860
893
  judge_evidence=_judge_evidence(best_analysis.rows),
@@ -887,6 +920,7 @@ def run_migration(
887
920
  experiment_log=experiment_log,
888
921
  cluster_history=cluster_history,
889
922
  warnings=size_warnings,
923
+ split_seeds_used=split_seeds_used,
890
924
  judge_agreement=judge_agreement_info,
891
925
  judge_evidence=_judge_evidence(best_analysis.rows),
892
926
  original_editable_files=original_editable,
@@ -96,6 +96,39 @@ class Metrics:
96
96
  scored: int = 0
97
97
 
98
98
 
99
+ def average_metrics(items: list[Metrics]) -> Metrics:
100
+ """Mean of headline metrics across multiple eval runs (multi-seed tuning)."""
101
+ if not items:
102
+ raise ValueError("average_metrics requires at least one Metrics")
103
+ if len(items) == 1:
104
+ return items[0]
105
+
106
+ def _mean(vals: list[float | None]) -> float | None:
107
+ nums = [v for v in vals if v is not None]
108
+ return sum(nums) / len(nums) if nums else None
109
+
110
+ def _mean_int(vals: list[int]) -> int:
111
+ return int(round(sum(vals) / len(vals)))
112
+
113
+ costs = [m.total_cost for m in items if m.total_cost is not None]
114
+ return Metrics(
115
+ n=items[0].n,
116
+ schema_error_rate=_mean([m.schema_error_rate for m in items]),
117
+ refusal_rate=_mean([m.refusal_rate for m in items]) or 0.0,
118
+ accuracy=_mean([m.accuracy for m in items]),
119
+ precision=_mean([m.precision for m in items]),
120
+ recall=_mean([m.recall for m in items]),
121
+ f1=_mean([m.f1 for m in items]),
122
+ avg_latency_ms=_mean([m.avg_latency_ms for m in items]),
123
+ total_cost=sum(costs) if costs else None,
124
+ score=_mean([m.score for m in items]),
125
+ schema_errors=_mean_int([m.schema_errors for m in items]),
126
+ refusals=_mean_int([m.refusals for m in items]),
127
+ labeled=items[0].labeled,
128
+ scored=items[0].scored,
129
+ )
130
+
131
+
99
132
  def load_jsonl(path: Path) -> list[OutputRecord]:
100
133
  records: list[OutputRecord] = []
101
134
  with path.open(encoding="utf-8") as fh:
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from dataclasses import dataclass
5
6
  from pathlib import Path
6
7
 
7
8
  from . import __version__
@@ -203,6 +204,204 @@ jobs:
203
204
  """
204
205
 
205
206
 
207
+ def label_audit_workflows(contract: Contract) -> list[str]:
208
+ """Workflow names eligible for gold-label auditing (classification + labels_path)."""
209
+ names: list[str] = []
210
+ for name, wf in contract.workflows.items():
211
+ if wf.eval.grading != "label":
212
+ continue
213
+ if not wf.eval.labels_path:
214
+ continue
215
+ names.append(name)
216
+ return names
217
+
218
+
219
+ def label_audit_paths(contract: Contract) -> list[str]:
220
+ """Union of dataset paths for workflows included in label audit."""
221
+ paths: list[str] = []
222
+ for name in label_audit_workflows(contract):
223
+ for path in dataset_paths(contract.workflows[name]):
224
+ if path not in paths:
225
+ paths.append(path)
226
+ return paths
227
+
228
+
229
+ def render_audit_labels_workflow(
230
+ action_ref: str,
231
+ workflow_names: list[str],
232
+ paths: list[str],
233
+ ) -> str:
234
+ if not workflow_names:
235
+ raise ValueError("workflow_names must not be empty")
236
+ title = (
237
+ f"driftless label audit ({workflow_names[0]})"
238
+ if len(workflow_names) == 1
239
+ else "driftless label audit"
240
+ )
241
+ if len(workflow_names) == 1:
242
+ matrix_block = ""
243
+ workflow_arg = workflow_names[0]
244
+ workflow_step = f"""\
245
+ - name: Audit gold labels ({workflow_names[0]})
246
+ uses: {action_ref}
247
+ with:
248
+ command: audit-labels
249
+ workflow: {workflow_arg}
250
+ args: "--fail"
251
+ """
252
+ else:
253
+ matrix_yaml = "\n".join(f" - {name!r}" for name in workflow_names)
254
+ matrix_block = f"""\
255
+ strategy:
256
+ fail-fast: false
257
+ matrix:
258
+ workflow:
259
+ {matrix_yaml}
260
+
261
+ """
262
+ workflow_step = f"""\
263
+ - name: Audit gold labels (${{{{ matrix.workflow }}}})
264
+ uses: {action_ref}
265
+ with:
266
+ command: audit-labels
267
+ workflow: ${{{{ matrix.workflow }}}}
268
+ args: "--fail"
269
+ """
270
+ return f"""\
271
+ name: {title}
272
+
273
+ # Fail CI when duplicate/near-duplicate inputs carry disagreeing gold labels.
274
+ on:
275
+ pull_request:
276
+ paths:
277
+ {_path_filter_block(paths)}\
278
+ push:
279
+ branches: [main]
280
+ paths:
281
+ {_path_filter_block(paths)}\
282
+ workflow_dispatch:
283
+
284
+ jobs:
285
+ audit:
286
+ runs-on: ubuntu-latest
287
+ {matrix_block}\
288
+ steps:
289
+ - uses: actions/checkout@v4
290
+ {workflow_step}\
291
+ """
292
+
293
+
294
+ @dataclass(frozen=True)
295
+ class JudgeCheckTarget:
296
+ name: str
297
+ calibration_path: str
298
+ enforce: bool
299
+
300
+
301
+ def judge_check_targets(contract: Contract) -> list[JudgeCheckTarget]:
302
+ """Judge-graded workflows with a human calibration set configured."""
303
+ targets: list[JudgeCheckTarget] = []
304
+ for name, wf in contract.workflows.items():
305
+ if wf.eval.grading != "judge" or wf.eval.judge is None:
306
+ continue
307
+ spec = wf.eval.judge
308
+ if not spec.calibration_path:
309
+ continue
310
+ enforce = spec.max_mae is not None or spec.min_correlation is not None
311
+ targets.append(
312
+ JudgeCheckTarget(
313
+ name=name,
314
+ calibration_path=spec.calibration_path,
315
+ enforce=enforce,
316
+ )
317
+ )
318
+ return targets
319
+
320
+
321
+ def judge_check_paths(contract: Contract) -> list[str]:
322
+ paths: list[str] = []
323
+ for target in judge_check_targets(contract):
324
+ if target.calibration_path not in paths:
325
+ paths.append(target.calibration_path)
326
+ return paths
327
+
328
+
329
+ def render_judge_check_workflow(
330
+ action_ref: str,
331
+ targets: list[JudgeCheckTarget],
332
+ paths: list[str],
333
+ ) -> str:
334
+ if not targets:
335
+ raise ValueError("targets must not be empty")
336
+ title = (
337
+ f"driftless judge check ({targets[0].name})"
338
+ if len(targets) == 1
339
+ else "driftless judge check"
340
+ )
341
+ if len(targets) == 1:
342
+ target = targets[0]
343
+ matrix_block = ""
344
+ args = '"--enforce"' if target.enforce else '""'
345
+ workflow_step = f"""\
346
+ - name: Judge calibration check ({target.name})
347
+ uses: {action_ref}
348
+ with:
349
+ command: judge-check
350
+ workflow: {target.name}
351
+ args: {args}
352
+ env:
353
+ {_provider_env_block()}\
354
+ """
355
+ else:
356
+ include_lines: list[str] = []
357
+ for target in targets:
358
+ args = '"--enforce"' if target.enforce else '""'
359
+ include_lines.append(
360
+ f" - workflow: {target.name!r}\n"
361
+ f" args: {args}"
362
+ )
363
+ matrix_block = (
364
+ " strategy:\n"
365
+ " fail-fast: false\n"
366
+ " matrix:\n"
367
+ " include:\n"
368
+ + "\n".join(include_lines)
369
+ + "\n\n"
370
+ )
371
+ workflow_step = f"""\
372
+ - name: Judge calibration check (${{{{ matrix.workflow }}}})
373
+ uses: {action_ref}
374
+ with:
375
+ command: judge-check
376
+ workflow: ${{{{ matrix.workflow }}}}
377
+ args: ${{{{ matrix.args }}}}
378
+ env:
379
+ {_provider_env_block()}\
380
+ """
381
+ return f"""\
382
+ name: {title}
383
+
384
+ # Measure LLM-judge agreement against human-scored calibration records.
385
+ on:
386
+ pull_request:
387
+ paths:
388
+ {_path_filter_block(paths)}\
389
+ push:
390
+ branches: [main]
391
+ paths:
392
+ {_path_filter_block(paths)}\
393
+ workflow_dispatch:
394
+
395
+ jobs:
396
+ judge-check:
397
+ runs-on: ubuntu-latest
398
+ {matrix_block}\
399
+ steps:
400
+ - uses: actions/checkout@v4
401
+ {workflow_step}\
402
+ """
403
+
404
+
206
405
  def render_plan_workflow(action_ref: str) -> str:
207
406
  return f"""\
208
407
  name: driftless plan (deprecation triage)
@@ -251,6 +450,8 @@ def scaffold_ci(
251
450
  include_refine: bool = True,
252
451
  include_poll: bool | None = None,
253
452
  include_plan: bool = False,
453
+ include_audit_labels: bool | None = None,
454
+ include_judge_check: bool | None = None,
254
455
  ) -> list[Path]:
255
456
  """Write GitHub workflow YAML files under ``out_dir``."""
256
457
  action_ref = action_ref or default_action_ref()
@@ -293,10 +494,52 @@ def scaffold_ci(
293
494
  if include_plan:
294
495
  write(out_dir / "driftless-plan-act.yml", render_plan_workflow(action_ref))
295
496
 
497
+ audit_names = label_audit_workflows(contract)
498
+ audit_needed = include_audit_labels
499
+ if audit_needed is None:
500
+ audit_needed = bool(audit_names)
501
+ if audit_needed:
502
+ if not audit_names:
503
+ raise DriftlessError(
504
+ "label audit workflow requires a classification workflow with eval.labels_path",
505
+ hint="add labels_path to a workflow or pass --no-audit-labels",
506
+ )
507
+ audit_paths = label_audit_paths(contract)
508
+ fname = (
509
+ "driftless-label-audit.yml"
510
+ if len(audit_names) == 1
511
+ else "driftless-label-audit-all.yml"
512
+ )
513
+ write(
514
+ out_dir / fname,
515
+ render_audit_labels_workflow(action_ref, audit_names, audit_paths),
516
+ )
517
+
518
+ judge_targets = judge_check_targets(contract)
519
+ judge_needed = include_judge_check
520
+ if judge_needed is None:
521
+ judge_needed = bool(judge_targets)
522
+ if judge_needed:
523
+ if not judge_targets:
524
+ raise DriftlessError(
525
+ "judge-check workflow requires eval.judge.calibration_path",
526
+ hint="add a human-scored calibration set or pass --no-judge-check",
527
+ )
528
+ judge_paths = judge_check_paths(contract)
529
+ fname = (
530
+ "driftless-judge-check.yml"
531
+ if len(judge_targets) == 1
532
+ else "driftless-judge-check-all.yml"
533
+ )
534
+ write(
535
+ out_dir / fname,
536
+ render_judge_check_workflow(action_ref, judge_targets, judge_paths),
537
+ )
538
+
296
539
  if not written:
297
540
  raise DriftlessError(
298
541
  "nothing to scaffold",
299
- hint="enable at least one of scan, migrate, refine, poll, or plan",
542
+ hint="enable at least one of scan, migrate, refine, poll, plan, audit-labels, or judge-check",
300
543
  )
301
544
  return written
302
545
 
@@ -321,5 +564,7 @@ Next steps:
321
564
  2. For poll workflows: DRIFTLESS_DATASOURCE_TOKEN if eval.data_source URLs need auth.
322
565
  3. Confirm workflow path filters match your eval dataset paths in driftless.yml.
323
566
  4. Run driftless validate -w <workflow> locally before enabling scheduled jobs.
324
- 5. Pin the Action ref when upgrading: uses: driftless-dev/driftless@vX.Y.Z
567
+ 5. Run driftless audit-labels -w <workflow> locally; CI uses --fail on label conflicts.
568
+ 6. For judge-graded workflows: driftless judge-check -w <workflow> --enforce when gates are set.
569
+ 7. Pin the Action ref when upgrading: uses: driftless-dev/driftless@vX.Y.Z
325
570
  """
@@ -568,6 +568,7 @@ def result_to_dict(result: MigrationResult) -> dict:
568
568
  "experiment_log": [asdict(a) for a in result.experiment_log],
569
569
  "cluster_trajectory": cluster_trajectories(result.cluster_history),
570
570
  "warnings": result.warnings,
571
+ "split_seeds_used": result.split_seeds_used,
571
572
  "judge_agreement": asdict(result.judge_agreement) if result.judge_agreement else None,
572
573
  "judge_evidence": result.judge_evidence,
573
574
  "suggested_thresholds": result.suggested_thresholds,