driftless 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {driftless-0.2.5 → driftless-0.2.7}/CHANGELOG.md +24 -2
  2. {driftless-0.2.5 → driftless-0.2.7}/PKG-INFO +5 -5
  3. {driftless-0.2.5 → driftless-0.2.7}/README.md +4 -4
  4. {driftless-0.2.5 → driftless-0.2.7}/docs/RELEASE.md +7 -5
  5. {driftless-0.2.5 → driftless-0.2.7}/site/docs.html +1 -1
  6. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/__init__.py +1 -1
  7. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/cli.py +5 -0
  8. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/compare.py +12 -1
  9. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/contract.py +10 -0
  10. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/engine.py +51 -8
  11. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/evaluation.py +60 -0
  12. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/report.py +1 -0
  13. {driftless-0.2.5 → driftless-0.2.7}/tests/test_contract.py +11 -0
  14. {driftless-0.2.5 → driftless-0.2.7}/tests/test_engine.py +11 -0
  15. {driftless-0.2.5 → driftless-0.2.7}/tests/test_evaluation.py +30 -0
  16. {driftless-0.2.5 → driftless-0.2.7}/tests/test_init_ci.py +40 -0
  17. driftless-0.2.7/tests/test_splits.py +27 -0
  18. {driftless-0.2.5 → driftless-0.2.7}/.gitignore +0 -0
  19. {driftless-0.2.5 → driftless-0.2.7}/LICENSE +0 -0
  20. {driftless-0.2.5 → driftless-0.2.7}/docs/repair-and-generators.md +0 -0
  21. {driftless-0.2.5 → driftless-0.2.7}/pyproject.toml +0 -0
  22. {driftless-0.2.5 → driftless-0.2.7}/site/assets/app.js +0 -0
  23. {driftless-0.2.5 → driftless-0.2.7}/site/assets/hero-workflow.png +0 -0
  24. {driftless-0.2.5 → driftless-0.2.7}/site/assets/landing.css +0 -0
  25. {driftless-0.2.5 → driftless-0.2.7}/site/assets/runs.css +0 -0
  26. {driftless-0.2.5 → driftless-0.2.7}/site/assets/runs.js +0 -0
  27. {driftless-0.2.5 → driftless-0.2.7}/site/assets/sample-run.json +0 -0
  28. {driftless-0.2.5 → driftless-0.2.7}/site/assets/styles.css +0 -0
  29. {driftless-0.2.5 → driftless-0.2.7}/site/index.html +0 -0
  30. {driftless-0.2.5 → driftless-0.2.7}/site/runs.html +0 -0
  31. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/calibrate.py +0 -0
  32. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/configure.py +0 -0
  33. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/data/model_lifecycle.json +0 -0
  34. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/datasource.py +0 -0
  35. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/datastate.py +0 -0
  36. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/discovery.py +0 -0
  37. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/errors.py +0 -0
  38. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/generators.py +0 -0
  39. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/github.py +0 -0
  40. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/harness.py +0 -0
  41. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/init_ci.py +0 -0
  42. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/judges.py +0 -0
  43. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/label_audit.py +0 -0
  44. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/lifecycle.py +0 -0
  45. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/policy.py +0 -0
  46. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/preflight.py +0 -0
  47. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/progress.py +0 -0
  48. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/scanner.py +0 -0
  49. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/splits.py +0 -0
  50. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/templates.py +0 -0
  51. {driftless-0.2.5 → driftless-0.2.7}/src/driftless/view.py +0 -0
  52. {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/live_eval_baseline.json +0 -0
  53. {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/driftless.yml +0 -0
  54. {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/inputs.jsonl +0 -0
  55. {driftless-0.2.5 → driftless-0.2.7}/tests/fixtures/smoke/labels.jsonl +0 -0
  56. {driftless-0.2.5 → driftless-0.2.7}/tests/regression_metrics.py +0 -0
  57. {driftless-0.2.5 → driftless-0.2.7}/tests/scenarios.py +0 -0
  58. {driftless-0.2.5 → driftless-0.2.7}/tests/test_cli.py +0 -0
  59. {driftless-0.2.5 → driftless-0.2.7}/tests/test_compare.py +0 -0
  60. {driftless-0.2.5 → driftless-0.2.7}/tests/test_data_change_gate.py +0 -0
  61. {driftless-0.2.5 → driftless-0.2.7}/tests/test_data_change_regression.py +0 -0
  62. {driftless-0.2.5 → driftless-0.2.7}/tests/test_datasource.py +0 -0
  63. {driftless-0.2.5 → driftless-0.2.7}/tests/test_datastate.py +0 -0
  64. {driftless-0.2.5 → driftless-0.2.7}/tests/test_discovery.py +0 -0
  65. {driftless-0.2.5 → driftless-0.2.7}/tests/test_endpoint.py +0 -0
  66. {driftless-0.2.5 → driftless-0.2.7}/tests/test_extraction.py +0 -0
  67. {driftless-0.2.5 → driftless-0.2.7}/tests/test_generators.py +0 -0
  68. {driftless-0.2.5 → driftless-0.2.7}/tests/test_github.py +0 -0
  69. {driftless-0.2.5 → driftless-0.2.7}/tests/test_grading_loop.py +0 -0
  70. {driftless-0.2.5 → driftless-0.2.7}/tests/test_harness.py +0 -0
  71. {driftless-0.2.5 → driftless-0.2.7}/tests/test_judge.py +0 -0
  72. {driftless-0.2.5 → driftless-0.2.7}/tests/test_judge_loop.py +0 -0
  73. {driftless-0.2.5 → driftless-0.2.7}/tests/test_label_audit.py +0 -0
  74. {driftless-0.2.5 → driftless-0.2.7}/tests/test_lifecycle.py +0 -0
  75. {driftless-0.2.5 → driftless-0.2.7}/tests/test_migration_live.py +0 -0
  76. {driftless-0.2.5 → driftless-0.2.7}/tests/test_migration_regression.py +0 -0
  77. {driftless-0.2.5 → driftless-0.2.7}/tests/test_plan_act.py +0 -0
  78. {driftless-0.2.5 → driftless-0.2.7}/tests/test_policy.py +0 -0
  79. {driftless-0.2.5 → driftless-0.2.7}/tests/test_poll_act.py +0 -0
  80. {driftless-0.2.5 → driftless-0.2.7}/tests/test_preflight.py +0 -0
  81. {driftless-0.2.5 → driftless-0.2.7}/tests/test_progress.py +0 -0
  82. {driftless-0.2.5 → driftless-0.2.7}/tests/test_refine.py +0 -0
  83. {driftless-0.2.5 → driftless-0.2.7}/tests/test_refresh_catalog.py +0 -0
  84. {driftless-0.2.5 → driftless-0.2.7}/tests/test_regression_metrics.py +0 -0
  85. {driftless-0.2.5 → driftless-0.2.7}/tests/test_repair_prompt.py +0 -0
  86. {driftless-0.2.5 → driftless-0.2.7}/tests/test_report.py +0 -0
  87. {driftless-0.2.5 → driftless-0.2.7}/tests/test_scanner.py +0 -0
  88. {driftless-0.2.5 → driftless-0.2.7}/tests/test_view.py +0 -0
@@ -17,6 +17,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
17
17
 
18
18
  ---
19
19
 
20
+ ## [0.2.7] - 2026-07-01
21
+
22
+ ### Added
23
+
24
+ - **P0.3 per-class support floors** — warn when any class has fewer than five gold
25
+ examples on a split (`assess_class_support`); surfaced on `migrate` (tuning +
26
+ holdout), `compare` (baseline + target), CLI "Confidence caveats", and saved
27
+ compare JSON.
28
+
29
+ ---
30
+
31
+ ## [0.2.6] - 2026-07-01
32
+
33
+ ### Added
34
+
35
+ - **P0.3 multi-seed tuning selection** — optional `migration.split_seed_count`
36
+ (1–5) averages tuning-split metrics across shuffle seeds when scoring repair
37
+ candidates; holdout validation still uses the primary `--seed` only.
38
+
39
+ ---
40
+
20
41
  ## [0.2.5] - 2026-07-01
21
42
 
22
43
  ### Added
@@ -132,8 +153,9 @@ First public release on [PyPI](https://pypi.org/project/driftless/0.1.0/).
132
153
  - **Docs** — project overview, repair algorithm spec, 2×2 migration methodology,
133
154
  Poetry + Dependabot product framing.
134
155
 
135
- [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.5...HEAD
136
- [0.2.5]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.5
156
+ [Unreleased]: https://github.com/driftless-dev/driftless/compare/v0.2.7...HEAD
157
+ [0.2.7]: https://github.com/driftless-dev/driftless/releases/tag/v0.2.7
158
+ [0.2.6]: https://github.com/driftless-dev/driftless/compare/v0.2.6...v0.2.7
137
159
  [0.2.4]: https://github.com/driftless-dev/driftless/compare/v0.2.4...v0.2.5
138
160
  [0.2.3]: https://github.com/driftless-dev/driftless/compare/v0.2.3...v0.2.4
139
161
  [0.2.2]: https://github.com/driftless-dev/driftless/compare/v0.2.2...v0.2.3
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: driftless
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
5
5
  Project-URL: Homepage, https://github.com/driftless-dev/driftless
6
6
  Project-URL: Repository, https://github.com/driftless-dev/driftless
@@ -96,7 +96,7 @@ optimizes against it, with your team owning the definition of "good":
96
96
  |---|---|
97
97
  | `init` | Scaffold a `driftless.yml`. |
98
98
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
99
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
99
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
100
100
  | `scan` | Find probable LLM usage and at-risk models. |
101
101
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
102
102
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -129,11 +129,11 @@ propose it.
129
129
  ## GitHub-native usage
130
130
 
131
131
  A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
132
- can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
133
- manually-triggered migration that opens a PR (or an issue when blocked).
132
+ can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
133
+ `plan --act` triage, and manually-triggered migration workflows.
134
134
 
135
135
  ```yaml
136
- - uses: driftless-dev/driftless@v0.2.5
136
+ - uses: driftless-dev/driftless@v0.2.7
137
137
  with:
138
138
  command: scan
139
139
  ```
@@ -57,7 +57,7 @@ optimizes against it, with your team owning the definition of "good":
57
57
  |---|---|
58
58
  | `init` | Scaffold a `driftless.yml`. |
59
59
  | `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
60
- | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, label audit, and judge check. |
60
+ | `init-ci` | Scaffold `.github/workflows/` for scan, migrate, refine, poll, plan, label audit, and judge check. |
61
61
  | `scan` | Find probable LLM usage and at-risk models. |
62
62
  | `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
63
63
  | `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
@@ -90,11 +90,11 @@ propose it.
90
90
  ## GitHub-native usage
91
91
 
92
92
  A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
93
- can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
94
- manually-triggered migration that opens a PR (or an issue when blocked).
93
+ can run in CI. See `.github/workflows/` for a scheduled deprecation scan, weekly
94
+ `plan --act` triage, and manually-triggered migration workflows.
95
95
 
96
96
  ```yaml
97
- - uses: driftless-dev/driftless@v0.2.5
97
+ - uses: driftless-dev/driftless@v0.2.7
98
98
  with:
99
99
  command: scan
100
100
  ```
@@ -153,7 +153,7 @@ After a release, users can pin the composite Action by release tag
153
153
  (`action.yml` lives at the repo root — no `/action` path segment):
154
154
 
155
155
  ```yaml
156
- - uses: driftless-dev/driftless@v0.2.5
156
+ - uses: driftless-dev/driftless@v0.2.7
157
157
  with:
158
158
  command: scan
159
159
  ```
@@ -161,9 +161,9 @@ After a release, users can pin the composite Action by release tag
161
161
  Or pin the PyPI package in the Action input:
162
162
 
163
163
  ```yaml
164
- - uses: driftless-dev/driftless@v0.2.5
164
+ - uses: driftless-dev/driftless@v0.2.7
165
165
  with:
166
- version: "==0.2.5"
166
+ version: "==0.2.7"
167
167
  command: migrate
168
168
  ```
169
169
 
@@ -171,7 +171,7 @@ Optionally maintain a floating **`v1`** tag on the latest stable minor release
171
171
  (point it at the current release tag after each publish):
172
172
 
173
173
  ```bash
174
- git tag -f v1 v0.2.5 && git push origin v1 --force
174
+ git tag -f v1 v0.2.7 && git push origin v1 --force
175
175
  ```
176
176
 
177
177
  Update [`action.yml`](../action.yml) default `version` input when cutting releases.
@@ -213,7 +213,9 @@ In **Settings → Secrets and variables → Actions**, add:
213
213
  | `ANTHROPIC_API_KEY` | Live eval matrix job (`provider: anthropic`) |
214
214
 
215
215
  If a secret is missing, that provider job exits cleanly with a warning (CI stays
216
- green). When both are set, nightly runs append to
216
+ green). On scheduled or manual runs, the **secrets-preflight** job writes a
217
+ summary table to the workflow run so you can see which keys are configured.
218
+ When both are set, nightly runs append to
217
219
  `.driftless/regression-metrics.jsonl` and check against
218
220
  `tests/fixtures/live_eval_baseline.json` with `--require-all`.
219
221
 
@@ -428,7 +428,7 @@ driftless view -w support_classifier</code></pre>
428
428
  <span class="tok-k">runs-on</span>: ubuntu-latest
429
429
  <span class="tok-k">steps</span>:
430
430
  - <span class="tok-k">uses</span>: actions/checkout@v4
431
- - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.5
431
+ - <span class="tok-k">uses</span>: driftless-dev/driftless@v0.2.7
432
432
  <span class="tok-k">with</span>:
433
433
  <span class="tok-k">command</span>: <span class="tok-s">plan</span></code></pre>
434
434
  <p>A scheduled <code class="inline">plan</code> gates CI when a deprecated model needs attention; a manually-triggered <code class="inline">migrate</code> opens a PR (or an issue when blocked) with the evidence attached.</p>
@@ -1,3 +1,3 @@
1
1
  """driftless: Dependabot for LLM models."""
2
2
 
3
- __version__ = "0.2.5"
3
+ __version__ = "0.2.7"
@@ -446,6 +446,11 @@ def compare(
446
446
 
447
447
  console.print(_scorecard(comparison))
448
448
 
449
+ if comparison.warnings:
450
+ console.print("\n[bold yellow]Confidence caveats[/]:")
451
+ for w in comparison.warnings:
452
+ console.print(f" • {w}")
453
+
449
454
  console.print("\n[bold]Thresholds[/] (target vs contract):")
450
455
  if not comparison.checks:
451
456
  console.print(" [dim]no thresholds configured[/]")
@@ -15,7 +15,7 @@ from typing import cast
15
15
 
16
16
  from .contract import ThresholdsSpec, Workflow
17
17
  from .errors import DriftlessError
18
- from .evaluation import Metrics, evaluate
18
+ from .evaluation import Metrics, assess_class_support, evaluate
19
19
  from .harness import run_workflow
20
20
  from .progress import log as progress_log
21
21
 
@@ -35,6 +35,7 @@ class Comparison:
35
35
  baseline: Metrics
36
36
  target: Metrics
37
37
  checks: list[ThresholdCheck] = field(default_factory=list)
38
+ warnings: list[str] = field(default_factory=list)
38
39
 
39
40
  @property
40
41
  def passed(self) -> bool:
@@ -218,6 +219,14 @@ def compare_models(
218
219
  )
219
220
 
220
221
  checks = check_thresholds(workflow.thresholds, baseline_metrics, target_metrics)
222
+ warnings: list[str] = []
223
+ for metrics, label in (
224
+ (baseline_metrics, "baseline"),
225
+ (target_metrics, "target"),
226
+ ):
227
+ for w in assess_class_support(metrics, context=f"{label} eval"):
228
+ if w not in warnings:
229
+ warnings.append(w)
221
230
 
222
231
  return Comparison(
223
232
  workflow=workflow_name,
@@ -226,6 +235,7 @@ def compare_models(
226
235
  baseline=baseline_metrics,
227
236
  target=target_metrics,
228
237
  checks=checks,
238
+ warnings=warnings,
229
239
  )
230
240
 
231
241
 
@@ -241,6 +251,7 @@ def save_comparison(comparison: Comparison, cwd: Path | None = None) -> Path:
241
251
  "baseline": asdict(comparison.baseline),
242
252
  "target": asdict(comparison.target),
243
253
  "checks": [asdict(c) for c in comparison.checks],
254
+ "warnings": comparison.warnings,
244
255
  "passed": comparison.passed,
245
256
  }
246
257
  out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
@@ -324,6 +324,16 @@ class MigrationSpec(StrictModel):
324
324
  allow_business_logic_edits: bool = False
325
325
  max_iterations: int = 8
326
326
  holdout_required: bool = True
327
+ # When >1, average tuning-split metrics across this many shuffle seeds
328
+ # (seed, seed+1, …) when scoring repair candidates. Holdout uses ``seed`` only.
329
+ split_seed_count: int = 1
330
+
331
+ @field_validator("split_seed_count")
332
+ @classmethod
333
+ def _split_seed_count_range(cls, v: int) -> int:
334
+ if v < 1 or v > 5:
335
+ raise ValueError("migration.split_seed_count must be between 1 and 5")
336
+ return v
327
337
 
328
338
 
329
339
  class RepairSpec(StrictModel):
@@ -30,10 +30,10 @@ from .calibrate import suggest_thresholds
30
30
  from .compare import ThresholdCheck, check_thresholds
31
31
  from .contract import ThresholdsSpec, Workflow
32
32
  from .errors import DriftlessError
33
- from .evaluation import Metrics, RecordRow, RunAnalysis, analyze
33
+ from .evaluation import Metrics, RecordRow, RunAnalysis, analyze, average_metrics, assess_class_support
34
34
  from .harness import run_workflow
35
35
  from .progress import log as progress_log
36
- from .splits import make_splits, materialize_inputs
36
+ from .splits import Split, make_splits, materialize_inputs
37
37
 
38
38
 
39
39
  # --------------------------------------------------------------------------- #
@@ -336,6 +336,8 @@ class MigrationResult:
336
336
  message: str = ""
337
337
  # Frozen editable files at loop start — baseline for per-candidate diffs in reports/UI.
338
338
  original_editable_files: dict[str, str] = field(default_factory=dict)
339
+ # Shuffle seeds used for tuning (primary ``seed`` only when split_seed_count==1).
340
+ split_seeds_used: list[int] = field(default_factory=list)
339
341
 
340
342
  @property
341
343
  def succeeded(self) -> bool:
@@ -516,11 +518,19 @@ def run_migration(
516
518
  )
517
519
 
518
520
  split = make_splits(workflow, cwd=cwd, seed=seed)
521
+ split_seeds_used = list(range(seed, seed + mig.split_seed_count))
519
522
  size_warnings = assess_split_sizes(
520
523
  len(split.input_lines),
521
524
  len(split.holdout_idx),
522
525
  holdout_required=mig.holdout_required,
523
526
  )
527
+ if mig.split_seed_count > 1:
528
+ size_warnings.append(
529
+ f"Multi-seed tuning: candidate selection averages metrics across "
530
+ f"{mig.split_seed_count} shuffle seeds ({split_seeds_used[0]}.."
531
+ f"{split_seeds_used[-1]}); each candidate scoring multiplies tuning "
532
+ "workflow runs."
533
+ )
524
534
 
525
535
  use_ids = bool(workflow.eval.id_field) and split.gold is not None
526
536
 
@@ -532,18 +542,23 @@ def run_migration(
532
542
  return judge_evidence_samples(rows)
533
543
 
534
544
  def evaluate_on(
535
- model: str, idx: list[int], files: dict[str, str] | None = None
545
+ model: str,
546
+ idx: list[int],
547
+ files: dict[str, str] | None = None,
548
+ *,
549
+ split_ref: Split | None = None,
536
550
  ) -> RunAnalysis:
551
+ sp = split_ref or split
537
552
  file_ctx = apply_files(files, cwd=cwd) if files else nullcontext()
538
- idx_lines = split.lines_for(idx)
553
+ idx_lines = sp.lines_for(idx)
539
554
  with materialize_inputs(workflow, idx_lines, cwd=cwd):
540
555
  with file_ctx:
541
556
  run = run_workflow(workflow, model, cwd=cwd)
542
- if use_ids:
557
+ if use_ids and sp.gold_ids is not None:
543
558
  return analyze(
544
559
  workflow,
545
560
  run,
546
- gold_by_id=split.gold_by_id_for(idx),
561
+ gold_by_id=sp.gold_by_id_for(idx),
547
562
  inputs=idx_lines,
548
563
  judge=judge,
549
564
  cwd=cwd,
@@ -551,18 +566,34 @@ def run_migration(
551
566
  return analyze(
552
567
  workflow,
553
568
  run,
554
- gold_labels=split.gold_for(idx),
569
+ gold_labels=sp.gold_for(idx),
555
570
  inputs=idx_lines,
556
571
  judge=judge,
557
572
  cwd=cwd,
558
573
  )
559
574
 
575
+ def evaluate_tuning(
576
+ model: str, files: dict[str, str] | None = None
577
+ ) -> RunAnalysis:
578
+ """Score on the tuning split; average across seeds when configured."""
579
+ if mig.split_seed_count <= 1:
580
+ return evaluate_on(model, split.tuning_idx, files)
581
+ tuning_splits = [make_splits(workflow, cwd=cwd, seed=s) for s in split_seeds_used]
582
+ analyses = [
583
+ evaluate_on(model, sp.tuning_idx, files, split_ref=sp) for sp in tuning_splits
584
+ ]
585
+ return RunAnalysis(
586
+ metrics=average_metrics([a.metrics for a in analyses]),
587
+ rows=analyses[0].rows,
588
+ )
589
+
560
590
  progress_log(
561
591
  f"migration: phase 1/3 — initial eval "
562
592
  f"({len(split.tuning_idx)} tuning examples, model={current})"
563
593
  )
564
594
  progress_log("migration: phase 1/3 — baseline prompt on tuning split...")
565
595
  baseline_tuning = evaluate_on(current, split.tuning_idx).metrics
596
+ size_warnings.extend(assess_class_support(baseline_tuning, context="tuning split"))
566
597
  progress_log(f"migration: phase 1/3 — baseline F1={_fmt_f1(baseline_tuning.f1)}")
567
598
  progress_log("migration: phase 1/3 — current prompt on tuning split...")
568
599
  naive_analysis = evaluate_on(target_model, split.tuning_idx)
@@ -575,8 +606,15 @@ def run_migration(
575
606
  baseline_holdout = evaluate_on(current, split.holdout_idx).metrics
576
607
  holdout_metrics = evaluate_on(target_model, split.holdout_idx, files=files).metrics
577
608
  checks = check_thresholds(thresholds, baseline_holdout, holdout_metrics)
609
+ append_holdout_class_warnings(holdout_metrics)
578
610
  return all(c.passed for c in checks), holdout_metrics, checks
579
611
 
612
+ def append_holdout_class_warnings(holdout_metrics: Metrics | None) -> None:
613
+ if holdout_metrics is not None:
614
+ size_warnings.extend(
615
+ assess_class_support(holdout_metrics, context="holdout split")
616
+ )
617
+
580
618
  # Step: naive target already good? (migrate only -- in refine the model is
581
619
  # pinned, so the "naive target" is just the current prompt and there's no
582
620
  # model-only change to short-circuit on.)
@@ -597,6 +635,7 @@ def run_migration(
597
635
  holdout_checks=holdout_checks,
598
636
  tuning_checks=naive_checks,
599
637
  warnings=size_warnings,
638
+ split_seeds_used=split_seeds_used,
600
639
  judge_agreement=judge_agreement_info,
601
640
  judge_evidence=_judge_evidence(naive_analysis.rows),
602
641
  message="naive model swap passes thresholds; only the model ID changes",
@@ -691,7 +730,7 @@ def run_migration(
691
730
  cand_size = _patch_diff_size(patch.files, original_editable)
692
731
  try:
693
732
  validate_patch_scope(patch, workflow, cwd)
694
- analysis = evaluate_on(target_model, split.tuning_idx, files=patch.files)
733
+ analysis = evaluate_tuning(target_model, files=patch.files)
695
734
  except DriftlessError as exc:
696
735
  experiment_log.append(
697
736
  AttemptRecord(
@@ -786,6 +825,7 @@ def run_migration(
786
825
  experiment_log=experiment_log,
787
826
  cluster_history=cluster_history,
788
827
  warnings=size_warnings,
828
+ split_seeds_used=split_seeds_used,
789
829
  judge_agreement=judge_agreement_info,
790
830
  judge_evidence=_judge_evidence(best_analysis.rows),
791
831
  original_editable_files=original_editable,
@@ -826,6 +866,7 @@ def run_migration(
826
866
  refine_holdout_checks = check_thresholds(
827
867
  ThresholdsSpec(), baseline_holdout, refine_holdout_metrics
828
868
  )
869
+ append_holdout_class_warnings(refine_holdout_metrics)
829
870
  basis = refine_holdout_metrics if refine_holdout_metrics is not None else best_metrics
830
871
  suggested = suggest_thresholds(basis)
831
872
 
@@ -855,6 +896,7 @@ def run_migration(
855
896
  experiment_log=experiment_log,
856
897
  cluster_history=cluster_history,
857
898
  warnings=size_warnings,
899
+ split_seeds_used=split_seeds_used,
858
900
  suggested_thresholds=suggested,
859
901
  judge_agreement=judge_agreement_info,
860
902
  judge_evidence=_judge_evidence(best_analysis.rows),
@@ -887,6 +929,7 @@ def run_migration(
887
929
  experiment_log=experiment_log,
888
930
  cluster_history=cluster_history,
889
931
  warnings=size_warnings,
932
+ split_seeds_used=split_seeds_used,
890
933
  judge_agreement=judge_agreement_info,
891
934
  judge_evidence=_judge_evidence(best_analysis.rows),
892
935
  original_editable_files=original_editable,
@@ -74,6 +74,33 @@ class ClassMetrics:
74
74
  f1: float
75
75
 
76
76
 
77
+ # Warn when macro-F1 aggregates classes with very few gold examples on a split.
78
+ MIN_CLASS_SUPPORT = 5
79
+
80
+
81
+ def assess_class_support(
82
+ metrics: Metrics,
83
+ *,
84
+ context: str,
85
+ min_support: int = MIN_CLASS_SUPPORT,
86
+ ) -> list[str]:
87
+ """Low-confidence warnings for rare classes in classification metrics."""
88
+ if metrics.f1 is None or not metrics.per_class or min_support <= 0:
89
+ return []
90
+ low = [
91
+ (name, cm.support)
92
+ for name, cm in sorted(metrics.per_class.items())
93
+ if 0 < cm.support < min_support
94
+ ]
95
+ if not low:
96
+ return []
97
+ bits = ", ".join(f"{name} ({n})" for name, n in low)
98
+ return [
99
+ f"Low per-class support on {context}: {bits} — each below {min_support} gold "
100
+ "examples. Macro-F1 may not reflect rare-class performance."
101
+ ]
102
+
103
+
77
104
  @dataclass
78
105
  class Metrics:
79
106
  n: int
@@ -96,6 +123,39 @@ class Metrics:
96
123
  scored: int = 0
97
124
 
98
125
 
126
+ def average_metrics(items: list[Metrics]) -> Metrics:
127
+ """Mean of headline metrics across multiple eval runs (multi-seed tuning)."""
128
+ if not items:
129
+ raise ValueError("average_metrics requires at least one Metrics")
130
+ if len(items) == 1:
131
+ return items[0]
132
+
133
+ def _mean(vals: list[float | None]) -> float | None:
134
+ nums = [v for v in vals if v is not None]
135
+ return sum(nums) / len(nums) if nums else None
136
+
137
+ def _mean_int(vals: list[int]) -> int:
138
+ return int(round(sum(vals) / len(vals)))
139
+
140
+ costs = [m.total_cost for m in items if m.total_cost is not None]
141
+ return Metrics(
142
+ n=items[0].n,
143
+ schema_error_rate=_mean([m.schema_error_rate for m in items]),
144
+ refusal_rate=_mean([m.refusal_rate for m in items]) or 0.0,
145
+ accuracy=_mean([m.accuracy for m in items]),
146
+ precision=_mean([m.precision for m in items]),
147
+ recall=_mean([m.recall for m in items]),
148
+ f1=_mean([m.f1 for m in items]),
149
+ avg_latency_ms=_mean([m.avg_latency_ms for m in items]),
150
+ total_cost=sum(costs) if costs else None,
151
+ score=_mean([m.score for m in items]),
152
+ schema_errors=_mean_int([m.schema_errors for m in items]),
153
+ refusals=_mean_int([m.refusals for m in items]),
154
+ labeled=items[0].labeled,
155
+ scored=items[0].scored,
156
+ )
157
+
158
+
99
159
  def load_jsonl(path: Path) -> list[OutputRecord]:
100
160
  records: list[OutputRecord] = []
101
161
  with path.open(encoding="utf-8") as fh:
@@ -568,6 +568,7 @@ def result_to_dict(result: MigrationResult) -> dict:
568
568
  "experiment_log": [asdict(a) for a in result.experiment_log],
569
569
  "cluster_trajectory": cluster_trajectories(result.cluster_history),
570
570
  "warnings": result.warnings,
571
+ "split_seeds_used": result.split_seeds_used,
571
572
  "judge_agreement": asdict(result.judge_agreement) if result.judge_agreement else None,
572
573
  "judge_evidence": result.judge_evidence,
573
574
  "suggested_thresholds": result.suggested_thresholds,
@@ -63,6 +63,17 @@ def test_workflow_not_found():
63
63
  contract.workflow("missing")
64
64
 
65
65
 
66
+ def test_split_seed_count_must_be_in_range():
67
+ with pytest.raises(Exception):
68
+ Workflow.model_validate(
69
+ {
70
+ "run": {"command": "true", "input_path": "i", "output_path": "o"},
71
+ "model": {"current": "m", "env_var": "M"},
72
+ "migration": {"split_seed_count": 0},
73
+ }
74
+ )
75
+
76
+
66
77
  def test_load_missing_contract(tmp_path: Path):
67
78
  with pytest.raises(ContractError):
68
79
  load_contract(tmp_path / "nope.yml")
@@ -191,6 +191,7 @@ def test_small_dataset_run_carries_warning(tmp_path: Path):
191
191
  wf = _make_workflow(tmp_path) # 6 examples -> below the min thresholds
192
192
  result = run_migration("demo", wf, "weak", generator=StrictGen(), cwd=tmp_path, seed=1)
193
193
  assert any("Small dataset" in w for w in result.warnings)
194
+ assert any("Low per-class support" in w for w in result.warnings)
194
195
 
195
196
 
196
197
  def test_cluster_failures():
@@ -207,3 +208,13 @@ def test_cluster_failures():
207
208
  assert kinds["refusal"].count == 1
208
209
  assert kinds["misclassification"].count == 2 # billing<-technical pair
209
210
  assert kinds["misclassification"].key == "billing -> technical"
211
+
212
+
213
+ def test_multi_seed_tuning_still_passes(tmp_path: Path):
214
+ wf = _make_workflow(tmp_path)
215
+ wf.migration.split_seed_count = 2
216
+ result = run_migration("demo", wf, "weak", generator=StrictGen(), cwd=tmp_path, seed=1)
217
+
218
+ assert result.status == MigrationStatus.PASS
219
+ assert result.split_seeds_used == [1, 2]
220
+ assert any("Multi-seed tuning" in w for w in result.warnings)
@@ -309,6 +309,25 @@ def test_id_alignment_duplicate_output_id_raises(tmp_path: Path):
309
309
  evaluate(wf, run, cwd=tmp_path)
310
310
 
311
311
 
312
+ def test_assess_class_support_flags_rare_classes():
313
+ from driftless.evaluation import ClassMetrics, Metrics, assess_class_support
314
+
315
+ metrics = Metrics(
316
+ n=12,
317
+ schema_error_rate=0.0,
318
+ refusal_rate=0.0,
319
+ f1=0.9,
320
+ per_class={
321
+ "billing": ClassMetrics(4, 1.0, 1.0, 1.0),
322
+ "technical": ClassMetrics(8, 0.9, 0.9, 0.9),
323
+ },
324
+ )
325
+ warnings = assess_class_support(metrics, context="tuning split")
326
+ assert len(warnings) == 1
327
+ assert "billing (4)" in warnings[0]
328
+ assert "tuning split" in warnings[0]
329
+
330
+
312
331
  def test_load_labels_by_id_rejects_duplicates(tmp_path: Path):
313
332
  from driftless.evaluation import load_labels_by_id
314
333
 
@@ -316,3 +335,14 @@ def test_load_labels_by_id_rejects_duplicates(tmp_path: Path):
316
335
  p.write_text('{"id":"a","label":"x"}\n{"id":"a","label":"y"}\n')
317
336
  with pytest.raises(Exception):
318
337
  load_labels_by_id(p, "id", "label")
338
+
339
+
340
+ def test_average_metrics_means_headline_fields():
341
+ from driftless.evaluation import Metrics, average_metrics
342
+
343
+ a = Metrics(n=10, schema_error_rate=0.2, refusal_rate=0.1, f1=0.8)
344
+ b = Metrics(n=10, schema_error_rate=0.0, refusal_rate=0.0, f1=1.0)
345
+ avg = average_metrics([a, b])
346
+ assert avg.f1 == pytest.approx(0.9)
347
+ assert avg.schema_error_rate == pytest.approx(0.1)
348
+ assert avg.refusal_rate == pytest.approx(0.05)
@@ -293,6 +293,46 @@ def test_label_audit_helpers():
293
293
  assert label_audit_paths(contract) == ["labels.jsonl", "in.jsonl"]
294
294
 
295
295
 
296
+ def test_init_ci_scaffolds_plan_workflow(tmp_path, monkeypatch):
297
+ monkeypatch.chdir(tmp_path)
298
+ Path("driftless.yml").write_text(
299
+ """
300
+ version: 1
301
+ workflows:
302
+ smoke:
303
+ run:
304
+ command: echo ok
305
+ input_path: in.jsonl
306
+ output_path: out.jsonl
307
+ model:
308
+ current: gpt-4o-mini
309
+ env_var: MODEL
310
+ eval:
311
+ labels_path: labels.jsonl
312
+ """.lstrip()
313
+ )
314
+ out = tmp_path / "workflows"
315
+ result = runner.invoke(
316
+ app,
317
+ [
318
+ "init-ci",
319
+ "--out-dir",
320
+ str(out),
321
+ "--no-scan",
322
+ "--no-migrate",
323
+ "--no-refine",
324
+ "--no-audit-labels",
325
+ "--plan",
326
+ ],
327
+ )
328
+
329
+ assert result.exit_code == 0
330
+ plan = (out / "driftless-plan-act.yml").read_text()
331
+ assert "command: plan" in plan
332
+ assert "--act" in plan
333
+ assert "GH_TOKEN" in plan
334
+
335
+
296
336
  def test_rendered_workflows_use_action_ref():
297
337
  ref = "driftless-dev/driftless@v9.9.9"
298
338
  assert ref in render_migrate_workflow(ref)
@@ -0,0 +1,27 @@
1
+ """Tests for tuning/holdout splits."""
2
+
3
+ from driftless.contract import Workflow
4
+ from driftless.splits import make_splits
5
+
6
+
7
+ def _workflow() -> Workflow:
8
+ return Workflow.model_validate(
9
+ {
10
+ "run": {"command": "true", "input_path": "i.jsonl", "output_path": "o.jsonl"},
11
+ "model": {"current": "m", "env_var": "M"},
12
+ "eval": {"labels_path": "l.jsonl", "split": {"tuning": 0.5, "holdout": 0.5}},
13
+ }
14
+ )
15
+
16
+
17
+ def test_different_seeds_produce_different_partitions(tmp_path):
18
+ lines = "\n".join(f'{{"id": {i}, "label": "a"}}' for i in range(20)) + "\n"
19
+ labels = "\n".join('{"id": ' + str(i) + ', "label": "a"}' for i in range(20)) + "\n"
20
+ (tmp_path / "i.jsonl").write_text(lines)
21
+ (tmp_path / "l.jsonl").write_text(labels)
22
+
23
+ wf = _workflow()
24
+ wf.eval.id_field = "id"
25
+ a = make_splits(wf, cwd=tmp_path, seed=0)
26
+ b = make_splits(wf, cwd=tmp_path, seed=1)
27
+ assert a.tuning_idx != b.tuning_idx
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes