driftless 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- driftless-0.1.0/.gitignore +22 -0
- driftless-0.1.0/IMPLEMENTATION_PLAN.md +97 -0
- driftless-0.1.0/LICENSE +21 -0
- driftless-0.1.0/PKG-INFO +137 -0
- driftless-0.1.0/README.md +102 -0
- driftless-0.1.0/docs/DECISIONS.md +611 -0
- driftless-0.1.0/docs/NEXT_STEPS.md +142 -0
- driftless-0.1.0/docs/PROJECT_OVERVIEW.md +574 -0
- driftless-0.1.0/docs/repair-and-generators.md +418 -0
- driftless-0.1.0/pyproject.toml +70 -0
- driftless-0.1.0/site/assets/app.js +100 -0
- driftless-0.1.0/site/assets/runs.css +327 -0
- driftless-0.1.0/site/assets/runs.js +520 -0
- driftless-0.1.0/site/assets/sample-run.json +361 -0
- driftless-0.1.0/site/assets/styles.css +517 -0
- driftless-0.1.0/site/docs.html +464 -0
- driftless-0.1.0/site/index.html +390 -0
- driftless-0.1.0/site/runs.html +125 -0
- driftless-0.1.0/src/driftless/__init__.py +3 -0
- driftless-0.1.0/src/driftless/calibrate.py +33 -0
- driftless-0.1.0/src/driftless/cli.py +1163 -0
- driftless-0.1.0/src/driftless/compare.py +213 -0
- driftless-0.1.0/src/driftless/configure.py +98 -0
- driftless-0.1.0/src/driftless/contract.py +414 -0
- driftless-0.1.0/src/driftless/data/model_lifecycle.json +51 -0
- driftless-0.1.0/src/driftless/datasource.py +103 -0
- driftless-0.1.0/src/driftless/datastate.py +258 -0
- driftless-0.1.0/src/driftless/discovery.py +392 -0
- driftless-0.1.0/src/driftless/engine.py +769 -0
- driftless-0.1.0/src/driftless/errors.py +28 -0
- driftless-0.1.0/src/driftless/evaluation.py +821 -0
- driftless-0.1.0/src/driftless/generators.py +460 -0
- driftless-0.1.0/src/driftless/github.py +225 -0
- driftless-0.1.0/src/driftless/harness.py +259 -0
- driftless-0.1.0/src/driftless/judges.py +185 -0
- driftless-0.1.0/src/driftless/lifecycle.py +145 -0
- driftless-0.1.0/src/driftless/policy.py +402 -0
- driftless-0.1.0/src/driftless/preflight.py +64 -0
- driftless-0.1.0/src/driftless/report.py +339 -0
- driftless-0.1.0/src/driftless/scanner.py +170 -0
- driftless-0.1.0/src/driftless/splits.py +143 -0
- driftless-0.1.0/src/driftless/templates.py +182 -0
- driftless-0.1.0/src/driftless/view.py +139 -0
- driftless-0.1.0/tests/fixtures/smoke/driftless.yml +17 -0
- driftless-0.1.0/tests/fixtures/smoke/inputs.jsonl +2 -0
- driftless-0.1.0/tests/fixtures/smoke/labels.jsonl +2 -0
- driftless-0.1.0/tests/scenarios.py +387 -0
- driftless-0.1.0/tests/test_cli.py +75 -0
- driftless-0.1.0/tests/test_compare.py +103 -0
- driftless-0.1.0/tests/test_contract.py +68 -0
- driftless-0.1.0/tests/test_data_change_gate.py +145 -0
- driftless-0.1.0/tests/test_data_change_regression.py +70 -0
- driftless-0.1.0/tests/test_datasource.py +104 -0
- driftless-0.1.0/tests/test_datastate.py +82 -0
- driftless-0.1.0/tests/test_discovery.py +214 -0
- driftless-0.1.0/tests/test_endpoint.py +120 -0
- driftless-0.1.0/tests/test_engine.py +209 -0
- driftless-0.1.0/tests/test_evaluation.py +318 -0
- driftless-0.1.0/tests/test_extraction.py +141 -0
- driftless-0.1.0/tests/test_generators.py +236 -0
- driftless-0.1.0/tests/test_github.py +151 -0
- driftless-0.1.0/tests/test_grading_loop.py +103 -0
- driftless-0.1.0/tests/test_harness.py +106 -0
- driftless-0.1.0/tests/test_judge.py +138 -0
- driftless-0.1.0/tests/test_judge_loop.py +93 -0
- driftless-0.1.0/tests/test_lifecycle.py +41 -0
- driftless-0.1.0/tests/test_migration_live.py +40 -0
- driftless-0.1.0/tests/test_migration_regression.py +100 -0
- driftless-0.1.0/tests/test_plan_act.py +63 -0
- driftless-0.1.0/tests/test_policy.py +137 -0
- driftless-0.1.0/tests/test_poll_act.py +31 -0
- driftless-0.1.0/tests/test_preflight.py +53 -0
- driftless-0.1.0/tests/test_refine.py +82 -0
- driftless-0.1.0/tests/test_refresh_catalog.py +91 -0
- driftless-0.1.0/tests/test_repair_prompt.py +98 -0
- driftless-0.1.0/tests/test_report.py +166 -0
- driftless-0.1.0/tests/test_scanner.py +108 -0
- driftless-0.1.0/tests/test_view.py +60 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
.env
|
|
11
|
+
|
|
12
|
+
# Tooling caches
|
|
13
|
+
.pytest_cache/
|
|
14
|
+
.mypy_cache/
|
|
15
|
+
.ruff_cache/
|
|
16
|
+
|
|
17
|
+
# driftless run artifacts
|
|
18
|
+
.driftless/
|
|
19
|
+
|
|
20
|
+
# OS / editor
|
|
21
|
+
.DS_Store
|
|
22
|
+
.scratch/
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Implementation Plan — Dependabot for LLM Models (`driftless`)
|
|
2
|
+
|
|
3
|
+
A GitHub-native tool that detects risky/deprecated/expensive LLM model
|
|
4
|
+
dependencies, evaluates replacement models through the **real application
|
|
5
|
+
path**, repairs prompts/configs where allowed, validates against thresholds,
|
|
6
|
+
and opens migration PRs with evidence.
|
|
7
|
+
|
|
8
|
+
## Build philosophy
|
|
9
|
+
|
|
10
|
+
- **CLI-first.** The CLI is the engine. The GitHub Action and App just invoke
|
|
11
|
+
it. Everything is testable locally before any GitHub integration exists.
|
|
12
|
+
- **The workflow contract is the spine.** A single typed schema
|
|
13
|
+
(`driftless.yml`) drives `scan`, `compare`, `migrate`, `validate`, and
|
|
14
|
+
`report`. Build it first and rigorously.
|
|
15
|
+
- **The customer owns the workflow; we orchestrate.** We shell out to *their*
|
|
16
|
+
eval command with the model overridden via env var. We never reimplement
|
|
17
|
+
their preprocessing/parsing/postprocessing.
|
|
18
|
+
- **Failure is a first-class output.** Every command can emit a
|
|
19
|
+
`pass` / `partial` / `blocked` result, not just success.
|
|
20
|
+
|
|
21
|
+
## Stack
|
|
22
|
+
|
|
23
|
+
- **Language:** Python (3.10+)
|
|
24
|
+
- **CLI framework:** Typer
|
|
25
|
+
- **Validation:** Pydantic v2
|
|
26
|
+
- **Config:** YAML (`driftless.yml`)
|
|
27
|
+
- **Tests:** pytest
|
|
28
|
+
- **Distribution:** pip / pipx; GitHub Action wraps the same CLI.
|
|
29
|
+
|
|
30
|
+
## Milestones
|
|
31
|
+
|
|
32
|
+
### Milestone 1 — Contract + harness (foundation)
|
|
33
|
+
- Project scaffold, CLI entrypoint, config loader.
|
|
34
|
+
- Workflow contract schema (`workflows`, `model`, `files.editable/readonly`,
|
|
35
|
+
`eval`, `thresholds`, `migration`) with strict validation + good errors.
|
|
36
|
+
- Runnable harness: execute the user's `command` with the model injected via
|
|
37
|
+
env var; read `input_path`, capture `output_path`; handle exit codes/timeouts.
|
|
38
|
+
- `driftless init` (scaffold a contract) and `driftless validate`
|
|
39
|
+
(contract parses + harness runs with current model).
|
|
40
|
+
- **Done when:** we can run the current model through the real command and
|
|
41
|
+
capture production-shaped outputs.
|
|
42
|
+
|
|
43
|
+
### Milestone 2 — Evaluation + comparison
|
|
44
|
+
- Output loaders + schema validation (JSON schema), enum/field-level checks.
|
|
45
|
+
- Metrics: accuracy, precision, recall, F1, schema-error rate, refusal rate;
|
|
46
|
+
cost + latency capture per run.
|
|
47
|
+
- `driftless compare --workflow X --to <model>`: baseline vs naive target
|
|
48
|
+
scorecard.
|
|
49
|
+
- **Done when:** `compare` reproduces the "Target, Original Files" column.
|
|
50
|
+
|
|
51
|
+
### Milestone 3 — Migration loop (port validated work)
|
|
52
|
+
- Failure clustering over baseline-vs-target diff.
|
|
53
|
+
- Candidate patch generation scoped strictly to `files.editable`.
|
|
54
|
+
- Tuning/holdout split, iterate <= `max_iterations`, `select_best` under
|
|
55
|
+
constraints; holdout validation as a hard gate.
|
|
56
|
+
- `driftless migrate --workflow X --to <model>` -> migrated files + result
|
|
57
|
+
object (`pass` / `partial` / `blocked`).
|
|
58
|
+
- **Done when:** on validated workflows, `migrate` recovers performance and
|
|
59
|
+
only edits allowed files.
|
|
60
|
+
|
|
61
|
+
### Milestone 4 — Reporting (incl. failure path)
|
|
62
|
+
- Markdown migration report: before/after metrics, changes made, remaining
|
|
63
|
+
risks, fixed/remaining examples, cost/latency.
|
|
64
|
+
- Distinct, useful artifacts for partial/blocked outcomes (remaining clusters,
|
|
65
|
+
recommended fallback model).
|
|
66
|
+
- `driftless report`.
|
|
67
|
+
- **Done when:** a partial migration still produces something a partner acts on.
|
|
68
|
+
|
|
69
|
+
### Milestone 5 — Static scanner
|
|
70
|
+
- Detect probable LLM usage (SDK imports, model-ID literals, env-var config).
|
|
71
|
+
- Provider lifecycle data (deprecation/retirement) as static JSON first,
|
|
72
|
+
structured to later become a hosted DB.
|
|
73
|
+
- `driftless scan` + `configure` helper to scaffold a contract from a
|
|
74
|
+
detected workflow.
|
|
75
|
+
- **Done when:** `scan` surfaces model usage + deprecation risk with no
|
|
76
|
+
contract required.
|
|
77
|
+
|
|
78
|
+
### Milestone 6 — GitHub surface
|
|
79
|
+
- GitHub Action wrapping `scan`/`compare`/`migrate`.
|
|
80
|
+
- PR generation: branch, commit migrated files, evidence-rich PR body. Open
|
|
81
|
+
issue/draft PR on partial/blocked.
|
|
82
|
+
- **Done when:** a real migration opens a real PR with the evidence table.
|
|
83
|
+
|
|
84
|
+
Milestones 1–4 yield a locally usable, dogfoodable tool. 5–6 make it the
|
|
85
|
+
Dependabot-style experience. Paid App/dashboard/billing are earned scope after.
|
|
86
|
+
|
|
87
|
+
## CLI surface (target)
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
driftless init
|
|
91
|
+
driftless scan
|
|
92
|
+
driftless configure <workflow>
|
|
93
|
+
driftless compare --workflow <w> --to <model>
|
|
94
|
+
driftless migrate --workflow <w> --to <model>
|
|
95
|
+
driftless validate --workflow <w>
|
|
96
|
+
driftless report
|
|
97
|
+
```
|
driftless-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 driftless contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
driftless-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: driftless
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Keep prompts in sync when model or eval data changes — Poetry-style lock regeneration, Dependabot-style PRs.
|
|
5
|
+
Project-URL: Homepage, https://github.com/driftless/driftless
|
|
6
|
+
Project-URL: Repository, https://github.com/driftless/driftless
|
|
7
|
+
Project-URL: Documentation, https://github.com/driftless/driftless/blob/main/docs/repair-and-generators.md
|
|
8
|
+
Project-URL: Issues, https://github.com/driftless/driftless/issues
|
|
9
|
+
Author: driftless contributors
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: anthropic,ci,dependabot,evaluation,llm,migration,openai,prompts
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: jsonschema>=4.21
|
|
24
|
+
Requires-Dist: pydantic>=2.6
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: rich>=13.7
|
|
27
|
+
Requires-Dist: typer>=0.12
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
31
|
+
Provides-Extra: llm
|
|
32
|
+
Requires-Dist: anthropic>=0.30; extra == 'llm'
|
|
33
|
+
Requires-Dist: openai>=1.0; extra == 'llm'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# driftless
|
|
37
|
+
|
|
38
|
+
**Poetry-style lock regeneration for prompts — delivered Dependabot-style.**
|
|
39
|
+
|
|
40
|
+
A prompt is pinned to a **model** and an **eval dataset** (like `pyproject.toml`
|
|
41
|
+
declares deps and `poetry.lock` pins what works). When either moves, the prompt
|
|
42
|
+
goes stale. driftless re-derives it through your real eval, validates on holdout,
|
|
43
|
+
and opens a PR with evidence.
|
|
44
|
+
|
|
45
|
+
> Also described as *Dependabot for LLM models* — same automation shape, different
|
|
46
|
+
> core insight: prompts are lockfiles, not just config files.
|
|
47
|
+
|
|
48
|
+
> Status: early development. See [`IMPLEMENTATION_PLAN.md`](./IMPLEMENTATION_PLAN.md).
|
|
49
|
+
|
|
50
|
+
## Install (dev)
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
python -m venv .venv
|
|
54
|
+
source .venv/bin/activate
|
|
55
|
+
pip install -e ".[dev]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quickstart
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
driftless init # scaffold a driftless.yml
|
|
62
|
+
driftless validate -w support_classifier # contract parses + harness runs
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## How it works
|
|
66
|
+
|
|
67
|
+
You describe your model-dependent workflow once in `driftless.yml`: how to
|
|
68
|
+
run it, how to override the model, which files may be edited, and what quality
|
|
69
|
+
thresholds must hold. `driftless` orchestrates *your* workflow under
|
|
70
|
+
different models, compares results, repairs allowed files, validates on
|
|
71
|
+
holdout, and opens a PR with the evidence.
|
|
72
|
+
|
|
73
|
+
The customer owns the workflow. The tool orchestrates it.
|
|
74
|
+
|
|
75
|
+
Not a classifier? Choose a grading mode that fits the task — the same loop then
|
|
76
|
+
optimizes against it, with your team owning the definition of "good":
|
|
77
|
+
|
|
78
|
+
- **`eval.score_field` / `eval.pass_field`** — your command emits a numeric score
|
|
79
|
+
or a pass/fail per record (works for any task: summarization, codegen, agents).
|
|
80
|
+
- **`eval.fields`** — structured extraction, scored per field with
|
|
81
|
+
precision/recall/F1 against the gold record.
|
|
82
|
+
- **`eval.judge`** — an LLM judge grades each free-form output against a rubric
|
|
83
|
+
(with an optional human-scored calibration set for a judge-agreement check).
|
|
84
|
+
|
|
85
|
+
## CLI
|
|
86
|
+
|
|
87
|
+
| Command | Purpose |
|
|
88
|
+
|---|---|
|
|
89
|
+
| `init` | Scaffold a `driftless.yml`. |
|
|
90
|
+
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
91
|
+
| `scan` | Find probable LLM usage and at-risk models. |
|
|
92
|
+
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
93
|
+
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
94
|
+
| `configure <workflow>` | Turn a detected workflow into a migration-ready contract. |
|
|
95
|
+
| `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
|
|
96
|
+
| `compare -w <w> --to <model>` | Baseline vs target scorecard. |
|
|
97
|
+
| `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
|
|
98
|
+
| `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
|
|
99
|
+
| `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
|
|
100
|
+
| `validate -w <w>` | Check the contract parses and the harness runs. |
|
|
101
|
+
| `report` | Render the latest migration report. |
|
|
102
|
+
| `view` | Open the optimization run viewer (charts + attempt log). |
|
|
103
|
+
| `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
|
|
104
|
+
|
|
105
|
+
## Configuring *when* to migrate
|
|
106
|
+
|
|
107
|
+
`plan` reads an optional `.driftless/policy.yml` — the "dependabot.yml" layer.
|
|
108
|
+
Scaffold it with `driftless init-policy`; every field matches a default, so an
|
|
109
|
+
empty file behaves like no file. It controls which triggers are enabled
|
|
110
|
+
(`deprecation` is on and forced; `cost`/`quality`/`new_model` are opportunistic),
|
|
111
|
+
the thresholds a candidate must clear (`min_savings_pct`, `min_gain`), a
|
|
112
|
+
`cooldown_days` to skip freshly-released models, candidate `allow`/`deny` globs,
|
|
113
|
+
and an `ignore` list to snooze specific models or moves. The engine still decides
|
|
114
|
+
whether a candidate actually passes *your* eval — policy only decides whether to
|
|
115
|
+
propose it.
|
|
116
|
+
|
|
117
|
+
## GitHub-native usage
|
|
118
|
+
|
|
119
|
+
A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
|
|
120
|
+
can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
|
|
121
|
+
manually-triggered migration that opens a PR (or an issue when blocked).
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
- uses: driftless/action@v1
|
|
125
|
+
with:
|
|
126
|
+
command: scan
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Documentation
|
|
130
|
+
|
|
131
|
+
- [Project overview](./docs/PROJECT_OVERVIEW.md) — what's been built so far:
|
|
132
|
+
architecture, components, the migration loop, and the testbed repo.
|
|
133
|
+
- [Next steps / roadmap](./docs/NEXT_STEPS.md) — prioritized backlog and where
|
|
134
|
+
each item lands in the code.
|
|
135
|
+
- [Repair prompts & custom generators](./docs/repair-and-generators.md) — customize
|
|
136
|
+
the LLM repair prompt or plug in your own patch generator.
|
|
137
|
+
- [Implementation plan](./IMPLEMENTATION_PLAN.md) — milestones and architecture.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# driftless
|
|
2
|
+
|
|
3
|
+
**Poetry-style lock regeneration for prompts — delivered Dependabot-style.**
|
|
4
|
+
|
|
5
|
+
A prompt is pinned to a **model** and an **eval dataset** (like `pyproject.toml`
|
|
6
|
+
declares deps and `poetry.lock` pins what works). When either moves, the prompt
|
|
7
|
+
goes stale. driftless re-derives it through your real eval, validates on holdout,
|
|
8
|
+
and opens a PR with evidence.
|
|
9
|
+
|
|
10
|
+
> Also described as *Dependabot for LLM models* — same automation shape, different
|
|
11
|
+
> core insight: prompts are lockfiles, not just config files.
|
|
12
|
+
|
|
13
|
+
> Status: early development. See [`IMPLEMENTATION_PLAN.md`](./IMPLEMENTATION_PLAN.md).
|
|
14
|
+
|
|
15
|
+
## Install (dev)
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
python -m venv .venv
|
|
19
|
+
source .venv/bin/activate
|
|
20
|
+
pip install -e ".[dev]"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
driftless init # scaffold a driftless.yml
|
|
27
|
+
driftless validate -w support_classifier # contract parses + harness runs
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## How it works
|
|
31
|
+
|
|
32
|
+
You describe your model-dependent workflow once in `driftless.yml`: how to
|
|
33
|
+
run it, how to override the model, which files may be edited, and what quality
|
|
34
|
+
thresholds must hold. `driftless` orchestrates *your* workflow under
|
|
35
|
+
different models, compares results, repairs allowed files, validates on
|
|
36
|
+
holdout, and opens a PR with the evidence.
|
|
37
|
+
|
|
38
|
+
The customer owns the workflow. The tool orchestrates it.
|
|
39
|
+
|
|
40
|
+
Not a classifier? Choose a grading mode that fits the task — the same loop then
|
|
41
|
+
optimizes against it, with your team owning the definition of "good":
|
|
42
|
+
|
|
43
|
+
- **`eval.score_field` / `eval.pass_field`** — your command emits a numeric score
|
|
44
|
+
or a pass/fail per record (works for any task: summarization, codegen, agents).
|
|
45
|
+
- **`eval.fields`** — structured extraction, scored per field with
|
|
46
|
+
precision/recall/F1 against the gold record.
|
|
47
|
+
- **`eval.judge`** — an LLM judge grades each free-form output against a rubric
|
|
48
|
+
(with an optional human-scored calibration set for a judge-agreement check).
|
|
49
|
+
|
|
50
|
+
## CLI
|
|
51
|
+
|
|
52
|
+
| Command | Purpose |
|
|
53
|
+
|---|---|
|
|
54
|
+
| `init` | Scaffold a `driftless.yml`. |
|
|
55
|
+
| `init-policy` | Scaffold a `.driftless/policy.yml` (when to migrate). |
|
|
56
|
+
| `scan` | Find probable LLM usage and at-risk models. |
|
|
57
|
+
| `plan` | Discover at-risk workflows and apply the migration policy (CI triage). |
|
|
58
|
+
| `plan --act` | Migrate + open a PR/issue for every actionable trigger (close the loop). |
|
|
59
|
+
| `configure <workflow>` | Turn a detected workflow into a migration-ready contract. |
|
|
60
|
+
| `calibrate -w <w>` | Measure the baseline and suggest starting thresholds. |
|
|
61
|
+
| `compare -w <w> --to <model>` | Baseline vs target scorecard. |
|
|
62
|
+
| `migrate -w <w> --to <model>` | Repair + validate + produce migrated files. |
|
|
63
|
+
| `refine -w <w>` | Re-optimize the prompt for a changed eval dataset (model pinned). |
|
|
64
|
+
| `poll [--act]` | Detect external eval-dataset changes and refine on a meaningful change. |
|
|
65
|
+
| `validate -w <w>` | Check the contract parses and the harness runs. |
|
|
66
|
+
| `report` | Render the latest migration report. |
|
|
67
|
+
| `view` | Open the optimization run viewer (charts + attempt log). |
|
|
68
|
+
| `open-pr -w <w>` | Open a PR (or issue) from the latest migration result. |
|
|
69
|
+
|
|
70
|
+
## Configuring *when* to migrate
|
|
71
|
+
|
|
72
|
+
`plan` reads an optional `.driftless/policy.yml` — the "dependabot.yml" layer.
|
|
73
|
+
Scaffold it with `driftless init-policy`; every field matches a default, so an
|
|
74
|
+
empty file behaves like no file. It controls which triggers are enabled
|
|
75
|
+
(`deprecation` is on and forced; `cost`/`quality`/`new_model` are opportunistic),
|
|
76
|
+
the thresholds a candidate must clear (`min_savings_pct`, `min_gain`), a
|
|
77
|
+
`cooldown_days` to skip freshly-released models, candidate `allow`/`deny` globs,
|
|
78
|
+
and an `ignore` list to snooze specific models or moves. The engine still decides
|
|
79
|
+
whether a candidate actually passes *your* eval — policy only decides whether to
|
|
80
|
+
propose it.
|
|
81
|
+
|
|
82
|
+
## GitHub-native usage
|
|
83
|
+
|
|
84
|
+
A composite GitHub Action (`action.yml`) wraps the CLI so scans and migrations
|
|
85
|
+
can run in CI. See `.github/workflows/` for a scheduled deprecation scan and a
|
|
86
|
+
manually-triggered migration that opens a PR (or an issue when blocked).
|
|
87
|
+
|
|
88
|
+
```yaml
|
|
89
|
+
- uses: driftless/action@v1
|
|
90
|
+
with:
|
|
91
|
+
command: scan
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Documentation
|
|
95
|
+
|
|
96
|
+
- [Project overview](./docs/PROJECT_OVERVIEW.md) — what's been built so far:
|
|
97
|
+
architecture, components, the migration loop, and the testbed repo.
|
|
98
|
+
- [Next steps / roadmap](./docs/NEXT_STEPS.md) — prioritized backlog and where
|
|
99
|
+
each item lands in the code.
|
|
100
|
+
- [Repair prompts & custom generators](./docs/repair-and-generators.md) — customize
|
|
101
|
+
the LLM repair prompt or plug in your own patch generator.
|
|
102
|
+
- [Implementation plan](./IMPLEMENTATION_PLAN.md) — milestones and architecture.
|