driftcut 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- driftcut-0.8.0/.dockerignore +14 -0
- driftcut-0.8.0/.github/workflows/ci.yml +48 -0
- driftcut-0.8.0/.github/workflows/pages.yml +35 -0
- driftcut-0.8.0/.github/workflows/publish.yml +75 -0
- driftcut-0.8.0/.github/workflows/release-audit.yml +21 -0
- driftcut-0.8.0/.gitignore +43 -0
- driftcut-0.8.0/CHANGELOG.md +179 -0
- driftcut-0.8.0/CLAUDE.md +88 -0
- driftcut-0.8.0/Dockerfile +14 -0
- driftcut-0.8.0/LICENSE +21 -0
- driftcut-0.8.0/PKG-INFO +537 -0
- driftcut-0.8.0/README.md +497 -0
- driftcut-0.8.0/assets/logo.svg +4 -0
- driftcut-0.8.0/docker-compose.yml +25 -0
- driftcut-0.8.0/docs/concept-document-v0.3.md +605 -0
- driftcut-0.8.0/docs/demo-report.html +523 -0
- driftcut-0.8.0/docs/redis-recall-v0.7.0-plan.md +428 -0
- driftcut-0.8.0/examples/migration.redis.yaml +54 -0
- driftcut-0.8.0/examples/migration.yaml +68 -0
- driftcut-0.8.0/examples/migration_test_openrouter.yaml +66 -0
- driftcut-0.8.0/examples/prompts.csv +31 -0
- driftcut-0.8.0/examples/replay.json +58 -0
- driftcut-0.8.0/examples/replay.yaml +39 -0
- driftcut-0.8.0/pyproject.toml +88 -0
- driftcut-0.8.0/scripts/check_release_alignment.py +141 -0
- driftcut-0.8.0/scripts/check_release_state.py +172 -0
- driftcut-0.8.0/scripts/replay_converter_template.py +60 -0
- driftcut-0.8.0/site/CNAME +1 -0
- driftcut-0.8.0/site/index.html +691 -0
- driftcut-0.8.0/src/driftcut/__init__.py +3 -0
- driftcut-0.8.0/src/driftcut/__main__.py +5 -0
- driftcut-0.8.0/src/driftcut/cli.py +265 -0
- driftcut-0.8.0/src/driftcut/config.py +108 -0
- driftcut-0.8.0/src/driftcut/corpus.py +146 -0
- driftcut-0.8.0/src/driftcut/decision.py +470 -0
- driftcut-0.8.0/src/driftcut/executor.py +175 -0
- driftcut-0.8.0/src/driftcut/judge.py +423 -0
- driftcut-0.8.0/src/driftcut/models.py +197 -0
- driftcut-0.8.0/src/driftcut/quality.py +152 -0
- driftcut-0.8.0/src/driftcut/replay.py +177 -0
- driftcut-0.8.0/src/driftcut/reporting.py +633 -0
- driftcut-0.8.0/src/driftcut/runner.py +470 -0
- driftcut-0.8.0/src/driftcut/sampler.py +96 -0
- driftcut-0.8.0/src/driftcut/store.py +72 -0
- driftcut-0.8.0/src/driftcut/store_null.py +40 -0
- driftcut-0.8.0/src/driftcut/store_redis.py +225 -0
- driftcut-0.8.0/src/driftcut/trackers.py +135 -0
- driftcut-0.8.0/tests/__init__.py +0 -0
- driftcut-0.8.0/tests/test_cli.py +106 -0
- driftcut-0.8.0/tests/test_config.py +163 -0
- driftcut-0.8.0/tests/test_corpus.py +103 -0
- driftcut-0.8.0/tests/test_executor.py +290 -0
- driftcut-0.8.0/tests/test_judge.py +193 -0
- driftcut-0.8.0/tests/test_models.py +166 -0
- driftcut-0.8.0/tests/test_quality.py +85 -0
- driftcut-0.8.0/tests/test_replay.py +196 -0
- driftcut-0.8.0/tests/test_reporting.py +183 -0
- driftcut-0.8.0/tests/test_runner.py +373 -0
- driftcut-0.8.0/tests/test_sampler.py +88 -0
- driftcut-0.8.0/tests/test_store.py +108 -0
- driftcut-0.8.0/tests/test_trackers.py +195 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v5
|
|
14
|
+
- uses: actions/setup-python@v6
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- run: pip install -e ".[dev]"
|
|
18
|
+
- run: python scripts/check_release_alignment.py
|
|
19
|
+
- run: ruff check src tests
|
|
20
|
+
- run: ruff format --check src tests
|
|
21
|
+
- run: mypy src tests
|
|
22
|
+
|
|
23
|
+
test:
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
strategy:
|
|
26
|
+
matrix:
|
|
27
|
+
python-version: ["3.12", "3.13"]
|
|
28
|
+
steps:
|
|
29
|
+
- uses: actions/checkout@v5
|
|
30
|
+
- uses: actions/setup-python@v6
|
|
31
|
+
with:
|
|
32
|
+
python-version: ${{ matrix.python-version }}
|
|
33
|
+
- run: pip install -e ".[dev]"
|
|
34
|
+
- run: pytest -v
|
|
35
|
+
|
|
36
|
+
package:
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v5
|
|
40
|
+
- uses: actions/setup-python@v6
|
|
41
|
+
with:
|
|
42
|
+
python-version: "3.13"
|
|
43
|
+
- run: pip install -e ".[dev]"
|
|
44
|
+
- run: python -m build
|
|
45
|
+
- run: python -m twine check dist/*
|
|
46
|
+
- run: python -m venv .pkg-venv
|
|
47
|
+
- run: .pkg-venv/bin/pip install dist/*.whl
|
|
48
|
+
- run: .pkg-venv/bin/driftcut --version
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Deploy landing page
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- 'site/**'
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
pages: write
|
|
13
|
+
id-token: write
|
|
14
|
+
|
|
15
|
+
concurrency:
|
|
16
|
+
group: pages
|
|
17
|
+
cancel-in-progress: true
|
|
18
|
+
|
|
19
|
+
env:
|
|
20
|
+
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
|
21
|
+
|
|
22
|
+
jobs:
|
|
23
|
+
deploy:
|
|
24
|
+
environment:
|
|
25
|
+
name: github-pages
|
|
26
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
steps:
|
|
29
|
+
- uses: actions/checkout@v5
|
|
30
|
+
- uses: actions/configure-pages@v5
|
|
31
|
+
- uses: actions/upload-pages-artifact@v4
|
|
32
|
+
with:
|
|
33
|
+
path: site
|
|
34
|
+
- id: deployment
|
|
35
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types:
|
|
6
|
+
- published
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
inputs:
|
|
9
|
+
ref:
|
|
10
|
+
description: Git ref to build and publish, for example main or v0.8.0
|
|
11
|
+
required: true
|
|
12
|
+
default: main
|
|
13
|
+
|
|
14
|
+
concurrency:
|
|
15
|
+
group: pypi-publish-${{ github.event.release.tag_name || github.event.inputs.ref || github.ref_name }}
|
|
16
|
+
cancel-in-progress: false
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
build-distributions:
|
|
20
|
+
name: Build distributions
|
|
21
|
+
if: github.event_name != 'release' || github.event.release.prerelease == false
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
permissions:
|
|
24
|
+
contents: read
|
|
25
|
+
|
|
26
|
+
steps:
|
|
27
|
+
- name: Check out repository
|
|
28
|
+
uses: actions/checkout@v5
|
|
29
|
+
with:
|
|
30
|
+
ref: ${{ github.event.inputs.ref || github.ref }}
|
|
31
|
+
|
|
32
|
+
- name: Set up Python
|
|
33
|
+
uses: actions/setup-python@v6
|
|
34
|
+
with:
|
|
35
|
+
python-version: "3.13"
|
|
36
|
+
|
|
37
|
+
- name: Install build tooling
|
|
38
|
+
run: |
|
|
39
|
+
python -m pip install --upgrade pip
|
|
40
|
+
python -m pip install build twine
|
|
41
|
+
|
|
42
|
+
- name: Build distributions
|
|
43
|
+
run: python -m build
|
|
44
|
+
|
|
45
|
+
- name: Validate distribution metadata
|
|
46
|
+
run: python -m twine check dist/*
|
|
47
|
+
|
|
48
|
+
- name: Upload built distributions
|
|
49
|
+
uses: actions/upload-artifact@v4
|
|
50
|
+
with:
|
|
51
|
+
name: python-package-distributions
|
|
52
|
+
path: dist/
|
|
53
|
+
if-no-files-found: error
|
|
54
|
+
|
|
55
|
+
publish-to-pypi:
|
|
56
|
+
name: Publish to PyPI
|
|
57
|
+
if: github.event_name != 'release' || github.event.release.prerelease == false
|
|
58
|
+
needs:
|
|
59
|
+
- build-distributions
|
|
60
|
+
runs-on: ubuntu-latest
|
|
61
|
+
environment:
|
|
62
|
+
name: pypi
|
|
63
|
+
url: https://pypi.org/project/driftcut/
|
|
64
|
+
permissions:
|
|
65
|
+
id-token: write
|
|
66
|
+
|
|
67
|
+
steps:
|
|
68
|
+
- name: Download distributions
|
|
69
|
+
uses: actions/download-artifact@v4
|
|
70
|
+
with:
|
|
71
|
+
name: python-package-distributions
|
|
72
|
+
path: dist/
|
|
73
|
+
|
|
74
|
+
- name: Publish package distributions to PyPI
|
|
75
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Release Audit
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
|
|
6
|
+
permissions:
|
|
7
|
+
contents: read
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
audit:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v5
|
|
14
|
+
|
|
15
|
+
- uses: actions/setup-python@v6
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
|
|
19
|
+
- run: python scripts/check_release_alignment.py
|
|
20
|
+
|
|
21
|
+
- run: python scripts/check_release_state.py --retries 6 --delay-seconds 10
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.egg
|
|
9
|
+
.eggs/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.vscode/
|
|
18
|
+
.idea/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
*~
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
Thumbs.db
|
|
26
|
+
|
|
27
|
+
# Testing
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
.coverage
|
|
30
|
+
htmlcov/
|
|
31
|
+
.mypy_cache/
|
|
32
|
+
.ruff_cache/
|
|
33
|
+
|
|
34
|
+
# Environment
|
|
35
|
+
.env
|
|
36
|
+
.env.local
|
|
37
|
+
|
|
38
|
+
# SQLite
|
|
39
|
+
*.db
|
|
40
|
+
*.sqlite3
|
|
41
|
+
|
|
42
|
+
# Driftcut run outputs
|
|
43
|
+
driftcut-results/
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.8.0] - 2026-04-02
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Richer prompt-level failure archetypes, including semantic regressions such as `refusal_regression`, `instruction_miss`, `incomplete_answer`, and `format_drift`
|
|
13
|
+
- Per-category scorecards in decision metrics, JSON output, and HTML reports
|
|
14
|
+
- Category-aware decision reasoning and console summaries that highlight the highest-risk category
|
|
15
|
+
- 4 new tests covering richer archetypes, category scorecards, and clearer run-level reasoning (110 total)
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- Prompt evaluations now retain multiple failure archetypes instead of collapsing to a single coarse label
|
|
20
|
+
- Judge-driven regressions can now classify into more actionable semantic buckets instead of only `judge_worse`
|
|
21
|
+
- HTML examples now surface archetype summaries alongside deterministic and judge rationale
|
|
22
|
+
|
|
23
|
+
## [0.7.0] - 2026-04-02
|
|
24
|
+
|
|
25
|
+
### Added
|
|
26
|
+
|
|
27
|
+
- Optional Redis memory layer for baseline response caching and searchable run-history persistence
|
|
28
|
+
- `cache_hit`, cache summary, and saved baseline cost metrics in JSON and HTML outputs
|
|
29
|
+
- `driftcut[redis]` extra plus a Redis-enabled sample config for local testing
|
|
30
|
+
- `Dockerfile`, `docker-compose.yml`, and `.dockerignore` for reproducible local Redis-backed runs
|
|
31
|
+
- 9 new tests covering Redis config, caching behavior, reporting, and store adapters (106 total)
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
|
|
35
|
+
- Cached baseline responses are excluded from live latency comparisons so reuse does not distort candidate latency decisions
|
|
36
|
+
- Memory-backed runs now persist canonical run payloads through the same reporting shape used for file exports
|
|
37
|
+
- README and docs now document local Docker + Redis workflows alongside the normal Python path
|
|
38
|
+
|
|
39
|
+
## [0.6.0] - 2026-04-01
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
|
|
43
|
+
- Real tiered judging with light-to-heavy escalation when light judge confidence is below threshold
|
|
44
|
+
- Configurable `tiered_escalation_threshold` in evaluation config (default 0.6)
|
|
45
|
+
- `tier` and `escalated` fields on judge results for tracking which judge tier produced the verdict
|
|
46
|
+
- `escalated_prompts` metric in decision metrics and JSON/HTML reports
|
|
47
|
+
- Split judge cost tracking: `judge_light_usd` and `judge_heavy_usd` in cost summaries
|
|
48
|
+
- Escalation threshold shown in HTML thresholds table when strategy is tiered
|
|
49
|
+
- 8 new tests for tiered escalation, config validation, cost splitting, and reporting (97 total)
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- `judge_strategy: tiered` now performs actual light-then-heavy escalation instead of aliasing to light
|
|
54
|
+
- Judge cost breakdown (light vs heavy) shown in HTML report when both tiers are used
|
|
55
|
+
- Console run summary includes escalated count when escalation occurs
|
|
56
|
+
|
|
57
|
+
## [0.5.1] - 2026-03-30
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
|
|
61
|
+
- CLI reference documentation page for `validate`, `run`, `replay`, and global flags
|
|
62
|
+
- Visible quickstart output examples across the README and docs site, including terminal and `results.json` excerpts
|
|
63
|
+
|
|
64
|
+
### Changed
|
|
65
|
+
|
|
66
|
+
- README, landing page, and docs now show the produced artifacts more concretely instead of only describing the workflow
|
|
67
|
+
- Concept documentation now reflects the shipped three-way decision engine and no longer implies category-scoped proceed decisions
|
|
68
|
+
|
|
69
|
+
### Fixed
|
|
70
|
+
|
|
71
|
+
- Live model calls now retry transient rate-limit, timeout, connection, and 5xx failures before counting them as API errors
|
|
72
|
+
- JSON exports now include per-response `retry_count` so retry behavior is auditable in saved artifacts
|
|
73
|
+
|
|
74
|
+
## [0.5.0] - 2026-03-30
|
|
75
|
+
|
|
76
|
+
### Added
|
|
77
|
+
|
|
78
|
+
- `driftcut replay --config ... --input ...` for historical paired-output backtesting
|
|
79
|
+
- Canonical replay JSON loader with versioned schema validation
|
|
80
|
+
- Replay-aware JSON and HTML reports with explicit `mode: replay` labeling
|
|
81
|
+
- Replay example assets and an external converter template under `scripts/`
|
|
82
|
+
- 8 new tests covering replay loading, CLI behavior, reporting, and live/replay parity
|
|
83
|
+
|
|
84
|
+
### Changed
|
|
85
|
+
|
|
86
|
+
- Refactored the runner so live execution and replay share the same post-processing path
|
|
87
|
+
- Replay now reuses stratified sampling, deterministic checks, judge flow, decision logic, and early-stop behavior
|
|
88
|
+
- Replay configs can omit `corpus.file`; the canonical replay input provides prompt metadata directly
|
|
89
|
+
|
|
90
|
+
## [0.4.0] - 2026-03-29
|
|
91
|
+
|
|
92
|
+
### Added
|
|
93
|
+
|
|
94
|
+
- Judge layer for ambiguous prompts with semantic verdicts, confidence, and rationale
|
|
95
|
+
- Judge-aware decision metrics, confidence scoring, and cost tracking
|
|
96
|
+
- Judge details in JSON output and HTML reports
|
|
97
|
+
- 6 new tests for judge helpers and runtime integration
|
|
98
|
+
|
|
99
|
+
### Changed
|
|
100
|
+
|
|
101
|
+
- `judge_strategy: light` is now the active default for the alpha runtime
|
|
102
|
+
- Ambiguous prompts now lower confidence until they are judged or the strategy is disabled
|
|
103
|
+
- `tiered` remains a compatibility alias for light judging until heavy escalation lands
|
|
104
|
+
|
|
105
|
+
## [0.3.0] - 2026-03-29
|
|
106
|
+
|
|
107
|
+
### Added
|
|
108
|
+
|
|
109
|
+
- Deterministic quality checks for expected output formats, required content, JSON keys, and length limits
|
|
110
|
+
- Early-stop decision engine with `STOP`, `CONTINUE`, and `PROCEED` outcomes
|
|
111
|
+
- HTML report generation alongside richer JSON output
|
|
112
|
+
- Run-level risk metrics, confidence, and failure archetype summaries
|
|
113
|
+
- 6 new tests for quality checks, reporting, and decision behavior
|
|
114
|
+
|
|
115
|
+
### Changed
|
|
116
|
+
|
|
117
|
+
- `min_batches` now actively gates early `PROCEED` decisions
|
|
118
|
+
- `risk` and most `output` settings are now active runtime behavior instead of validation-only config
|
|
119
|
+
- Public repo messaging and examples now reflect the real runtime feature set
|
|
120
|
+
|
|
121
|
+
## [0.2.2] - 2026-03-29
|
|
122
|
+
|
|
123
|
+
### Changed
|
|
124
|
+
|
|
125
|
+
- Raised the repository quality bar with enforced `mypy` checks in CI and a clean strict-typing pass
|
|
126
|
+
- Stabilized sampled batch result ordering and refreshed the public alpha messaging across the app site
|
|
127
|
+
|
|
128
|
+
### Fixed
|
|
129
|
+
|
|
130
|
+
- Preserved successful model responses when pricing metadata is unavailable
|
|
131
|
+
- Aligned the shipped app version across package metadata, CLI output, changelog, and landing page badge
|
|
132
|
+
|
|
133
|
+
## [0.2.1] - 2026-03-29
|
|
134
|
+
|
|
135
|
+
### Added
|
|
136
|
+
|
|
137
|
+
- `api_base` field on model config for custom endpoints (Azure, proxies, self-hosted)
|
|
138
|
+
- OpenRouter support documented and tested
|
|
139
|
+
- Multi-provider config examples (same-provider, OpenRouter, custom endpoint)
|
|
140
|
+
- 5 new tests (66 total)
|
|
141
|
+
|
|
142
|
+
## [0.2.0] - 2026-03-29
|
|
143
|
+
|
|
144
|
+
### Added
|
|
145
|
+
|
|
146
|
+
- Async model execution via LiteLLM (`executor.py`)
|
|
147
|
+
- Latency tracker with p50/p95 per category (`trackers.py`)
|
|
148
|
+
- Cost tracker with per-prompt and cumulative spend (`trackers.py`)
|
|
149
|
+
- Migration runner with concurrent batch execution (`runner.py`)
|
|
150
|
+
- `driftcut run --config` command — fully wired end-to-end
|
|
151
|
+
- JSON results export to `driftcut-results/results.json`
|
|
152
|
+
- Result data models: `ModelResponse`, `PromptResult`, `BatchResult` (`models.py`)
|
|
153
|
+
- Rich progress bars during batch execution
|
|
154
|
+
- 26 new tests (61 total)
|
|
155
|
+
|
|
156
|
+
## [0.1.0] - 2026-03-28
|
|
157
|
+
|
|
158
|
+
### Added
|
|
159
|
+
|
|
160
|
+
- YAML config loading and validation with Pydantic models
|
|
161
|
+
- Corpus loading from CSV and JSON with full validation
|
|
162
|
+
- Stratified batch sampler (high-criticality prioritized in early batches)
|
|
163
|
+
- `driftcut validate --config` command with Rich terminal output
|
|
164
|
+
- CI pipeline (ruff lint + format + pytest on Python 3.12 & 3.13)
|
|
165
|
+
- Pre-launch landing page at driftcut.dev
|
|
166
|
+
- Documentation site at docs.driftcut.dev
|
|
167
|
+
- 35 tests covering config, corpus, sampler, and CLI
|
|
168
|
+
|
|
169
|
+
[0.8.0]: https://github.com/riccardomerenda/driftcut/compare/v0.7.0...v0.8.0
|
|
170
|
+
[0.7.0]: https://github.com/riccardomerenda/driftcut/compare/v0.6.0...v0.7.0
|
|
171
|
+
[0.6.0]: https://github.com/riccardomerenda/driftcut/compare/v0.5.1...v0.6.0
|
|
172
|
+
[0.5.1]: https://github.com/riccardomerenda/driftcut/compare/v0.5.0...v0.5.1
|
|
173
|
+
[0.5.0]: https://github.com/riccardomerenda/driftcut/compare/v0.4.0...v0.5.0
|
|
174
|
+
[0.4.0]: https://github.com/riccardomerenda/driftcut/compare/v0.3.0...v0.4.0
|
|
175
|
+
[0.3.0]: https://github.com/riccardomerenda/driftcut/compare/v0.2.2...v0.3.0
|
|
176
|
+
[0.2.2]: https://github.com/riccardomerenda/driftcut/compare/v0.2.1...v0.2.2
|
|
177
|
+
[0.2.1]: https://github.com/riccardomerenda/driftcut/compare/v0.2.0...v0.2.1
|
|
178
|
+
[0.2.0]: https://github.com/riccardomerenda/driftcut/compare/v0.1.0...v0.2.0
|
|
179
|
+
[0.1.0]: https://github.com/riccardomerenda/driftcut/releases/tag/v0.1.0
|
driftcut-0.8.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## What is this?
|
|
6
|
+
|
|
7
|
+
Driftcut is an early-stop canary testing tool for LLM model migrations. It samples strategically from a prompt corpus, compares baseline and candidate models, runs deterministic checks first, escalates only ambiguous prompts to a judge model, and outputs a STOP/CONTINUE/PROCEED decision — all before committing to a full evaluation run.
|
|
8
|
+
|
|
9
|
+
## Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install -e ".[dev]" # Install in dev mode
|
|
13
|
+
pytest # Run all tests
|
|
14
|
+
pytest tests/test_runner.py # Run a single test file
|
|
15
|
+
pytest tests/test_runner.py -k "test_name" # Run a single test
|
|
16
|
+
ruff check src tests # Lint
|
|
17
|
+
ruff format src tests # Format
|
|
18
|
+
mypy src # Type check
|
|
19
|
+
driftcut validate --config migration.yaml # Validate config + corpus
|
|
20
|
+
driftcut run --config migration.yaml # Run a migration test
|
|
21
|
+
driftcut replay --config replay.yaml --input replay.json # Replay historical outputs
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Architecture
|
|
25
|
+
|
|
26
|
+
### Data flow
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
CLI (cli.py) loads config + corpus
|
|
30
|
+
→ StratifiedSampler yields Batch objects (criticality-first)
|
|
31
|
+
→ runner.py orchestrates batch-by-batch execution
|
|
32
|
+
→ executor.py runs baseline + candidate concurrently per prompt (asyncio.gather)
|
|
33
|
+
→ quality.py runs deterministic checks on every response (free)
|
|
34
|
+
→ judge.py called only for ambiguous prompts (both pass deterministic but differ semantically)
|
|
35
|
+
→ decision.py evaluates STOP/CONTINUE/PROCEED after each batch
|
|
36
|
+
→ reporting.py writes JSON + HTML to driftcut-results/
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Replay mode substitutes pre-recorded responses for live execution but uses the same evaluation pipeline.
|
|
40
|
+
|
|
41
|
+
### Module roles
|
|
42
|
+
|
|
43
|
+
| Module | Role |
|
|
44
|
+
|---|---|
|
|
45
|
+
| `cli.py` | Typer commands: `run`, `validate`, `replay` |
|
|
46
|
+
| `config.py` | Pydantic models for YAML config with constrained fields |
|
|
47
|
+
| `corpus.py` | CSV/JSON loader → `PromptRecord` list with flexible delimiter parsing (`\|` or `;`) |
|
|
48
|
+
| `sampler.py` | `StratifiedSampler` iterator — yields equal-sized batches, pre-sorted by criticality within category |
|
|
49
|
+
| `runner.py` | Async orchestrator — batch loop, wires executor → quality → judge → decision |
|
|
50
|
+
| `executor.py` | Async LiteLLM wrapper with retry (3 attempts, exponential backoff for rate limits/5xx/timeouts) |
|
|
51
|
+
| `quality.py` | Deterministic checks driven by `expected_output_type`: JSON validity, required keys, substrings, length |
|
|
52
|
+
| `judge.py` | Sends ambiguous prompt pairs to judge model, extracts JSON verdict from freeform responses |
|
|
53
|
+
| `decision.py` | Decision engine: hard stops on schema breaks/high-crit failures, proceed gate on overall risk + latency |
|
|
54
|
+
| `trackers.py` | Cost and latency accumulators (cost gracefully handles missing LiteLLM pricing) |
|
|
55
|
+
| `models.py` | Frozen dataclasses: `ModelResponse`, `PromptEvaluation`, `JudgeResult`, `RunDecision`, `RunResult` |
|
|
56
|
+
| `reporting.py` | JSON serialization + HTML report with failure archetypes |
|
|
57
|
+
| `replay.py` | Replay-specific data models for loading historical paired outputs |
|
|
58
|
+
|
|
59
|
+
### Decision engine (`decision.py`)
|
|
60
|
+
|
|
61
|
+
Evaluated after every batch:
|
|
62
|
+
|
|
63
|
+
1. **Hard stops** (checked first): `schema_break_rate >= 0.25` or `high_criticality_failure_rate >= 0.20`
|
|
64
|
+
2. **Proceed gate**: `overall_risk < 0.08` AND latency ratios within bounds AND `min_batches` reached
|
|
65
|
+
3. **Continue**: neither triggered and batches remain
|
|
66
|
+
4. **Final stop**: budget exhausted without proceeding
|
|
67
|
+
|
|
68
|
+
Overall risk is a weighted average of regression rate, failure rate, schema breaks, high-criticality failures (weighted 2x), and latency penalty.
|
|
69
|
+
|
|
70
|
+
### Concurrency model
|
|
71
|
+
|
|
72
|
+
- Prompts within a batch run concurrently via `asyncio.as_completed()`
|
|
73
|
+
- Baseline and candidate for each prompt run in parallel via `asyncio.gather()`
|
|
74
|
+
- Results are re-sorted by original index after completion (preserves deterministic order)
|
|
75
|
+
- Judge calls are sequential per prompt
|
|
76
|
+
|
|
77
|
+
## Key conventions
|
|
78
|
+
|
|
79
|
+
- **src layout**: imports are `from driftcut.xxx import yyy`
|
|
80
|
+
- **Config validation at load time**: Pydantic fields have `ge`/`le` constraints; invalid configs fail immediately
|
|
81
|
+
- **`expected_output_type` drives evaluation**: `"json"` checks validity + keys, `"labels"` parses as list, `"free_text"`/`"markdown"` check substrings + length only
|
|
82
|
+
- **Judge strategy enum**: `"none"` (deterministic only), `"light"` (cheap model), `"heavy"` (expensive model), `"tiered"` (light first, escalates to heavy when confidence < `tiered_escalation_threshold`)
|
|
83
|
+
- **Cost error tolerance**: if LiteLLM can't price a model, run continues with `cost_usd=0.0` and `cost_error` stores the message
|
|
84
|
+
- **Conservative defaults**: decision engine favors false negatives (saying STOP) over false positives (saying PROCEED)
|
|
85
|
+
- **B008 suppressed in cli.py**: `typer.Option()` in function defaults is idiomatic Typer
|
|
86
|
+
- **ruff line length**: 100 chars, target Python 3.12
|
|
87
|
+
- **mypy strict mode** enabled
|
|
88
|
+
- **pytest-asyncio auto mode**: async test functions are auto-detected
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
FROM python:3.13-slim
|
|
2
|
+
|
|
3
|
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
4
|
+
PYTHONUNBUFFERED=1 \
|
|
5
|
+
PIP_NO_CACHE_DIR=1
|
|
6
|
+
|
|
7
|
+
WORKDIR /workspace
|
|
8
|
+
|
|
9
|
+
COPY . /workspace
|
|
10
|
+
|
|
11
|
+
RUN python -m pip install --upgrade pip \
|
|
12
|
+
&& python -m pip install -e ".[dev,redis]"
|
|
13
|
+
|
|
14
|
+
CMD ["driftcut", "--help"]
|
driftcut-0.8.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Riccardo Merenda
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|