driftcut 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. driftcut-0.8.0/.dockerignore +14 -0
  2. driftcut-0.8.0/.github/workflows/ci.yml +48 -0
  3. driftcut-0.8.0/.github/workflows/pages.yml +35 -0
  4. driftcut-0.8.0/.github/workflows/publish.yml +75 -0
  5. driftcut-0.8.0/.github/workflows/release-audit.yml +21 -0
  6. driftcut-0.8.0/.gitignore +43 -0
  7. driftcut-0.8.0/CHANGELOG.md +179 -0
  8. driftcut-0.8.0/CLAUDE.md +88 -0
  9. driftcut-0.8.0/Dockerfile +14 -0
  10. driftcut-0.8.0/LICENSE +21 -0
  11. driftcut-0.8.0/PKG-INFO +537 -0
  12. driftcut-0.8.0/README.md +497 -0
  13. driftcut-0.8.0/assets/logo.svg +4 -0
  14. driftcut-0.8.0/docker-compose.yml +25 -0
  15. driftcut-0.8.0/docs/concept-document-v0.3.md +605 -0
  16. driftcut-0.8.0/docs/demo-report.html +523 -0
  17. driftcut-0.8.0/docs/redis-recall-v0.7.0-plan.md +428 -0
  18. driftcut-0.8.0/examples/migration.redis.yaml +54 -0
  19. driftcut-0.8.0/examples/migration.yaml +68 -0
  20. driftcut-0.8.0/examples/migration_test_openrouter.yaml +66 -0
  21. driftcut-0.8.0/examples/prompts.csv +31 -0
  22. driftcut-0.8.0/examples/replay.json +58 -0
  23. driftcut-0.8.0/examples/replay.yaml +39 -0
  24. driftcut-0.8.0/pyproject.toml +88 -0
  25. driftcut-0.8.0/scripts/check_release_alignment.py +141 -0
  26. driftcut-0.8.0/scripts/check_release_state.py +172 -0
  27. driftcut-0.8.0/scripts/replay_converter_template.py +60 -0
  28. driftcut-0.8.0/site/CNAME +1 -0
  29. driftcut-0.8.0/site/index.html +691 -0
  30. driftcut-0.8.0/src/driftcut/__init__.py +3 -0
  31. driftcut-0.8.0/src/driftcut/__main__.py +5 -0
  32. driftcut-0.8.0/src/driftcut/cli.py +265 -0
  33. driftcut-0.8.0/src/driftcut/config.py +108 -0
  34. driftcut-0.8.0/src/driftcut/corpus.py +146 -0
  35. driftcut-0.8.0/src/driftcut/decision.py +470 -0
  36. driftcut-0.8.0/src/driftcut/executor.py +175 -0
  37. driftcut-0.8.0/src/driftcut/judge.py +423 -0
  38. driftcut-0.8.0/src/driftcut/models.py +197 -0
  39. driftcut-0.8.0/src/driftcut/quality.py +152 -0
  40. driftcut-0.8.0/src/driftcut/replay.py +177 -0
  41. driftcut-0.8.0/src/driftcut/reporting.py +633 -0
  42. driftcut-0.8.0/src/driftcut/runner.py +470 -0
  43. driftcut-0.8.0/src/driftcut/sampler.py +96 -0
  44. driftcut-0.8.0/src/driftcut/store.py +72 -0
  45. driftcut-0.8.0/src/driftcut/store_null.py +40 -0
  46. driftcut-0.8.0/src/driftcut/store_redis.py +225 -0
  47. driftcut-0.8.0/src/driftcut/trackers.py +135 -0
  48. driftcut-0.8.0/tests/__init__.py +0 -0
  49. driftcut-0.8.0/tests/test_cli.py +106 -0
  50. driftcut-0.8.0/tests/test_config.py +163 -0
  51. driftcut-0.8.0/tests/test_corpus.py +103 -0
  52. driftcut-0.8.0/tests/test_executor.py +290 -0
  53. driftcut-0.8.0/tests/test_judge.py +193 -0
  54. driftcut-0.8.0/tests/test_models.py +166 -0
  55. driftcut-0.8.0/tests/test_quality.py +85 -0
  56. driftcut-0.8.0/tests/test_replay.py +196 -0
  57. driftcut-0.8.0/tests/test_reporting.py +183 -0
  58. driftcut-0.8.0/tests/test_runner.py +373 -0
  59. driftcut-0.8.0/tests/test_sampler.py +88 -0
  60. driftcut-0.8.0/tests/test_store.py +108 -0
  61. driftcut-0.8.0/tests/test_trackers.py +195 -0
@@ -0,0 +1,14 @@
1
+ .git
2
+ .github
3
+ .mypy_cache
4
+ .pytest_cache
5
+ .ruff_cache
6
+ .tmp
7
+ .venv
8
+ driftcut-results
9
+ site
10
+ docs-live.html
11
+ pytest-cache-files-*
12
+ pytest-temp-*
13
+ __pycache__
14
+ *.pyc
@@ -0,0 +1,48 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v5
14
+ - uses: actions/setup-python@v6
15
+ with:
16
+ python-version: "3.12"
17
+ - run: pip install -e ".[dev]"
18
+ - run: python scripts/check_release_alignment.py
19
+ - run: ruff check src tests
20
+ - run: ruff format --check src tests
21
+ - run: mypy src tests
22
+
23
+ test:
24
+ runs-on: ubuntu-latest
25
+ strategy:
26
+ matrix:
27
+ python-version: ["3.12", "3.13"]
28
+ steps:
29
+ - uses: actions/checkout@v5
30
+ - uses: actions/setup-python@v6
31
+ with:
32
+ python-version: ${{ matrix.python-version }}
33
+ - run: pip install -e ".[dev]"
34
+ - run: pytest -v
35
+
36
+ package:
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v5
40
+ - uses: actions/setup-python@v6
41
+ with:
42
+ python-version: "3.13"
43
+ - run: pip install -e ".[dev]"
44
+ - run: python -m build
45
+ - run: python -m twine check dist/*
46
+ - run: python -m venv .pkg-venv
47
+ - run: .pkg-venv/bin/pip install dist/*.whl
48
+ - run: .pkg-venv/bin/driftcut --version
@@ -0,0 +1,35 @@
1
+ name: Deploy landing page
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - 'site/**'
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+ pages: write
13
+ id-token: write
14
+
15
+ concurrency:
16
+ group: pages
17
+ cancel-in-progress: true
18
+
19
+ env:
20
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
21
+
22
+ jobs:
23
+ deploy:
24
+ environment:
25
+ name: github-pages
26
+ url: ${{ steps.deployment.outputs.page_url }}
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - uses: actions/checkout@v5
30
+ - uses: actions/configure-pages@v5
31
+ - uses: actions/upload-pages-artifact@v4
32
+ with:
33
+ path: site
34
+ - id: deployment
35
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,75 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types:
6
+ - published
7
+ workflow_dispatch:
8
+ inputs:
9
+ ref:
10
+ description: Git ref to build and publish, for example main or v0.8.0
11
+ required: true
12
+ default: main
13
+
14
+ concurrency:
15
+ group: pypi-publish-${{ github.event.release.tag_name || github.event.inputs.ref || github.ref_name }}
16
+ cancel-in-progress: false
17
+
18
+ jobs:
19
+ build-distributions:
20
+ name: Build distributions
21
+ if: github.event_name != 'release' || github.event.release.prerelease == false
22
+ runs-on: ubuntu-latest
23
+ permissions:
24
+ contents: read
25
+
26
+ steps:
27
+ - name: Check out repository
28
+ uses: actions/checkout@v5
29
+ with:
30
+ ref: ${{ github.event.inputs.ref || github.ref }}
31
+
32
+ - name: Set up Python
33
+ uses: actions/setup-python@v6
34
+ with:
35
+ python-version: "3.13"
36
+
37
+ - name: Install build tooling
38
+ run: |
39
+ python -m pip install --upgrade pip
40
+ python -m pip install build twine
41
+
42
+ - name: Build distributions
43
+ run: python -m build
44
+
45
+ - name: Validate distribution metadata
46
+ run: python -m twine check dist/*
47
+
48
+ - name: Upload built distributions
49
+ uses: actions/upload-artifact@v4
50
+ with:
51
+ name: python-package-distributions
52
+ path: dist/
53
+ if-no-files-found: error
54
+
55
+ publish-to-pypi:
56
+ name: Publish to PyPI
57
+ if: github.event_name != 'release' || github.event.release.prerelease == false
58
+ needs:
59
+ - build-distributions
60
+ runs-on: ubuntu-latest
61
+ environment:
62
+ name: pypi
63
+ url: https://pypi.org/project/driftcut/
64
+ permissions:
65
+ id-token: write
66
+
67
+ steps:
68
+ - name: Download distributions
69
+ uses: actions/download-artifact@v4
70
+ with:
71
+ name: python-package-distributions
72
+ path: dist/
73
+
74
+ - name: Publish package distributions to PyPI
75
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,21 @@
1
+ name: Release Audit
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ permissions:
7
+ contents: read
8
+
9
+ jobs:
10
+ audit:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v5
14
+
15
+ - uses: actions/setup-python@v6
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - run: python scripts/check_release_alignment.py
20
+
21
+ - run: python scripts/check_release_state.py --retries 6 --delay-seconds 10
@@ -0,0 +1,43 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ *.egg
9
+ .eggs/
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+ *~
22
+
23
+ # OS
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Testing
28
+ .pytest_cache/
29
+ .coverage
30
+ htmlcov/
31
+ .mypy_cache/
32
+ .ruff_cache/
33
+
34
+ # Environment
35
+ .env
36
+ .env.local
37
+
38
+ # SQLite
39
+ *.db
40
+ *.sqlite3
41
+
42
+ # Driftcut run outputs
43
+ driftcut-results/
@@ -0,0 +1,179 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.8.0] - 2026-04-02
9
+
10
+ ### Added
11
+
12
+ - Richer prompt-level failure archetypes, including semantic regressions such as `refusal_regression`, `instruction_miss`, `incomplete_answer`, and `format_drift`
13
+ - Per-category scorecards in decision metrics, JSON output, and HTML reports
14
+ - Category-aware decision reasoning and console summaries that highlight the highest-risk category
15
+ - 4 new tests covering richer archetypes, category scorecards, and clearer run-level reasoning (110 total)
16
+
17
+ ### Changed
18
+
19
+ - Prompt evaluations now retain multiple failure archetypes instead of collapsing to a single coarse label
20
+ - Judge-driven regressions can now classify into more actionable semantic buckets instead of only `judge_worse`
21
+ - HTML examples now surface archetype summaries alongside deterministic and judge rationale
22
+
23
+ ## [0.7.0] - 2026-04-02
24
+
25
+ ### Added
26
+
27
+ - Optional Redis memory layer for baseline response caching and searchable run-history persistence
28
+ - `cache_hit`, cache summary, and saved baseline cost metrics in JSON and HTML outputs
29
+ - `driftcut[redis]` extra plus a Redis-enabled sample config for local testing
30
+ - `Dockerfile`, `docker-compose.yml`, and `.dockerignore` for reproducible local Redis-backed runs
31
+ - 9 new tests covering Redis config, caching behavior, reporting, and store adapters (106 total)
32
+
33
+ ### Changed
34
+
35
+ - Cached baseline responses are excluded from live latency comparisons so reuse does not distort candidate latency decisions
36
+ - Memory-backed runs now persist canonical run payloads through the same reporting shape used for file exports
37
+ - README and docs now document local Docker + Redis workflows alongside the normal Python path
38
+
39
+ ## [0.6.0] - 2026-04-01
40
+
41
+ ### Added
42
+
43
+ - Real tiered judging with light-to-heavy escalation when light judge confidence is below threshold
44
+ - Configurable `tiered_escalation_threshold` in evaluation config (default 0.6)
45
+ - `tier` and `escalated` fields on judge results for tracking which judge tier produced the verdict
46
+ - `escalated_prompts` metric in decision metrics and JSON/HTML reports
47
+ - Split judge cost tracking: `judge_light_usd` and `judge_heavy_usd` in cost summaries
48
+ - Escalation threshold shown in HTML thresholds table when strategy is tiered
49
+ - 8 new tests for tiered escalation, config validation, cost splitting, and reporting (97 total)
50
+
51
+ ### Changed
52
+
53
+ - `judge_strategy: tiered` now performs actual light-then-heavy escalation instead of aliasing to light
54
+ - Judge cost breakdown (light vs heavy) shown in HTML report when both tiers are used
55
+ - Console run summary includes escalated count when escalation occurs
56
+
57
+ ## [0.5.1] - 2026-03-30
58
+
59
+ ### Added
60
+
61
+ - CLI reference documentation page for `validate`, `run`, `replay`, and global flags
62
+ - Visible quickstart output examples across the README and docs site, including terminal and `results.json` excerpts
63
+
64
+ ### Changed
65
+
66
+ - README, landing page, and docs now show the produced artifacts more concretely instead of only describing the workflow
67
+ - Concept documentation now reflects the shipped three-way decision engine and no longer implies category-scoped proceed decisions
68
+
69
+ ### Fixed
70
+
71
+ - Live model calls now retry transient rate-limit, timeout, connection, and 5xx failures before counting them as API errors
72
+ - JSON exports now include per-response `retry_count` so retry behavior is auditable in saved artifacts
73
+
74
+ ## [0.5.0] - 2026-03-30
75
+
76
+ ### Added
77
+
78
+ - `driftcut replay --config ... --input ...` for historical paired-output backtesting
79
+ - Canonical replay JSON loader with versioned schema validation
80
+ - Replay-aware JSON and HTML reports with explicit `mode: replay` labeling
81
+ - Replay example assets and an external converter template under `scripts/`
82
+ - 8 new tests covering replay loading, CLI behavior, reporting, and live/replay parity
83
+
84
+ ### Changed
85
+
86
+ - Refactored the runner so live execution and replay share the same post-processing path
87
+ - Replay now reuses stratified sampling, deterministic checks, judge flow, decision logic, and early-stop behavior
88
+ - Replay configs can omit `corpus.file`; the canonical replay input provides prompt metadata directly
89
+
90
+ ## [0.4.0] - 2026-03-29
91
+
92
+ ### Added
93
+
94
+ - Judge layer for ambiguous prompts with semantic verdicts, confidence, and rationale
95
+ - Judge-aware decision metrics, confidence scoring, and cost tracking
96
+ - Judge details in JSON output and HTML reports
97
+ - 6 new tests for judge helpers and runtime integration
98
+
99
+ ### Changed
100
+
101
+ - `judge_strategy: light` is now the active default for the alpha runtime
102
+ - Ambiguous prompts now lower confidence until they are judged or the strategy is disabled
103
+ - `tiered` remains a compatibility alias for light judging until heavy escalation lands
104
+
105
+ ## [0.3.0] - 2026-03-29
106
+
107
+ ### Added
108
+
109
+ - Deterministic quality checks for expected output formats, required content, JSON keys, and length limits
110
+ - Early-stop decision engine with `STOP`, `CONTINUE`, and `PROCEED` outcomes
111
+ - HTML report generation alongside richer JSON output
112
+ - Run-level risk metrics, confidence, and failure archetype summaries
113
+ - 6 new tests for quality checks, reporting, and decision behavior
114
+
115
+ ### Changed
116
+
117
+ - `min_batches` now actively gates early `PROCEED` decisions
118
+ - `risk` and most `output` settings are now active runtime behavior instead of validation-only config
119
+ - Public repo messaging and examples now reflect the real runtime feature set
120
+
121
+ ## [0.2.2] - 2026-03-29
122
+
123
+ ### Changed
124
+
125
+ - Raised the repository quality bar with enforced `mypy` checks in CI and a clean strict-typing pass
126
+ - Stabilized sampled batch result ordering and refreshed the public alpha messaging across the app site
127
+
128
+ ### Fixed
129
+
130
+ - Preserved successful model responses when pricing metadata is unavailable
131
+ - Aligned the shipped app version across package metadata, CLI output, changelog, and landing page badge
132
+
133
+ ## [0.2.1] - 2026-03-29
134
+
135
+ ### Added
136
+
137
+ - `api_base` field on model config for custom endpoints (Azure, proxies, self-hosted)
138
+ - OpenRouter support documented and tested
139
+ - Multi-provider config examples (same-provider, OpenRouter, custom endpoint)
140
+ - 5 new tests (66 total)
141
+
142
+ ## [0.2.0] - 2026-03-29
143
+
144
+ ### Added
145
+
146
+ - Async model execution via LiteLLM (`executor.py`)
147
+ - Latency tracker with p50/p95 per category (`trackers.py`)
148
+ - Cost tracker with per-prompt and cumulative spend (`trackers.py`)
149
+ - Migration runner with concurrent batch execution (`runner.py`)
150
+ - `driftcut run --config` command — fully wired end-to-end
151
+ - JSON results export to `driftcut-results/results.json`
152
+ - Result data models: `ModelResponse`, `PromptResult`, `BatchResult` (`models.py`)
153
+ - Rich progress bars during batch execution
154
+ - 26 new tests (61 total)
155
+
156
+ ## [0.1.0] - 2026-03-28
157
+
158
+ ### Added
159
+
160
+ - YAML config loading and validation with Pydantic models
161
+ - Corpus loading from CSV and JSON with full validation
162
+ - Stratified batch sampler (high-criticality prioritized in early batches)
163
+ - `driftcut validate --config` command with Rich terminal output
164
+ - CI pipeline (ruff lint + format + pytest on Python 3.12 & 3.13)
165
+ - Pre-launch landing page at driftcut.dev
166
+ - Documentation site at docs.driftcut.dev
167
+ - 35 tests covering config, corpus, sampler, and CLI
168
+
169
+ [0.8.0]: https://github.com/riccardomerenda/driftcut/compare/v0.7.0...v0.8.0
170
+ [0.7.0]: https://github.com/riccardomerenda/driftcut/compare/v0.6.0...v0.7.0
171
+ [0.6.0]: https://github.com/riccardomerenda/driftcut/compare/v0.5.1...v0.6.0
172
+ [0.5.1]: https://github.com/riccardomerenda/driftcut/compare/v0.5.0...v0.5.1
173
+ [0.5.0]: https://github.com/riccardomerenda/driftcut/compare/v0.4.0...v0.5.0
174
+ [0.4.0]: https://github.com/riccardomerenda/driftcut/compare/v0.3.0...v0.4.0
175
+ [0.3.0]: https://github.com/riccardomerenda/driftcut/compare/v0.2.2...v0.3.0
176
+ [0.2.2]: https://github.com/riccardomerenda/driftcut/compare/v0.2.1...v0.2.2
177
+ [0.2.1]: https://github.com/riccardomerenda/driftcut/compare/v0.2.0...v0.2.1
178
+ [0.2.0]: https://github.com/riccardomerenda/driftcut/compare/v0.1.0...v0.2.0
179
+ [0.1.0]: https://github.com/riccardomerenda/driftcut/releases/tag/v0.1.0
@@ -0,0 +1,88 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What is this?
6
+
7
+ Driftcut is an early-stop canary testing tool for LLM model migrations. It samples strategically from a prompt corpus, compares baseline and candidate models, runs deterministic checks first, escalates only ambiguous prompts to a judge model, and outputs a STOP/CONTINUE/PROCEED decision — all before committing to a full evaluation run.
8
+
9
+ ## Commands
10
+
11
+ ```bash
12
+ pip install -e ".[dev]" # Install in dev mode
13
+ pytest # Run all tests
14
+ pytest tests/test_runner.py # Run a single test file
15
+ pytest tests/test_runner.py -k "test_name" # Run a single test
16
+ ruff check src tests # Lint
17
+ ruff format src tests # Format
18
+ mypy src # Type check
19
+ driftcut validate --config migration.yaml # Validate config + corpus
20
+ driftcut run --config migration.yaml # Run a migration test
21
+ driftcut replay --config replay.yaml --input replay.json # Replay historical outputs
22
+ ```
23
+
24
+ ## Architecture
25
+
26
+ ### Data flow
27
+
28
+ ```
29
+ CLI (cli.py) loads config + corpus
30
+ → StratifiedSampler yields Batch objects (criticality-first)
31
+ → runner.py orchestrates batch-by-batch execution
32
+ → executor.py runs baseline + candidate concurrently per prompt (asyncio.gather)
33
+ → quality.py runs deterministic checks on every response (free)
34
+ → judge.py called only for ambiguous prompts (both pass deterministic but differ semantically)
35
+ → decision.py evaluates STOP/CONTINUE/PROCEED after each batch
36
+ → reporting.py writes JSON + HTML to driftcut-results/
37
+ ```
38
+
39
+ Replay mode substitutes pre-recorded responses for live execution but uses the same evaluation pipeline.
40
+
41
+ ### Module roles
42
+
43
+ | Module | Role |
44
+ |---|---|
45
+ | `cli.py` | Typer commands: `run`, `validate`, `replay` |
46
+ | `config.py` | Pydantic models for YAML config with constrained fields |
47
+ | `corpus.py` | CSV/JSON loader → `PromptRecord` list with flexible delimiter parsing (`\|` or `;`) |
48
+ | `sampler.py` | `StratifiedSampler` iterator — yields equal-sized batches, pre-sorted by criticality within category |
49
+ | `runner.py` | Async orchestrator — batch loop, wires executor → quality → judge → decision |
50
+ | `executor.py` | Async LiteLLM wrapper with retry (3 attempts, exponential backoff for rate limits/5xx/timeouts) |
51
+ | `quality.py` | Deterministic checks driven by `expected_output_type`: JSON validity, required keys, substrings, length |
52
+ | `judge.py` | Sends ambiguous prompt pairs to judge model, extracts JSON verdict from freeform responses |
53
+ | `decision.py` | Decision engine: hard stops on schema breaks/high-crit failures, proceed gate on overall risk + latency |
54
+ | `trackers.py` | Cost and latency accumulators (cost gracefully handles missing LiteLLM pricing) |
55
+ | `models.py` | Frozen dataclasses: `ModelResponse`, `PromptEvaluation`, `JudgeResult`, `RunDecision`, `RunResult` |
56
+ | `reporting.py` | JSON serialization + HTML report with failure archetypes |
57
+ | `replay.py` | Replay-specific data models for loading historical paired outputs |
58
+
59
+ ### Decision engine (`decision.py`)
60
+
61
+ Evaluated after every batch:
62
+
63
+ 1. **Hard stops** (checked first): `schema_break_rate >= 0.25` or `high_criticality_failure_rate >= 0.20`
64
+ 2. **Proceed gate**: `overall_risk < 0.08` AND latency ratios within bounds AND `min_batches` reached
65
+ 3. **Continue**: neither triggered and batches remain
66
+ 4. **Final stop**: budget exhausted without proceeding
67
+
68
+ Overall risk is a weighted average of regression rate, failure rate, schema breaks, high-criticality failures (weighted 2x), and latency penalty.
69
+
70
+ ### Concurrency model
71
+
72
+ - Prompts within a batch run concurrently via `asyncio.as_completed()`
73
+ - Baseline and candidate for each prompt run in parallel via `asyncio.gather()`
74
+ - Results are re-sorted by original index after completion (preserves deterministic order)
75
+ - Judge calls are sequential per prompt
76
+
77
+ ## Key conventions
78
+
79
+ - **src layout**: imports are `from driftcut.xxx import yyy`
80
+ - **Config validation at load time**: Pydantic fields have `ge`/`le` constraints; invalid configs fail immediately
81
+ - **`expected_output_type` drives evaluation**: `"json"` checks validity + keys, `"labels"` parses as list, `"free_text"`/`"markdown"` check substrings + length only
82
+ - **Judge strategy enum**: `"none"` (deterministic only), `"light"` (cheap model), `"heavy"` (expensive model), `"tiered"` (light first, escalates to heavy when confidence < `tiered_escalation_threshold`)
83
+ - **Cost error tolerance**: if LiteLLM can't price a model, run continues with `cost_usd=0.0` and `cost_error` stores the message
84
+ - **Conservative defaults**: decision engine favors false negatives (saying STOP) over false positives (saying PROCEED)
85
+ - **B008 suppressed in cli.py**: `typer.Option()` in function defaults is idiomatic Typer
86
+ - **ruff line length**: 100 chars, target Python 3.12
87
+ - **mypy strict mode** enabled
88
+ - **pytest-asyncio auto mode**: async test functions are auto-detected
@@ -0,0 +1,14 @@
1
+ FROM python:3.13-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /workspace
8
+
9
+ COPY . /workspace
10
+
11
+ RUN python -m pip install --upgrade pip \
12
+ && python -m pip install -e ".[dev,redis]"
13
+
14
+ CMD ["driftcut", "--help"]
driftcut-0.8.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Riccardo Merenda
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.