inject-lock 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inject_lock-0.1.0/.github/workflows/ci.yml +59 -0
- inject_lock-0.1.0/.github/workflows/publish.yml +35 -0
- inject_lock-0.1.0/.gitignore +12 -0
- inject_lock-0.1.0/LICENSE +21 -0
- inject_lock-0.1.0/PKG-INFO +278 -0
- inject_lock-0.1.0/README.md +244 -0
- inject_lock-0.1.0/action.yml +59 -0
- inject_lock-0.1.0/pyproject.toml +71 -0
- inject_lock-0.1.0/src/prompt_lock/__init__.py +8 -0
- inject_lock-0.1.0/src/prompt_lock/cli.py +494 -0
- inject_lock-0.1.0/src/prompt_lock/config.py +79 -0
- inject_lock-0.1.0/src/prompt_lock/detector.py +63 -0
- inject_lock-0.1.0/src/prompt_lock/gate.py +80 -0
- inject_lock-0.1.0/src/prompt_lock/judge/__init__.py +1 -0
- inject_lock-0.1.0/src/prompt_lock/judge/calibrate.py +160 -0
- inject_lock-0.1.0/src/prompt_lock/judge/llm.py +64 -0
- inject_lock-0.1.0/src/prompt_lock/runner.py +221 -0
- inject_lock-0.1.0/src/prompt_lock/tracer.py +224 -0
- inject_lock-0.1.0/tests/__init__.py +0 -0
- inject_lock-0.1.0/tests/test_config.py +76 -0
- inject_lock-0.1.0/tests/test_gate.py +100 -0
- inject_lock-0.1.0/tests/test_runner.py +77 -0
- inject_lock-0.1.0/tests/test_tracer.py +103 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
run: pip install uv
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: uv pip install -e ".[dev]" --system
|
|
29
|
+
|
|
30
|
+
- name: Lint
|
|
31
|
+
run: ruff check src/
|
|
32
|
+
|
|
33
|
+
- name: Type check
|
|
34
|
+
run: mypy src/prompt_lock --ignore-missing-imports || true
|
|
35
|
+
|
|
36
|
+
- name: Run tests
|
|
37
|
+
run: pytest tests/ -v --cov=prompt_lock --cov-report=term-missing
|
|
38
|
+
|
|
39
|
+
build:
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
needs: test
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
|
|
45
|
+
- name: Set up Python
|
|
46
|
+
uses: actions/setup-python@v5
|
|
47
|
+
with:
|
|
48
|
+
python-version: "3.11"
|
|
49
|
+
|
|
50
|
+
- name: Install uv
|
|
51
|
+
run: pip install uv
|
|
52
|
+
|
|
53
|
+
- name: Build package
|
|
54
|
+
run: uv build
|
|
55
|
+
|
|
56
|
+
- name: Verify install from wheel
|
|
57
|
+
run: |
|
|
58
|
+
pip install dist/*.whl
|
|
59
|
+
prompt-lock --version
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-and-publish:
|
|
9
|
+
name: Build and publish to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: pypi
|
|
13
|
+
url: https://pypi.org/p/prompt-lock
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write # OIDC trusted publishing — no stored API tokens needed
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.11"
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
run: pip install uv
|
|
28
|
+
|
|
29
|
+
- name: Build
|
|
30
|
+
run: uv build
|
|
31
|
+
|
|
32
|
+
- name: Publish to PyPI
|
|
33
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
34
|
+
# Uses OIDC trusted publishing — configure at pypi.org/manage/account/publishing/
|
|
35
|
+
# No API token needed.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 BuildWorld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inject-lock
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Git-native prompt regression testing with judge calibration for LLM CI/CD pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/Rowusuduah/prompt-lock
|
|
6
|
+
Project-URL: Repository, https://github.com/Rowusuduah/prompt-lock
|
|
7
|
+
Project-URL: Issues, https://github.com/Rowusuduah/prompt-lock/issues
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: ci-cd,evaluation,judge-calibration,llm,prompt,regression,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: click>=8.1
|
|
20
|
+
Requires-Dist: gitpython>=3.1
|
|
21
|
+
Requires-Dist: litellm>=1.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Requires-Dist: pyyaml>=6.0
|
|
24
|
+
Requires-Dist: rich>=13.0
|
|
25
|
+
Requires-Dist: scipy>=1.11
|
|
26
|
+
Requires-Dist: sentence-transformers>=5.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: black>=24.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.3; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# prompt-lock
|
|
36
|
+
|
|
37
|
+
**Git-native prompt regression testing with judge calibration.**
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/prompt-lock/)
|
|
40
|
+
[](https://github.com/buildworld-ai/prompt-lock/actions)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
|
|
44
|
+
Guards at the gaps in your LLM CI/CD pipeline. Fails the build when a prompt change causes a regression — and verifies that your LLM judge actually agrees with humans before trusting it as a gate.
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
pip install prompt-lock
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## The problem
|
|
53
|
+
|
|
54
|
+
You changed a prompt. Did your model outputs get worse?
|
|
55
|
+
|
|
56
|
+
You probably don't know. 82% of teams have no automated detection for prompt quality regressions. The few that do often use LLM-as-a-judge — but their judge is miscalibrated: it disagrees with human evaluators on 20–40% of examples and they've never measured it.
|
|
57
|
+
|
|
58
|
+
## The solution
|
|
59
|
+
|
|
60
|
+
prompt-lock does three things no other tool does together in a single `pip install`:
|
|
61
|
+
|
|
62
|
+
1. **Detects changed prompts via git diff** — only evaluates what changed, keeping costs low
|
|
63
|
+
2. **Verifies judge calibration** — runs your LLM judge against human-labeled examples, measures agreement rate and Spearman correlation, and *blocks the CI pipeline if the judge can't be trusted*
|
|
64
|
+
3. **Regression gate** — fails the build if eval scores drop more than a configurable threshold from baseline
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Quick start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install prompt-lock
|
|
72
|
+
cd your-llm-project
|
|
73
|
+
prompt-lock init
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
`init` creates `.prompt-lock.yml`, `prompts/`, `tests/test_cases.jsonl`, and `tests/human_labels.jsonl`.
|
|
77
|
+
|
|
78
|
+
Fill in your test cases:
|
|
79
|
+
```jsonl
|
|
80
|
+
{"input": "Summarize this article: ...", "output": "The article discusses ...", "expected_output": "A summary of the article."}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Run:
|
|
84
|
+
```bash
|
|
85
|
+
prompt-lock check
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Configuration
|
|
91
|
+
|
|
92
|
+
```yaml
|
|
93
|
+
# .prompt-lock.yml
|
|
94
|
+
version: "1"
|
|
95
|
+
model: gpt-4o-mini
|
|
96
|
+
|
|
97
|
+
# Judge calibration — the key differentiator
|
|
98
|
+
judge:
|
|
99
|
+
enabled: true
|
|
100
|
+
human_labels_file: tests/human_labels.jsonl
|
|
101
|
+
model: gpt-4o-mini
|
|
102
|
+
criteria: "Rate the quality of this response from 0.0 to 1.0."
|
|
103
|
+
min_agreement: 0.80 # 80% of examples must agree (within ±0.15)
|
|
104
|
+
min_spearman: 0.70 # Spearman correlation with human scores
|
|
105
|
+
|
|
106
|
+
prompts:
|
|
107
|
+
- path: "prompts/*.txt"
|
|
108
|
+
name: "My Prompts"
|
|
109
|
+
test_cases_file: tests/test_cases.jsonl
|
|
110
|
+
evals:
|
|
111
|
+
- type: llm_judge
|
|
112
|
+
criteria: "Is the response helpful, accurate, and well-structured?"
|
|
113
|
+
threshold: 0.70
|
|
114
|
+
- type: semantic_similarity
|
|
115
|
+
threshold: 0.80
|
|
116
|
+
|
|
117
|
+
gate:
|
|
118
|
+
mode: regression # hard | regression | soft
|
|
119
|
+
regression_threshold: 0.05 # fail if score drops >5% from baseline
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Eval types
|
|
125
|
+
|
|
126
|
+
| Type | What it checks | Requires |
|
|
127
|
+
|------|---------------|----------|
|
|
128
|
+
| `llm_judge` | LLM scores output against criteria (0.0–1.0) | `criteria` |
|
|
129
|
+
| `semantic_similarity` | Cosine similarity to expected output (offline, all-MiniLM-L6-v2) | `expected_output` in test cases |
|
|
130
|
+
| `exact_match` | Exact string match | `expected_output` in test cases |
|
|
131
|
+
| `regex` | Output matches a regex pattern | `pattern` |
|
|
132
|
+
| `custom` | Your own Python function `fn(input, output) -> float` | `custom_fn` |
|
|
133
|
+
|
|
134
|
+
Works with any LLM provider via [LiteLLM](https://github.com/BerriAI/litellm):
|
|
135
|
+
- `gpt-4o-mini`, `gpt-4o`
|
|
136
|
+
- `claude-haiku-4-5-20251001`, `claude-sonnet-4-6`
|
|
137
|
+
- `mistral/mistral-small`
|
|
138
|
+
- Any local model via Ollama: `ollama/llama3`
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Gate modes
|
|
143
|
+
|
|
144
|
+
**`regression`** (default) — fail if score drops more than `regression_threshold` from recent baseline. Good for ongoing development.
|
|
145
|
+
|
|
146
|
+
**`hard`** — fail if score is below `hard_threshold`. Good for critical prompts with known minimum quality.
|
|
147
|
+
|
|
148
|
+
**`soft`** — never fail, warn only. Good for new prompts without established baselines.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Judge calibration
|
|
153
|
+
|
|
154
|
+
The unique feature. Before running evals, prompt-lock checks whether your LLM judge actually agrees with human evaluators:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
prompt-lock calibrate
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
162
|
+
│ Calibration Summary │
|
|
163
|
+
│ │
|
|
164
|
+
│ PASSED │
|
|
165
|
+
│ │
|
|
166
|
+
│ Agreement rate 87.5% (min: 80%) │
|
|
167
|
+
│ Spearman r 0.831 (min: 0.70) │
|
|
168
|
+
│ Bias +0.042 (positive = judge inflates scores) │
|
|
169
|
+
│ Examples 16 │
|
|
170
|
+
└─────────────────────────────────────────────────────────────┘
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
If calibration fails, `prompt-lock check` exits with code 2 and blocks deployment. Your CI pipeline doesn't trust an uncalibrated judge.
|
|
174
|
+
|
|
175
|
+
Create `tests/human_labels.jsonl`:
|
|
176
|
+
```jsonl
|
|
177
|
+
{"input": "What is 2+2?", "output": "The answer is 4.", "human_score": 1.0}
|
|
178
|
+
{"input": "What is 2+2?", "output": "It's roughly 5.", "human_score": 0.0}
|
|
179
|
+
{"input": "Explain Python.", "output": "Python is a high-level language.", "human_score": 0.9}
|
|
180
|
+
```
|
|
181
|
+
Minimum 5 examples. More is better.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## GitHub Actions
|
|
186
|
+
|
|
187
|
+
```yaml
|
|
188
|
+
# .github/workflows/prompt-lock.yml
|
|
189
|
+
name: Prompt Regression Tests
|
|
190
|
+
|
|
191
|
+
on: [push, pull_request]
|
|
192
|
+
|
|
193
|
+
jobs:
|
|
194
|
+
prompt-lock:
|
|
195
|
+
runs-on: ubuntu-latest
|
|
196
|
+
steps:
|
|
197
|
+
- uses: actions/checkout@v4
|
|
198
|
+
with:
|
|
199
|
+
fetch-depth: 2 # needed for git diff detection
|
|
200
|
+
|
|
201
|
+
- uses: buildworld-ai/prompt-lock@v1
|
|
202
|
+
with:
|
|
203
|
+
config: .prompt-lock.yml
|
|
204
|
+
env:
|
|
205
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Or with other providers:
|
|
209
|
+
```yaml
|
|
210
|
+
env:
|
|
211
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## CLI reference
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
prompt-lock init # initialize config and example files
|
|
220
|
+
prompt-lock check # run regression checks (git-diff aware)
|
|
221
|
+
prompt-lock check --all-prompts # eval all prompts, not just changed ones
|
|
222
|
+
prompt-lock check --no-calibrate # skip calibration check
|
|
223
|
+
prompt-lock check -v # verbose: show per-test-case results
|
|
224
|
+
prompt-lock calibrate # run calibration and show detailed results
|
|
225
|
+
prompt-lock traces show # show recent eval runs from trace ledger
|
|
226
|
+
prompt-lock traces show -n 50 # show last 50 runs
|
|
227
|
+
prompt-lock traces diff abc123 def456 # compare scores between two commits
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Trace ledger
|
|
233
|
+
|
|
234
|
+
Every eval run is recorded in a local SQLite database (`.prompt-lock/traces.db`) with the git commit SHA. This is how regression detection works — it compares current scores to recent passing baselines.
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
prompt-lock traces show
|
|
238
|
+
|
|
239
|
+
┌───────────────────────┬─────────┬─────────────────┬───────────┬───────┬──────┐
|
|
240
|
+
│ Timestamp │ Commit │ Prompt │ Type │ Score │ Pass │
|
|
241
|
+
├───────────────────────┼─────────┼─────────────────┼───────────┼───────┼──────┤
|
|
242
|
+
│ 2026-03-27T14:32:01 │ a1b2c3d │ prompts/sum.txt │ llm_judge │ 0.841 │ ✓ │
|
|
243
|
+
│ 2026-03-27T14:32:00 │ a1b2c3d │ prompts/sum.txt │ semantic │ 0.923 │ ✓ │
|
|
244
|
+
│ 2026-03-26T09:15:44 │ e4f5g6h │ prompts/sum.txt │ llm_judge │ 0.710 │ ✓ │
|
|
245
|
+
└───────────────────────┴─────────┴─────────────────┴───────────┴───────┴──────┘
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Why not Promptfoo / LangSmith / DeepEval?
|
|
251
|
+
|
|
252
|
+
| Capability | prompt-lock | Promptfoo | LangSmith | DeepEval |
|
|
253
|
+
|-----------|:-----------:|:---------:|:---------:|:--------:|
|
|
254
|
+
| Git-diff aware (only eval changed prompts) | ✓ | ✗ | ✗ | ✗ |
|
|
255
|
+
| Judge calibration against human labels | ✓ | ✗ | partial | ✗ |
|
|
256
|
+
| Block CI if judge is miscalibrated | ✓ | ✗ | ✗ | ✗ |
|
|
257
|
+
| Regression gate (baseline comparison) | ✓ | ✓ | ✓ | ✓ |
|
|
258
|
+
| Commit-linked trace ledger | ✓ | ✗ | ✓ | ✗ |
|
|
259
|
+
| Framework-agnostic (LiteLLM) | ✓ | ✓ | ✗ | ✓ |
|
|
260
|
+
| Offline semantic similarity | ✓ | ✗ | ✗ | ✓ |
|
|
261
|
+
| Zero hosted infrastructure | ✓ | ✓ | ✗ | partial |
|
|
262
|
+
| `pip install` in 30 seconds | ✓ | ✗ | ✗ | ✓ |
|
|
263
|
+
|
|
264
|
+
Promptfoo was acquired by OpenAI in March 2026 — its roadmap is now OpenAI-aligned. prompt-lock is MIT licensed and provider-agnostic.
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Contributing
|
|
269
|
+
|
|
270
|
+
Issues and PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## License
|
|
275
|
+
|
|
276
|
+
MIT. Built by [BuildWorld](https://github.com/buildworld-ai).
|
|
277
|
+
|
|
278
|
+
*Guards at the gaps. Nehemiah 4:13.*
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# prompt-lock
|
|
2
|
+
|
|
3
|
+
**Git-native prompt regression testing with judge calibration.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/prompt-lock/)
|
|
6
|
+
[](https://github.com/buildworld-ai/prompt-lock/actions)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
[](https://www.python.org/downloads/)
|
|
9
|
+
|
|
10
|
+
Guards at the gaps in your LLM CI/CD pipeline. Fails the build when a prompt change causes a regression — and verifies that your LLM judge actually agrees with humans before trusting it as a gate.
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
pip install prompt-lock
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## The problem
|
|
19
|
+
|
|
20
|
+
You changed a prompt. Did your model outputs get worse?
|
|
21
|
+
|
|
22
|
+
You probably don't know. 82% of teams have no automated detection for prompt quality regressions. The few that do often use LLM-as-a-judge — but their judge is miscalibrated: it disagrees with human evaluators on 20–40% of examples and they've never measured it.
|
|
23
|
+
|
|
24
|
+
## The solution
|
|
25
|
+
|
|
26
|
+
prompt-lock does three things no other tool does together in a single `pip install`:
|
|
27
|
+
|
|
28
|
+
1. **Detects changed prompts via git diff** — only evaluates what changed, keeping costs low
|
|
29
|
+
2. **Verifies judge calibration** — runs your LLM judge against human-labeled examples, measures agreement rate and Spearman correlation, and *blocks the CI pipeline if the judge can't be trusted*
|
|
30
|
+
3. **Regression gate** — fails the build if eval scores drop more than a configurable threshold from baseline
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install prompt-lock
|
|
38
|
+
cd your-llm-project
|
|
39
|
+
prompt-lock init
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
`init` creates `.prompt-lock.yml`, `prompts/`, `tests/test_cases.jsonl`, and `tests/human_labels.jsonl`.
|
|
43
|
+
|
|
44
|
+
Fill in your test cases:
|
|
45
|
+
```jsonl
|
|
46
|
+
{"input": "Summarize this article: ...", "output": "The article discusses ...", "expected_output": "A summary of the article."}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Run:
|
|
50
|
+
```bash
|
|
51
|
+
prompt-lock check
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Configuration
|
|
57
|
+
|
|
58
|
+
```yaml
|
|
59
|
+
# .prompt-lock.yml
|
|
60
|
+
version: "1"
|
|
61
|
+
model: gpt-4o-mini
|
|
62
|
+
|
|
63
|
+
# Judge calibration — the key differentiator
|
|
64
|
+
judge:
|
|
65
|
+
enabled: true
|
|
66
|
+
human_labels_file: tests/human_labels.jsonl
|
|
67
|
+
model: gpt-4o-mini
|
|
68
|
+
criteria: "Rate the quality of this response from 0.0 to 1.0."
|
|
69
|
+
min_agreement: 0.80 # 80% of examples must agree (within ±0.15)
|
|
70
|
+
min_spearman: 0.70 # Spearman correlation with human scores
|
|
71
|
+
|
|
72
|
+
prompts:
|
|
73
|
+
- path: "prompts/*.txt"
|
|
74
|
+
name: "My Prompts"
|
|
75
|
+
test_cases_file: tests/test_cases.jsonl
|
|
76
|
+
evals:
|
|
77
|
+
- type: llm_judge
|
|
78
|
+
criteria: "Is the response helpful, accurate, and well-structured?"
|
|
79
|
+
threshold: 0.70
|
|
80
|
+
- type: semantic_similarity
|
|
81
|
+
threshold: 0.80
|
|
82
|
+
|
|
83
|
+
gate:
|
|
84
|
+
mode: regression # hard | regression | soft
|
|
85
|
+
regression_threshold: 0.05 # fail if score drops >5% from baseline
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Eval types
|
|
91
|
+
|
|
92
|
+
| Type | What it checks | Requires |
|
|
93
|
+
|------|---------------|----------|
|
|
94
|
+
| `llm_judge` | LLM scores output against criteria (0.0–1.0) | `criteria` |
|
|
95
|
+
| `semantic_similarity` | Cosine similarity to expected output (offline, all-MiniLM-L6-v2) | `expected_output` in test cases |
|
|
96
|
+
| `exact_match` | Exact string match | `expected_output` in test cases |
|
|
97
|
+
| `regex` | Output matches a regex pattern | `pattern` |
|
|
98
|
+
| `custom` | Your own Python function `fn(input, output) -> float` | `custom_fn` |
|
|
99
|
+
|
|
100
|
+
Works with any LLM provider via [LiteLLM](https://github.com/BerriAI/litellm):
|
|
101
|
+
- `gpt-4o-mini`, `gpt-4o`
|
|
102
|
+
- `claude-haiku-4-5-20251001`, `claude-sonnet-4-6`
|
|
103
|
+
- `mistral/mistral-small`
|
|
104
|
+
- Any local model via Ollama: `ollama/llama3`
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Gate modes
|
|
109
|
+
|
|
110
|
+
**`regression`** (default) — fail if score drops more than `regression_threshold` from recent baseline. Good for ongoing development.
|
|
111
|
+
|
|
112
|
+
**`hard`** — fail if score is below `hard_threshold`. Good for critical prompts with known minimum quality.
|
|
113
|
+
|
|
114
|
+
**`soft`** — never fail, warn only. Good for new prompts without established baselines.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Judge calibration
|
|
119
|
+
|
|
120
|
+
The unique feature. Before running evals, prompt-lock checks whether your LLM judge actually agrees with human evaluators:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
prompt-lock calibrate
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
128
|
+
│ Calibration Summary │
|
|
129
|
+
│ │
|
|
130
|
+
│ PASSED │
|
|
131
|
+
│ │
|
|
132
|
+
│ Agreement rate 87.5% (min: 80%) │
|
|
133
|
+
│ Spearman r 0.831 (min: 0.70) │
|
|
134
|
+
│ Bias +0.042 (positive = judge inflates scores) │
|
|
135
|
+
│ Examples 16 │
|
|
136
|
+
└─────────────────────────────────────────────────────────────┘
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
If calibration fails, `prompt-lock check` exits with code 2 and blocks deployment. Your CI pipeline doesn't trust an uncalibrated judge.
|
|
140
|
+
|
|
141
|
+
Create `tests/human_labels.jsonl`:
|
|
142
|
+
```jsonl
|
|
143
|
+
{"input": "What is 2+2?", "output": "The answer is 4.", "human_score": 1.0}
|
|
144
|
+
{"input": "What is 2+2?", "output": "It's roughly 5.", "human_score": 0.0}
|
|
145
|
+
{"input": "Explain Python.", "output": "Python is a high-level language.", "human_score": 0.9}
|
|
146
|
+
```
|
|
147
|
+
Minimum 5 examples. More is better.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## GitHub Actions
|
|
152
|
+
|
|
153
|
+
```yaml
|
|
154
|
+
# .github/workflows/prompt-lock.yml
|
|
155
|
+
name: Prompt Regression Tests
|
|
156
|
+
|
|
157
|
+
on: [push, pull_request]
|
|
158
|
+
|
|
159
|
+
jobs:
|
|
160
|
+
prompt-lock:
|
|
161
|
+
runs-on: ubuntu-latest
|
|
162
|
+
steps:
|
|
163
|
+
- uses: actions/checkout@v4
|
|
164
|
+
with:
|
|
165
|
+
fetch-depth: 2 # needed for git diff detection
|
|
166
|
+
|
|
167
|
+
- uses: buildworld-ai/prompt-lock@v1
|
|
168
|
+
with:
|
|
169
|
+
config: .prompt-lock.yml
|
|
170
|
+
env:
|
|
171
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Or with other providers:
|
|
175
|
+
```yaml
|
|
176
|
+
env:
|
|
177
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## CLI reference
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
prompt-lock init # initialize config and example files
|
|
186
|
+
prompt-lock check # run regression checks (git-diff aware)
|
|
187
|
+
prompt-lock check --all-prompts # eval all prompts, not just changed ones
|
|
188
|
+
prompt-lock check --no-calibrate # skip calibration check
|
|
189
|
+
prompt-lock check -v # verbose: show per-test-case results
|
|
190
|
+
prompt-lock calibrate # run calibration and show detailed results
|
|
191
|
+
prompt-lock traces show # show recent eval runs from trace ledger
|
|
192
|
+
prompt-lock traces show -n 50 # show last 50 runs
|
|
193
|
+
prompt-lock traces diff abc123 def456 # compare scores between two commits
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Trace ledger
|
|
199
|
+
|
|
200
|
+
Every eval run is recorded in a local SQLite database (`.prompt-lock/traces.db`) with the git commit SHA. This is how regression detection works — it compares current scores to recent passing baselines.
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
prompt-lock traces show
|
|
204
|
+
|
|
205
|
+
┌───────────────────────┬─────────┬─────────────────┬───────────┬───────┬──────┐
|
|
206
|
+
│ Timestamp │ Commit │ Prompt │ Type │ Score │ Pass │
|
|
207
|
+
├───────────────────────┼─────────┼─────────────────┼───────────┼───────┼──────┤
|
|
208
|
+
│ 2026-03-27T14:32:01 │ a1b2c3d │ prompts/sum.txt │ llm_judge │ 0.841 │ ✓ │
|
|
209
|
+
│ 2026-03-27T14:32:00 │ a1b2c3d │ prompts/sum.txt │ semantic │ 0.923 │ ✓ │
|
|
210
|
+
│ 2026-03-26T09:15:44 │ e4f5g6h │ prompts/sum.txt │ llm_judge │ 0.710 │ ✓ │
|
|
211
|
+
└───────────────────────┴─────────┴─────────────────┴───────────┴───────┴──────┘
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## Why not Promptfoo / LangSmith / DeepEval?
|
|
217
|
+
|
|
218
|
+
| Capability | prompt-lock | Promptfoo | LangSmith | DeepEval |
|
|
219
|
+
|-----------|:-----------:|:---------:|:---------:|:--------:|
|
|
220
|
+
| Git-diff aware (only eval changed prompts) | ✓ | ✗ | ✗ | ✗ |
|
|
221
|
+
| Judge calibration against human labels | ✓ | ✗ | partial | ✗ |
|
|
222
|
+
| Block CI if judge is miscalibrated | ✓ | ✗ | ✗ | ✗ |
|
|
223
|
+
| Regression gate (baseline comparison) | ✓ | ✓ | ✓ | ✓ |
|
|
224
|
+
| Commit-linked trace ledger | ✓ | ✗ | ✓ | ✗ |
|
|
225
|
+
| Framework-agnostic (LiteLLM) | ✓ | ✓ | ✗ | ✓ |
|
|
226
|
+
| Offline semantic similarity | ✓ | ✗ | ✗ | ✓ |
|
|
227
|
+
| Zero hosted infrastructure | ✓ | ✓ | ✗ | partial |
|
|
228
|
+
| `pip install` in 30 seconds | ✓ | ✗ | ✗ | ✓ |
|
|
229
|
+
|
|
230
|
+
Promptfoo was acquired by OpenAI in March 2026 — its roadmap is now OpenAI-aligned. prompt-lock is MIT licensed and provider-agnostic.
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Contributing
|
|
235
|
+
|
|
236
|
+
Issues and PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT. Built by [BuildWorld](https://github.com/buildworld-ai).
|
|
243
|
+
|
|
244
|
+
*Guards at the gaps. Nehemiah 4:13.*
|