llm-mutation 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_mutation-0.1.0/LICENSE +21 -0
- llm_mutation-0.1.0/PKG-INFO +134 -0
- llm_mutation-0.1.0/README.md +106 -0
- llm_mutation-0.1.0/llm_mutation/__init__.py +48 -0
- llm_mutation-0.1.0/llm_mutation/_calibrate.py +210 -0
- llm_mutation-0.1.0/llm_mutation/_cli.py +209 -0
- llm_mutation-0.1.0/llm_mutation/_engine.py +357 -0
- llm_mutation-0.1.0/llm_mutation/_models.py +308 -0
- llm_mutation-0.1.0/llm_mutation/_runner.py +150 -0
- llm_mutation-0.1.0/llm_mutation/_store.py +115 -0
- llm_mutation-0.1.0/pyproject.toml +56 -0
- llm_mutation-0.1.0/tests/__init__.py +0 -0
- llm_mutation-0.1.0/tests/test_llm_mutation.py +969 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 BuildWorld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-mutation
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Rowusuduah/llm-mutation
|
|
6
|
+
Project-URL: Repository, https://github.com/Rowusuduah/llm-mutation
|
|
7
|
+
Project-URL: Issues, https://github.com/Rowusuduah/llm-mutation/issues
|
|
8
|
+
Author-email: Richmond Owusu Duah <Rowusuduah@users.noreply.github.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,anthropic,ci-cd,eval,evaluation,llm,llmops,mutation-testing,prompt-testing,testing
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: black; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# llm-mutation
|
|
30
|
+
|
|
31
|
+
**Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.**
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install llm-mutation
|
|
35
|
+
mutate run --prompt prompts/customer_service.txt --eval evals/test_cs.py
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## The Problem
|
|
39
|
+
|
|
40
|
+
You have an eval suite. It passes. You ship. Production breaks.
|
|
41
|
+
|
|
42
|
+
Your eval suite tested 50 specific cases you wrote. It was never tested itself. **llm-mutation tests whether your eval suite would notice if a key constraint was removed, a clause was dropped, or a scope was expanded.**
|
|
43
|
+
|
|
44
|
+
## Quickstart
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from llm_mutation import MutationEngine, MutantRunner, MutationReport
|
|
48
|
+
|
|
49
|
+
# 1. Generate semantic mutations of your prompt
|
|
50
|
+
engine = MutationEngine()
|
|
51
|
+
mutations = engine.generate("prompts/customer_service.txt")
|
|
52
|
+
|
|
53
|
+
# 2. Run your eval suite against each mutant
|
|
54
|
+
def my_eval_fn(prompt: str, test_cases: list) -> float:
|
|
55
|
+
# your existing eval logic — returns 0.0-1.0
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
runner = MutantRunner(eval_fn=my_eval_fn, test_cases=my_test_cases)
|
|
59
|
+
results = runner.run(mutations)
|
|
60
|
+
|
|
61
|
+
# 3. See your gaps
|
|
62
|
+
report = MutationReport.from_results(results, prompt, original_score=0.91)
|
|
63
|
+
print(report.summary())
|
|
64
|
+
# MUTATION SCORE: 71% (5/7 mutations killed)
|
|
65
|
+
# SURVIVING MUTATIONS:
|
|
66
|
+
# ✗ DropClause — "Direct pricing questions to sales@acmecorp.com." removed
|
|
67
|
+
# → ADD TEST CASE: "User asks 'What does the enterprise plan cost?'"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Six Deterministic Mutation Operators
|
|
71
|
+
|
|
72
|
+
| Operator | What it does |
|
|
73
|
+
|----------|--------------|
|
|
74
|
+
| `NegateConstraint` | Removes a prohibitive clause ("Never X") |
|
|
75
|
+
| `DropClause` | Removes a requirement ("Always X", "You must X") |
|
|
76
|
+
| `ScopeExpand` | Widens a scope restriction ("software only" → "products and services") |
|
|
77
|
+
| `ScopeNarrow` | Narrows a permission ("any topic" → "general topics only") |
|
|
78
|
+
| `ConditionInvert` | Removes a conditional behavior ("if A, then B") |
|
|
79
|
+
| `PhraseSwap` | Swaps a style phrase ("concise" ↔ "comprehensive") |
|
|
80
|
+
|
|
81
|
+
No LLM required for mutation generation — all operators are deterministic text transforms.
|
|
82
|
+
|
|
83
|
+
## Mutation Score
|
|
84
|
+
|
|
85
|
+
| Score | Verdict | Meaning |
|
|
86
|
+
|-------|---------|---------|
|
|
87
|
+
| >= 90% | STRONG | Eval suite is comprehensive |
|
|
88
|
+
| 80-89% | ADEQUATE | Good for CI gate |
|
|
89
|
+
| 70-79% | MARGINAL | Meaningful gaps |
|
|
90
|
+
| 60-69% | WEAK | Significant gaps |
|
|
91
|
+
| < 60% | DANGEROUS | Not fit for purpose |
|
|
92
|
+
|
|
93
|
+
**Recommended minimum for production CI gate: 80%**
|
|
94
|
+
|
|
95
|
+
## CLI
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Run mutation test
|
|
99
|
+
mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
|
|
100
|
+
|
|
101
|
+
# Generate report
|
|
102
|
+
mutate report --input report.json --format markdown
|
|
103
|
+
|
|
104
|
+
# CI gate (exit 1 if score < 80%)
|
|
105
|
+
mutate ci --input report.json --min-score 0.80
|
|
106
|
+
|
|
107
|
+
# Calibrate your eval suite
|
|
108
|
+
mutate calibrate --prompt prompts/cs.txt --eval evals/test_cs.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## GitHub Action
|
|
112
|
+
|
|
113
|
+
```yaml
|
|
114
|
+
- run: pip install llm-mutation
|
|
115
|
+
- name: Run mutation tests
|
|
116
|
+
run: |
|
|
117
|
+
mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
|
|
118
|
+
mutate ci --input report.json --min-score 0.80
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Pattern Foundation
|
|
122
|
+
|
|
123
|
+
Built on **PAT-045 — Judges 6:36-40 (The Gideon Fleece Inversion Pattern)**.
|
|
124
|
+
|
|
125
|
+
Gideon designed a two-condition invertible test: fleece wet/ground dry, then fleece dry/ground wet. He wasn't testing God's power — he was testing whether his testing mechanism could discriminate signal from coincidence.
|
|
126
|
+
|
|
127
|
+
**llm-mutation is the bowlful of water. Your mutation score is your measurement.**
|
|
128
|
+
|
|
129
|
+
Supporting: PAT-046 (Acts 17:11 — Berean Null Test) → `mutate calibrate`
|
|
130
|
+
Supporting: PAT-047 (Numbers 13:25-33 — Twelve Spies Divergence) → `mutate verify-judge`
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# llm-mutation
|
|
2
|
+
|
|
3
|
+
**Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.**
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install llm-mutation
|
|
7
|
+
mutate run --prompt prompts/customer_service.txt --eval evals/test_cs.py
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
## The Problem
|
|
11
|
+
|
|
12
|
+
You have an eval suite. It passes. You ship. Production breaks.
|
|
13
|
+
|
|
14
|
+
Your eval suite tested 50 specific cases you wrote. It was never tested itself. **llm-mutation tests whether your eval suite would notice if a key constraint was removed, a clause was dropped, or a scope was expanded.**
|
|
15
|
+
|
|
16
|
+
## Quickstart
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from llm_mutation import MutationEngine, MutantRunner, MutationReport
|
|
20
|
+
|
|
21
|
+
# 1. Generate semantic mutations of your prompt
|
|
22
|
+
engine = MutationEngine()
|
|
23
|
+
mutations = engine.generate("prompts/customer_service.txt")
|
|
24
|
+
|
|
25
|
+
# 2. Run your eval suite against each mutant
|
|
26
|
+
def my_eval_fn(prompt: str, test_cases: list) -> float:
|
|
27
|
+
# your existing eval logic — returns 0.0-1.0
|
|
28
|
+
...
|
|
29
|
+
|
|
30
|
+
runner = MutantRunner(eval_fn=my_eval_fn, test_cases=my_test_cases)
|
|
31
|
+
results = runner.run(mutations)
|
|
32
|
+
|
|
33
|
+
# 3. See your gaps
|
|
34
|
+
report = MutationReport.from_results(results, prompt, original_score=0.91)
|
|
35
|
+
print(report.summary())
|
|
36
|
+
# MUTATION SCORE: 71% (5/7 mutations killed)
|
|
37
|
+
# SURVIVING MUTATIONS:
|
|
38
|
+
# ✗ DropClause — "Direct pricing questions to sales@acmecorp.com." removed
|
|
39
|
+
# → ADD TEST CASE: "User asks 'What does the enterprise plan cost?'"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Six Deterministic Mutation Operators
|
|
43
|
+
|
|
44
|
+
| Operator | What it does |
|
|
45
|
+
|----------|--------------|
|
|
46
|
+
| `NegateConstraint` | Removes a prohibitive clause ("Never X") |
|
|
47
|
+
| `DropClause` | Removes a requirement ("Always X", "You must X") |
|
|
48
|
+
| `ScopeExpand` | Widens a scope restriction ("software only" → "products and services") |
|
|
49
|
+
| `ScopeNarrow` | Narrows a permission ("any topic" → "general topics only") |
|
|
50
|
+
| `ConditionInvert` | Removes a conditional behavior ("if A, then B") |
|
|
51
|
+
| `PhraseSwap` | Swaps a style phrase ("concise" ↔ "comprehensive") |
|
|
52
|
+
|
|
53
|
+
No LLM required for mutation generation — all operators are deterministic text transforms.
|
|
54
|
+
|
|
55
|
+
## Mutation Score
|
|
56
|
+
|
|
57
|
+
| Score | Verdict | Meaning |
|
|
58
|
+
|-------|---------|---------|
|
|
59
|
+
| >= 90% | STRONG | Eval suite is comprehensive |
|
|
60
|
+
| 80-89% | ADEQUATE | Good for CI gate |
|
|
61
|
+
| 70-79% | MARGINAL | Meaningful gaps |
|
|
62
|
+
| 60-69% | WEAK | Significant gaps |
|
|
63
|
+
| < 60% | DANGEROUS | Not fit for purpose |
|
|
64
|
+
|
|
65
|
+
**Recommended minimum for production CI gate: 80%**
|
|
66
|
+
|
|
67
|
+
## CLI
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Run mutation test
|
|
71
|
+
mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
|
|
72
|
+
|
|
73
|
+
# Generate report
|
|
74
|
+
mutate report --input report.json --format markdown
|
|
75
|
+
|
|
76
|
+
# CI gate (exit 1 if score < 80%)
|
|
77
|
+
mutate ci --input report.json --min-score 0.80
|
|
78
|
+
|
|
79
|
+
# Calibrate your eval suite
|
|
80
|
+
mutate calibrate --prompt prompts/cs.txt --eval evals/test_cs.py
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## GitHub Action
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
- run: pip install llm-mutation
|
|
87
|
+
- name: Run mutation tests
|
|
88
|
+
run: |
|
|
89
|
+
mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
|
|
90
|
+
mutate ci --input report.json --min-score 0.80
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Pattern Foundation
|
|
94
|
+
|
|
95
|
+
Built on **PAT-045 — Judges 6:36-40 (The Gideon Fleece Inversion Pattern)**.
|
|
96
|
+
|
|
97
|
+
Gideon designed a two-condition invertible test: fleece wet/ground dry, then fleece dry/ground wet. He wasn't testing God's power — he was testing whether his testing mechanism could discriminate signal from coincidence.
|
|
98
|
+
|
|
99
|
+
**llm-mutation is the bowlful of water. Your mutation score is your measurement.**
|
|
100
|
+
|
|
101
|
+
Supporting: PAT-046 (Acts 17:11 — Berean Null Test) → `mutate calibrate`
|
|
102
|
+
Supporting: PAT-047 (Numbers 13:25-33 — Twelve Spies Divergence) → `mutate verify-judge`
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
llm-mutation — Mutation testing for LLM prompts.
|
|
3
|
+
|
|
4
|
+
Find the gaps in your eval suite before production does.
|
|
5
|
+
|
|
6
|
+
Quickstart:
|
|
7
|
+
pip install llm-mutation
|
|
8
|
+
mutate run --prompt prompts/cs.txt --eval evals/test_cs.py
|
|
9
|
+
|
|
10
|
+
Pattern: PAT-045 (Judges 6:36-40 — The Gideon Fleece Inversion Pattern)
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from ._models import (
|
|
15
|
+
Mutation,
|
|
16
|
+
MutantResult,
|
|
17
|
+
MutantVerdict,
|
|
18
|
+
MutationReport,
|
|
19
|
+
MutationOperator,
|
|
20
|
+
MutationScoreVerdict,
|
|
21
|
+
)
|
|
22
|
+
from ._engine import MutationEngine
|
|
23
|
+
from ._runner import MutantRunner
|
|
24
|
+
from ._store import MutationStore
|
|
25
|
+
from ._calibrate import run_calibration, CalibrationReport
|
|
26
|
+
|
|
27
|
+
__version__ = "0.1.0"
|
|
28
|
+
__all__ = [
|
|
29
|
+
# Models
|
|
30
|
+
"Mutation",
|
|
31
|
+
"MutantResult",
|
|
32
|
+
"MutantVerdict",
|
|
33
|
+
"MutationReport",
|
|
34
|
+
"MutationOperator",
|
|
35
|
+
"MutationScoreVerdict",
|
|
36
|
+
# Core
|
|
37
|
+
"MutationEngine",
|
|
38
|
+
"MutantRunner",
|
|
39
|
+
"MutationStore",
|
|
40
|
+
# Calibration
|
|
41
|
+
"run_calibration",
|
|
42
|
+
"CalibrationReport",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _cli_main() -> None:
|
|
47
|
+
from ._cli import main
|
|
48
|
+
main()
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Calibration module — Berean Null Test (PAT-046).
|
|
3
|
+
|
|
4
|
+
Before trusting your mutation score, verify that your eval suite can
|
|
5
|
+
actually detect known-severity mutations. If your eval suite can't catch
|
|
6
|
+
a HIGH-severity mutation (complete system prompt removal), the mutation
|
|
7
|
+
score is meaningless.
|
|
8
|
+
|
|
9
|
+
The `mutate calibrate` command runs 5 known-severity mutations against
|
|
10
|
+
the user's eval suite and reports a calibration score.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Callable, Literal
|
|
16
|
+
|
|
17
|
+
from ._models import Mutation
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
CalibrationSeverity = Literal["HIGH", "MEDIUM", "LOW"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CalibrationCase:
|
|
25
|
+
severity: CalibrationSeverity
|
|
26
|
+
description: str
|
|
27
|
+
build_mutant: Callable[[str], str] # (original_text) -> mutated_text
|
|
28
|
+
recommendation: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _remove_prohibitions(text: str) -> str:
|
|
32
|
+
import re
|
|
33
|
+
result = re.sub(
|
|
34
|
+
r"(?m)^[ \t]*(?:[Nn]ever|[Dd]o not|[Dd]on't|[Aa]void)\s+.+[.!]?\s*$",
|
|
35
|
+
"",
|
|
36
|
+
text,
|
|
37
|
+
)
|
|
38
|
+
return re.sub(r"\n{3,}", "\n\n", result).strip() or text
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _remove_requirements(text: str) -> str:
|
|
42
|
+
import re
|
|
43
|
+
result = re.sub(
|
|
44
|
+
r"(?m)^[ \t]*(?:[Aa]lways|[Yy]ou must|[Yy]ou should|[Mm]ake sure|[Ee]nsure)\s+.+[.!]?\s*$",
|
|
45
|
+
"",
|
|
46
|
+
text,
|
|
47
|
+
)
|
|
48
|
+
return re.sub(r"\n{3,}", "\n\n", result).strip() or text
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _remove_first_line(text: str) -> str:
|
|
52
|
+
lines = text.strip().split("\n")
|
|
53
|
+
if len(lines) <= 1:
|
|
54
|
+
return text
|
|
55
|
+
return "\n".join(lines[1:]).strip()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _remove_last_instruction(text: str) -> str:
|
|
59
|
+
import re
|
|
60
|
+
lines = text.strip().split("\n")
|
|
61
|
+
# Find last non-empty line
|
|
62
|
+
for i in range(len(lines) - 1, -1, -1):
|
|
63
|
+
if lines[i].strip():
|
|
64
|
+
new_lines = lines[:i] + lines[i + 1:]
|
|
65
|
+
result = "\n".join(new_lines).strip()
|
|
66
|
+
return result if result else text
|
|
67
|
+
return text
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Five canonical calibration mutations (known severity) — defined after helper functions
|
|
71
|
+
_CALIBRATION_CASES: list[CalibrationCase] = [
|
|
72
|
+
CalibrationCase(
|
|
73
|
+
severity="HIGH",
|
|
74
|
+
description="Complete system prompt removal (all instructions stripped)",
|
|
75
|
+
build_mutant=lambda text: "You are a helpful assistant.",
|
|
76
|
+
recommendation=(
|
|
77
|
+
"Your eval suite cannot detect complete prompt removal. "
|
|
78
|
+
"Add a test case that would fail on a blank/generic assistant."
|
|
79
|
+
),
|
|
80
|
+
),
|
|
81
|
+
CalibrationCase(
|
|
82
|
+
severity="HIGH",
|
|
83
|
+
description="All prohibitive clauses removed (all 'Never' / 'Do not' lines)",
|
|
84
|
+
build_mutant=_remove_prohibitions,
|
|
85
|
+
recommendation=(
|
|
86
|
+
"Your eval suite misses prohibition removal. "
|
|
87
|
+
"Add test cases that verify prohibited behaviors are actually refused."
|
|
88
|
+
),
|
|
89
|
+
),
|
|
90
|
+
CalibrationCase(
|
|
91
|
+
severity="MEDIUM",
|
|
92
|
+
description="All requirement clauses removed (all 'Always' / 'You must' lines)",
|
|
93
|
+
build_mutant=_remove_requirements,
|
|
94
|
+
recommendation=(
|
|
95
|
+
"Your eval suite misses requirement removal. "
|
|
96
|
+
"Add test cases verifying each required behavior appears in responses."
|
|
97
|
+
),
|
|
98
|
+
),
|
|
99
|
+
CalibrationCase(
|
|
100
|
+
severity="MEDIUM",
|
|
101
|
+
description="Role/persona instruction removed (first line of prompt)",
|
|
102
|
+
build_mutant=_remove_first_line,
|
|
103
|
+
recommendation=(
|
|
104
|
+
"Your eval suite misses persona loss. "
|
|
105
|
+
"Add a test case verifying the LLM adopts the specified role."
|
|
106
|
+
),
|
|
107
|
+
),
|
|
108
|
+
CalibrationCase(
|
|
109
|
+
severity="LOW",
|
|
110
|
+
description="Single instruction clause dropped (last instruction line)",
|
|
111
|
+
build_mutant=_remove_last_instruction,
|
|
112
|
+
recommendation=(
|
|
113
|
+
"Your eval suite misses single-clause removal. "
|
|
114
|
+
"Each important instruction should have at least one dedicated test case."
|
|
115
|
+
),
|
|
116
|
+
),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class CalibrationResult:
|
|
122
|
+
case: CalibrationCase
|
|
123
|
+
original_score: float
|
|
124
|
+
mutant_score: float
|
|
125
|
+
delta: float
|
|
126
|
+
caught: bool # True if delta >= delta_threshold
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class CalibrationReport:
|
|
131
|
+
calibration_score: float # fraction of calibration cases caught
|
|
132
|
+
total_cases: int
|
|
133
|
+
caught: int
|
|
134
|
+
missed: int
|
|
135
|
+
results: list[CalibrationResult]
|
|
136
|
+
warnings: list[str]
|
|
137
|
+
|
|
138
|
+
def summary(self) -> str:
|
|
139
|
+
pct = int(self.calibration_score * 100)
|
|
140
|
+
lines = [
|
|
141
|
+
f"\nCALIBRATION RESULTS:",
|
|
142
|
+
f"Tested {self.total_cases} known-severity mutations against your eval suite.",
|
|
143
|
+
]
|
|
144
|
+
for r in self.results:
|
|
145
|
+
icon = "\u2713" if r.caught else "\u2717"
|
|
146
|
+
caught_str = "Caught" if r.caught else "Missed"
|
|
147
|
+
lines.append(
|
|
148
|
+
f" {icon} Severity: {r.case.severity} \u2014 {r.case.description}"
|
|
149
|
+
f" \u2014 {caught_str} (score: {r.original_score:.2f} \u2192 {r.mutant_score:.2f})"
|
|
150
|
+
)
|
|
151
|
+
lines.append("")
|
|
152
|
+
lines.append(f"Calibration score: {pct}% ({self.caught}/{self.total_cases} known-severity mutations caught)")
|
|
153
|
+
for w in self.warnings:
|
|
154
|
+
lines.append(f"WARNING: {w}")
|
|
155
|
+
return "\n".join(lines)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def run_calibration(
|
|
159
|
+
eval_fn: Callable[[str, list], float],
|
|
160
|
+
test_cases: list,
|
|
161
|
+
prompt: str,
|
|
162
|
+
delta_threshold: float = 0.15,
|
|
163
|
+
runs_per_case: int = 1,
|
|
164
|
+
) -> CalibrationReport:
|
|
165
|
+
"""Run calibration cases and return CalibrationReport."""
|
|
166
|
+
import statistics
|
|
167
|
+
|
|
168
|
+
def _score(text: str) -> float:
|
|
169
|
+
scores = [float(eval_fn(text, test_cases)) for _ in range(runs_per_case)]
|
|
170
|
+
return statistics.median(scores)
|
|
171
|
+
|
|
172
|
+
original_score = _score(prompt)
|
|
173
|
+
results: list[CalibrationResult] = []
|
|
174
|
+
warnings: list[str] = []
|
|
175
|
+
|
|
176
|
+
for case in _CALIBRATION_CASES:
|
|
177
|
+
try:
|
|
178
|
+
mutant_text = case.build_mutant(prompt)
|
|
179
|
+
if mutant_text == prompt:
|
|
180
|
+
# Skip if mutation didn't change anything (prompt may not have matching clauses)
|
|
181
|
+
continue
|
|
182
|
+
mutant_score = _score(mutant_text)
|
|
183
|
+
delta = original_score - mutant_score
|
|
184
|
+
caught = delta >= delta_threshold
|
|
185
|
+
results.append(
|
|
186
|
+
CalibrationResult(
|
|
187
|
+
case=case,
|
|
188
|
+
original_score=original_score,
|
|
189
|
+
mutant_score=mutant_score,
|
|
190
|
+
delta=delta,
|
|
191
|
+
caught=caught,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
if not caught and case.severity == "HIGH":
|
|
195
|
+
warnings.append(case.recommendation)
|
|
196
|
+
except Exception as exc:
|
|
197
|
+
warnings.append(f"Calibration case '{case.description}' failed: {exc}")
|
|
198
|
+
|
|
199
|
+
caught_count = sum(1 for r in results if r.caught)
|
|
200
|
+
total = len(results)
|
|
201
|
+
score = caught_count / total if total > 0 else 0.0
|
|
202
|
+
|
|
203
|
+
return CalibrationReport(
|
|
204
|
+
calibration_score=score,
|
|
205
|
+
total_cases=total,
|
|
206
|
+
caught=caught_count,
|
|
207
|
+
missed=total - caught_count,
|
|
208
|
+
results=results,
|
|
209
|
+
warnings=warnings,
|
|
210
|
+
)
|