llm-mutation 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 BuildWorld
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-mutation
3
+ Version: 0.1.0
4
+ Summary: Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.
5
+ Project-URL: Homepage, https://github.com/Rowusuduah/llm-mutation
6
+ Project-URL: Repository, https://github.com/Rowusuduah/llm-mutation
7
+ Project-URL: Issues, https://github.com/Rowusuduah/llm-mutation/issues
8
+ Author-email: Richmond Owusu Duah <Rowusuduah@users.noreply.github.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,anthropic,ci-cd,eval,evaluation,llm,llmops,mutation-testing,prompt-testing,testing
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Quality Assurance
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.10
22
+ Provides-Extra: dev
23
+ Requires-Dist: black; extra == 'dev'
24
+ Requires-Dist: pytest-cov; extra == 'dev'
25
+ Requires-Dist: pytest>=7.0; extra == 'dev'
26
+ Requires-Dist: ruff; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # llm-mutation
30
+
31
+ **Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.**
32
+
33
+ ```bash
34
+ pip install llm-mutation
35
+ mutate run --prompt prompts/customer_service.txt --eval evals/test_cs.py
36
+ ```
37
+
38
+ ## The Problem
39
+
40
+ You have an eval suite. It passes. You ship. Production breaks.
41
+
42
+ Your eval suite tested 50 specific cases you wrote. It was never tested itself. **llm-mutation tests whether your eval suite would notice if a key constraint was removed, a clause was dropped, or a scope was expanded.**
43
+
44
+ ## Quickstart
45
+
46
+ ```python
47
+ from llm_mutation import MutationEngine, MutantRunner, MutationReport
48
+
49
+ # 1. Generate semantic mutations of your prompt
50
+ engine = MutationEngine()
51
+ mutations = engine.generate("prompts/customer_service.txt")
52
+
53
+ # 2. Run your eval suite against each mutant
54
+ def my_eval_fn(prompt: str, test_cases: list) -> float:
55
+ # your existing eval logic — returns 0.0-1.0
56
+ ...
57
+
58
+ runner = MutantRunner(eval_fn=my_eval_fn, test_cases=my_test_cases)
59
+ results = runner.run(mutations)
60
+
61
+ # 3. See your gaps
62
+ report = MutationReport.from_results(results, prompt, original_score=0.91)
63
+ print(report.summary())
64
+ # MUTATION SCORE: 71% (5/7 mutations killed)
65
+ # SURVIVING MUTATIONS:
66
+ # ✗ DropClause — "Direct pricing questions to sales@acmecorp.com." removed
67
+ # → ADD TEST CASE: "User asks 'What does the enterprise plan cost?'"
68
+ ```
69
+
70
+ ## Six Deterministic Mutation Operators
71
+
72
+ | Operator | What it does |
73
+ |----------|--------------|
74
+ | `NegateConstraint` | Removes a prohibitive clause ("Never X") |
75
+ | `DropClause` | Removes a requirement ("Always X", "You must X") |
76
+ | `ScopeExpand` | Widens a scope restriction ("software only" → "products and services") |
77
+ | `ScopeNarrow` | Narrows a permission ("any topic" → "general topics only") |
78
+ | `ConditionInvert` | Removes a conditional behavior ("if A, then B") |
79
+ | `PhraseSwap` | Swaps a style phrase ("concise" ↔ "comprehensive") |
80
+
81
+ No LLM required for mutation generation — all operators are deterministic text transforms.
82
+
83
+ ## Mutation Score
84
+
85
+ | Score | Verdict | Meaning |
86
+ |-------|---------|---------|
87
+ | >= 90% | STRONG | Eval suite is comprehensive |
88
+ | 80-89% | ADEQUATE | Good for CI gate |
89
+ | 70-79% | MARGINAL | Meaningful gaps |
90
+ | 60-69% | WEAK | Significant gaps |
91
+ | < 60% | DANGEROUS | Not fit for purpose |
92
+
93
+ **Recommended minimum for production CI gate: 80%**
94
+
95
+ ## CLI
96
+
97
+ ```bash
98
+ # Run mutation test
99
+ mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
100
+
101
+ # Generate report
102
+ mutate report --input report.json --format markdown
103
+
104
+ # CI gate (exit 1 if score < 80%)
105
+ mutate ci --input report.json --min-score 0.80
106
+
107
+ # Calibrate your eval suite
108
+ mutate calibrate --prompt prompts/cs.txt --eval evals/test_cs.py
109
+ ```
110
+
111
+ ## GitHub Action
112
+
113
+ ```yaml
114
+ - run: pip install llm-mutation
115
+ - name: Run mutation tests
116
+ run: |
117
+ mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
118
+ mutate ci --input report.json --min-score 0.80
119
+ ```
120
+
121
+ ## Pattern Foundation
122
+
123
+ Built on **PAT-045 — Judges 6:36-40 (The Gideon Fleece Inversion Pattern)**.
124
+
125
+ Gideon designed a two-condition invertible test: fleece wet/ground dry, then fleece dry/ground wet. He wasn't testing God's power — he was testing whether his testing mechanism could discriminate signal from coincidence.
126
+
127
+ **llm-mutation is the bowlful of water. Your mutation score is your measurement.**
128
+
129
+ Supporting: PAT-046 (Acts 17:11 — Berean Null Test) → `mutate calibrate`
130
+ Supporting: PAT-047 (Numbers 13:25-33 — Twelve Spies Divergence) → `mutate verify-judge`
131
+
132
+ ## License
133
+
134
+ MIT
@@ -0,0 +1,106 @@
1
+ # llm-mutation
2
+
3
+ **Mutation testing for LLM prompts. Find the gaps in your eval suite before production does.**
4
+
5
+ ```bash
6
+ pip install llm-mutation
7
+ mutate run --prompt prompts/customer_service.txt --eval evals/test_cs.py
8
+ ```
9
+
10
+ ## The Problem
11
+
12
+ You have an eval suite. It passes. You ship. Production breaks.
13
+
14
+ Your eval suite tested 50 specific cases you wrote. It was never tested itself. **llm-mutation tests whether your eval suite would notice if a key constraint was removed, a clause was dropped, or a scope was expanded.**
15
+
16
+ ## Quickstart
17
+
18
+ ```python
19
+ from llm_mutation import MutationEngine, MutantRunner, MutationReport
20
+
21
+ # 1. Generate semantic mutations of your prompt
22
+ engine = MutationEngine()
23
+ mutations = engine.generate("prompts/customer_service.txt")
24
+
25
+ # 2. Run your eval suite against each mutant
26
+ def my_eval_fn(prompt: str, test_cases: list) -> float:
27
+ # your existing eval logic — returns 0.0-1.0
28
+ ...
29
+
30
+ runner = MutantRunner(eval_fn=my_eval_fn, test_cases=my_test_cases)
31
+ results = runner.run(mutations)
32
+
33
+ # 3. See your gaps
34
+ report = MutationReport.from_results(results, prompt, original_score=0.91)
35
+ print(report.summary())
36
+ # MUTATION SCORE: 71% (5/7 mutations killed)
37
+ # SURVIVING MUTATIONS:
38
+ # ✗ DropClause — "Direct pricing questions to sales@acmecorp.com." removed
39
+ # → ADD TEST CASE: "User asks 'What does the enterprise plan cost?'"
40
+ ```
41
+
42
+ ## Six Deterministic Mutation Operators
43
+
44
+ | Operator | What it does |
45
+ |----------|--------------|
46
+ | `NegateConstraint` | Removes a prohibitive clause ("Never X") |
47
+ | `DropClause` | Removes a requirement ("Always X", "You must X") |
48
+ | `ScopeExpand` | Widens a scope restriction ("software only" → "products and services") |
49
+ | `ScopeNarrow` | Narrows a permission ("any topic" → "general topics only") |
50
+ | `ConditionInvert` | Removes a conditional behavior ("if A, then B") |
51
+ | `PhraseSwap` | Swaps a style phrase ("concise" ↔ "comprehensive") |
52
+
53
+ No LLM required for mutation generation — all operators are deterministic text transforms.
54
+
55
+ ## Mutation Score
56
+
57
+ | Score | Verdict | Meaning |
58
+ |-------|---------|---------|
59
+ | >= 90% | STRONG | Eval suite is comprehensive |
60
+ | 80-89% | ADEQUATE | Good for CI gate |
61
+ | 70-79% | MARGINAL | Meaningful gaps |
62
+ | 60-69% | WEAK | Significant gaps |
63
+ | < 60% | DANGEROUS | Not fit for purpose |
64
+
65
+ **Recommended minimum for production CI gate: 80%**
66
+
67
+ ## CLI
68
+
69
+ ```bash
70
+ # Run mutation test
71
+ mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
72
+
73
+ # Generate report
74
+ mutate report --input report.json --format markdown
75
+
76
+ # CI gate (exit 1 if score < 80%)
77
+ mutate ci --input report.json --min-score 0.80
78
+
79
+ # Calibrate your eval suite
80
+ mutate calibrate --prompt prompts/cs.txt --eval evals/test_cs.py
81
+ ```
82
+
83
+ ## GitHub Action
84
+
85
+ ```yaml
86
+ - run: pip install llm-mutation
87
+ - name: Run mutation tests
88
+ run: |
89
+ mutate run --prompt prompts/cs.txt --eval evals/test_cs.py --output report.json
90
+ mutate ci --input report.json --min-score 0.80
91
+ ```
92
+
93
+ ## Pattern Foundation
94
+
95
+ Built on **PAT-045 — Judges 6:36-40 (The Gideon Fleece Inversion Pattern)**.
96
+
97
+ Gideon designed a two-condition invertible test: fleece wet/ground dry, then fleece dry/ground wet. He wasn't testing God's power — he was testing whether his testing mechanism could discriminate signal from coincidence.
98
+
99
+ **llm-mutation is the bowlful of water. Your mutation score is your measurement.**
100
+
101
+ Supporting: PAT-046 (Acts 17:11 — Berean Null Test) → `mutate calibrate`
102
+ Supporting: PAT-047 (Numbers 13:25-33 — Twelve Spies Divergence) → `mutate verify-judge`
103
+
104
+ ## License
105
+
106
+ MIT
@@ -0,0 +1,48 @@
1
+ """
2
+ llm-mutation — Mutation testing for LLM prompts.
3
+
4
+ Find the gaps in your eval suite before production does.
5
+
6
+ Quickstart:
7
+ pip install llm-mutation
8
+ mutate run --prompt prompts/cs.txt --eval evals/test_cs.py
9
+
10
+ Pattern: PAT-045 (Judges 6:36-40 — The Gideon Fleece Inversion Pattern)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from ._models import (
15
+ Mutation,
16
+ MutantResult,
17
+ MutantVerdict,
18
+ MutationReport,
19
+ MutationOperator,
20
+ MutationScoreVerdict,
21
+ )
22
+ from ._engine import MutationEngine
23
+ from ._runner import MutantRunner
24
+ from ._store import MutationStore
25
+ from ._calibrate import run_calibration, CalibrationReport
26
+
27
+ __version__ = "0.1.0"
28
+ __all__ = [
29
+ # Models
30
+ "Mutation",
31
+ "MutantResult",
32
+ "MutantVerdict",
33
+ "MutationReport",
34
+ "MutationOperator",
35
+ "MutationScoreVerdict",
36
+ # Core
37
+ "MutationEngine",
38
+ "MutantRunner",
39
+ "MutationStore",
40
+ # Calibration
41
+ "run_calibration",
42
+ "CalibrationReport",
43
+ ]
44
+
45
+
46
+ def _cli_main() -> None:
47
+ from ._cli import main
48
+ main()
@@ -0,0 +1,210 @@
1
+ """
2
+ Calibration module — Berean Null Test (PAT-046).
3
+
4
+ Before trusting your mutation score, verify that your eval suite can
5
+ actually detect known-severity mutations. If your eval suite can't catch
6
+ a HIGH-severity mutation (complete system prompt removal), the mutation
7
+ score is meaningless.
8
+
9
+ The `mutate calibrate` command runs 5 known-severity mutations against
10
+ the user's eval suite and reports a calibration score.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Callable, Literal
16
+
17
+ from ._models import Mutation
18
+
19
+
20
+ CalibrationSeverity = Literal["HIGH", "MEDIUM", "LOW"]
21
+
22
+
23
+ @dataclass
24
+ class CalibrationCase:
25
+ severity: CalibrationSeverity
26
+ description: str
27
+ build_mutant: Callable[[str], str] # (original_text) -> mutated_text
28
+ recommendation: str
29
+
30
+
31
+ def _remove_prohibitions(text: str) -> str:
32
+ import re
33
+ result = re.sub(
34
+ r"(?m)^[ \t]*(?:[Nn]ever|[Dd]o not|[Dd]on't|[Aa]void)\s+.+[.!]?\s*$",
35
+ "",
36
+ text,
37
+ )
38
+ return re.sub(r"\n{3,}", "\n\n", result).strip() or text
39
+
40
+
41
+ def _remove_requirements(text: str) -> str:
42
+ import re
43
+ result = re.sub(
44
+ r"(?m)^[ \t]*(?:[Aa]lways|[Yy]ou must|[Yy]ou should|[Mm]ake sure|[Ee]nsure)\s+.+[.!]?\s*$",
45
+ "",
46
+ text,
47
+ )
48
+ return re.sub(r"\n{3,}", "\n\n", result).strip() or text
49
+
50
+
51
+ def _remove_first_line(text: str) -> str:
52
+ lines = text.strip().split("\n")
53
+ if len(lines) <= 1:
54
+ return text
55
+ return "\n".join(lines[1:]).strip()
56
+
57
+
58
+ def _remove_last_instruction(text: str) -> str:
59
+ import re
60
+ lines = text.strip().split("\n")
61
+ # Find last non-empty line
62
+ for i in range(len(lines) - 1, -1, -1):
63
+ if lines[i].strip():
64
+ new_lines = lines[:i] + lines[i + 1:]
65
+ result = "\n".join(new_lines).strip()
66
+ return result if result else text
67
+ return text
68
+
69
+
70
+ # Five canonical calibration mutations (known severity) — defined after helper functions
71
+ _CALIBRATION_CASES: list[CalibrationCase] = [
72
+ CalibrationCase(
73
+ severity="HIGH",
74
+ description="Complete system prompt removal (all instructions stripped)",
75
+ build_mutant=lambda text: "You are a helpful assistant.",
76
+ recommendation=(
77
+ "Your eval suite cannot detect complete prompt removal. "
78
+ "Add a test case that would fail on a blank/generic assistant."
79
+ ),
80
+ ),
81
+ CalibrationCase(
82
+ severity="HIGH",
83
+ description="All prohibitive clauses removed (all 'Never' / 'Do not' lines)",
84
+ build_mutant=_remove_prohibitions,
85
+ recommendation=(
86
+ "Your eval suite misses prohibition removal. "
87
+ "Add test cases that verify prohibited behaviors are actually refused."
88
+ ),
89
+ ),
90
+ CalibrationCase(
91
+ severity="MEDIUM",
92
+ description="All requirement clauses removed (all 'Always' / 'You must' lines)",
93
+ build_mutant=_remove_requirements,
94
+ recommendation=(
95
+ "Your eval suite misses requirement removal. "
96
+ "Add test cases verifying each required behavior appears in responses."
97
+ ),
98
+ ),
99
+ CalibrationCase(
100
+ severity="MEDIUM",
101
+ description="Role/persona instruction removed (first line of prompt)",
102
+ build_mutant=_remove_first_line,
103
+ recommendation=(
104
+ "Your eval suite misses persona loss. "
105
+ "Add a test case verifying the LLM adopts the specified role."
106
+ ),
107
+ ),
108
+ CalibrationCase(
109
+ severity="LOW",
110
+ description="Single instruction clause dropped (last instruction line)",
111
+ build_mutant=_remove_last_instruction,
112
+ recommendation=(
113
+ "Your eval suite misses single-clause removal. "
114
+ "Each important instruction should have at least one dedicated test case."
115
+ ),
116
+ ),
117
+ ]
118
+
119
+
120
+ @dataclass
121
+ class CalibrationResult:
122
+ case: CalibrationCase
123
+ original_score: float
124
+ mutant_score: float
125
+ delta: float
126
+ caught: bool # True if delta >= delta_threshold
127
+
128
+
129
+ @dataclass
130
+ class CalibrationReport:
131
+ calibration_score: float # fraction of calibration cases caught
132
+ total_cases: int
133
+ caught: int
134
+ missed: int
135
+ results: list[CalibrationResult]
136
+ warnings: list[str]
137
+
138
+ def summary(self) -> str:
139
+ pct = int(self.calibration_score * 100)
140
+ lines = [
141
+ f"\nCALIBRATION RESULTS:",
142
+ f"Tested {self.total_cases} known-severity mutations against your eval suite.",
143
+ ]
144
+ for r in self.results:
145
+ icon = "\u2713" if r.caught else "\u2717"
146
+ caught_str = "Caught" if r.caught else "Missed"
147
+ lines.append(
148
+ f" {icon} Severity: {r.case.severity} \u2014 {r.case.description}"
149
+ f" \u2014 {caught_str} (score: {r.original_score:.2f} \u2192 {r.mutant_score:.2f})"
150
+ )
151
+ lines.append("")
152
+ lines.append(f"Calibration score: {pct}% ({self.caught}/{self.total_cases} known-severity mutations caught)")
153
+ for w in self.warnings:
154
+ lines.append(f"WARNING: {w}")
155
+ return "\n".join(lines)
156
+
157
+
158
+ def run_calibration(
159
+ eval_fn: Callable[[str, list], float],
160
+ test_cases: list,
161
+ prompt: str,
162
+ delta_threshold: float = 0.15,
163
+ runs_per_case: int = 1,
164
+ ) -> CalibrationReport:
165
+ """Run calibration cases and return CalibrationReport."""
166
+ import statistics
167
+
168
+ def _score(text: str) -> float:
169
+ scores = [float(eval_fn(text, test_cases)) for _ in range(runs_per_case)]
170
+ return statistics.median(scores)
171
+
172
+ original_score = _score(prompt)
173
+ results: list[CalibrationResult] = []
174
+ warnings: list[str] = []
175
+
176
+ for case in _CALIBRATION_CASES:
177
+ try:
178
+ mutant_text = case.build_mutant(prompt)
179
+ if mutant_text == prompt:
180
+ # Skip if mutation didn't change anything (prompt may not have matching clauses)
181
+ continue
182
+ mutant_score = _score(mutant_text)
183
+ delta = original_score - mutant_score
184
+ caught = delta >= delta_threshold
185
+ results.append(
186
+ CalibrationResult(
187
+ case=case,
188
+ original_score=original_score,
189
+ mutant_score=mutant_score,
190
+ delta=delta,
191
+ caught=caught,
192
+ )
193
+ )
194
+ if not caught and case.severity == "HIGH":
195
+ warnings.append(case.recommendation)
196
+ except Exception as exc:
197
+ warnings.append(f"Calibration case '{case.description}' failed: {exc}")
198
+
199
+ caught_count = sum(1 for r in results if r.caught)
200
+ total = len(results)
201
+ score = caught_count / total if total > 0 else 0.0
202
+
203
+ return CalibrationReport(
204
+ calibration_score=score,
205
+ total_cases=total,
206
+ caught=caught_count,
207
+ missed=total - caught_count,
208
+ results=results,
209
+ warnings=warnings,
210
+ )