llm-guard-kit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_guard_kit-0.1.0/PKG-INFO +257 -0
- llm_guard_kit-0.1.0/README.md +227 -0
- llm_guard_kit-0.1.0/llm_guard/__init__.py +39 -0
- llm_guard_kit-0.1.0/llm_guard_kit.egg-info/PKG-INFO +257 -0
- llm_guard_kit-0.1.0/llm_guard_kit.egg-info/SOURCES.txt +16 -0
- llm_guard_kit-0.1.0/llm_guard_kit.egg-info/dependency_links.txt +1 -0
- llm_guard_kit-0.1.0/llm_guard_kit.egg-info/requires.txt +8 -0
- llm_guard_kit-0.1.0/llm_guard_kit.egg-info/top_level.txt +2 -0
- llm_guard_kit-0.1.0/pyproject.toml +46 -0
- llm_guard_kit-0.1.0/qppg/__init__.py +64 -0
- llm_guard_kit-0.1.0/qppg/blindness.py +740 -0
- llm_guard_kit-0.1.0/qppg/core.py +391 -0
- llm_guard_kit-0.1.0/qppg/encoder.py +389 -0
- llm_guard_kit-0.1.0/qppg/guard.py +687 -0
- llm_guard_kit-0.1.0/qppg/novelty.py +390 -0
- llm_guard_kit-0.1.0/qppg/trust.py +451 -0
- llm_guard_kit-0.1.0/setup.cfg +4 -0
- llm_guard_kit-0.1.0/tests/test_guard.py +419 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-guard-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Predict, diagnose, and repair LLM failures automatically. AUROC 0.966–0.993.
|
|
5
|
+
Author: Avighan Majumder
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/avighan/qppg
|
|
8
|
+
Project-URL: Issues, https://github.com/avighan/qppg/issues
|
|
9
|
+
Keywords: llm,reliability,failure-prediction,anomaly-detection,knn,claude,openai
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: numpy>=1.20
|
|
24
|
+
Requires-Dist: scikit-learn>=0.24
|
|
25
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
26
|
+
Requires-Dist: anthropic>=0.7.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: matplotlib>=3.4; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# llm-guard
|
|
32
|
+
|
|
33
|
+
**Predict, diagnose, and repair LLM failures automatically.**
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/llm-guard/)
|
|
36
|
+
[](https://pypi.org/project/llm-guard/)
|
|
37
|
+
[](https://opensource.org/licenses/MIT)
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## What it does
|
|
42
|
+
|
|
43
|
+
`llm-guard` wraps any LLM call with a three-stage reliability layer:
|
|
44
|
+
|
|
45
|
+
1. **Predict** — scores every query for failure risk in <15ms before the LLM responds
|
|
46
|
+
2. **Diagnose** — clusters accumulated failures into a labeled error taxonomy
|
|
47
|
+
3. **Heal** — synthesises targeted repair instructions from failure patterns; applies them automatically on future queries
|
|
48
|
+
|
|
49
|
+
**Validated results** (Claude Haiku, internal benchmarks):
|
|
50
|
+
|
|
51
|
+
| Benchmark | Task type | AUROC | Precision@10 |
|
|
52
|
+
|-----------------|-------------|--------|--------------|
|
|
53
|
+
| MATH-500 | Math | 0.966 | 100% |
|
|
54
|
+
| HumanEval | Code | 0.993 | 100% |
|
|
55
|
+
| TriviaQA | Factual QA | 0.992 | 100% |
|
|
56
|
+
|
|
57
|
+
Cost: <$0.25 to validate on 664 benchmark problems.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install llm-guard
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Requires Python 3.9+ and an Anthropic API key.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Quick start — three calibration paths
|
|
72
|
+
|
|
73
|
+
### Path A: You have labeled correct examples
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from llm_guard import LLMGuard
|
|
77
|
+
|
|
78
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
79
|
+
|
|
80
|
+
# Fit on questions your LLM is known to handle correctly
|
|
81
|
+
guard.fit(correct_questions=[
|
|
82
|
+
"What is the capital of France?",
|
|
83
|
+
"What is 12 * 15?",
|
|
84
|
+
# ... 50+ examples recommended
|
|
85
|
+
])
|
|
86
|
+
|
|
87
|
+
result = guard.query("What is 15% of 240?")
|
|
88
|
+
print(result.answer) # "36"
|
|
89
|
+
print(result.confidence) # "high" | "medium" | "low"
|
|
90
|
+
print(result.risk_score) # 0.12 (lower = more familiar = lower failure risk)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Path B: No labels — use self-consistency
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
97
|
+
|
|
98
|
+
# Runs each question 5 times; those with 80%+ agreement are "probably correct"
|
|
99
|
+
guard.fit_from_consistency(
|
|
100
|
+
questions=my_question_pool, # 100–500 questions
|
|
101
|
+
n_samples=5,
|
|
102
|
+
agreement_threshold=0.8,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
result = guard.query("Explain the water cycle.")
|
|
106
|
+
print(result.confidence) # "high"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Path C: Automated verifier (code, math, SQL, schema)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import subprocess, textwrap
|
|
113
|
+
|
|
114
|
+
def python_verifier(question, response):
|
|
115
|
+
"""Returns True if the code response passes the test suite."""
|
|
116
|
+
try:
|
|
117
|
+
exec(compile(response, "<llm>", "exec"), {})
|
|
118
|
+
return True
|
|
119
|
+
except Exception:
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
123
|
+
guard.fit_from_execution(
|
|
124
|
+
questions=coding_questions,
|
|
125
|
+
verifier_fn=python_verifier,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
result = guard.query("Write a function that reverses a string.")
|
|
129
|
+
print(result.answer)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Error Autopsy
|
|
135
|
+
|
|
136
|
+
Cluster accumulated failures into a labeled taxonomy (read-only, does not modify guard state):
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
clusters = guard.diagnose(
|
|
140
|
+
failed_questions=failed_qs,
|
|
141
|
+
model_answers=model_answers,
|
|
142
|
+
correct_answers=correct_answers, # optional but enables suggested_fix
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
for c in clusters:
|
|
146
|
+
print(f"Cluster {c['cluster_id']} ({c['size']} failures): {c['label']}")
|
|
147
|
+
print(f" Fix: {c.get('suggested_fix', 'n/a')}")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Example output:
|
|
151
|
+
```
|
|
152
|
+
Cluster 0 (12 failures): The model misreads multi-step word problems,
|
|
153
|
+
computing intermediate values correctly but applying them to the wrong sub-question.
|
|
154
|
+
Fix: Explicitly label each sub-goal before computing.
|
|
155
|
+
Cluster 1 (8 failures): Off-by-one errors in loop boundary conditions.
|
|
156
|
+
Fix: Always verify that loop indices match the stated range inclusivity.
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Prompt Healer
|
|
162
|
+
|
|
163
|
+
Learn from failures and auto-apply targeted repairs on future queries in the same error cluster:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
guard.learn_from_errors(
|
|
167
|
+
failed_questions=failed_qs,
|
|
168
|
+
model_answers=model_answers,
|
|
169
|
+
correct_answers=correct_answers,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Future queries near a known failure cluster get the repair instruction injected automatically
|
|
173
|
+
result = guard.query("If a train travels 60 mph for 2.5 hours, how far does it go?")
|
|
174
|
+
print(result.tool_used) # "error_fix_0" ← repair tool was applied
|
|
175
|
+
print(result.confidence) # "medium"
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## GuardResult fields
|
|
181
|
+
|
|
182
|
+
| Field | Type | Description |
|
|
183
|
+
|---------------|----------------|-----------------------------------------------------|
|
|
184
|
+
| `answer` | str | LLM response text |
|
|
185
|
+
| `risk_score` | float | Mean KNN distance; higher = more likely to fail |
|
|
186
|
+
| `confidence` | str | `"high"` / `"medium"` / `"low"` |
|
|
187
|
+
| `tool_used` | str \| None | Repair tool ID if applied |
|
|
188
|
+
| `cluster_id` | int \| None | Error cluster ID if matched |
|
|
189
|
+
| `was_retried` | bool | True if a resource-failure retry fired |
|
|
190
|
+
| `raw_response` | str | Full LLM response (same as `answer` currently) |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Constructor parameters
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
guard = LLMGuard(
|
|
198
|
+
api_key="sk-ant-...", # Anthropic key (or set ANTHROPIC_API_KEY)
|
|
199
|
+
model="claude-haiku-4-5-20251001", # any Claude model
|
|
200
|
+
embedding_model="all-MiniLM-L6-v2", # sentence-transformers model
|
|
201
|
+
n_neighbors=5, # k for KNN scoring
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## How it works
|
|
208
|
+
|
|
209
|
+
The failure predictor uses **KNN anomaly scoring** on sentence-transformer embeddings:
|
|
210
|
+
|
|
211
|
+
1. During calibration, embed all known-correct questions → build a KNN index
|
|
212
|
+
2. At query time, embed the new question → compute mean distance to k nearest correct examples
|
|
213
|
+
3. High distance = unfamiliar territory = high failure risk (AUROC 0.966–0.993)
|
|
214
|
+
|
|
215
|
+
Risk thresholds are auto-calibrated from the training distribution (75th and 95th percentile), so they work across any domain without manual tuning.
|
|
216
|
+
|
|
217
|
+
**Failure-type detection** (applied at medium/high risk):
|
|
218
|
+
- `stop_reason == "max_tokens"` → resource failure → retry with 2x tokens (no tool)
|
|
219
|
+
- Otherwise → reasoning failure → apply synthesised cluster repair tool
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Limitations
|
|
224
|
+
|
|
225
|
+
- **Calibration quality matters.** `fit()` requires ≥6 correct examples; `fit_from_consistency()` works best when baseline accuracy is >70%. With very low baseline accuracy, few questions will agree across samples.
|
|
226
|
+
- **Embeddings are language-level.** The predictor detects unfamiliar *phrasing*, not unfamiliar *reasoning steps*. Two questions that look similar but require different reasoning may get similar scores.
|
|
227
|
+
- **repair tools are heuristic.** `learn_from_errors()` synthesises prompt additions using the LLM — they help on average but are not guaranteed to fix every instance of a cluster.
|
|
228
|
+
- **Currently Anthropic-only.** OpenAI/other provider support is on the roadmap.
|
|
229
|
+
- **Not a security filter.** This tool predicts factual/reasoning failures, not prompt injection or jailbreaks.
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Roadmap
|
|
234
|
+
|
|
235
|
+
- [ ] OpenAI and Ollama provider support
|
|
236
|
+
- [ ] Async/streaming API
|
|
237
|
+
- [ ] Save/load guard state (`.save()` / `.load()`)
|
|
238
|
+
- [ ] Score-only mode (no LLM call required)
|
|
239
|
+
- [ ] Dashboard for failure cluster visualization
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## License
|
|
244
|
+
|
|
245
|
+
MIT. See [LICENSE](LICENSE).
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Citation
|
|
250
|
+
|
|
251
|
+
If you use this in research:
|
|
252
|
+
|
|
253
|
+
```
|
|
254
|
+
Majumder, A. (2025). LLM Reliability Guard: KNN-based failure prediction
|
|
255
|
+
for large language models. AUROC 0.966–0.993 on math, code, and factual QA.
|
|
256
|
+
https://github.com/avighan/qppg
|
|
257
|
+
```
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# llm-guard
|
|
2
|
+
|
|
3
|
+
**Predict, diagnose, and repair LLM failures automatically.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/llm-guard/)
|
|
6
|
+
[](https://pypi.org/project/llm-guard/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What it does
|
|
12
|
+
|
|
13
|
+
`llm-guard` wraps any LLM call with a three-stage reliability layer:
|
|
14
|
+
|
|
15
|
+
1. **Predict** — scores every query for failure risk in <15ms before the LLM responds
|
|
16
|
+
2. **Diagnose** — clusters accumulated failures into a labeled error taxonomy
|
|
17
|
+
3. **Heal** — synthesises targeted repair instructions from failure patterns; applies them automatically on future queries
|
|
18
|
+
|
|
19
|
+
**Validated results** (Claude Haiku, internal benchmarks):
|
|
20
|
+
|
|
21
|
+
| Benchmark | Task type | AUROC | Precision@10 |
|
|
22
|
+
|-----------------|-------------|--------|--------------|
|
|
23
|
+
| MATH-500 | Math | 0.966 | 100% |
|
|
24
|
+
| HumanEval | Code | 0.993 | 100% |
|
|
25
|
+
| TriviaQA | Factual QA | 0.992 | 100% |
|
|
26
|
+
|
|
27
|
+
Cost: <$0.25 to validate on 664 benchmark problems.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install llm-guard
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Requires Python 3.9+ and an Anthropic API key.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick start — three calibration paths
|
|
42
|
+
|
|
43
|
+
### Path A: You have labeled correct examples
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from llm_guard import LLMGuard
|
|
47
|
+
|
|
48
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
49
|
+
|
|
50
|
+
# Fit on questions your LLM is known to handle correctly
|
|
51
|
+
guard.fit(correct_questions=[
|
|
52
|
+
"What is the capital of France?",
|
|
53
|
+
"What is 12 * 15?",
|
|
54
|
+
# ... 50+ examples recommended
|
|
55
|
+
])
|
|
56
|
+
|
|
57
|
+
result = guard.query("What is 15% of 240?")
|
|
58
|
+
print(result.answer) # "36"
|
|
59
|
+
print(result.confidence) # "high" | "medium" | "low"
|
|
60
|
+
print(result.risk_score) # 0.12 (lower = more familiar = lower failure risk)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Path B: No labels — use self-consistency
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
67
|
+
|
|
68
|
+
# Runs each question 5 times; those with 80%+ agreement are "probably correct"
|
|
69
|
+
guard.fit_from_consistency(
|
|
70
|
+
questions=my_question_pool, # 100–500 questions
|
|
71
|
+
n_samples=5,
|
|
72
|
+
agreement_threshold=0.8,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
result = guard.query("Explain the water cycle.")
|
|
76
|
+
print(result.confidence) # "high"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Path C: Automated verifier (code, math, SQL, schema)
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import subprocess, textwrap
|
|
83
|
+
|
|
84
|
+
def python_verifier(question, response):
|
|
85
|
+
"""Returns True if the code response passes the test suite."""
|
|
86
|
+
try:
|
|
87
|
+
exec(compile(response, "<llm>", "exec"), {})
|
|
88
|
+
return True
|
|
89
|
+
except Exception:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
93
|
+
guard.fit_from_execution(
|
|
94
|
+
questions=coding_questions,
|
|
95
|
+
verifier_fn=python_verifier,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
result = guard.query("Write a function that reverses a string.")
|
|
99
|
+
print(result.answer)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Error Autopsy
|
|
105
|
+
|
|
106
|
+
Cluster accumulated failures into a labeled taxonomy (read-only, does not modify guard state):
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
clusters = guard.diagnose(
|
|
110
|
+
failed_questions=failed_qs,
|
|
111
|
+
model_answers=model_answers,
|
|
112
|
+
correct_answers=correct_answers, # optional but enables suggested_fix
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
for c in clusters:
|
|
116
|
+
print(f"Cluster {c['cluster_id']} ({c['size']} failures): {c['label']}")
|
|
117
|
+
print(f" Fix: {c.get('suggested_fix', 'n/a')}")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Example output:
|
|
121
|
+
```
|
|
122
|
+
Cluster 0 (12 failures): The model misreads multi-step word problems,
|
|
123
|
+
computing intermediate values correctly but applying them to the wrong sub-question.
|
|
124
|
+
Fix: Explicitly label each sub-goal before computing.
|
|
125
|
+
Cluster 1 (8 failures): Off-by-one errors in loop boundary conditions.
|
|
126
|
+
Fix: Always verify that loop indices match the stated range inclusivity.
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Prompt Healer
|
|
132
|
+
|
|
133
|
+
Learn from failures and auto-apply targeted repairs on future queries in the same error cluster:
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
guard.learn_from_errors(
|
|
137
|
+
failed_questions=failed_qs,
|
|
138
|
+
model_answers=model_answers,
|
|
139
|
+
correct_answers=correct_answers,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Future queries near a known failure cluster get the repair instruction injected automatically
|
|
143
|
+
result = guard.query("If a train travels 60 mph for 2.5 hours, how far does it go?")
|
|
144
|
+
print(result.tool_used) # "error_fix_0" ← repair tool was applied
|
|
145
|
+
print(result.confidence) # "medium"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## GuardResult fields
|
|
151
|
+
|
|
152
|
+
| Field | Type | Description |
|
|
153
|
+
|---------------|----------------|-----------------------------------------------------|
|
|
154
|
+
| `answer` | str | LLM response text |
|
|
155
|
+
| `risk_score` | float | Mean KNN distance; higher = more likely to fail |
|
|
156
|
+
| `confidence` | str | `"high"` / `"medium"` / `"low"` |
|
|
157
|
+
| `tool_used` | str \| None | Repair tool ID if applied |
|
|
158
|
+
| `cluster_id` | int \| None | Error cluster ID if matched |
|
|
159
|
+
| `was_retried` | bool | True if a resource-failure retry fired |
|
|
160
|
+
| `raw_response` | str | Full LLM response (same as `answer` currently) |
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Constructor parameters
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
guard = LLMGuard(
|
|
168
|
+
api_key="sk-ant-...", # Anthropic key (or set ANTHROPIC_API_KEY)
|
|
169
|
+
model="claude-haiku-4-5-20251001", # any Claude model
|
|
170
|
+
embedding_model="all-MiniLM-L6-v2", # sentence-transformers model
|
|
171
|
+
n_neighbors=5, # k for KNN scoring
|
|
172
|
+
)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## How it works
|
|
178
|
+
|
|
179
|
+
The failure predictor uses **KNN anomaly scoring** on sentence-transformer embeddings:
|
|
180
|
+
|
|
181
|
+
1. During calibration, embed all known-correct questions → build a KNN index
|
|
182
|
+
2. At query time, embed the new question → compute mean distance to k nearest correct examples
|
|
183
|
+
3. High distance = unfamiliar territory = high failure risk (AUROC 0.966–0.993)
|
|
184
|
+
|
|
185
|
+
Risk thresholds are auto-calibrated from the training distribution (75th and 95th percentile), so they work across any domain without manual tuning.
|
|
186
|
+
|
|
187
|
+
**Failure-type detection** (applied at medium/high risk):
|
|
188
|
+
- `stop_reason == "max_tokens"` → resource failure → retry with 2x tokens (no tool)
|
|
189
|
+
- Otherwise → reasoning failure → apply synthesised cluster repair tool
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Limitations
|
|
194
|
+
|
|
195
|
+
- **Calibration quality matters.** `fit()` requires ≥6 correct examples; `fit_from_consistency()` works best when baseline accuracy is >70%. With very low baseline accuracy, few questions will agree across samples.
|
|
196
|
+
- **Embeddings are language-level.** The predictor detects unfamiliar *phrasing*, not unfamiliar *reasoning steps*. Two questions that look similar but require different reasoning may get similar scores.
|
|
197
|
+
- **repair tools are heuristic.** `learn_from_errors()` synthesises prompt additions using the LLM — they help on average but are not guaranteed to fix every instance of a cluster.
|
|
198
|
+
- **Currently Anthropic-only.** OpenAI/other provider support is on the roadmap.
|
|
199
|
+
- **Not a security filter.** This tool predicts factual/reasoning failures, not prompt injection or jailbreaks.
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Roadmap
|
|
204
|
+
|
|
205
|
+
- [ ] OpenAI and Ollama provider support
|
|
206
|
+
- [ ] Async/streaming API
|
|
207
|
+
- [ ] Save/load guard state (`.save()` / `.load()`)
|
|
208
|
+
- [ ] Score-only mode (no LLM call required)
|
|
209
|
+
- [ ] Dashboard for failure cluster visualization
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT. See [LICENSE](LICENSE).
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Citation
|
|
220
|
+
|
|
221
|
+
If you use this in research:
|
|
222
|
+
|
|
223
|
+
```
|
|
224
|
+
Majumder, A. (2025). LLM Reliability Guard: KNN-based failure prediction
|
|
225
|
+
for large language models. AUROC 0.966–0.993 on math, code, and factual QA.
|
|
226
|
+
https://github.com/avighan/qppg
|
|
227
|
+
```
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
llm-guard: Predict, diagnose, and repair LLM failures automatically.
|
|
3
|
+
|
|
4
|
+
Validated AUROC: 0.966–0.993 across math, code, and factual QA benchmarks.
|
|
5
|
+
Latency overhead: <15ms per query after warm-up.
|
|
6
|
+
|
|
7
|
+
Quick start
|
|
8
|
+
-----------
|
|
9
|
+
from llm_guard import LLMGuard
|
|
10
|
+
|
|
11
|
+
guard = LLMGuard(api_key="sk-ant-...")
|
|
12
|
+
|
|
13
|
+
# Option A — you have known-correct examples
|
|
14
|
+
guard.fit(correct_questions=[...])
|
|
15
|
+
|
|
16
|
+
# Option B — no labels, use self-consistency
|
|
17
|
+
guard.fit_from_consistency(questions=[...], n_samples=5)
|
|
18
|
+
|
|
19
|
+
# Option C — automated verifier (code execution, math eval, SQL, etc.)
|
|
20
|
+
guard.fit_from_execution(questions=[...], verifier_fn=my_verifier)
|
|
21
|
+
|
|
22
|
+
# Run a query with automatic reliability scoring and repair
|
|
23
|
+
result = guard.query("What is 15% of 240?")
|
|
24
|
+
print(result.answer) # "36"
|
|
25
|
+
print(result.confidence) # "high" | "medium" | "low"
|
|
26
|
+
print(result.risk_score) # 0.12 (low = familiar territory)
|
|
27
|
+
|
|
28
|
+
# Diagnose accumulated failures
|
|
29
|
+
clusters = guard.diagnose(failed_qs, model_answers, correct_answers)
|
|
30
|
+
|
|
31
|
+
# Learn from failures — future queries in those error clusters get auto-fixed
|
|
32
|
+
guard.learn_from_errors(failed_qs, model_answers, correct_answers)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from qppg.guard import QPPGLLMGuard as LLMGuard
|
|
36
|
+
from qppg.guard import GuardResult
|
|
37
|
+
|
|
38
|
+
__version__ = "0.1.0"
|
|
39
|
+
__all__ = ["LLMGuard", "GuardResult"]
|