llm-evalgate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_evalgate-0.1.0/.github/workflows/ci.yml +30 -0
- llm_evalgate-0.1.0/.github/workflows/publish.yml +33 -0
- llm_evalgate-0.1.0/.gitignore +15 -0
- llm_evalgate-0.1.0/PKG-INFO +188 -0
- llm_evalgate-0.1.0/README.md +159 -0
- llm_evalgate-0.1.0/pyproject.toml +58 -0
- llm_evalgate-0.1.0/src/llm_evalkit/__init__.py +4 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/__init__.py +4 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimension.py +31 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimensions/__init__.py +11 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimensions/blocklist.py +32 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimensions/factual.py +56 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimensions/readability.py +26 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/dimensions/schema.py +30 -0
- llm_evalgate-0.1.0/src/llm_evalkit/eval/harness.py +50 -0
- llm_evalgate-0.1.0/src/llm_evalkit/reliable/__init__.py +12 -0
- llm_evalgate-0.1.0/src/llm_evalkit/reliable/circuit.py +97 -0
- llm_evalgate-0.1.0/src/llm_evalkit/reliable/fallback.py +53 -0
- llm_evalgate-0.1.0/src/llm_evalkit/reliable/retry.py +45 -0
- llm_evalgate-0.1.0/tests/__init__.py +0 -0
- llm_evalgate-0.1.0/tests/test_dimensions.py +97 -0
- llm_evalgate-0.1.0/tests/test_harness.py +58 -0
- llm_evalgate-0.1.0/tests/test_reliable.py +151 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: pip install -e ".[dev]"
|
|
25
|
+
|
|
26
|
+
- name: Lint
|
|
27
|
+
run: ruff check .
|
|
28
|
+
|
|
29
|
+
- name: Test
|
|
30
|
+
run: pytest
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: release
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Build
|
|
23
|
+
run: |
|
|
24
|
+
pip install build
|
|
25
|
+
python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish
|
|
28
|
+
env:
|
|
29
|
+
TWINE_USERNAME: __token__
|
|
30
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
|
31
|
+
run: |
|
|
32
|
+
pip install twine
|
|
33
|
+
twine upload --verbose dist/*
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-evalgate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic eval gates and reliability primitives for LLM pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/LesterALeong/llm-evalkit
|
|
6
|
+
Project-URL: Repository, https://github.com/LesterALeong/llm-evalkit
|
|
7
|
+
Project-URL: Issues, https://github.com/LesterALeong/llm-evalkit/issues
|
|
8
|
+
Author-email: Lester Leong <lester.leong89@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: agents,ai,eval,evaluation,llm,pipeline,reliability
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: textstat>=0.7
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: hatch>=1.12; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# llm-evalkit
|
|
31
|
+
|
|
32
|
+
Deterministic eval gates and reliability primitives for LLM pipelines.
|
|
33
|
+
|
|
34
|
+
[](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml)
|
|
35
|
+
[](https://pypi.org/project/llm-evalgate/)
|
|
36
|
+
[](https://pypi.org/project/llm-evalgate/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
Most LLM eval tooling is either LLM-as-judge (non-deterministic, expensive, not CI-friendly) or a heavy enterprise suite. `llm-evalkit` is neither.
|
|
42
|
+
|
|
43
|
+
It gives you two things:
|
|
44
|
+
|
|
45
|
+
- **Eval gates**: code-only quality dimensions that run the same way every time. Drop them into any pipeline, run them in CI, get a pass/fail with a reason.
|
|
46
|
+
- **Reliability primitives**: retry with backoff, model fallback chains, and a circuit breaker. The building blocks for LLM pipelines that hold up in production.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install llm-evalgate
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quickstart
|
|
55
|
+
|
|
56
|
+
### Eval gates
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from llm_evalkit import EvalHarness
|
|
60
|
+
from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension, SchemaComplianceDimension
|
|
61
|
+
|
|
62
|
+
harness = EvalHarness([
|
|
63
|
+
BlocklistDimension(terms=["confidential", "internal use only"]),
|
|
64
|
+
ReadabilityDimension(threshold=0.3),
|
|
65
|
+
SchemaComplianceDimension(required_fields=["title:", "summary:"]),
|
|
66
|
+
])
|
|
67
|
+
|
|
68
|
+
report = harness.run(llm_output)
|
|
69
|
+
|
|
70
|
+
if not report.passed:
|
|
71
|
+
print(report)
|
|
72
|
+
# EvalReport: FAIL
|
|
73
|
+
# FAIL [blocklist] score=0.000 — prohibited terms found: ['confidential']
|
|
74
|
+
# PASS [readability] score=0.612 — Flesch ease=61.2, FK grade=8.4
|
|
75
|
+
# PASS [schema_compliance] score=1.000 — all 2 required fields present
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Custom dimension
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from llm_evalkit import Dimension
|
|
82
|
+
|
|
83
|
+
class JsonDimension(Dimension):
|
|
84
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
85
|
+
import json
|
|
86
|
+
try:
|
|
87
|
+
json.loads(text)
|
|
88
|
+
return 1.0, "valid JSON"
|
|
89
|
+
except json.JSONDecodeError as e:
|
|
90
|
+
return 0.0, f"invalid JSON: {e}"
|
|
91
|
+
|
|
92
|
+
harness = EvalHarness([JsonDimension(threshold=1.0)])
|
|
93
|
+
report = harness.run('{"key": "value"}')
|
|
94
|
+
assert report.passed
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Retry
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from llm_evalkit.reliable import retry
|
|
101
|
+
|
|
102
|
+
@retry(max_attempts=3, backoff=2.0)
|
|
103
|
+
def call_llm(prompt: str) -> str:
|
|
104
|
+
return client.messages.create(...)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Fallback chain
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from llm_evalkit.reliable import with_fallback, with_fallback_chain
|
|
111
|
+
|
|
112
|
+
# two-model fallback
|
|
113
|
+
result = with_fallback(
|
|
114
|
+
primary=lambda: call_model("claude-opus-4-8", prompt),
|
|
115
|
+
fallback=lambda: call_model("claude-sonnet-4-6", prompt),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# ordered chain — first success wins
|
|
119
|
+
result = with_fallback_chain([
|
|
120
|
+
lambda: call_model("claude-opus-4-8", prompt),
|
|
121
|
+
lambda: call_model("claude-sonnet-4-6", prompt),
|
|
122
|
+
lambda: call_model("claude-haiku-4-5", prompt),
|
|
123
|
+
])
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Circuit breaker
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from llm_evalkit.reliable import CircuitBreaker, CircuitOpenError
|
|
130
|
+
|
|
131
|
+
breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
with breaker:
|
|
135
|
+
result = call_llm(prompt)
|
|
136
|
+
except CircuitOpenError:
|
|
137
|
+
result = cached_response # serve from cache while circuit is open
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Built-in dimensions
|
|
141
|
+
|
|
142
|
+
| Dimension | What it checks | Default threshold |
|
|
143
|
+
|---|---|---|
|
|
144
|
+
| `BlocklistDimension` | No prohibited terms in output | 1.0 (zero tolerance) |
|
|
145
|
+
| `ReadabilityDimension` | Flesch Reading Ease score | 0.3 (college-level prose) |
|
|
146
|
+
| `SchemaComplianceDimension` | Required fields are present | 1.0 (all fields) |
|
|
147
|
+
| `FactualGroundingDimension` | Numeric claims traceable to evidence | 0.85 |
|
|
148
|
+
|
|
149
|
+
All dimensions follow the same interface: `evaluate(text) -> (score, detail)`. Writing a new one is ten lines.
|
|
150
|
+
|
|
151
|
+
## Why deterministic?
|
|
152
|
+
|
|
153
|
+
LLM-as-judge eval is useful for research. In production pipelines, you need:
|
|
154
|
+
|
|
155
|
+
- The same input to produce the same pass/fail result every run
|
|
156
|
+
- CI to catch regressions without burning tokens on every commit
|
|
157
|
+
- An audit trail that doesn't depend on a model that may drift
|
|
158
|
+
|
|
159
|
+
`llm-evalkit` eval dimensions are pure functions. No model calls, no network, no randomness.
|
|
160
|
+
|
|
161
|
+
## Composing with a pipeline
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from llm_evalkit import EvalHarness
|
|
165
|
+
from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension
|
|
166
|
+
from llm_evalkit.reliable import retry, with_fallback
|
|
167
|
+
|
|
168
|
+
harness = EvalHarness([
|
|
169
|
+
BlocklistDimension(terms=["[REDACTED]", "TODO"]),
|
|
170
|
+
ReadabilityDimension(threshold=0.2),
|
|
171
|
+
])
|
|
172
|
+
|
|
173
|
+
@retry(max_attempts=3, backoff=2.0)
|
|
174
|
+
def generate(prompt: str) -> str:
|
|
175
|
+
return with_fallback(
|
|
176
|
+
primary=lambda: call_model("claude-opus-4-8", prompt),
|
|
177
|
+
fallback=lambda: call_model("claude-sonnet-4-6", prompt),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
output = generate(prompt)
|
|
181
|
+
report = harness.run(output)
|
|
182
|
+
if not report.passed:
|
|
183
|
+
raise ValueError(f"Output failed eval gate:\n{report}")
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## License
|
|
187
|
+
|
|
188
|
+
MIT
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# llm-evalkit
|
|
2
|
+
|
|
3
|
+
Deterministic eval gates and reliability primitives for LLM pipelines.
|
|
4
|
+
|
|
5
|
+
[](https://github.com/LesterALeong/llm-evalkit/actions/workflows/ci.yml)
|
|
6
|
+
[](https://pypi.org/project/llm-evalgate/)
|
|
7
|
+
[](https://pypi.org/project/llm-evalgate/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
Most LLM eval tooling is either LLM-as-judge (non-deterministic, expensive, not CI-friendly) or a heavy enterprise suite. `llm-evalkit` is neither.
|
|
13
|
+
|
|
14
|
+
It gives you two things:
|
|
15
|
+
|
|
16
|
+
- **Eval gates**: code-only quality dimensions that run the same way every time. Drop them into any pipeline, run them in CI, get a pass/fail with a reason.
|
|
17
|
+
- **Reliability primitives**: retry with backoff, model fallback chains, and a circuit breaker. The building blocks for LLM pipelines that hold up in production.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install llm-evalgate
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quickstart
|
|
26
|
+
|
|
27
|
+
### Eval gates
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from llm_evalkit import EvalHarness
|
|
31
|
+
from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension, SchemaComplianceDimension
|
|
32
|
+
|
|
33
|
+
harness = EvalHarness([
|
|
34
|
+
BlocklistDimension(terms=["confidential", "internal use only"]),
|
|
35
|
+
ReadabilityDimension(threshold=0.3),
|
|
36
|
+
SchemaComplianceDimension(required_fields=["title:", "summary:"]),
|
|
37
|
+
])
|
|
38
|
+
|
|
39
|
+
report = harness.run(llm_output)
|
|
40
|
+
|
|
41
|
+
if not report.passed:
|
|
42
|
+
print(report)
|
|
43
|
+
# EvalReport: FAIL
|
|
44
|
+
# FAIL [blocklist] score=0.000 — prohibited terms found: ['confidential']
|
|
45
|
+
# PASS [readability] score=0.612 — Flesch ease=61.2, FK grade=8.4
|
|
46
|
+
# PASS [schema_compliance] score=1.000 — all 2 required fields present
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Custom dimension
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from llm_evalkit import Dimension
|
|
53
|
+
|
|
54
|
+
class JsonDimension(Dimension):
|
|
55
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
56
|
+
import json
|
|
57
|
+
try:
|
|
58
|
+
json.loads(text)
|
|
59
|
+
return 1.0, "valid JSON"
|
|
60
|
+
except json.JSONDecodeError as e:
|
|
61
|
+
return 0.0, f"invalid JSON: {e}"
|
|
62
|
+
|
|
63
|
+
harness = EvalHarness([JsonDimension(threshold=1.0)])
|
|
64
|
+
report = harness.run('{"key": "value"}')
|
|
65
|
+
assert report.passed
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Retry
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from llm_evalkit.reliable import retry
|
|
72
|
+
|
|
73
|
+
@retry(max_attempts=3, backoff=2.0)
|
|
74
|
+
def call_llm(prompt: str) -> str:
|
|
75
|
+
return client.messages.create(...)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Fallback chain
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from llm_evalkit.reliable import with_fallback, with_fallback_chain
|
|
82
|
+
|
|
83
|
+
# two-model fallback
|
|
84
|
+
result = with_fallback(
|
|
85
|
+
primary=lambda: call_model("claude-opus-4-8", prompt),
|
|
86
|
+
fallback=lambda: call_model("claude-sonnet-4-6", prompt),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# ordered chain — first success wins
|
|
90
|
+
result = with_fallback_chain([
|
|
91
|
+
lambda: call_model("claude-opus-4-8", prompt),
|
|
92
|
+
lambda: call_model("claude-sonnet-4-6", prompt),
|
|
93
|
+
lambda: call_model("claude-haiku-4-5", prompt),
|
|
94
|
+
])
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Circuit breaker
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from llm_evalkit.reliable import CircuitBreaker, CircuitOpenError
|
|
101
|
+
|
|
102
|
+
breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
with breaker:
|
|
106
|
+
result = call_llm(prompt)
|
|
107
|
+
except CircuitOpenError:
|
|
108
|
+
result = cached_response # serve from cache while circuit is open
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Built-in dimensions
|
|
112
|
+
|
|
113
|
+
| Dimension | What it checks | Default threshold |
|
|
114
|
+
|---|---|---|
|
|
115
|
+
| `BlocklistDimension` | No prohibited terms in output | 1.0 (zero tolerance) |
|
|
116
|
+
| `ReadabilityDimension` | Flesch Reading Ease score | 0.3 (college-level prose) |
|
|
117
|
+
| `SchemaComplianceDimension` | Required fields are present | 1.0 (all fields) |
|
|
118
|
+
| `FactualGroundingDimension` | Numeric claims traceable to evidence | 0.85 |
|
|
119
|
+
|
|
120
|
+
All dimensions follow the same interface: `evaluate(text) -> (score, detail)`. Writing a new one is ten lines.
|
|
121
|
+
|
|
122
|
+
## Why deterministic?
|
|
123
|
+
|
|
124
|
+
LLM-as-judge eval is useful for research. In production pipelines, you need:
|
|
125
|
+
|
|
126
|
+
- The same input to produce the same pass/fail result every run
|
|
127
|
+
- CI to catch regressions without burning tokens on every commit
|
|
128
|
+
- An audit trail that doesn't depend on a model that may drift
|
|
129
|
+
|
|
130
|
+
`llm-evalkit` eval dimensions are pure functions. No model calls, no network, no randomness.
|
|
131
|
+
|
|
132
|
+
## Composing with a pipeline
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from llm_evalkit import EvalHarness
|
|
136
|
+
from llm_evalkit.eval.dimensions import BlocklistDimension, ReadabilityDimension
|
|
137
|
+
from llm_evalkit.reliable import retry, with_fallback
|
|
138
|
+
|
|
139
|
+
harness = EvalHarness([
|
|
140
|
+
BlocklistDimension(terms=["[REDACTED]", "TODO"]),
|
|
141
|
+
ReadabilityDimension(threshold=0.2),
|
|
142
|
+
])
|
|
143
|
+
|
|
144
|
+
@retry(max_attempts=3, backoff=2.0)
|
|
145
|
+
def generate(prompt: str) -> str:
|
|
146
|
+
return with_fallback(
|
|
147
|
+
primary=lambda: call_model("claude-opus-4-8", prompt),
|
|
148
|
+
fallback=lambda: call_model("claude-sonnet-4-6", prompt),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
output = generate(prompt)
|
|
152
|
+
report = harness.run(output)
|
|
153
|
+
if not report.passed:
|
|
154
|
+
raise ValueError(f"Output failed eval gate:\n{report}")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-evalgate"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Deterministic eval gates and reliability primitives for LLM pipelines"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{ name = "Lester Leong", email = "lester.leong89@gmail.com" }]
|
|
13
|
+
keywords = ["llm", "eval", "evaluation", "reliability", "ai", "agents", "pipeline"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"textstat>=0.7",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8.0",
|
|
33
|
+
"pytest-cov>=5.0",
|
|
34
|
+
"ruff>=0.4",
|
|
35
|
+
"hatch>=1.12",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/LesterALeong/llm-evalkit"
|
|
40
|
+
Repository = "https://github.com/LesterALeong/llm-evalkit"
|
|
41
|
+
Issues = "https://github.com/LesterALeong/llm-evalkit/issues"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/llm_evalkit"]
|
|
45
|
+
|
|
46
|
+
[tool.ruff]
|
|
47
|
+
line-length = 100
|
|
48
|
+
target-version = "py39"
|
|
49
|
+
|
|
50
|
+
[tool.ruff.lint]
|
|
51
|
+
select = ["E", "F", "I", "UP"]
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
testpaths = ["tests"]
|
|
55
|
+
addopts = "--cov=llm_evalkit --cov-report=term-missing"
|
|
56
|
+
|
|
57
|
+
[tool.coverage.run]
|
|
58
|
+
source = ["src"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class DimensionResult:
|
|
9
|
+
score: float
|
|
10
|
+
passed: bool
|
|
11
|
+
detail: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Dimension(ABC):
|
|
15
|
+
"""Base class for a single eval dimension.
|
|
16
|
+
|
|
17
|
+
Subclass this and implement ``evaluate``. The harness calls ``run``,
|
|
18
|
+
which applies the threshold and returns a ``DimensionResult``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, threshold: float = 1.0, name: str | None = None) -> None:
|
|
22
|
+
self.threshold = threshold
|
|
23
|
+
self.name = name or self.__class__.__name__
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
27
|
+
"""Return (score, detail). Score is in [0.0, 1.0]."""
|
|
28
|
+
|
|
29
|
+
def run(self, text: str) -> DimensionResult:
|
|
30
|
+
score, detail = self.evaluate(text)
|
|
31
|
+
return DimensionResult(score=score, passed=score >= self.threshold, detail=detail)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .blocklist import BlocklistDimension
|
|
2
|
+
from .factual import FactualGroundingDimension
|
|
3
|
+
from .readability import ReadabilityDimension
|
|
4
|
+
from .schema import SchemaComplianceDimension
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BlocklistDimension",
|
|
8
|
+
"FactualGroundingDimension",
|
|
9
|
+
"ReadabilityDimension",
|
|
10
|
+
"SchemaComplianceDimension",
|
|
11
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ..dimension import Dimension
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BlocklistDimension(Dimension):
|
|
9
|
+
"""Fail if any prohibited term appears in the text (case-insensitive).
|
|
10
|
+
|
|
11
|
+
Score is 1.0 when clean, 0.0 when any term is found. Useful for
|
|
12
|
+
preventing confidential identifiers, brand names, or internal jargon
|
|
13
|
+
from leaking into LLM output.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
terms: list[str],
|
|
19
|
+
threshold: float = 1.0,
|
|
20
|
+
name: str = "blocklist",
|
|
21
|
+
case_sensitive: bool = False,
|
|
22
|
+
) -> None:
|
|
23
|
+
super().__init__(threshold=threshold, name=name)
|
|
24
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
|
25
|
+
self._patterns = [re.compile(re.escape(t), flags) for t in terms]
|
|
26
|
+
self._terms = terms
|
|
27
|
+
|
|
28
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
29
|
+
found = [t for t, p in zip(self._terms, self._patterns) if p.search(text)]
|
|
30
|
+
if found:
|
|
31
|
+
return 0.0, f"prohibited terms found: {found}"
|
|
32
|
+
return 1.0, "clean"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ..dimension import Dimension
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FactualGroundingDimension(Dimension):
|
|
9
|
+
"""Check that numeric claims in LLM output are traceable to evidence.
|
|
10
|
+
|
|
11
|
+
For each number extracted from the text, we check whether a value
|
|
12
|
+
within ``rel_tolerance`` of it appears in the evidence list. Score
|
|
13
|
+
is the fraction of numeric claims that are grounded.
|
|
14
|
+
|
|
15
|
+
If no evidence is supplied the dimension is skipped (returns 1.0).
|
|
16
|
+
If the text contains no numbers, it also passes.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
evidence: list[float] | None = None,
|
|
22
|
+
rel_tolerance: float = 0.02,
|
|
23
|
+
threshold: float = 0.85,
|
|
24
|
+
name: str = "factual_grounding",
|
|
25
|
+
) -> None:
|
|
26
|
+
super().__init__(threshold=threshold, name=name)
|
|
27
|
+
self._evidence = evidence or []
|
|
28
|
+
self._rel_tolerance = rel_tolerance
|
|
29
|
+
|
|
30
|
+
def _numbers_in_text(self, text: str) -> list[float]:
|
|
31
|
+
raw = re.findall(r"[\d,]+(?:\.\d+)?", text)
|
|
32
|
+
results = []
|
|
33
|
+
for r in raw:
|
|
34
|
+
try:
|
|
35
|
+
results.append(float(r.replace(",", "")))
|
|
36
|
+
except ValueError:
|
|
37
|
+
pass
|
|
38
|
+
return results
|
|
39
|
+
|
|
40
|
+
def _is_grounded(self, value: float) -> bool:
|
|
41
|
+
for ev in self._evidence:
|
|
42
|
+
if ev == 0:
|
|
43
|
+
continue
|
|
44
|
+
if abs(value - ev) / abs(ev) <= self._rel_tolerance:
|
|
45
|
+
return True
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
49
|
+
if not self._evidence:
|
|
50
|
+
return 1.0, "skipped (no evidence supplied)"
|
|
51
|
+
numbers = self._numbers_in_text(text)
|
|
52
|
+
if not numbers:
|
|
53
|
+
return 1.0, "no numeric claims found"
|
|
54
|
+
grounded = [n for n in numbers if self._is_grounded(n)]
|
|
55
|
+
score = len(grounded) / len(numbers)
|
|
56
|
+
return score, f"{len(grounded)}/{len(numbers)} numeric claims grounded"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import textstat
|
|
4
|
+
|
|
5
|
+
from ..dimension import Dimension
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReadabilityDimension(Dimension):
|
|
9
|
+
"""Pass when Flesch Reading Ease score maps to a grade <= max_grade.
|
|
10
|
+
|
|
11
|
+
``threshold`` is a normalised [0, 1] score derived from Flesch Reading
|
|
12
|
+
Ease, where 1.0 = very easy and 0.0 = very difficult. The default
|
|
13
|
+
threshold of 0.3 accepts most professional prose up to ~college level.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, threshold: float = 0.3, name: str = "readability") -> None:
|
|
17
|
+
super().__init__(threshold=threshold, name=name)
|
|
18
|
+
|
|
19
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
20
|
+
if not text.strip():
|
|
21
|
+
return 0.0, "empty text"
|
|
22
|
+
ease = textstat.flesch_reading_ease(text)
|
|
23
|
+
# Flesch ease: 100=very easy, 0=very hard. Normalise to [0, 1].
|
|
24
|
+
score = max(0.0, min(1.0, ease / 100.0))
|
|
25
|
+
grade = textstat.flesch_kincaid_grade(text)
|
|
26
|
+
return score, f"Flesch ease={ease:.1f}, FK grade={grade:.1f}"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ..dimension import Dimension
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SchemaComplianceDimension(Dimension):
|
|
7
|
+
"""Check that required fields are present in the text.
|
|
8
|
+
|
|
9
|
+
Useful for structured LLM outputs (JSON, YAML, markdown with
|
|
10
|
+
required sections) where missing fields are a hard failure.
|
|
11
|
+
|
|
12
|
+
``required_fields`` is a list of strings that must each appear
|
|
13
|
+
verbatim somewhere in the text.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
required_fields: list[str],
|
|
19
|
+
threshold: float = 1.0,
|
|
20
|
+
name: str = "schema_compliance",
|
|
21
|
+
) -> None:
|
|
22
|
+
super().__init__(threshold=threshold, name=name)
|
|
23
|
+
self._required = required_fields
|
|
24
|
+
|
|
25
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
26
|
+
missing = [f for f in self._required if f not in text]
|
|
27
|
+
if missing:
|
|
28
|
+
score = 1.0 - len(missing) / len(self._required)
|
|
29
|
+
return score, f"missing fields: {missing}"
|
|
30
|
+
return 1.0, f"all {len(self._required)} required fields present"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from .dimension import Dimension, DimensionResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class EvalReport:
|
|
10
|
+
passed: bool
|
|
11
|
+
results: dict[str, DimensionResult]
|
|
12
|
+
text: str
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def failures(self) -> dict[str, DimensionResult]:
|
|
16
|
+
return {name: r for name, r in self.results.items() if not r.passed}
|
|
17
|
+
|
|
18
|
+
def __str__(self) -> str:
|
|
19
|
+
lines = [f"EvalReport: {'PASS' if self.passed else 'FAIL'}"]
|
|
20
|
+
for name, result in self.results.items():
|
|
21
|
+
status = "PASS" if result.passed else "FAIL"
|
|
22
|
+
lines.append(f" {status} [{name}] score={result.score:.3f} — {result.detail}")
|
|
23
|
+
return "\n".join(lines)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EvalHarness:
|
|
27
|
+
"""Run a list of dimensions against text and produce an EvalReport.
|
|
28
|
+
|
|
29
|
+
Usage::
|
|
30
|
+
|
|
31
|
+
harness = EvalHarness([
|
|
32
|
+
ReadabilityDimension(threshold=0.7),
|
|
33
|
+
BlocklistDimension(terms=["secret", "internal"]),
|
|
34
|
+
])
|
|
35
|
+
report = harness.run(text)
|
|
36
|
+
if not report.passed:
|
|
37
|
+
raise ValueError(str(report))
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, dimensions: list[Dimension]) -> None:
|
|
41
|
+
if not dimensions:
|
|
42
|
+
raise ValueError("EvalHarness requires at least one dimension.")
|
|
43
|
+
self._dimensions = dimensions
|
|
44
|
+
|
|
45
|
+
def run(self, text: str) -> EvalReport:
|
|
46
|
+
results: dict[str, DimensionResult] = {}
|
|
47
|
+
for dim in self._dimensions:
|
|
48
|
+
results[dim.name] = dim.run(text)
|
|
49
|
+
passed = all(r.passed for r in results.values())
|
|
50
|
+
return EvalReport(passed=passed, results=results, text=text)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .circuit import CircuitBreaker, CircuitOpenError, CircuitState
|
|
2
|
+
from .fallback import with_fallback, with_fallback_chain
|
|
3
|
+
from .retry import retry
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"retry",
|
|
7
|
+
"with_fallback",
|
|
8
|
+
"with_fallback_chain",
|
|
9
|
+
"CircuitBreaker",
|
|
10
|
+
"CircuitOpenError",
|
|
11
|
+
"CircuitState",
|
|
12
|
+
]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CircuitState(Enum):
|
|
8
|
+
CLOSED = "closed" # normal operation
|
|
9
|
+
OPEN = "open" # failing, rejecting calls
|
|
10
|
+
HALF_OPEN = "half_open" # probing for recovery
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CircuitOpenError(Exception):
|
|
14
|
+
"""Raised when a call is attempted while the circuit is open."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CircuitBreaker:
|
|
18
|
+
"""Prevent cascading failures by stopping calls to a failing service.
|
|
19
|
+
|
|
20
|
+
Usage::
|
|
21
|
+
|
|
22
|
+
breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
|
|
23
|
+
|
|
24
|
+
# context manager form
|
|
25
|
+
with breaker:
|
|
26
|
+
result = call_llm(prompt)
|
|
27
|
+
|
|
28
|
+
# or call form
|
|
29
|
+
result = breaker.call(lambda: call_llm(prompt))
|
|
30
|
+
|
|
31
|
+
State transitions:
|
|
32
|
+
- CLOSED -> OPEN: after ``failure_threshold`` consecutive failures
|
|
33
|
+
- OPEN -> HALF_OPEN: after ``recovery_timeout`` seconds
|
|
34
|
+
- HALF_OPEN -> CLOSED: on first success
|
|
35
|
+
- HALF_OPEN -> OPEN: on first failure
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
failure_threshold: int = 5,
|
|
41
|
+
recovery_timeout: float = 60.0,
|
|
42
|
+
exceptions: tuple[type[Exception], ...] = (Exception,),
|
|
43
|
+
) -> None:
|
|
44
|
+
self.failure_threshold = failure_threshold
|
|
45
|
+
self.recovery_timeout = recovery_timeout
|
|
46
|
+
self.exceptions = exceptions
|
|
47
|
+
|
|
48
|
+
self._state = CircuitState.CLOSED
|
|
49
|
+
self._failure_count = 0
|
|
50
|
+
self._opened_at: float | None = None
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def state(self) -> CircuitState:
|
|
54
|
+
if self._state is CircuitState.OPEN:
|
|
55
|
+
assert self._opened_at is not None
|
|
56
|
+
if time.monotonic() - self._opened_at >= self.recovery_timeout:
|
|
57
|
+
self._state = CircuitState.HALF_OPEN
|
|
58
|
+
return self._state
|
|
59
|
+
|
|
60
|
+
def _on_success(self) -> None:
|
|
61
|
+
self._failure_count = 0
|
|
62
|
+
self._state = CircuitState.CLOSED
|
|
63
|
+
self._opened_at = None
|
|
64
|
+
|
|
65
|
+
def _on_failure(self) -> None:
|
|
66
|
+
self._failure_count += 1
|
|
67
|
+
if self._failure_count >= self.failure_threshold:
|
|
68
|
+
self._state = CircuitState.OPEN
|
|
69
|
+
self._opened_at = time.monotonic()
|
|
70
|
+
|
|
71
|
+
def call(self, fn):
|
|
72
|
+
if self.state is CircuitState.OPEN:
|
|
73
|
+
raise CircuitOpenError(
|
|
74
|
+
f"Circuit is open. Retry after {self.recovery_timeout}s."
|
|
75
|
+
)
|
|
76
|
+
try:
|
|
77
|
+
result = fn()
|
|
78
|
+
self._on_success()
|
|
79
|
+
return result
|
|
80
|
+
except self.exceptions as exc:
|
|
81
|
+
self._on_failure()
|
|
82
|
+
raise exc
|
|
83
|
+
|
|
84
|
+
def __enter__(self):
|
|
85
|
+
if self.state is CircuitState.OPEN:
|
|
86
|
+
raise CircuitOpenError(
|
|
87
|
+
f"Circuit is open. Retry after {self.recovery_timeout}s."
|
|
88
|
+
)
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
92
|
+
if exc_type is not None and issubclass(exc_type, self.exceptions):
|
|
93
|
+
self._on_failure()
|
|
94
|
+
return False
|
|
95
|
+
if exc_type is None:
|
|
96
|
+
self._on_success()
|
|
97
|
+
return False
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
T = TypeVar("T")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def with_fallback(
|
|
10
|
+
primary: Callable[[], T],
|
|
11
|
+
fallback: Callable[[], T],
|
|
12
|
+
exceptions: tuple[type[Exception], ...] = (Exception,),
|
|
13
|
+
) -> T:
|
|
14
|
+
"""Call ``primary``; on failure call ``fallback``.
|
|
15
|
+
|
|
16
|
+
Usage::
|
|
17
|
+
|
|
18
|
+
result = with_fallback(
|
|
19
|
+
primary=lambda: call_opus(prompt),
|
|
20
|
+
fallback=lambda: call_sonnet(prompt),
|
|
21
|
+
)
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
return primary()
|
|
25
|
+
except exceptions:
|
|
26
|
+
return fallback()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def with_fallback_chain(
|
|
30
|
+
callables: list[Callable[[], T]],
|
|
31
|
+
exceptions: tuple[type[Exception], ...] = (Exception,),
|
|
32
|
+
) -> T:
|
|
33
|
+
"""Try each callable in order, returning the first success.
|
|
34
|
+
|
|
35
|
+
Raises the last exception if all callables fail.
|
|
36
|
+
|
|
37
|
+
Usage::
|
|
38
|
+
|
|
39
|
+
result = with_fallback_chain([
|
|
40
|
+
lambda: call_opus(prompt),
|
|
41
|
+
lambda: call_sonnet(prompt),
|
|
42
|
+
lambda: call_haiku(prompt),
|
|
43
|
+
])
|
|
44
|
+
"""
|
|
45
|
+
if not callables:
|
|
46
|
+
raise ValueError("callables list is empty")
|
|
47
|
+
last_exc: Exception | None = None
|
|
48
|
+
for fn in callables:
|
|
49
|
+
try:
|
|
50
|
+
return fn()
|
|
51
|
+
except exceptions as exc:
|
|
52
|
+
last_exc = exc
|
|
53
|
+
raise last_exc # type: ignore[misc]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from typing import Any, TypeVar
|
|
7
|
+
|
|
8
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def retry(
|
|
12
|
+
max_attempts: int = 3,
|
|
13
|
+
backoff: float = 2.0,
|
|
14
|
+
exceptions: tuple[type[Exception], ...] = (Exception,),
|
|
15
|
+
) -> Callable[[F], F]:
|
|
16
|
+
"""Retry a callable on failure with exponential backoff.
|
|
17
|
+
|
|
18
|
+
Usage::
|
|
19
|
+
|
|
20
|
+
@retry(max_attempts=3, backoff=2.0)
|
|
21
|
+
def call_llm(prompt: str) -> str:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
# or without decorator syntax:
|
|
25
|
+
result = retry(max_attempts=3)(call_llm)(prompt)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def decorator(fn: F) -> F:
|
|
29
|
+
@functools.wraps(fn)
|
|
30
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
31
|
+
delay = backoff
|
|
32
|
+
last_exc: Exception | None = None
|
|
33
|
+
for attempt in range(1, max_attempts + 1):
|
|
34
|
+
try:
|
|
35
|
+
return fn(*args, **kwargs)
|
|
36
|
+
except exceptions as exc:
|
|
37
|
+
last_exc = exc
|
|
38
|
+
if attempt < max_attempts:
|
|
39
|
+
time.sleep(delay)
|
|
40
|
+
delay *= backoff
|
|
41
|
+
raise last_exc # type: ignore[misc]
|
|
42
|
+
|
|
43
|
+
return wrapper # type: ignore[return-value]
|
|
44
|
+
|
|
45
|
+
return decorator
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from llm_evalkit.eval.dimensions import (
|
|
2
|
+
BlocklistDimension,
|
|
3
|
+
FactualGroundingDimension,
|
|
4
|
+
ReadabilityDimension,
|
|
5
|
+
SchemaComplianceDimension,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# --- BlocklistDimension ---
|
|
10
|
+
|
|
11
|
+
def test_blocklist_clean():
|
|
12
|
+
dim = BlocklistDimension(terms=["secret", "internal"])
|
|
13
|
+
result = dim.run("This is a public document.")
|
|
14
|
+
assert result.passed
|
|
15
|
+
assert result.score == 1.0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_blocklist_hit():
|
|
19
|
+
dim = BlocklistDimension(terms=["secret", "internal"])
|
|
20
|
+
result = dim.run("This is an internal document.")
|
|
21
|
+
assert not result.passed
|
|
22
|
+
assert result.score == 0.0
|
|
23
|
+
assert "internal" in result.detail
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_blocklist_case_insensitive_default():
|
|
27
|
+
dim = BlocklistDimension(terms=["SECRET"])
|
|
28
|
+
result = dim.run("This contains secret info.")
|
|
29
|
+
assert not result.passed
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_blocklist_case_sensitive():
|
|
33
|
+
dim = BlocklistDimension(terms=["SECRET"], case_sensitive=True)
|
|
34
|
+
result = dim.run("This contains secret info.")
|
|
35
|
+
assert result.passed
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# --- SchemaComplianceDimension ---
|
|
39
|
+
|
|
40
|
+
def test_schema_all_present():
|
|
41
|
+
dim = SchemaComplianceDimension(required_fields=["title:", "summary:", "date:"])
|
|
42
|
+
text = "title: Foo\nsummary: Bar\ndate: 2024-01-01"
|
|
43
|
+
result = dim.run(text)
|
|
44
|
+
assert result.passed
|
|
45
|
+
assert result.score == 1.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_schema_missing_fields():
|
|
49
|
+
dim = SchemaComplianceDimension(required_fields=["title:", "summary:", "date:"])
|
|
50
|
+
text = "title: Foo"
|
|
51
|
+
result = dim.run(text)
|
|
52
|
+
assert not result.passed
|
|
53
|
+
assert result.score < 1.0
|
|
54
|
+
assert "summary:" in result.detail
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# --- ReadabilityDimension ---
|
|
58
|
+
|
|
59
|
+
def test_readability_simple_text_passes():
|
|
60
|
+
dim = ReadabilityDimension(threshold=0.1)
|
|
61
|
+
text = "The cat sat on the mat. It was a fat cat."
|
|
62
|
+
result = dim.run(text)
|
|
63
|
+
assert result.passed
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_readability_empty_text_fails():
|
|
67
|
+
dim = ReadabilityDimension(threshold=0.1)
|
|
68
|
+
result = dim.run(" ")
|
|
69
|
+
assert not result.passed
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# --- FactualGroundingDimension ---
|
|
73
|
+
|
|
74
|
+
def test_factual_no_evidence_skips():
|
|
75
|
+
dim = FactualGroundingDimension(evidence=None)
|
|
76
|
+
result = dim.run("Revenue was $1.2 billion.")
|
|
77
|
+
assert result.passed
|
|
78
|
+
assert "skipped" in result.detail
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_factual_grounded():
|
|
82
|
+
dim = FactualGroundingDimension(evidence=[1200000000.0], threshold=0.85)
|
|
83
|
+
result = dim.run("Revenue was 1200000000 dollars.")
|
|
84
|
+
assert result.passed
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_factual_ungrounded():
|
|
88
|
+
dim = FactualGroundingDimension(evidence=[999.0], threshold=0.85)
|
|
89
|
+
result = dim.run("Revenue was 1200000000 dollars.")
|
|
90
|
+
assert not result.passed
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_factual_no_numbers_passes():
|
|
94
|
+
dim = FactualGroundingDimension(evidence=[1000.0], threshold=0.85)
|
|
95
|
+
result = dim.run("Revenue grew significantly.")
|
|
96
|
+
assert result.passed
|
|
97
|
+
assert "no numeric" in result.detail
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from llm_evalkit import Dimension, EvalHarness
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AlwaysPassDimension(Dimension):
|
|
7
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
8
|
+
return 1.0, "always passes"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AlwaysFailDimension(Dimension):
|
|
12
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
13
|
+
return 0.0, "always fails"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HalfScoreDimension(Dimension):
|
|
17
|
+
def evaluate(self, text: str) -> tuple[float, str]:
|
|
18
|
+
return 0.5, "half score"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_harness_all_pass():
|
|
22
|
+
harness = EvalHarness([AlwaysPassDimension(name="d1"), AlwaysPassDimension(name="d2")])
|
|
23
|
+
report = harness.run("some text")
|
|
24
|
+
assert report.passed
|
|
25
|
+
assert report.failures == {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_harness_one_fail():
|
|
29
|
+
harness = EvalHarness([AlwaysPassDimension(name="pass"), AlwaysFailDimension(name="fail")])
|
|
30
|
+
report = harness.run("some text")
|
|
31
|
+
assert not report.passed
|
|
32
|
+
assert "fail" in report.failures
|
|
33
|
+
assert "pass" not in report.failures
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_harness_threshold():
|
|
37
|
+
dim = HalfScoreDimension(threshold=0.4, name="half")
|
|
38
|
+
harness = EvalHarness([dim])
|
|
39
|
+
report = harness.run("text")
|
|
40
|
+
assert report.passed
|
|
41
|
+
|
|
42
|
+
dim_strict = HalfScoreDimension(threshold=0.6, name="half_strict")
|
|
43
|
+
harness2 = EvalHarness([dim_strict])
|
|
44
|
+
report2 = harness2.run("text")
|
|
45
|
+
assert not report2.passed
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_harness_empty_dimensions_raises():
|
|
49
|
+
with pytest.raises(ValueError):
|
|
50
|
+
EvalHarness([])
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_report_str_contains_pass_fail():
|
|
54
|
+
harness = EvalHarness([AlwaysPassDimension(name="p"), AlwaysFailDimension(name="f")])
|
|
55
|
+
report = harness.run("x")
|
|
56
|
+
s = str(report)
|
|
57
|
+
assert "FAIL" in s
|
|
58
|
+
assert "PASS" in s
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from llm_evalkit.reliable import (
|
|
4
|
+
CircuitBreaker,
|
|
5
|
+
CircuitOpenError,
|
|
6
|
+
CircuitState,
|
|
7
|
+
retry,
|
|
8
|
+
with_fallback,
|
|
9
|
+
with_fallback_chain,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# --- retry ---
|
|
14
|
+
|
|
15
|
+
def test_retry_succeeds_first_try():
|
|
16
|
+
calls = []
|
|
17
|
+
|
|
18
|
+
@retry(max_attempts=3, backoff=0.0)
|
|
19
|
+
def fn():
|
|
20
|
+
calls.append(1)
|
|
21
|
+
return "ok"
|
|
22
|
+
|
|
23
|
+
assert fn() == "ok"
|
|
24
|
+
assert len(calls) == 1
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_retry_succeeds_on_second_attempt():
|
|
28
|
+
calls = []
|
|
29
|
+
|
|
30
|
+
@retry(max_attempts=3, backoff=0.0)
|
|
31
|
+
def fn():
|
|
32
|
+
calls.append(1)
|
|
33
|
+
if len(calls) < 2:
|
|
34
|
+
raise ValueError("not yet")
|
|
35
|
+
return "ok"
|
|
36
|
+
|
|
37
|
+
assert fn() == "ok"
|
|
38
|
+
assert len(calls) == 2
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_retry_exhausted_raises():
|
|
42
|
+
@retry(max_attempts=3, backoff=0.0)
|
|
43
|
+
def fn():
|
|
44
|
+
raise RuntimeError("always fails")
|
|
45
|
+
|
|
46
|
+
with pytest.raises(RuntimeError):
|
|
47
|
+
fn()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_retry_only_catches_specified_exceptions():
|
|
51
|
+
@retry(max_attempts=3, backoff=0.0, exceptions=(ValueError,))
|
|
52
|
+
def fn():
|
|
53
|
+
raise TypeError("wrong type")
|
|
54
|
+
|
|
55
|
+
with pytest.raises(TypeError):
|
|
56
|
+
fn()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# --- with_fallback ---
|
|
60
|
+
|
|
61
|
+
def test_fallback_primary_succeeds():
|
|
62
|
+
result = with_fallback(primary=lambda: "primary", fallback=lambda: "fallback")
|
|
63
|
+
assert result == "primary"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_fallback_primary_fails_uses_fallback():
|
|
67
|
+
def fail():
|
|
68
|
+
raise RuntimeError("primary down")
|
|
69
|
+
|
|
70
|
+
result = with_fallback(primary=fail, fallback=lambda: "fallback")
|
|
71
|
+
assert result == "fallback"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_fallback_chain_first_succeeds():
|
|
75
|
+
result = with_fallback_chain([lambda: "a", lambda: "b", lambda: "c"])
|
|
76
|
+
assert result == "a"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_fallback_chain_first_two_fail():
|
|
80
|
+
calls = []
|
|
81
|
+
|
|
82
|
+
def fail():
|
|
83
|
+
calls.append(1)
|
|
84
|
+
raise RuntimeError("down")
|
|
85
|
+
|
|
86
|
+
result = with_fallback_chain([fail, fail, lambda: "c"])
|
|
87
|
+
assert result == "c"
|
|
88
|
+
assert len(calls) == 2
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_fallback_chain_all_fail():
|
|
92
|
+
def fail():
|
|
93
|
+
raise RuntimeError("always")
|
|
94
|
+
|
|
95
|
+
with pytest.raises(RuntimeError):
|
|
96
|
+
with_fallback_chain([fail, fail])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_fallback_chain_empty_raises():
|
|
100
|
+
with pytest.raises(ValueError):
|
|
101
|
+
with_fallback_chain([])
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# --- CircuitBreaker ---
|
|
105
|
+
|
|
106
|
+
def test_circuit_starts_closed():
|
|
107
|
+
cb = CircuitBreaker()
|
|
108
|
+
assert cb.state is CircuitState.CLOSED
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_circuit_opens_after_threshold():
|
|
112
|
+
cb = CircuitBreaker(failure_threshold=3)
|
|
113
|
+
for _ in range(3):
|
|
114
|
+
try:
|
|
115
|
+
with cb:
|
|
116
|
+
raise RuntimeError("fail")
|
|
117
|
+
except RuntimeError:
|
|
118
|
+
pass
|
|
119
|
+
assert cb.state is CircuitState.OPEN
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def test_circuit_open_raises_circuit_error():
|
|
123
|
+
cb = CircuitBreaker(failure_threshold=1)
|
|
124
|
+
try:
|
|
125
|
+
with cb:
|
|
126
|
+
raise RuntimeError("fail")
|
|
127
|
+
except RuntimeError:
|
|
128
|
+
pass
|
|
129
|
+
with pytest.raises(CircuitOpenError):
|
|
130
|
+
with cb:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_circuit_resets_on_success():
|
|
135
|
+
cb = CircuitBreaker(failure_threshold=3)
|
|
136
|
+
try:
|
|
137
|
+
with cb:
|
|
138
|
+
raise RuntimeError("fail")
|
|
139
|
+
except RuntimeError:
|
|
140
|
+
pass
|
|
141
|
+
assert cb._failure_count == 1
|
|
142
|
+
with cb:
|
|
143
|
+
pass
|
|
144
|
+
assert cb.state is CircuitState.CLOSED
|
|
145
|
+
assert cb._failure_count == 0
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_circuit_call_form():
|
|
149
|
+
cb = CircuitBreaker()
|
|
150
|
+
result = cb.call(lambda: "ok")
|
|
151
|
+
assert result == "ok"
|