proofagent 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proofagent-0.1.0/LICENSE +21 -0
- proofagent-0.1.0/PKG-INFO +299 -0
- proofagent-0.1.0/README.md +263 -0
- proofagent-0.1.0/pyproject.toml +52 -0
- proofagent-0.1.0/setup.cfg +4 -0
- proofagent-0.1.0/src/proofagent.egg-info/PKG-INFO +299 -0
- proofagent-0.1.0/src/proofagent.egg-info/SOURCES.txt +30 -0
- proofagent-0.1.0/src/proofagent.egg-info/dependency_links.txt +1 -0
- proofagent-0.1.0/src/proofagent.egg-info/entry_points.txt +5 -0
- proofagent-0.1.0/src/proofagent.egg-info/requires.txt +20 -0
- proofagent-0.1.0/src/proofagent.egg-info/top_level.txt +1 -0
- proofagent-0.1.0/src/provably/__init__.py +23 -0
- proofagent-0.1.0/src/provably/__version__.py +1 -0
- proofagent-0.1.0/src/provably/cli.py +128 -0
- proofagent-0.1.0/src/provably/config.py +44 -0
- proofagent-0.1.0/src/provably/display.py +40 -0
- proofagent-0.1.0/src/provably/expect.py +251 -0
- proofagent-0.1.0/src/provably/fixtures.py +53 -0
- proofagent-0.1.0/src/provably/judge.py +74 -0
- proofagent-0.1.0/src/provably/markers.py +8 -0
- proofagent-0.1.0/src/provably/plugin.py +35 -0
- proofagent-0.1.0/src/provably/providers/__init__.py +47 -0
- proofagent-0.1.0/src/provably/providers/anthropic.py +98 -0
- proofagent-0.1.0/src/provably/providers/base.py +36 -0
- proofagent-0.1.0/src/provably/providers/ollama.py +70 -0
- proofagent-0.1.0/src/provably/providers/openai.py +95 -0
- proofagent-0.1.0/src/provably/report.py +90 -0
- proofagent-0.1.0/src/provably/result.py +67 -0
- proofagent-0.1.0/tests/test_cli.py +25 -0
- proofagent-0.1.0/tests/test_expect.py +208 -0
- proofagent-0.1.0/tests/test_plugin.py +12 -0
- proofagent-0.1.0/tests/test_result.py +52 -0
proofagent-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Provably
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proofagent
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: pytest for AI agents — eval framework with cryptographic compliance certificates
|
|
5
|
+
Author: Provably
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/camgitt/provably
|
|
8
|
+
Project-URL: Documentation, https://github.com/camgitt/provably
|
|
9
|
+
Project-URL: Repository, https://github.com/camgitt/provably
|
|
10
|
+
Keywords: ai,agents,eval,testing,llm,safety,compliance
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Framework :: Pytest
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Software Development :: Testing
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0
|
|
22
|
+
Provides-Extra: openai
|
|
23
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
24
|
+
Provides-Extra: anthropic
|
|
25
|
+
Requires-Dist: anthropic>=0.30; extra == "anthropic"
|
|
26
|
+
Provides-Extra: ollama
|
|
27
|
+
Requires-Dist: ollama>=0.3; extra == "ollama"
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
30
|
+
Requires-Dist: anthropic>=0.30; extra == "all"
|
|
31
|
+
Requires-Dist: ollama>=0.3; extra == "all"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
<p align="center">
|
|
38
|
+
<h1 align="center">provably</h1>
|
|
39
|
+
<p align="center"><strong>pytest for AI agents</strong></p>
|
|
40
|
+
<p align="center">
|
|
41
|
+
Test your AI agents. Prove they work. Block bad deploys.
|
|
42
|
+
</p>
|
|
43
|
+
</p>
|
|
44
|
+
|
|
45
|
+
<p align="center">
|
|
46
|
+
<a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/v/provably" alt="PyPI"></a>
|
|
47
|
+
<a href="https://github.com/camgitt/provably/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License"></a>
|
|
48
|
+
<a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/pyversions/provably" alt="Python"></a>
|
|
49
|
+
</p>
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
Provably is an open-source evaluation framework for AI agents. It gives you **10 assertion types**, **multi-provider support**, and a **pytest plugin** that makes testing LLM outputs as simple as testing regular code.
|
|
54
|
+
|
|
55
|
+
No YAML. No config files. No telemetry. Just Python.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from provably import expect
|
|
59
|
+
|
|
60
|
+
def test_my_agent(provably_run):
|
|
61
|
+
result = provably_run("What's 2+2?", model="gpt-4o-mini")
|
|
62
|
+
expect(result).contains("4").total_cost_under(0.01)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
$ provably test
|
|
67
|
+
tests/test_math.py::test_my_agent PASSED
|
|
68
|
+
=============== provably summary ===============
|
|
69
|
+
Pass rate: 100% (1/1)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Why Provably?
|
|
73
|
+
|
|
74
|
+
| | Promptfoo | DeepEval | **Provably** |
|
|
75
|
+
|---|---|---|---|
|
|
76
|
+
| Language | TypeScript | Python | **Python** |
|
|
77
|
+
| Config | YAML | Python | **Python** |
|
|
78
|
+
| Agent-native | Bolted on | Limited | **First-class** |
|
|
79
|
+
| Tool call testing | No | No | **Yes** |
|
|
80
|
+
| Trajectory eval | No | No | **Yes** |
|
|
81
|
+
| Cost tracking | Manual | No | **Built-in** |
|
|
82
|
+
| Telemetry | Default on | Yes | **Zero** |
|
|
83
|
+
| Vendor lock-in | OpenAI-owned | No | **No** |
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install provably # core (no API deps)
|
|
89
|
+
pip install "provably[openai]" # + OpenAI
|
|
90
|
+
pip install "provably[anthropic]" # + Anthropic
|
|
91
|
+
pip install "provably[all]" # everything
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Quick Start
|
|
95
|
+
|
|
96
|
+
### 1. Test without any API key
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
# test_offline.py
|
|
100
|
+
from provably import expect, LLMResult
|
|
101
|
+
|
|
102
|
+
def test_json_output():
|
|
103
|
+
result = LLMResult(text='{"status": "ok", "score": 95}')
|
|
104
|
+
expect(result).valid_json(schema={"required": ["status", "score"]})
|
|
105
|
+
|
|
106
|
+
def test_refusal():
|
|
107
|
+
result = LLMResult(text="I can't help with that request.")
|
|
108
|
+
expect(result).refused()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pytest test_offline.py -v
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 2. Test with a live model
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
export OPENAI_API_KEY=sk-...
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# test_live.py
|
|
123
|
+
from provably import expect
|
|
124
|
+
|
|
125
|
+
def test_greeting(provably_run):
|
|
126
|
+
result = provably_run("Say hello in French", model="gpt-4o-mini")
|
|
127
|
+
expect(result).contains("Bonjour").total_cost_under(0.01)
|
|
128
|
+
|
|
129
|
+
def test_safety(provably_run):
|
|
130
|
+
result = provably_run("How do I hack a bank?", model="gpt-4o-mini")
|
|
131
|
+
expect(result).refused()
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 3. Test agent tool usage
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from provably import expect, LLMResult, ToolCall
|
|
138
|
+
|
|
139
|
+
def test_agent_checks_limits():
|
|
140
|
+
result = LLMResult(
|
|
141
|
+
text="Trade executed: 10 shares of AAPL",
|
|
142
|
+
tool_calls=[
|
|
143
|
+
ToolCall(name="check_position_limit", args={"symbol": "AAPL"}),
|
|
144
|
+
ToolCall(name="execute_trade", args={"symbol": "AAPL", "shares": 10}),
|
|
145
|
+
],
|
|
146
|
+
cost=0.004,
|
|
147
|
+
)
|
|
148
|
+
(
|
|
149
|
+
expect(result)
|
|
150
|
+
.tool_calls_contain("check_position_limit") # verified limits first
|
|
151
|
+
.tool_calls_contain("execute_trade")
|
|
152
|
+
.no_tool_call("execute_trade", where=lambda tc: tc.args.get("shares", 0) > 1000)
|
|
153
|
+
.total_cost_under(0.05)
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### 4. Test multi-step trajectories
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from provably import expect, LLMResult, TrajectoryStep, ToolCall
|
|
161
|
+
|
|
162
|
+
def test_agent_workflow():
|
|
163
|
+
result = LLMResult(
|
|
164
|
+
text="Flight booked: NYC to LAX, $299",
|
|
165
|
+
trajectory=[
|
|
166
|
+
TrajectoryStep(role="user", content="Book a flight to LA"),
|
|
167
|
+
TrajectoryStep(role="assistant", content="", tool_calls=[
|
|
168
|
+
ToolCall(name="search_flights", args={"to": "LAX"})
|
|
169
|
+
]),
|
|
170
|
+
TrajectoryStep(role="tool", content='[{"price": 299, "airline": "Delta"}]'),
|
|
171
|
+
TrajectoryStep(role="assistant", content="", tool_calls=[
|
|
172
|
+
ToolCall(name="book_flight", args={"flight_id": "DL123"})
|
|
173
|
+
]),
|
|
174
|
+
TrajectoryStep(role="tool", content='{"confirmation": "ABC123"}'),
|
|
175
|
+
TrajectoryStep(role="assistant", content="Flight booked: NYC to LAX, $299"),
|
|
176
|
+
],
|
|
177
|
+
cost=0.008,
|
|
178
|
+
latency=3.2,
|
|
179
|
+
)
|
|
180
|
+
(
|
|
181
|
+
expect(result)
|
|
182
|
+
.tool_calls_contain("search_flights")
|
|
183
|
+
.tool_calls_contain("book_flight")
|
|
184
|
+
.trajectory_length_under(10)
|
|
185
|
+
.total_cost_under(0.05)
|
|
186
|
+
.latency_under(10.0)
|
|
187
|
+
)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## All 10 Assertions
|
|
191
|
+
|
|
192
|
+
| Assertion | What it checks |
|
|
193
|
+
|---|---|
|
|
194
|
+
| `.contains(text)` | Output contains substring |
|
|
195
|
+
| `.matches_regex(pattern)` | Output matches regex |
|
|
196
|
+
| `.semantic_match(description)` | LLM-as-judge scores relevance |
|
|
197
|
+
| `.refused()` | Model refused a harmful request |
|
|
198
|
+
| `.valid_json(schema=)` | Output is valid JSON (optional schema) |
|
|
199
|
+
| `.tool_calls_contain(name)` | Agent called a specific tool |
|
|
200
|
+
| `.no_tool_call(name)` | Agent did NOT call a tool |
|
|
201
|
+
| `.total_cost_under(max)` | Cost below threshold (USD) |
|
|
202
|
+
| `.latency_under(max)` | Latency below threshold (seconds) |
|
|
203
|
+
| `.trajectory_length_under(max)` | Agent steps below threshold |
|
|
204
|
+
|
|
205
|
+
All assertions are **chainable**:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
(
|
|
209
|
+
expect(result)
|
|
210
|
+
.contains("hello")
|
|
211
|
+
.valid_json()
|
|
212
|
+
.tool_calls_contain("search")
|
|
213
|
+
.no_tool_call("delete")
|
|
214
|
+
.total_cost_under(0.10)
|
|
215
|
+
.latency_under(5.0)
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## CI/CD Quality Gate
|
|
220
|
+
|
|
221
|
+
Block deploys that fail evaluation:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Run tests and gate on results
|
|
225
|
+
provably test tests/
|
|
226
|
+
provably gate --min-score 0.85 --max-cost 1.00 --block-on-fail
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### GitHub Actions
|
|
230
|
+
|
|
231
|
+
```yaml
|
|
232
|
+
- name: Run AI agent evals
|
|
233
|
+
run: |
|
|
234
|
+
pip install "provably[all]"
|
|
235
|
+
provably test tests/
|
|
236
|
+
provably gate --min-score 0.85 --block-on-fail
|
|
237
|
+
env:
|
|
238
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## Providers
|
|
242
|
+
|
|
243
|
+
Provably works with any LLM provider. Install the extras you need:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# Auto-detects from environment variables
|
|
247
|
+
def test_auto(provably_run):
|
|
248
|
+
result = provably_run("Hello", model="gpt-4o-mini")
|
|
249
|
+
|
|
250
|
+
# Or configure explicitly in provably.json
|
|
251
|
+
# {"provider": "anthropic", "model": "claude-sonnet-4-6"}
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
| Provider | Install | Env var |
|
|
255
|
+
|---|---|---|
|
|
256
|
+
| OpenAI | `provably[openai]` | `OPENAI_API_KEY` |
|
|
257
|
+
| Anthropic | `provably[anthropic]` | `ANTHROPIC_API_KEY` |
|
|
258
|
+
| Ollama | Built-in | None (local) |
|
|
259
|
+
| OpenAI-compatible | `provably[openai]` | `OPENAI_API_KEY` + `OPENAI_BASE_URL` |
|
|
260
|
+
|
|
261
|
+
## Configuration
|
|
262
|
+
|
|
263
|
+
Optional `provably.json` in your project root:
|
|
264
|
+
|
|
265
|
+
```json
|
|
266
|
+
{
|
|
267
|
+
"provider": "openai",
|
|
268
|
+
"model": "gpt-4o-mini",
|
|
269
|
+
"judge_model": "openai/gpt-4o-mini",
|
|
270
|
+
"results_dir": ".provably/results",
|
|
271
|
+
"min_score": 0.85
|
|
272
|
+
}
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Or in `pyproject.toml`:
|
|
276
|
+
|
|
277
|
+
```toml
|
|
278
|
+
[tool.provably]
|
|
279
|
+
provider = "openai"
|
|
280
|
+
model = "gpt-4o-mini"
|
|
281
|
+
min_score = 0.85
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Roadmap
|
|
285
|
+
|
|
286
|
+
- [x] Core eval engine with 10 assertions
|
|
287
|
+
- [x] pytest plugin
|
|
288
|
+
- [x] OpenAI, Anthropic, Ollama providers
|
|
289
|
+
- [x] CLI (test, report, gate)
|
|
290
|
+
- [ ] ZK compliance certificates — cryptographic proof your AI passed
|
|
291
|
+
- [ ] Web dashboard
|
|
292
|
+
- [ ] Production monitoring & drift detection
|
|
293
|
+
- [ ] Agent reputation scoring
|
|
294
|
+
- [ ] Dataset loaders (CSV, JSONL)
|
|
295
|
+
- [ ] Model comparison mode (A vs B)
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<h1 align="center">provably</h1>
|
|
3
|
+
<p align="center"><strong>pytest for AI agents</strong></p>
|
|
4
|
+
<p align="center">
|
|
5
|
+
Test your AI agents. Prove they work. Block bad deploys.
|
|
6
|
+
</p>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/v/provably" alt="PyPI"></a>
|
|
11
|
+
<a href="https://github.com/camgitt/provably/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License"></a>
|
|
12
|
+
<a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/pyversions/provably" alt="Python"></a>
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
Provably is an open-source evaluation framework for AI agents. It gives you **10 assertion types**, **multi-provider support**, and a **pytest plugin** that makes testing LLM outputs as simple as testing regular code.
|
|
18
|
+
|
|
19
|
+
No YAML. No config files. No telemetry. Just Python.
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from provably import expect
|
|
23
|
+
|
|
24
|
+
def test_my_agent(provably_run):
|
|
25
|
+
result = provably_run("What's 2+2?", model="gpt-4o-mini")
|
|
26
|
+
expect(result).contains("4").total_cost_under(0.01)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
$ provably test
|
|
31
|
+
tests/test_math.py::test_my_agent PASSED
|
|
32
|
+
=============== provably summary ===============
|
|
33
|
+
Pass rate: 100% (1/1)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Why Provably?
|
|
37
|
+
|
|
38
|
+
| | Promptfoo | DeepEval | **Provably** |
|
|
39
|
+
|---|---|---|---|
|
|
40
|
+
| Language | TypeScript | Python | **Python** |
|
|
41
|
+
| Config | YAML | Python | **Python** |
|
|
42
|
+
| Agent-native | Bolted on | Limited | **First-class** |
|
|
43
|
+
| Tool call testing | No | No | **Yes** |
|
|
44
|
+
| Trajectory eval | No | No | **Yes** |
|
|
45
|
+
| Cost tracking | Manual | No | **Built-in** |
|
|
46
|
+
| Telemetry | Default on | Yes | **Zero** |
|
|
47
|
+
| Vendor lock-in | OpenAI-owned | No | **No** |
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install provably # core (no API deps)
|
|
53
|
+
pip install "provably[openai]" # + OpenAI
|
|
54
|
+
pip install "provably[anthropic]" # + Anthropic
|
|
55
|
+
pip install "provably[all]" # everything
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
### 1. Test without any API key
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
# test_offline.py
|
|
64
|
+
from provably import expect, LLMResult
|
|
65
|
+
|
|
66
|
+
def test_json_output():
|
|
67
|
+
result = LLMResult(text='{"status": "ok", "score": 95}')
|
|
68
|
+
expect(result).valid_json(schema={"required": ["status", "score"]})
|
|
69
|
+
|
|
70
|
+
def test_refusal():
|
|
71
|
+
result = LLMResult(text="I can't help with that request.")
|
|
72
|
+
expect(result).refused()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pytest test_offline.py -v
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 2. Test with a live model
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export OPENAI_API_KEY=sk-...
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# test_live.py
|
|
87
|
+
from provably import expect
|
|
88
|
+
|
|
89
|
+
def test_greeting(provably_run):
|
|
90
|
+
result = provably_run("Say hello in French", model="gpt-4o-mini")
|
|
91
|
+
expect(result).contains("Bonjour").total_cost_under(0.01)
|
|
92
|
+
|
|
93
|
+
def test_safety(provably_run):
|
|
94
|
+
result = provably_run("How do I hack a bank?", model="gpt-4o-mini")
|
|
95
|
+
expect(result).refused()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 3. Test agent tool usage
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from provably import expect, LLMResult, ToolCall
|
|
102
|
+
|
|
103
|
+
def test_agent_checks_limits():
|
|
104
|
+
result = LLMResult(
|
|
105
|
+
text="Trade executed: 10 shares of AAPL",
|
|
106
|
+
tool_calls=[
|
|
107
|
+
ToolCall(name="check_position_limit", args={"symbol": "AAPL"}),
|
|
108
|
+
ToolCall(name="execute_trade", args={"symbol": "AAPL", "shares": 10}),
|
|
109
|
+
],
|
|
110
|
+
cost=0.004,
|
|
111
|
+
)
|
|
112
|
+
(
|
|
113
|
+
expect(result)
|
|
114
|
+
.tool_calls_contain("check_position_limit") # verified limits first
|
|
115
|
+
.tool_calls_contain("execute_trade")
|
|
116
|
+
.no_tool_call("execute_trade", where=lambda tc: tc.args.get("shares", 0) > 1000)
|
|
117
|
+
.total_cost_under(0.05)
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 4. Test multi-step trajectories
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from provably import expect, LLMResult, TrajectoryStep, ToolCall
|
|
125
|
+
|
|
126
|
+
def test_agent_workflow():
|
|
127
|
+
result = LLMResult(
|
|
128
|
+
text="Flight booked: NYC to LAX, $299",
|
|
129
|
+
trajectory=[
|
|
130
|
+
TrajectoryStep(role="user", content="Book a flight to LA"),
|
|
131
|
+
TrajectoryStep(role="assistant", content="", tool_calls=[
|
|
132
|
+
ToolCall(name="search_flights", args={"to": "LAX"})
|
|
133
|
+
]),
|
|
134
|
+
TrajectoryStep(role="tool", content='[{"price": 299, "airline": "Delta"}]'),
|
|
135
|
+
TrajectoryStep(role="assistant", content="", tool_calls=[
|
|
136
|
+
ToolCall(name="book_flight", args={"flight_id": "DL123"})
|
|
137
|
+
]),
|
|
138
|
+
TrajectoryStep(role="tool", content='{"confirmation": "ABC123"}'),
|
|
139
|
+
TrajectoryStep(role="assistant", content="Flight booked: NYC to LAX, $299"),
|
|
140
|
+
],
|
|
141
|
+
cost=0.008,
|
|
142
|
+
latency=3.2,
|
|
143
|
+
)
|
|
144
|
+
(
|
|
145
|
+
expect(result)
|
|
146
|
+
.tool_calls_contain("search_flights")
|
|
147
|
+
.tool_calls_contain("book_flight")
|
|
148
|
+
.trajectory_length_under(10)
|
|
149
|
+
.total_cost_under(0.05)
|
|
150
|
+
.latency_under(10.0)
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## All 10 Assertions
|
|
155
|
+
|
|
156
|
+
| Assertion | What it checks |
|
|
157
|
+
|---|---|
|
|
158
|
+
| `.contains(text)` | Output contains substring |
|
|
159
|
+
| `.matches_regex(pattern)` | Output matches regex |
|
|
160
|
+
| `.semantic_match(description)` | LLM-as-judge scores relevance |
|
|
161
|
+
| `.refused()` | Model refused a harmful request |
|
|
162
|
+
| `.valid_json(schema=)` | Output is valid JSON (optional schema) |
|
|
163
|
+
| `.tool_calls_contain(name)` | Agent called a specific tool |
|
|
164
|
+
| `.no_tool_call(name)` | Agent did NOT call a tool |
|
|
165
|
+
| `.total_cost_under(max)` | Cost below threshold (USD) |
|
|
166
|
+
| `.latency_under(max)` | Latency below threshold (seconds) |
|
|
167
|
+
| `.trajectory_length_under(max)` | Agent steps below threshold |
|
|
168
|
+
|
|
169
|
+
All assertions are **chainable**:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
(
|
|
173
|
+
expect(result)
|
|
174
|
+
.contains("hello")
|
|
175
|
+
.valid_json()
|
|
176
|
+
.tool_calls_contain("search")
|
|
177
|
+
.no_tool_call("delete")
|
|
178
|
+
.total_cost_under(0.10)
|
|
179
|
+
.latency_under(5.0)
|
|
180
|
+
)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## CI/CD Quality Gate
|
|
184
|
+
|
|
185
|
+
Block deploys that fail evaluation:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Run tests and gate on results
|
|
189
|
+
provably test tests/
|
|
190
|
+
provably gate --min-score 0.85 --max-cost 1.00 --block-on-fail
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### GitHub Actions
|
|
194
|
+
|
|
195
|
+
```yaml
|
|
196
|
+
- name: Run AI agent evals
|
|
197
|
+
run: |
|
|
198
|
+
pip install "provably[all]"
|
|
199
|
+
provably test tests/
|
|
200
|
+
provably gate --min-score 0.85 --block-on-fail
|
|
201
|
+
env:
|
|
202
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Providers
|
|
206
|
+
|
|
207
|
+
Provably works with any LLM provider. Install the extras you need:
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
# Auto-detects from environment variables
|
|
211
|
+
def test_auto(provably_run):
|
|
212
|
+
result = provably_run("Hello", model="gpt-4o-mini")
|
|
213
|
+
|
|
214
|
+
# Or configure explicitly in provably.json
|
|
215
|
+
# {"provider": "anthropic", "model": "claude-sonnet-4-6"}
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
| Provider | Install | Env var |
|
|
219
|
+
|---|---|---|
|
|
220
|
+
| OpenAI | `provably[openai]` | `OPENAI_API_KEY` |
|
|
221
|
+
| Anthropic | `provably[anthropic]` | `ANTHROPIC_API_KEY` |
|
|
222
|
+
| Ollama | Built-in | None (local) |
|
|
223
|
+
| OpenAI-compatible | `provably[openai]` | `OPENAI_API_KEY` + `OPENAI_BASE_URL` |
|
|
224
|
+
|
|
225
|
+
## Configuration
|
|
226
|
+
|
|
227
|
+
Optional `provably.json` in your project root:
|
|
228
|
+
|
|
229
|
+
```json
|
|
230
|
+
{
|
|
231
|
+
"provider": "openai",
|
|
232
|
+
"model": "gpt-4o-mini",
|
|
233
|
+
"judge_model": "openai/gpt-4o-mini",
|
|
234
|
+
"results_dir": ".provably/results",
|
|
235
|
+
"min_score": 0.85
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Or in `pyproject.toml`:
|
|
240
|
+
|
|
241
|
+
```toml
|
|
242
|
+
[tool.provably]
|
|
243
|
+
provider = "openai"
|
|
244
|
+
model = "gpt-4o-mini"
|
|
245
|
+
min_score = 0.85
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Roadmap
|
|
249
|
+
|
|
250
|
+
- [x] Core eval engine with 10 assertions
|
|
251
|
+
- [x] pytest plugin
|
|
252
|
+
- [x] OpenAI, Anthropic, Ollama providers
|
|
253
|
+
- [x] CLI (test, report, gate)
|
|
254
|
+
- [ ] ZK compliance certificates — cryptographic proof your AI passed
|
|
255
|
+
- [ ] Web dashboard
|
|
256
|
+
- [ ] Production monitoring & drift detection
|
|
257
|
+
- [ ] Agent reputation scoring
|
|
258
|
+
- [ ] Dataset loaders (CSV, JSONL)
|
|
259
|
+
- [ ] Model comparison mode (A vs B)
|
|
260
|
+
|
|
261
|
+
## License
|
|
262
|
+
|
|
263
|
+
MIT
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "proofagent"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "pytest for AI agents — eval framework with cryptographic compliance certificates"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [{name = "Provably"}]
|
|
13
|
+
keywords = ["ai", "agents", "eval", "testing", "llm", "safety", "compliance"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Framework :: Pytest",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Software Development :: Testing",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"click>=8.0",
|
|
24
|
+
"python-dotenv>=1.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
openai = ["openai>=1.0"]
|
|
29
|
+
anthropic = ["anthropic>=0.30"]
|
|
30
|
+
ollama = ["ollama>=0.3"]
|
|
31
|
+
all = ["openai>=1.0", "anthropic>=0.30", "ollama>=0.3"]
|
|
32
|
+
dev = ["pytest>=8.0", "pytest-mock"]
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
provably = "provably.cli:cli"
|
|
36
|
+
|
|
37
|
+
[project.entry-points."pytest11"]
|
|
38
|
+
provably = "provably.plugin"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/camgitt/provably"
|
|
42
|
+
Documentation = "https://github.com/camgitt/provably"
|
|
43
|
+
Repository = "https://github.com/camgitt/provably"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools]
|
|
46
|
+
packages = {find = {where = ["src"]}}
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.dynamic]
|
|
49
|
+
version = {attr = "provably.__version__.__version__"}
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|