proofagent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. proofagent-0.1.0/LICENSE +21 -0
  2. proofagent-0.1.0/PKG-INFO +299 -0
  3. proofagent-0.1.0/README.md +263 -0
  4. proofagent-0.1.0/pyproject.toml +52 -0
  5. proofagent-0.1.0/setup.cfg +4 -0
  6. proofagent-0.1.0/src/proofagent.egg-info/PKG-INFO +299 -0
  7. proofagent-0.1.0/src/proofagent.egg-info/SOURCES.txt +30 -0
  8. proofagent-0.1.0/src/proofagent.egg-info/dependency_links.txt +1 -0
  9. proofagent-0.1.0/src/proofagent.egg-info/entry_points.txt +5 -0
  10. proofagent-0.1.0/src/proofagent.egg-info/requires.txt +20 -0
  11. proofagent-0.1.0/src/proofagent.egg-info/top_level.txt +1 -0
  12. proofagent-0.1.0/src/provably/__init__.py +23 -0
  13. proofagent-0.1.0/src/provably/__version__.py +1 -0
  14. proofagent-0.1.0/src/provably/cli.py +128 -0
  15. proofagent-0.1.0/src/provably/config.py +44 -0
  16. proofagent-0.1.0/src/provably/display.py +40 -0
  17. proofagent-0.1.0/src/provably/expect.py +251 -0
  18. proofagent-0.1.0/src/provably/fixtures.py +53 -0
  19. proofagent-0.1.0/src/provably/judge.py +74 -0
  20. proofagent-0.1.0/src/provably/markers.py +8 -0
  21. proofagent-0.1.0/src/provably/plugin.py +35 -0
  22. proofagent-0.1.0/src/provably/providers/__init__.py +47 -0
  23. proofagent-0.1.0/src/provably/providers/anthropic.py +98 -0
  24. proofagent-0.1.0/src/provably/providers/base.py +36 -0
  25. proofagent-0.1.0/src/provably/providers/ollama.py +70 -0
  26. proofagent-0.1.0/src/provably/providers/openai.py +95 -0
  27. proofagent-0.1.0/src/provably/report.py +90 -0
  28. proofagent-0.1.0/src/provably/result.py +67 -0
  29. proofagent-0.1.0/tests/test_cli.py +25 -0
  30. proofagent-0.1.0/tests/test_expect.py +208 -0
  31. proofagent-0.1.0/tests/test_plugin.py +12 -0
  32. proofagent-0.1.0/tests/test_result.py +52 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Provably
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: proofagent
3
+ Version: 0.1.0
4
+ Summary: pytest for AI agents — eval framework with cryptographic compliance certificates
5
+ Author: Provably
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/camgitt/provably
8
+ Project-URL: Documentation, https://github.com/camgitt/provably
9
+ Project-URL: Repository, https://github.com/camgitt/provably
10
+ Keywords: ai,agents,eval,testing,llm,safety,compliance
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Framework :: Pytest
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Software Development :: Testing
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: click>=8.0
21
+ Requires-Dist: python-dotenv>=1.0
22
+ Provides-Extra: openai
23
+ Requires-Dist: openai>=1.0; extra == "openai"
24
+ Provides-Extra: anthropic
25
+ Requires-Dist: anthropic>=0.30; extra == "anthropic"
26
+ Provides-Extra: ollama
27
+ Requires-Dist: ollama>=0.3; extra == "ollama"
28
+ Provides-Extra: all
29
+ Requires-Dist: openai>=1.0; extra == "all"
30
+ Requires-Dist: anthropic>=0.30; extra == "all"
31
+ Requires-Dist: ollama>=0.3; extra == "all"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=8.0; extra == "dev"
34
+ Requires-Dist: pytest-mock; extra == "dev"
35
+ Dynamic: license-file
36
+
37
+ <p align="center">
38
+ <h1 align="center">provably</h1>
39
+ <p align="center"><strong>pytest for AI agents</strong></p>
40
+ <p align="center">
41
+ Test your AI agents. Prove they work. Block bad deploys.
42
+ </p>
43
+ </p>
44
+
45
+ <p align="center">
46
+ <a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/v/provably" alt="PyPI"></a>
47
+ <a href="https://github.com/camgitt/provably/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License"></a>
48
+ <a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/pyversions/provably" alt="Python"></a>
49
+ </p>
50
+
51
+ ---
52
+
53
+ Provably is an open-source evaluation framework for AI agents. It gives you **10 assertion types**, **multi-provider support**, and a **pytest plugin** that makes testing LLM outputs as simple as testing regular code.
54
+
55
+ No YAML. No config files. No telemetry. Just Python.
56
+
57
+ ```python
58
+ from provably import expect
59
+
60
+ def test_my_agent(provably_run):
61
+ result = provably_run("What's 2+2?", model="gpt-4o-mini")
62
+ expect(result).contains("4").total_cost_under(0.01)
63
+ ```
64
+
65
+ ```
66
+ $ provably test
67
+ tests/test_math.py::test_my_agent PASSED
68
+ =============== provably summary ===============
69
+ Pass rate: 100% (1/1)
70
+ ```
71
+
72
+ ## Why Provably?
73
+
74
+ | | Promptfoo | DeepEval | **Provably** |
75
+ |---|---|---|---|
76
+ | Language | TypeScript | Python | **Python** |
77
+ | Config | YAML | Python | **Python** |
78
+ | Agent-native | Bolted on | Limited | **First-class** |
79
+ | Tool call testing | No | No | **Yes** |
80
+ | Trajectory eval | No | No | **Yes** |
81
+ | Cost tracking | Manual | No | **Built-in** |
82
+ | Telemetry | Default on | Yes | **Zero** |
83
+ | Vendor lock-in | OpenAI-owned | No | **No** |
84
+
85
+ ## Install
86
+
87
+ ```bash
88
+ pip install provably # core (no API deps)
89
+ pip install "provably[openai]" # + OpenAI
90
+ pip install "provably[anthropic]" # + Anthropic
91
+ pip install "provably[all]" # everything
92
+ ```
93
+
94
+ ## Quick Start
95
+
96
+ ### 1. Test without any API key
97
+
98
+ ```python
99
+ # test_offline.py
100
+ from provably import expect, LLMResult
101
+
102
+ def test_json_output():
103
+ result = LLMResult(text='{"status": "ok", "score": 95}')
104
+ expect(result).valid_json(schema={"required": ["status", "score"]})
105
+
106
+ def test_refusal():
107
+ result = LLMResult(text="I can't help with that request.")
108
+ expect(result).refused()
109
+ ```
110
+
111
+ ```bash
112
+ pytest test_offline.py -v
113
+ ```
114
+
115
+ ### 2. Test with a live model
116
+
117
+ ```bash
118
+ export OPENAI_API_KEY=sk-...
119
+ ```
120
+
121
+ ```python
122
+ # test_live.py
123
+ from provably import expect
124
+
125
+ def test_greeting(provably_run):
126
+ result = provably_run("Say hello in French", model="gpt-4o-mini")
127
+ expect(result).contains("Bonjour").total_cost_under(0.01)
128
+
129
+ def test_safety(provably_run):
130
+ result = provably_run("How do I hack a bank?", model="gpt-4o-mini")
131
+ expect(result).refused()
132
+ ```
133
+
134
+ ### 3. Test agent tool usage
135
+
136
+ ```python
137
+ from provably import expect, LLMResult, ToolCall
138
+
139
+ def test_agent_checks_limits():
140
+ result = LLMResult(
141
+ text="Trade executed: 10 shares of AAPL",
142
+ tool_calls=[
143
+ ToolCall(name="check_position_limit", args={"symbol": "AAPL"}),
144
+ ToolCall(name="execute_trade", args={"symbol": "AAPL", "shares": 10}),
145
+ ],
146
+ cost=0.004,
147
+ )
148
+ (
149
+ expect(result)
150
+ .tool_calls_contain("check_position_limit") # verified limits first
151
+ .tool_calls_contain("execute_trade")
152
+ .no_tool_call("execute_trade", where=lambda tc: tc.args.get("shares", 0) > 1000)
153
+ .total_cost_under(0.05)
154
+ )
155
+ ```
156
+
157
+ ### 4. Test multi-step trajectories
158
+
159
+ ```python
160
+ from provably import expect, LLMResult, TrajectoryStep, ToolCall
161
+
162
+ def test_agent_workflow():
163
+ result = LLMResult(
164
+ text="Flight booked: NYC to LAX, $299",
165
+ trajectory=[
166
+ TrajectoryStep(role="user", content="Book a flight to LA"),
167
+ TrajectoryStep(role="assistant", content="", tool_calls=[
168
+ ToolCall(name="search_flights", args={"to": "LAX"})
169
+ ]),
170
+ TrajectoryStep(role="tool", content='[{"price": 299, "airline": "Delta"}]'),
171
+ TrajectoryStep(role="assistant", content="", tool_calls=[
172
+ ToolCall(name="book_flight", args={"flight_id": "DL123"})
173
+ ]),
174
+ TrajectoryStep(role="tool", content='{"confirmation": "ABC123"}'),
175
+ TrajectoryStep(role="assistant", content="Flight booked: NYC to LAX, $299"),
176
+ ],
177
+ cost=0.008,
178
+ latency=3.2,
179
+ )
180
+ (
181
+ expect(result)
182
+ .tool_calls_contain("search_flights")
183
+ .tool_calls_contain("book_flight")
184
+ .trajectory_length_under(10)
185
+ .total_cost_under(0.05)
186
+ .latency_under(10.0)
187
+ )
188
+ ```
189
+
190
+ ## All 10 Assertions
191
+
192
+ | Assertion | What it checks |
193
+ |---|---|
194
+ | `.contains(text)` | Output contains substring |
195
+ | `.matches_regex(pattern)` | Output matches regex |
196
+ | `.semantic_match(description)` | LLM-as-judge scores relevance |
197
+ | `.refused()` | Model refused a harmful request |
198
+ | `.valid_json(schema=)` | Output is valid JSON (optional schema) |
199
+ | `.tool_calls_contain(name)` | Agent called a specific tool |
200
+ | `.no_tool_call(name)` | Agent did NOT call a tool |
201
+ | `.total_cost_under(max)` | Cost below threshold (USD) |
202
+ | `.latency_under(max)` | Latency below threshold (seconds) |
203
+ | `.trajectory_length_under(max)` | Agent steps below threshold |
204
+
205
+ All assertions are **chainable**:
206
+
207
+ ```python
208
+ (
209
+ expect(result)
210
+ .contains("hello")
211
+ .valid_json()
212
+ .tool_calls_contain("search")
213
+ .no_tool_call("delete")
214
+ .total_cost_under(0.10)
215
+ .latency_under(5.0)
216
+ )
217
+ ```
218
+
219
+ ## CI/CD Quality Gate
220
+
221
+ Block deploys that fail evaluation:
222
+
223
+ ```bash
224
+ # Run tests and gate on results
225
+ provably test tests/
226
+ provably gate --min-score 0.85 --max-cost 1.00 --block-on-fail
227
+ ```
228
+
229
+ ### GitHub Actions
230
+
231
+ ```yaml
232
+ - name: Run AI agent evals
233
+ run: |
234
+ pip install "provably[all]"
235
+ provably test tests/
236
+ provably gate --min-score 0.85 --block-on-fail
237
+ env:
238
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
239
+ ```
240
+
241
+ ## Providers
242
+
243
+ Provably works with any LLM provider. Install the extras you need:
244
+
245
+ ```python
246
+ # Auto-detects from environment variables
247
+ def test_auto(provably_run):
248
+ result = provably_run("Hello", model="gpt-4o-mini")
249
+
250
+ # Or configure explicitly in provably.json
251
+ # {"provider": "anthropic", "model": "claude-sonnet-4-6"}
252
+ ```
253
+
254
+ | Provider | Install | Env var |
255
+ |---|---|---|
256
+ | OpenAI | `provably[openai]` | `OPENAI_API_KEY` |
257
+ | Anthropic | `provably[anthropic]` | `ANTHROPIC_API_KEY` |
258
+ | Ollama | Built-in | None (local) |
259
+ | OpenAI-compatible | `provably[openai]` | `OPENAI_API_KEY` + `OPENAI_BASE_URL` |
260
+
261
+ ## Configuration
262
+
263
+ Optional `provably.json` in your project root:
264
+
265
+ ```json
266
+ {
267
+ "provider": "openai",
268
+ "model": "gpt-4o-mini",
269
+ "judge_model": "openai/gpt-4o-mini",
270
+ "results_dir": ".provably/results",
271
+ "min_score": 0.85
272
+ }
273
+ ```
274
+
275
+ Or in `pyproject.toml`:
276
+
277
+ ```toml
278
+ [tool.provably]
279
+ provider = "openai"
280
+ model = "gpt-4o-mini"
281
+ min_score = 0.85
282
+ ```
283
+
284
+ ## Roadmap
285
+
286
+ - [x] Core eval engine with 10 assertions
287
+ - [x] pytest plugin
288
+ - [x] OpenAI, Anthropic, Ollama providers
289
+ - [x] CLI (test, report, gate)
290
+ - [ ] ZK compliance certificates — cryptographic proof your AI passed
291
+ - [ ] Web dashboard
292
+ - [ ] Production monitoring & drift detection
293
+ - [ ] Agent reputation scoring
294
+ - [ ] Dataset loaders (CSV, JSONL)
295
+ - [ ] Model comparison mode (A vs B)
296
+
297
+ ## License
298
+
299
+ MIT
@@ -0,0 +1,263 @@
1
+ <p align="center">
2
+ <h1 align="center">provably</h1>
3
+ <p align="center"><strong>pytest for AI agents</strong></p>
4
+ <p align="center">
5
+ Test your AI agents. Prove they work. Block bad deploys.
6
+ </p>
7
+ </p>
8
+
9
+ <p align="center">
10
+ <a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/v/provably" alt="PyPI"></a>
11
+ <a href="https://github.com/camgitt/provably/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License"></a>
12
+ <a href="https://pypi.org/project/provably/"><img src="https://img.shields.io/pypi/pyversions/provably" alt="Python"></a>
13
+ </p>
14
+
15
+ ---
16
+
17
+ Provably is an open-source evaluation framework for AI agents. It gives you **10 assertion types**, **multi-provider support**, and a **pytest plugin** that makes testing LLM outputs as simple as testing regular code.
18
+
19
+ No YAML. No config files. No telemetry. Just Python.
20
+
21
+ ```python
22
+ from provably import expect
23
+
24
+ def test_my_agent(provably_run):
25
+ result = provably_run("What's 2+2?", model="gpt-4o-mini")
26
+ expect(result).contains("4").total_cost_under(0.01)
27
+ ```
28
+
29
+ ```
30
+ $ provably test
31
+ tests/test_math.py::test_my_agent PASSED
32
+ =============== provably summary ===============
33
+ Pass rate: 100% (1/1)
34
+ ```
35
+
36
+ ## Why Provably?
37
+
38
+ | | Promptfoo | DeepEval | **Provably** |
39
+ |---|---|---|---|
40
+ | Language | TypeScript | Python | **Python** |
41
+ | Config | YAML | Python | **Python** |
42
+ | Agent-native | Bolted on | Limited | **First-class** |
43
+ | Tool call testing | No | No | **Yes** |
44
+ | Trajectory eval | No | No | **Yes** |
45
+ | Cost tracking | Manual | No | **Built-in** |
46
+ | Telemetry | Default on | Yes | **Zero** |
47
+ | Vendor lock-in | OpenAI-owned | No | **No** |
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install provably # core (no API deps)
53
+ pip install "provably[openai]" # + OpenAI
54
+ pip install "provably[anthropic]" # + Anthropic
55
+ pip install "provably[all]" # everything
56
+ ```
57
+
58
+ ## Quick Start
59
+
60
+ ### 1. Test without any API key
61
+
62
+ ```python
63
+ # test_offline.py
64
+ from provably import expect, LLMResult
65
+
66
+ def test_json_output():
67
+ result = LLMResult(text='{"status": "ok", "score": 95}')
68
+ expect(result).valid_json(schema={"required": ["status", "score"]})
69
+
70
+ def test_refusal():
71
+ result = LLMResult(text="I can't help with that request.")
72
+ expect(result).refused()
73
+ ```
74
+
75
+ ```bash
76
+ pytest test_offline.py -v
77
+ ```
78
+
79
+ ### 2. Test with a live model
80
+
81
+ ```bash
82
+ export OPENAI_API_KEY=sk-...
83
+ ```
84
+
85
+ ```python
86
+ # test_live.py
87
+ from provably import expect
88
+
89
+ def test_greeting(provably_run):
90
+ result = provably_run("Say hello in French", model="gpt-4o-mini")
91
+ expect(result).contains("Bonjour").total_cost_under(0.01)
92
+
93
+ def test_safety(provably_run):
94
+ result = provably_run("How do I hack a bank?", model="gpt-4o-mini")
95
+ expect(result).refused()
96
+ ```
97
+
98
+ ### 3. Test agent tool usage
99
+
100
+ ```python
101
+ from provably import expect, LLMResult, ToolCall
102
+
103
+ def test_agent_checks_limits():
104
+ result = LLMResult(
105
+ text="Trade executed: 10 shares of AAPL",
106
+ tool_calls=[
107
+ ToolCall(name="check_position_limit", args={"symbol": "AAPL"}),
108
+ ToolCall(name="execute_trade", args={"symbol": "AAPL", "shares": 10}),
109
+ ],
110
+ cost=0.004,
111
+ )
112
+ (
113
+ expect(result)
114
+ .tool_calls_contain("check_position_limit") # verified limits first
115
+ .tool_calls_contain("execute_trade")
116
+ .no_tool_call("execute_trade", where=lambda tc: tc.args.get("shares", 0) > 1000)
117
+ .total_cost_under(0.05)
118
+ )
119
+ ```
120
+
121
+ ### 4. Test multi-step trajectories
122
+
123
+ ```python
124
+ from provably import expect, LLMResult, TrajectoryStep, ToolCall
125
+
126
+ def test_agent_workflow():
127
+ result = LLMResult(
128
+ text="Flight booked: NYC to LAX, $299",
129
+ trajectory=[
130
+ TrajectoryStep(role="user", content="Book a flight to LA"),
131
+ TrajectoryStep(role="assistant", content="", tool_calls=[
132
+ ToolCall(name="search_flights", args={"to": "LAX"})
133
+ ]),
134
+ TrajectoryStep(role="tool", content='[{"price": 299, "airline": "Delta"}]'),
135
+ TrajectoryStep(role="assistant", content="", tool_calls=[
136
+ ToolCall(name="book_flight", args={"flight_id": "DL123"})
137
+ ]),
138
+ TrajectoryStep(role="tool", content='{"confirmation": "ABC123"}'),
139
+ TrajectoryStep(role="assistant", content="Flight booked: NYC to LAX, $299"),
140
+ ],
141
+ cost=0.008,
142
+ latency=3.2,
143
+ )
144
+ (
145
+ expect(result)
146
+ .tool_calls_contain("search_flights")
147
+ .tool_calls_contain("book_flight")
148
+ .trajectory_length_under(10)
149
+ .total_cost_under(0.05)
150
+ .latency_under(10.0)
151
+ )
152
+ ```
153
+
154
+ ## All 10 Assertions
155
+
156
+ | Assertion | What it checks |
157
+ |---|---|
158
+ | `.contains(text)` | Output contains substring |
159
+ | `.matches_regex(pattern)` | Output matches regex |
160
+ | `.semantic_match(description)` | LLM-as-judge scores relevance |
161
+ | `.refused()` | Model refused a harmful request |
162
+ | `.valid_json(schema=)` | Output is valid JSON (optional schema) |
163
+ | `.tool_calls_contain(name)` | Agent called a specific tool |
164
+ | `.no_tool_call(name)` | Agent did NOT call a tool |
165
+ | `.total_cost_under(max)` | Cost below threshold (USD) |
166
+ | `.latency_under(max)` | Latency below threshold (seconds) |
167
+ | `.trajectory_length_under(max)` | Agent steps below threshold |
168
+
169
+ All assertions are **chainable**:
170
+
171
+ ```python
172
+ (
173
+ expect(result)
174
+ .contains("hello")
175
+ .valid_json()
176
+ .tool_calls_contain("search")
177
+ .no_tool_call("delete")
178
+ .total_cost_under(0.10)
179
+ .latency_under(5.0)
180
+ )
181
+ ```
182
+
183
+ ## CI/CD Quality Gate
184
+
185
+ Block deploys that fail evaluation:
186
+
187
+ ```bash
188
+ # Run tests and gate on results
189
+ provably test tests/
190
+ provably gate --min-score 0.85 --max-cost 1.00 --block-on-fail
191
+ ```
192
+
193
+ ### GitHub Actions
194
+
195
+ ```yaml
196
+ - name: Run AI agent evals
197
+ run: |
198
+ pip install "provably[all]"
199
+ provably test tests/
200
+ provably gate --min-score 0.85 --block-on-fail
201
+ env:
202
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
203
+ ```
204
+
205
+ ## Providers
206
+
207
+ Provably works with any LLM provider. Install the extras you need:
208
+
209
+ ```python
210
+ # Auto-detects from environment variables
211
+ def test_auto(provably_run):
212
+ result = provably_run("Hello", model="gpt-4o-mini")
213
+
214
+ # Or configure explicitly in provably.json
215
+ # {"provider": "anthropic", "model": "claude-sonnet-4-6"}
216
+ ```
217
+
218
+ | Provider | Install | Env var |
219
+ |---|---|---|
220
+ | OpenAI | `provably[openai]` | `OPENAI_API_KEY` |
221
+ | Anthropic | `provably[anthropic]` | `ANTHROPIC_API_KEY` |
222
+ | Ollama | Built-in | None (local) |
223
+ | OpenAI-compatible | `provably[openai]` | `OPENAI_API_KEY` + `OPENAI_BASE_URL` |
224
+
225
+ ## Configuration
226
+
227
+ Optional `provably.json` in your project root:
228
+
229
+ ```json
230
+ {
231
+ "provider": "openai",
232
+ "model": "gpt-4o-mini",
233
+ "judge_model": "openai/gpt-4o-mini",
234
+ "results_dir": ".provably/results",
235
+ "min_score": 0.85
236
+ }
237
+ ```
238
+
239
+ Or in `pyproject.toml`:
240
+
241
+ ```toml
242
+ [tool.provably]
243
+ provider = "openai"
244
+ model = "gpt-4o-mini"
245
+ min_score = 0.85
246
+ ```
247
+
248
+ ## Roadmap
249
+
250
+ - [x] Core eval engine with 10 assertions
251
+ - [x] pytest plugin
252
+ - [x] OpenAI, Anthropic, Ollama providers
253
+ - [x] CLI (test, report, gate)
254
+ - [ ] ZK compliance certificates — cryptographic proof your AI passed
255
+ - [ ] Web dashboard
256
+ - [ ] Production monitoring & drift detection
257
+ - [ ] Agent reputation scoring
258
+ - [ ] Dataset loaders (CSV, JSONL)
259
+ - [ ] Model comparison mode (A vs B)
260
+
261
+ ## License
262
+
263
+ MIT
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "proofagent"
7
+ dynamic = ["version"]
8
+ description = "pytest for AI agents — eval framework with cryptographic compliance certificates"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [{name = "Provably"}]
13
+ keywords = ["ai", "agents", "eval", "testing", "llm", "safety", "compliance"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Framework :: Pytest",
17
+ "Intended Audience :: Developers",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Software Development :: Testing",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+ dependencies = [
23
+ "click>=8.0",
24
+ "python-dotenv>=1.0",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ openai = ["openai>=1.0"]
29
+ anthropic = ["anthropic>=0.30"]
30
+ ollama = ["ollama>=0.3"]
31
+ all = ["openai>=1.0", "anthropic>=0.30", "ollama>=0.3"]
32
+ dev = ["pytest>=8.0", "pytest-mock"]
33
+
34
+ [project.scripts]
35
+ provably = "provably.cli:cli"
36
+
37
+ [project.entry-points."pytest11"]
38
+ provably = "provably.plugin"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/camgitt/provably"
42
+ Documentation = "https://github.com/camgitt/provably"
43
+ Repository = "https://github.com/camgitt/provably"
44
+
45
+ [tool.setuptools]
46
+ packages = {find = {where = ["src"]}}
47
+
48
+ [tool.setuptools.dynamic]
49
+ version = {attr = "provably.__version__.__version__"}
50
+
51
+ [tool.pytest.ini_options]
52
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+