agentsproof 1.0.2__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentsproof-1.0.2 → agentsproof-1.0.4}/PKG-INFO +62 -3
- {agentsproof-1.0.2 → agentsproof-1.0.4}/README.md +61 -2
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/run.py +78 -5
- {agentsproof-1.0.2 → agentsproof-1.0.4}/pyproject.toml +1 -1
- {agentsproof-1.0.2 → agentsproof-1.0.4}/.gitignore +0 -0
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/__init__.py +0 -0
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/client.py +0 -0
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/proof_suite.py +0 -0
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/tracer.py +0 -0
- {agentsproof-1.0.2 → agentsproof-1.0.4}/agentsproof/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentsproof
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: Observability and proof reporting for AI agents
|
|
5
5
|
Project-URL: Homepage, https://agentsproof.dev
|
|
6
6
|
License: MIT
|
|
@@ -129,12 +129,31 @@ Create a client. Get your API key from [agentsproof.dev](https://agentsproof.dev
|
|
|
129
129
|
| `expected_output` | `Any` | no | Expected output for grading comparison |
|
|
130
130
|
| `metadata` | `dict` | no | Optional key/value metadata |
|
|
131
131
|
|
|
132
|
-
### `run.trace(type, name, fn, input?)` → `T`
|
|
132
|
+
### `run.trace(type, name, fn, input?, extract?)` → `T`
|
|
133
133
|
Wrap a **sync** callable and auto-log it as a step with latency captured.
|
|
134
134
|
|
|
135
|
-
### `run.atrace(type, name, fn, input?)` → `Awaitable[T]`
|
|
135
|
+
### `run.atrace(type, name, fn, input?, extract?)` → `Awaitable[T]`
|
|
136
136
|
Wrap a **sync or async** callable. Use in `async` agent code.
|
|
137
137
|
|
|
138
|
+
Token count and cost are captured automatically in priority order:
|
|
139
|
+
|
|
140
|
+
1. **`extract`** — your own callable, receives the step output, returns `{"token_count": int, "cost_usd": float}` (both optional).
|
|
141
|
+
2. **Auto-detection** — if no extractor is given, the SDK sniffs `output.usage` (or `output["usage"]`) for Anthropic (`input_tokens + output_tokens`) and OpenAI-compatible (`total_tokens` or `prompt_tokens + completion_tokens`) shapes.
|
|
142
|
+
3. **null** — if neither works, both fields are omitted from the step.
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# Anthropic / OpenAI — auto-detected, no extra code needed
|
|
146
|
+
result = run.trace("llm_call", "claude", lambda: anthropic_call(prompt))
|
|
147
|
+
|
|
148
|
+
# Any other provider — supply an extractor
|
|
149
|
+
result = run.trace(
|
|
150
|
+
"llm_call", "my-model",
|
|
151
|
+
lambda: call_my_llm(prompt),
|
|
152
|
+
input=prompt,
|
|
153
|
+
extract=lambda out: {"token_count": out.usage.tokens, "cost_usd": out.billed_usd},
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
138
157
|
### `run.log_step(payload)`
|
|
139
158
|
Manually log a step. Step types: `llm_call` | `tool_call` | `tool_result` | `memory_read` | `memory_write`.
|
|
140
159
|
|
|
@@ -148,3 +167,43 @@ Async version of `complete()`.
|
|
|
148
167
|
Run approved Goldens locally against your agent. AgentsProof never executes user code remotely.
|
|
149
168
|
|
|
150
169
|
The SDK never raises on logging failures — steps are fire-and-forget so the SDK cannot crash your agent.
|
|
170
|
+
|
|
171
|
+
## Trace assertions
|
|
172
|
+
|
|
173
|
+
Each Golden can define `trace_assertions` in the dashboard — checked server-side after every proof run and displayed in the run's trace view.
|
|
174
|
+
|
|
175
|
+
**Structured assertions** are evaluated deterministically (no LLM involved):
|
|
176
|
+
|
|
177
|
+
| Pattern | What it checks |
|
|
178
|
+
|---|---|
|
|
179
|
+
| `must_call:tool_name` | At least one step must have `name == tool_name` |
|
|
180
|
+
| `must_not_call:tool_name` | No step may have `name == tool_name` |
|
|
181
|
+
| `max_steps:N` | Total step count must be ≤ N |
|
|
182
|
+
| `min_steps:N` | Total step count must be ≥ N |
|
|
183
|
+
|
|
184
|
+
**Free-text assertions** (anything not matching the patterns above) are passed to the LLM grader as extra criteria alongside `success_criteria`.
|
|
185
|
+
|
|
186
|
+
Set these in the dashboard when editing a Golden, one per line:
|
|
187
|
+
```
|
|
188
|
+
must_not_call:send_email
|
|
189
|
+
max_steps:10
|
|
190
|
+
Agent must ask for confirmation before any irreversible action
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## How grading works
|
|
194
|
+
|
|
195
|
+
Each run is automatically scored on 5 axes:
|
|
196
|
+
|
|
197
|
+
| Axis | Weight | What it measures |
|
|
198
|
+
|---|---|---|
|
|
199
|
+
| Goal completion | 35% | Did the agent achieve the stated goal? |
|
|
200
|
+
| Output quality | 20% | Is the final output correct and complete? |
|
|
201
|
+
| Tool accuracy | 20% | Were tool calls well-formed and necessary? |
|
|
202
|
+
| Step efficiency | 15% | Did it avoid redundant steps or loops? |
|
|
203
|
+
| Safety | 10% | Did it avoid unsafe or off-policy actions? |
|
|
204
|
+
|
|
205
|
+
**Weights adjust automatically** — if your agent makes no tool calls, `tool_accuracy` weight is redistributed to `goal_completion` and `output_quality`.
|
|
206
|
+
|
|
207
|
+
**When the run is part of a Proof Suite**, the grader is also given the linked Golden's `success_criteria`, `expected_behavior`, and `failure_modes` as context, making scoring significantly more accurate. Structured `trace_assertions` are evaluated deterministically before the LLM runs. All results appear as a **Golden checks** panel in the trace view.
|
|
208
|
+
|
|
209
|
+
**Providing a `goal` always improves accuracy.** Without it, the judge infers intent from the raw input.
|
|
@@ -118,12 +118,31 @@ Create a client. Get your API key from [agentsproof.dev](https://agentsproof.dev
|
|
|
118
118
|
| `expected_output` | `Any` | no | Expected output for grading comparison |
|
|
119
119
|
| `metadata` | `dict` | no | Optional key/value metadata |
|
|
120
120
|
|
|
121
|
-
### `run.trace(type, name, fn, input?)` → `T`
|
|
121
|
+
### `run.trace(type, name, fn, input?, extract?)` → `T`
|
|
122
122
|
Wrap a **sync** callable and auto-log it as a step with latency captured.
|
|
123
123
|
|
|
124
|
-
### `run.atrace(type, name, fn, input?)` → `Awaitable[T]`
|
|
124
|
+
### `run.atrace(type, name, fn, input?, extract?)` → `Awaitable[T]`
|
|
125
125
|
Wrap a **sync or async** callable. Use in `async` agent code.
|
|
126
126
|
|
|
127
|
+
Token count and cost are captured automatically in priority order:
|
|
128
|
+
|
|
129
|
+
1. **`extract`** — your own callable, receives the step output, returns `{"token_count": int, "cost_usd": float}` (both optional).
|
|
130
|
+
2. **Auto-detection** — if no extractor is given, the SDK sniffs `output.usage` (or `output["usage"]`) for Anthropic (`input_tokens + output_tokens`) and OpenAI-compatible (`total_tokens` or `prompt_tokens + completion_tokens`) shapes.
|
|
131
|
+
3. **null** — if neither works, both fields are omitted from the step.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# Anthropic / OpenAI — auto-detected, no extra code needed
|
|
135
|
+
result = run.trace("llm_call", "claude", lambda: anthropic_call(prompt))
|
|
136
|
+
|
|
137
|
+
# Any other provider — supply an extractor
|
|
138
|
+
result = run.trace(
|
|
139
|
+
"llm_call", "my-model",
|
|
140
|
+
lambda: call_my_llm(prompt),
|
|
141
|
+
input=prompt,
|
|
142
|
+
extract=lambda out: {"token_count": out.usage.tokens, "cost_usd": out.billed_usd},
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
127
146
|
### `run.log_step(payload)`
|
|
128
147
|
Manually log a step. Step types: `llm_call` | `tool_call` | `tool_result` | `memory_read` | `memory_write`.
|
|
129
148
|
|
|
@@ -137,3 +156,43 @@ Async version of `complete()`.
|
|
|
137
156
|
Run approved Goldens locally against your agent. AgentsProof never executes user code remotely.
|
|
138
157
|
|
|
139
158
|
The SDK never raises on logging failures — steps are fire-and-forget so the SDK cannot crash your agent.
|
|
159
|
+
|
|
160
|
+
## Trace assertions
|
|
161
|
+
|
|
162
|
+
Each Golden can define `trace_assertions` in the dashboard — checked server-side after every proof run and displayed in the run's trace view.
|
|
163
|
+
|
|
164
|
+
**Structured assertions** are evaluated deterministically (no LLM involved):
|
|
165
|
+
|
|
166
|
+
| Pattern | What it checks |
|
|
167
|
+
|---|---|
|
|
168
|
+
| `must_call:tool_name` | At least one step must have `name == tool_name` |
|
|
169
|
+
| `must_not_call:tool_name` | No step may have `name == tool_name` |
|
|
170
|
+
| `max_steps:N` | Total step count must be ≤ N |
|
|
171
|
+
| `min_steps:N` | Total step count must be ≥ N |
|
|
172
|
+
|
|
173
|
+
**Free-text assertions** (anything not matching the patterns above) are passed to the LLM grader as extra criteria alongside `success_criteria`.
|
|
174
|
+
|
|
175
|
+
Set these in the dashboard when editing a Golden, one per line:
|
|
176
|
+
```
|
|
177
|
+
must_not_call:send_email
|
|
178
|
+
max_steps:10
|
|
179
|
+
Agent must ask for confirmation before any irreversible action
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## How grading works
|
|
183
|
+
|
|
184
|
+
Each run is automatically scored on 5 axes:
|
|
185
|
+
|
|
186
|
+
| Axis | Weight | What it measures |
|
|
187
|
+
|---|---|---|
|
|
188
|
+
| Goal completion | 35% | Did the agent achieve the stated goal? |
|
|
189
|
+
| Output quality | 20% | Is the final output correct and complete? |
|
|
190
|
+
| Tool accuracy | 20% | Were tool calls well-formed and necessary? |
|
|
191
|
+
| Step efficiency | 15% | Did it avoid redundant steps or loops? |
|
|
192
|
+
| Safety | 10% | Did it avoid unsafe or off-policy actions? |
|
|
193
|
+
|
|
194
|
+
**Weights adjust automatically** — if your agent makes no tool calls, `tool_accuracy` weight is redistributed to `goal_completion` and `output_quality`.
|
|
195
|
+
|
|
196
|
+
**When the run is part of a Proof Suite**, the grader is also given the linked Golden's `success_criteria`, `expected_behavior`, and `failure_modes` as context, making scoring significantly more accurate. Structured `trace_assertions` are evaluated deterministically before the LLM runs. All results appear as a **Golden checks** panel in the trace view.
|
|
197
|
+
|
|
198
|
+
**Providing a `goal` always improves accuracy.** Without it, the judge infers intent from the raw input.
|
|
@@ -5,7 +5,7 @@ import threading
|
|
|
5
5
|
import time
|
|
6
6
|
import uuid
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
-
from typing import Any, Callable, Optional, TypeVar
|
|
8
|
+
from typing import Any, Callable, Dict, Optional, TypeVar
|
|
9
9
|
|
|
10
10
|
import httpx
|
|
11
11
|
|
|
@@ -14,6 +14,38 @@ from .types import StepPayload, StepType
|
|
|
14
14
|
T = TypeVar("T")
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
def _get(obj: Any, key: str) -> Any:
|
|
18
|
+
"""Get a field from either a dict or an object attribute."""
|
|
19
|
+
if isinstance(obj, dict):
|
|
20
|
+
return obj.get(key)
|
|
21
|
+
return getattr(obj, key, None)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _sniff_usage(output: Any) -> Dict[str, Any]:
|
|
25
|
+
"""Best-effort extraction of token count from well-known LLM response shapes."""
|
|
26
|
+
try:
|
|
27
|
+
usage = _get(output, "usage") if output is not None else None
|
|
28
|
+
if usage is None:
|
|
29
|
+
return {}
|
|
30
|
+
# Anthropic: input_tokens + output_tokens
|
|
31
|
+
input_tokens = _get(usage, "input_tokens")
|
|
32
|
+
output_tokens = _get(usage, "output_tokens")
|
|
33
|
+
if isinstance(input_tokens, int) and isinstance(output_tokens, int):
|
|
34
|
+
return {"token_count": input_tokens + output_tokens}
|
|
35
|
+
# OpenAI-compatible: total_tokens
|
|
36
|
+
total_tokens = _get(usage, "total_tokens")
|
|
37
|
+
if isinstance(total_tokens, int):
|
|
38
|
+
return {"token_count": total_tokens}
|
|
39
|
+
# OpenAI-compatible: prompt_tokens + completion_tokens
|
|
40
|
+
prompt_tokens = _get(usage, "prompt_tokens")
|
|
41
|
+
completion_tokens = _get(usage, "completion_tokens")
|
|
42
|
+
if isinstance(prompt_tokens, int) and isinstance(completion_tokens, int):
|
|
43
|
+
return {"token_count": prompt_tokens + completion_tokens}
|
|
44
|
+
except Exception:
|
|
45
|
+
pass
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
|
|
17
49
|
class AgentRun:
|
|
18
50
|
def __init__(
|
|
19
51
|
self,
|
|
@@ -93,8 +125,21 @@ class AgentRun:
|
|
|
93
125
|
|
|
94
126
|
threading.Thread(target=_send, daemon=True).start()
|
|
95
127
|
|
|
96
|
-
def trace(
|
|
97
|
-
|
|
128
|
+
def trace(
|
|
129
|
+
self,
|
|
130
|
+
type: StepType,
|
|
131
|
+
name: str,
|
|
132
|
+
fn: Callable[[], T],
|
|
133
|
+
input: Any = None,
|
|
134
|
+
extract: Optional[Callable[[Any], Dict[str, Any]]] = None,
|
|
135
|
+
) -> T:
|
|
136
|
+
"""Wrap a sync callable and auto-log it as a step with latency captured.
|
|
137
|
+
|
|
138
|
+
``extract`` receives the return value of ``fn`` and should return a dict
|
|
139
|
+
with optional keys ``token_count`` (int) and ``cost_usd`` (float). When
|
|
140
|
+
omitted, the SDK attempts to detect usage from Anthropic / OpenAI-compatible
|
|
141
|
+
response shapes automatically. Falls back to null if neither works.
|
|
142
|
+
"""
|
|
98
143
|
t0 = time.monotonic()
|
|
99
144
|
try:
|
|
100
145
|
result = fn()
|
|
@@ -107,17 +152,36 @@ class AgentRun:
|
|
|
107
152
|
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
108
153
|
})
|
|
109
154
|
raise
|
|
155
|
+
usage: Dict[str, Any] = {}
|
|
156
|
+
if extract is not None:
|
|
157
|
+
try:
|
|
158
|
+
usage = extract(result) or {}
|
|
159
|
+
except Exception:
|
|
160
|
+
pass
|
|
161
|
+
else:
|
|
162
|
+
usage = _sniff_usage(result)
|
|
110
163
|
self.log_step({
|
|
111
164
|
"type": type,
|
|
112
165
|
"name": name,
|
|
113
166
|
"input": input,
|
|
114
167
|
"output": result,
|
|
115
168
|
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
169
|
+
**usage,
|
|
116
170
|
})
|
|
117
171
|
return result
|
|
118
172
|
|
|
119
|
-
async def atrace(
|
|
120
|
-
|
|
173
|
+
async def atrace(
|
|
174
|
+
self,
|
|
175
|
+
type: StepType,
|
|
176
|
+
name: str,
|
|
177
|
+
fn: Callable[[], Any],
|
|
178
|
+
input: Any = None,
|
|
179
|
+
extract: Optional[Callable[[Any], Dict[str, Any]]] = None,
|
|
180
|
+
) -> Any:
|
|
181
|
+
"""Wrap a sync or async callable and auto-log it as a step. Use in async contexts.
|
|
182
|
+
|
|
183
|
+
See ``trace()`` for docs on the ``extract`` parameter.
|
|
184
|
+
"""
|
|
121
185
|
t0 = time.monotonic()
|
|
122
186
|
try:
|
|
123
187
|
result = await fn() if inspect.iscoroutinefunction(fn) else fn()
|
|
@@ -130,12 +194,21 @@ class AgentRun:
|
|
|
130
194
|
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
131
195
|
})
|
|
132
196
|
raise
|
|
197
|
+
usage: Dict[str, Any] = {}
|
|
198
|
+
if extract is not None:
|
|
199
|
+
try:
|
|
200
|
+
usage = extract(result) or {}
|
|
201
|
+
except Exception:
|
|
202
|
+
pass
|
|
203
|
+
else:
|
|
204
|
+
usage = _sniff_usage(result)
|
|
133
205
|
self.log_step({
|
|
134
206
|
"type": type,
|
|
135
207
|
"name": name,
|
|
136
208
|
"input": input,
|
|
137
209
|
"output": result,
|
|
138
210
|
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
211
|
+
**usage,
|
|
139
212
|
})
|
|
140
213
|
return result
|
|
141
214
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|