agentsproof 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentsproof/__init__.py +17 -0
- agentsproof/client.py +66 -0
- agentsproof/proof_suite.py +185 -0
- agentsproof/run.py +172 -0
- agentsproof/tracer.py +27 -0
- agentsproof/types.py +40 -0
- agentsproof-0.1.0.dist-info/METADATA +150 -0
- agentsproof-0.1.0.dist-info/RECORD +9 -0
- agentsproof-0.1.0.dist-info/WHEEL +4 -0
agentsproof/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .client import AgentsProof
|
|
2
|
+
from .run import AgentRun
|
|
3
|
+
from .tracer import atrace_llm, atrace_tool, trace_llm, trace_tool
|
|
4
|
+
from .types import GoldenCase, ProofSuiteResult, StepPayload, StepType
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"AgentsProof",
|
|
8
|
+
"AgentRun",
|
|
9
|
+
"trace_llm",
|
|
10
|
+
"trace_tool",
|
|
11
|
+
"atrace_llm",
|
|
12
|
+
"atrace_tool",
|
|
13
|
+
"StepPayload",
|
|
14
|
+
"StepType",
|
|
15
|
+
"GoldenCase",
|
|
16
|
+
"ProofSuiteResult",
|
|
17
|
+
]
|
agentsproof/client.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
from .proof_suite import arun_proof_suite, run_proof_suite
|
|
6
|
+
from .run import AgentRun
|
|
7
|
+
from .types import ProofSuiteResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AgentsProof:
|
|
11
|
+
def __init__(self, *, api_key: str, base_url: str = "https://agentsproof.dev/api") -> None:
|
|
12
|
+
self._api_key = api_key
|
|
13
|
+
self._base_url = base_url.rstrip("/")
|
|
14
|
+
|
|
15
|
+
def start_run(
|
|
16
|
+
self,
|
|
17
|
+
*,
|
|
18
|
+
project_slug: str,
|
|
19
|
+
input: Any,
|
|
20
|
+
label: Optional[str] = None,
|
|
21
|
+
goal: Optional[str] = None,
|
|
22
|
+
expected_output: Any = None,
|
|
23
|
+
metadata: Optional[dict] = None,
|
|
24
|
+
) -> AgentRun:
|
|
25
|
+
return AgentRun(
|
|
26
|
+
project_slug=project_slug,
|
|
27
|
+
input=input,
|
|
28
|
+
label=label,
|
|
29
|
+
goal=goal,
|
|
30
|
+
expected_output=expected_output,
|
|
31
|
+
metadata=metadata,
|
|
32
|
+
api_key=self._api_key,
|
|
33
|
+
base_url=self._base_url,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def run_proof_suite(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
project_slug: str,
|
|
40
|
+
suite_slug: str,
|
|
41
|
+
handler: Callable[[Any, Any], Any],
|
|
42
|
+
) -> ProofSuiteResult:
|
|
43
|
+
"""Run approved Goldens locally against your agent (sync). Handler must be a regular function."""
|
|
44
|
+
return run_proof_suite(
|
|
45
|
+
project_slug=project_slug,
|
|
46
|
+
suite_slug=suite_slug,
|
|
47
|
+
handler=handler,
|
|
48
|
+
api_key=self._api_key,
|
|
49
|
+
base_url=self._base_url,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
async def arun_proof_suite(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
project_slug: str,
|
|
56
|
+
suite_slug: str,
|
|
57
|
+
handler: Callable[[Any, Any], Any],
|
|
58
|
+
) -> ProofSuiteResult:
|
|
59
|
+
"""Run approved Goldens locally against your agent (async). Handler can be sync or async."""
|
|
60
|
+
return await arun_proof_suite(
|
|
61
|
+
project_slug=project_slug,
|
|
62
|
+
suite_slug=suite_slug,
|
|
63
|
+
handler=handler,
|
|
64
|
+
api_key=self._api_key,
|
|
65
|
+
base_url=self._base_url,
|
|
66
|
+
)
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from .run import AgentRun
|
|
9
|
+
from .types import GoldenCase, ProofSuiteResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run_proof_suite(
|
|
13
|
+
*,
|
|
14
|
+
project_slug: str,
|
|
15
|
+
suite_slug: str,
|
|
16
|
+
handler: Callable[[Any, Any], Any],
|
|
17
|
+
api_key: str,
|
|
18
|
+
base_url: str,
|
|
19
|
+
) -> ProofSuiteResult:
|
|
20
|
+
"""Sync version. handler(input, ctx) must be a regular (non-async) function."""
|
|
21
|
+
with httpx.Client() as client:
|
|
22
|
+
res = client.get(
|
|
23
|
+
f"{base_url}/proof-suites/{suite_slug}/cases",
|
|
24
|
+
params={"projectSlug": project_slug},
|
|
25
|
+
headers={"x-api-key": api_key},
|
|
26
|
+
timeout=15,
|
|
27
|
+
)
|
|
28
|
+
if not res.is_success:
|
|
29
|
+
raise RuntimeError(f"AgentsProof: failed to load proof suite — {res.status_code}")
|
|
30
|
+
data = res.json()
|
|
31
|
+
|
|
32
|
+
proof_run_id: str = data["proofRunId"]
|
|
33
|
+
cases: list[GoldenCase] = data["cases"]
|
|
34
|
+
|
|
35
|
+
for golden in cases:
|
|
36
|
+
run_id: Optional[str] = None
|
|
37
|
+
passed = False
|
|
38
|
+
failure_summary: Optional[str] = None
|
|
39
|
+
handler_run: Optional[AgentRun] = None
|
|
40
|
+
|
|
41
|
+
def start_run(**overrides: Any) -> AgentRun:
|
|
42
|
+
nonlocal handler_run
|
|
43
|
+
r = AgentRun(
|
|
44
|
+
project_slug=project_slug,
|
|
45
|
+
label=overrides.get("label", f"Proof case: {golden.get('name', '')}"),
|
|
46
|
+
input=overrides.get("input", golden.get("input")),
|
|
47
|
+
goal=overrides.get("goal", golden.get("goal")),
|
|
48
|
+
expected_output=overrides.get("expected_output", golden.get("expected_output")),
|
|
49
|
+
metadata={**overrides.get("metadata", {}), "goldenId": golden["id"], "proofRunId": proof_run_id},
|
|
50
|
+
api_key=api_key,
|
|
51
|
+
base_url=base_url,
|
|
52
|
+
)
|
|
53
|
+
handler_run = r
|
|
54
|
+
return r
|
|
55
|
+
|
|
56
|
+
class Ctx:
|
|
57
|
+
golden_case = golden
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def start_run(**overrides: Any) -> AgentRun:
|
|
61
|
+
return start_run(**overrides)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
handler(golden.get("input"), Ctx)
|
|
65
|
+
if handler_run is not None:
|
|
66
|
+
run_id = handler_run.remote_run_id
|
|
67
|
+
passed = True
|
|
68
|
+
except Exception as err:
|
|
69
|
+
failure_summary = str(err)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
with httpx.Client() as client:
|
|
73
|
+
client.post(
|
|
74
|
+
f"{base_url}/proof-runs/{proof_run_id}/case-results",
|
|
75
|
+
headers={"x-api-key": api_key},
|
|
76
|
+
json={
|
|
77
|
+
"goldenId": golden["id"],
|
|
78
|
+
"runId": run_id,
|
|
79
|
+
"score": None,
|
|
80
|
+
"passed": passed,
|
|
81
|
+
"failureSummary": failure_summary,
|
|
82
|
+
},
|
|
83
|
+
timeout=10,
|
|
84
|
+
)
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
with httpx.Client() as client:
|
|
89
|
+
res = client.post(
|
|
90
|
+
f"{base_url}/proof-runs/{proof_run_id}/complete",
|
|
91
|
+
headers={"x-api-key": api_key},
|
|
92
|
+
timeout=30,
|
|
93
|
+
)
|
|
94
|
+
if not res.is_success:
|
|
95
|
+
raise RuntimeError(f"AgentsProof: failed to complete proof suite — {res.status_code}")
|
|
96
|
+
return res.json()
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def arun_proof_suite(
|
|
100
|
+
*,
|
|
101
|
+
project_slug: str,
|
|
102
|
+
suite_slug: str,
|
|
103
|
+
handler: Callable[[Any, Any], Any],
|
|
104
|
+
api_key: str,
|
|
105
|
+
base_url: str,
|
|
106
|
+
) -> ProofSuiteResult:
|
|
107
|
+
"""Async version. handler(input, ctx) can be sync or async."""
|
|
108
|
+
async with httpx.AsyncClient() as client:
|
|
109
|
+
res = await client.get(
|
|
110
|
+
f"{base_url}/proof-suites/{suite_slug}/cases",
|
|
111
|
+
params={"projectSlug": project_slug},
|
|
112
|
+
headers={"x-api-key": api_key},
|
|
113
|
+
timeout=15,
|
|
114
|
+
)
|
|
115
|
+
if not res.is_success:
|
|
116
|
+
raise RuntimeError(f"AgentsProof: failed to load proof suite — {res.status_code}")
|
|
117
|
+
data = res.json()
|
|
118
|
+
|
|
119
|
+
proof_run_id: str = data["proofRunId"]
|
|
120
|
+
cases: list[GoldenCase] = data["cases"]
|
|
121
|
+
|
|
122
|
+
for golden in cases:
|
|
123
|
+
run_id: Optional[str] = None
|
|
124
|
+
passed = False
|
|
125
|
+
failure_summary: Optional[str] = None
|
|
126
|
+
handler_run: Optional[AgentRun] = None
|
|
127
|
+
|
|
128
|
+
def start_run(**overrides: Any) -> AgentRun:
|
|
129
|
+
nonlocal handler_run
|
|
130
|
+
r = AgentRun(
|
|
131
|
+
project_slug=project_slug,
|
|
132
|
+
label=overrides.get("label", f"Proof case: {golden.get('name', '')}"),
|
|
133
|
+
input=overrides.get("input", golden.get("input")),
|
|
134
|
+
goal=overrides.get("goal", golden.get("goal")),
|
|
135
|
+
expected_output=overrides.get("expected_output", golden.get("expected_output")),
|
|
136
|
+
metadata={**overrides.get("metadata", {}), "goldenId": golden["id"], "proofRunId": proof_run_id},
|
|
137
|
+
api_key=api_key,
|
|
138
|
+
base_url=base_url,
|
|
139
|
+
)
|
|
140
|
+
handler_run = r
|
|
141
|
+
return r
|
|
142
|
+
|
|
143
|
+
class Ctx:
|
|
144
|
+
golden_case = golden
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def start_run(**overrides: Any) -> AgentRun:
|
|
148
|
+
return start_run(**overrides)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
result = handler(golden.get("input"), Ctx)
|
|
152
|
+
if inspect.iscoroutine(result):
|
|
153
|
+
await result
|
|
154
|
+
if handler_run is not None:
|
|
155
|
+
run_id = handler_run.remote_run_id
|
|
156
|
+
passed = True
|
|
157
|
+
except Exception as err:
|
|
158
|
+
failure_summary = str(err)
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
async with httpx.AsyncClient() as client:
|
|
162
|
+
await client.post(
|
|
163
|
+
f"{base_url}/proof-runs/{proof_run_id}/case-results",
|
|
164
|
+
headers={"x-api-key": api_key},
|
|
165
|
+
json={
|
|
166
|
+
"goldenId": golden["id"],
|
|
167
|
+
"runId": run_id,
|
|
168
|
+
"score": None,
|
|
169
|
+
"passed": passed,
|
|
170
|
+
"failureSummary": failure_summary,
|
|
171
|
+
},
|
|
172
|
+
timeout=10,
|
|
173
|
+
)
|
|
174
|
+
except Exception:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
async with httpx.AsyncClient() as client:
|
|
178
|
+
res = await client.post(
|
|
179
|
+
f"{base_url}/proof-runs/{proof_run_id}/complete",
|
|
180
|
+
headers={"x-api-key": api_key},
|
|
181
|
+
timeout=30,
|
|
182
|
+
)
|
|
183
|
+
if not res.is_success:
|
|
184
|
+
raise RuntimeError(f"AgentsProof: failed to complete proof suite — {res.status_code}")
|
|
185
|
+
return res.json()
|
agentsproof/run.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, Callable, Optional, TypeVar
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from .types import StepPayload, StepType
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AgentRun:
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
*,
|
|
21
|
+
project_slug: str,
|
|
22
|
+
input: Any,
|
|
23
|
+
api_key: str,
|
|
24
|
+
base_url: str,
|
|
25
|
+
label: Optional[str] = None,
|
|
26
|
+
goal: Optional[str] = None,
|
|
27
|
+
expected_output: Any = None,
|
|
28
|
+
metadata: Optional[dict] = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.run_id = uuid.uuid4().hex[:12]
|
|
31
|
+
self._api_key = api_key
|
|
32
|
+
self._base_url = base_url
|
|
33
|
+
self._steps: list = []
|
|
34
|
+
self._started_at = time.monotonic()
|
|
35
|
+
self._remote_run_id: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
self._init_remote(
|
|
38
|
+
project_slug=project_slug,
|
|
39
|
+
input=input,
|
|
40
|
+
label=label,
|
|
41
|
+
goal=goal,
|
|
42
|
+
expected_output=expected_output,
|
|
43
|
+
metadata=metadata or {},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def _init_remote(
|
|
47
|
+
self,
|
|
48
|
+
*,
|
|
49
|
+
project_slug: str,
|
|
50
|
+
input: Any,
|
|
51
|
+
label: Optional[str],
|
|
52
|
+
goal: Optional[str],
|
|
53
|
+
expected_output: Any,
|
|
54
|
+
metadata: dict,
|
|
55
|
+
) -> None:
|
|
56
|
+
with httpx.Client() as client:
|
|
57
|
+
res = client.post(
|
|
58
|
+
f"{self._base_url}/runs",
|
|
59
|
+
headers={"x-api-key": self._api_key},
|
|
60
|
+
json={
|
|
61
|
+
"label": label,
|
|
62
|
+
"input": input,
|
|
63
|
+
"projectSlug": project_slug,
|
|
64
|
+
"clientRunId": self.run_id,
|
|
65
|
+
"goal": goal,
|
|
66
|
+
"expectedOutput": expected_output,
|
|
67
|
+
"metadata": metadata,
|
|
68
|
+
},
|
|
69
|
+
timeout=10,
|
|
70
|
+
)
|
|
71
|
+
res.raise_for_status()
|
|
72
|
+
self._remote_run_id = res.json()["runId"]
|
|
73
|
+
|
|
74
|
+
def log_step(self, payload: StepPayload) -> None:
|
|
75
|
+
step = {
|
|
76
|
+
**payload,
|
|
77
|
+
"step_index": len(self._steps),
|
|
78
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
79
|
+
}
|
|
80
|
+
self._steps.append(step)
|
|
81
|
+
|
|
82
|
+
def _send() -> None:
|
|
83
|
+
try:
|
|
84
|
+
with httpx.Client() as client:
|
|
85
|
+
client.post(
|
|
86
|
+
f"{self._base_url}/runs/{self._remote_run_id}/steps",
|
|
87
|
+
headers={"x-api-key": self._api_key},
|
|
88
|
+
json=step,
|
|
89
|
+
timeout=10,
|
|
90
|
+
)
|
|
91
|
+
except Exception:
|
|
92
|
+
pass # SDK must never crash the agent
|
|
93
|
+
|
|
94
|
+
threading.Thread(target=_send, daemon=True).start()
|
|
95
|
+
|
|
96
|
+
def trace(self, type: StepType, name: str, fn: Callable[[], T], input: Any = None) -> T:
|
|
97
|
+
"""Wrap a sync callable and auto-log it as a step with latency captured."""
|
|
98
|
+
t0 = time.monotonic()
|
|
99
|
+
try:
|
|
100
|
+
result = fn()
|
|
101
|
+
except Exception as err:
|
|
102
|
+
self.log_step({
|
|
103
|
+
"type": type,
|
|
104
|
+
"name": name,
|
|
105
|
+
"input": input,
|
|
106
|
+
"output": {"error": str(err)},
|
|
107
|
+
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
108
|
+
})
|
|
109
|
+
raise
|
|
110
|
+
self.log_step({
|
|
111
|
+
"type": type,
|
|
112
|
+
"name": name,
|
|
113
|
+
"input": input,
|
|
114
|
+
"output": result,
|
|
115
|
+
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
116
|
+
})
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
async def atrace(self, type: StepType, name: str, fn: Callable[[], Any], input: Any = None) -> Any:
|
|
120
|
+
"""Wrap a sync or async callable and auto-log it as a step. Use in async contexts."""
|
|
121
|
+
t0 = time.monotonic()
|
|
122
|
+
try:
|
|
123
|
+
result = await fn() if inspect.iscoroutinefunction(fn) else fn()
|
|
124
|
+
except Exception as err:
|
|
125
|
+
self.log_step({
|
|
126
|
+
"type": type,
|
|
127
|
+
"name": name,
|
|
128
|
+
"input": input,
|
|
129
|
+
"output": {"error": str(err)},
|
|
130
|
+
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
131
|
+
})
|
|
132
|
+
raise
|
|
133
|
+
self.log_step({
|
|
134
|
+
"type": type,
|
|
135
|
+
"name": name,
|
|
136
|
+
"input": input,
|
|
137
|
+
"output": result,
|
|
138
|
+
"latency_ms": (time.monotonic() - t0) * 1000,
|
|
139
|
+
})
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
def complete(self, output: Any) -> dict:
|
|
143
|
+
"""Finish the run and trigger grading. Returns {"publicUrl": "..."}."""
|
|
144
|
+
with httpx.Client() as client:
|
|
145
|
+
res = client.post(
|
|
146
|
+
f"{self._base_url}/runs/{self._remote_run_id}/complete",
|
|
147
|
+
headers={"x-api-key": self._api_key},
|
|
148
|
+
json={"output": output},
|
|
149
|
+
timeout=30,
|
|
150
|
+
)
|
|
151
|
+
res.raise_for_status()
|
|
152
|
+
return res.json()
|
|
153
|
+
|
|
154
|
+
async def acomplete(self, output: Any) -> dict:
|
|
155
|
+
"""Async version of complete()."""
|
|
156
|
+
async with httpx.AsyncClient() as client:
|
|
157
|
+
res = await client.post(
|
|
158
|
+
f"{self._base_url}/runs/{self._remote_run_id}/complete",
|
|
159
|
+
headers={"x-api-key": self._api_key},
|
|
160
|
+
json={"output": output},
|
|
161
|
+
timeout=30,
|
|
162
|
+
)
|
|
163
|
+
res.raise_for_status()
|
|
164
|
+
return res.json()
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def elapsed_ms(self) -> float:
|
|
168
|
+
return (time.monotonic() - self._started_at) * 1000
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def remote_run_id(self) -> Optional[str]:
|
|
172
|
+
return self._remote_run_id
|
agentsproof/tracer.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, TypeVar
|
|
4
|
+
|
|
5
|
+
from .run import AgentRun
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def trace_llm(run: AgentRun, name: str, fn: Callable[[], T], input: Any = None) -> T:
|
|
11
|
+
"""Convenience wrapper for tracing a sync LLM call."""
|
|
12
|
+
return run.trace("llm_call", name, fn, input)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def trace_tool(run: AgentRun, name: str, fn: Callable[[], T], input: Any = None) -> T:
|
|
16
|
+
"""Convenience wrapper for tracing a sync tool call."""
|
|
17
|
+
return run.trace("tool_call", name, fn, input)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def atrace_llm(run: AgentRun, name: str, fn: Callable[[], Any], input: Any = None) -> Any:
|
|
21
|
+
"""Convenience wrapper for tracing a sync or async LLM call."""
|
|
22
|
+
return await run.atrace("llm_call", name, fn, input)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def atrace_tool(run: AgentRun, name: str, fn: Callable[[], Any], input: Any = None) -> Any:
|
|
26
|
+
"""Convenience wrapper for tracing a sync or async tool call."""
|
|
27
|
+
return await run.atrace("tool_call", name, fn, input)
|
agentsproof/types.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Coroutine, Dict, List, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
StepType = Literal["llm_call", "tool_call", "tool_result", "memory_read", "memory_write"]
|
|
6
|
+
|
|
7
|
+
# Using plain dicts with TypedDict for IDE support without runtime overhead
|
|
8
|
+
try:
|
|
9
|
+
from typing import TypedDict
|
|
10
|
+
except ImportError:
|
|
11
|
+
from typing_extensions import TypedDict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StepPayload(TypedDict, total=False):
|
|
15
|
+
type: StepType
|
|
16
|
+
name: str
|
|
17
|
+
input: Any
|
|
18
|
+
output: Any
|
|
19
|
+
latency_ms: float
|
|
20
|
+
token_count: int
|
|
21
|
+
cost_usd: float
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GoldenCase(TypedDict, total=False):
|
|
25
|
+
id: str
|
|
26
|
+
name: str
|
|
27
|
+
input: Any
|
|
28
|
+
goal: str
|
|
29
|
+
expected_output: Any
|
|
30
|
+
expected_behavior: str
|
|
31
|
+
success_criteria: List[str]
|
|
32
|
+
trace_assertions: List[str]
|
|
33
|
+
custom_grader_ids: List[str]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ProofSuiteResult(TypedDict, total=False):
|
|
37
|
+
passed_cases: int
|
|
38
|
+
failed_cases: int
|
|
39
|
+
overall_score: float
|
|
40
|
+
public_url: str
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentsproof
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Observability and proof reporting for AI agents
|
|
5
|
+
Project-URL: Homepage, https://agentsproof.dev
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: agents,ai,llm,observability,tracing
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Requires-Dist: httpx>=0.27.0
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# agentsproof
|
|
13
|
+
|
|
14
|
+
Drop the SDK into your Python agent, define what "good" means, and get a shareable proof report.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install agentsproof
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick start — single run (sync)
|
|
23
|
+
|
|
24
|
+
Works with any Python agent — OpenAI, Anthropic, LangChain, LlamaIndex, or plain functions.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import os
|
|
28
|
+
from agentsproof import AgentsProof
|
|
29
|
+
|
|
30
|
+
ap = AgentsProof(api_key=os.environ["AGENTSPROOF_API_KEY"])
|
|
31
|
+
|
|
32
|
+
def run_my_agent(user_query: str):
|
|
33
|
+
run = ap.start_run(
|
|
34
|
+
project_slug="my-coding-agent",
|
|
35
|
+
label="Answer coding question",
|
|
36
|
+
input={"query": user_query},
|
|
37
|
+
goal="Search the web for relevant docs and return a working code solution",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Wrap any callable — the SDK captures latency and output automatically
|
|
41
|
+
plan = run.trace("llm_call", "gpt-4o", lambda: openai_call(user_query), input=user_query)
|
|
42
|
+
|
|
43
|
+
results = run.trace("tool_call", "web_search", lambda: web_search(plan))
|
|
44
|
+
|
|
45
|
+
final_answer = run.trace("llm_call", "gpt-4o", lambda: openai_call(results))
|
|
46
|
+
|
|
47
|
+
result = run.complete({"answer": final_answer})
|
|
48
|
+
print(f"Report: {result['publicUrl']}")
|
|
49
|
+
# → https://agentsproof.dev/r/abc123
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick start — async agent
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import asyncio
|
|
56
|
+
import os
|
|
57
|
+
from agentsproof import AgentsProof
|
|
58
|
+
|
|
59
|
+
ap = AgentsProof(api_key=os.environ["AGENTSPROOF_API_KEY"])
|
|
60
|
+
|
|
61
|
+
async def run_my_agent(user_query: str):
|
|
62
|
+
run = ap.start_run(
|
|
63
|
+
project_slug="my-coding-agent",
|
|
64
|
+
input={"query": user_query},
|
|
65
|
+
goal="Return a working code solution",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Use atrace() for async callables
|
|
69
|
+
plan = await run.atrace("llm_call", "gpt-4o", lambda: async_openai_call(user_query))
|
|
70
|
+
results = await run.atrace("tool_call", "web_search", lambda: async_web_search(plan))
|
|
71
|
+
final_answer = await run.atrace("llm_call", "gpt-4o", lambda: async_openai_call(results))
|
|
72
|
+
|
|
73
|
+
result = await run.acomplete({"answer": final_answer})
|
|
74
|
+
print(f"Report: {result['publicUrl']}")
|
|
75
|
+
|
|
76
|
+
asyncio.run(run_my_agent("How do I reverse a list in Python?"))
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Proof Suites — regression testing
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import os
|
|
83
|
+
from agentsproof import AgentsProof
|
|
84
|
+
|
|
85
|
+
ap = AgentsProof(api_key=os.environ["AGENTSPROOF_API_KEY"])
|
|
86
|
+
|
|
87
|
+
def handler(input, ctx):
|
|
88
|
+
run = ctx.start_run()
|
|
89
|
+
result = my_agent(input)
|
|
90
|
+
run.complete({"answer": result})
|
|
91
|
+
|
|
92
|
+
result = ap.run_proof_suite(
|
|
93
|
+
project_slug="my-coding-agent",
|
|
94
|
+
suite_slug="core-behaviors",
|
|
95
|
+
handler=handler,
|
|
96
|
+
)
|
|
97
|
+
print(result)
|
|
98
|
+
# → {"passedCases": 17, "failedCases": 1, "overallScore": 0.91, "publicUrl": "..."}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Async proof suite
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
async def async_handler(input, ctx):
|
|
105
|
+
run = ctx.start_run()
|
|
106
|
+
result = await my_async_agent(input)
|
|
107
|
+
await run.acomplete({"answer": result})
|
|
108
|
+
|
|
109
|
+
result = await ap.arun_proof_suite(
|
|
110
|
+
project_slug="my-coding-agent",
|
|
111
|
+
suite_slug="core-behaviors",
|
|
112
|
+
handler=async_handler,
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## API
|
|
117
|
+
|
|
118
|
+
### `AgentsProof(api_key, base_url?)`
|
|
119
|
+
Create a client. Get your API key from [agentsproof.dev](https://agentsproof.dev).
|
|
120
|
+
|
|
121
|
+
### `client.start_run(...)` → `AgentRun`
|
|
122
|
+
|
|
123
|
+
| Param | Type | Required | Description |
|
|
124
|
+
|---|---|---|---|
|
|
125
|
+
| `project_slug` | `str` | yes | Your project identifier |
|
|
126
|
+
| `input` | `Any` | yes | The initial input or prompt to the agent |
|
|
127
|
+
| `label` | `str` | no | Human-readable label for this run |
|
|
128
|
+
| `goal` | `str` | no | What this run should accomplish |
|
|
129
|
+
| `expected_output` | `Any` | no | Expected output for grading comparison |
|
|
130
|
+
| `metadata` | `dict` | no | Optional key/value metadata |
|
|
131
|
+
|
|
132
|
+
### `run.trace(type, name, fn, input?)` → `T`
|
|
133
|
+
Wrap a **sync** callable and auto-log it as a step with latency captured.
|
|
134
|
+
|
|
135
|
+
### `run.atrace(type, name, fn, input?)` → `Awaitable[T]`
|
|
136
|
+
Wrap a **sync or async** callable. Use in `async` agent code.
|
|
137
|
+
|
|
138
|
+
### `run.log_step(payload)`
|
|
139
|
+
Manually log a step. Step types: `llm_call` | `tool_call` | `tool_result` | `memory_read` | `memory_write`.
|
|
140
|
+
|
|
141
|
+
### `run.complete(output)` → `{"publicUrl": str}`
|
|
142
|
+
Finish the run, trigger grading, and get back the public report URL.
|
|
143
|
+
|
|
144
|
+
### `run.acomplete(output)` → `Awaitable[{"publicUrl": str}]`
|
|
145
|
+
Async version of `complete()`.
|
|
146
|
+
|
|
147
|
+
### `client.run_proof_suite(...)` / `client.arun_proof_suite(...)`
|
|
148
|
+
Run approved Goldens locally against your agent. AgentsProof never executes user code remotely.
|
|
149
|
+
|
|
150
|
+
The SDK never raises on logging failures — steps are fire-and-forget so the SDK cannot crash your agent.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
agentsproof/__init__.py,sha256=3uSqAQCT-h9cqZkJBF1xIZ4nV5tdSQ6BsLqDDm0MkLY,395
|
|
2
|
+
agentsproof/client.py,sha256=yqZBU-bdksXzQphord9-OuFDy92G7dzqVb3UcfpEQ_I,1956
|
|
3
|
+
agentsproof/proof_suite.py,sha256=Eg1kQ65JePcP7pui_8pzAMVkjGrXNLWBNEnqNaRWLEc,6350
|
|
4
|
+
agentsproof/run.py,sha256=ZrZuNZstF4GBdatBMFnrYrA7dAiiyjUPYTWEZ2WtFFQ,5401
|
|
5
|
+
agentsproof/tracer.py,sha256=xFjOkeaSQz6swML-kY9fMaiTXsnzcJfKmH9PNLhJ68Y,971
|
|
6
|
+
agentsproof/types.py,sha256=EaLiw6By8enmCL0u7-oDoCVAL7stAdG3-HiUlhBVy48,947
|
|
7
|
+
agentsproof-0.1.0.dist-info/METADATA,sha256=QPbBFUR134vZQ-kAYWerVOnExPw3nCZUvwIR3wYeAm4,4627
|
|
8
|
+
agentsproof-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
agentsproof-0.1.0.dist-info/RECORD,,
|