replayd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replayd-0.1.0/.gitignore +44 -0
- replayd-0.1.0/LICENSE +21 -0
- replayd-0.1.0/PKG-INFO +247 -0
- replayd-0.1.0/README.md +210 -0
- replayd-0.1.0/examples/basic_example.py +118 -0
- replayd-0.1.0/pyproject.toml +32 -0
- replayd-0.1.0/replayd/__init__.py +4 -0
- replayd-0.1.0/replayd/capture.py +92 -0
- replayd-0.1.0/replayd/core.py +147 -0
- replayd-0.1.0/replayd/grader.py +147 -0
- replayd-0.1.0/replayd/models.py +158 -0
- replayd-0.1.0/replayd/replay.py +85 -0
- replayd-0.1.0/replayd/storage.py +64 -0
- replayd-0.1.0/scripts/regression_check.py +44 -0
- replayd-0.1.0/tests/test_core.py +198 -0
replayd-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
*.so
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
.eggs/
|
|
12
|
+
*.whl
|
|
13
|
+
|
|
14
|
+
# Virtual environments
|
|
15
|
+
.venv/
|
|
16
|
+
venv/
|
|
17
|
+
env/
|
|
18
|
+
ENV/
|
|
19
|
+
|
|
20
|
+
# replayd runtime data — never commit captured runs or tests from real agents
|
|
21
|
+
.replayd/
|
|
22
|
+
|
|
23
|
+
# Environment variables — never commit secrets
|
|
24
|
+
.env
|
|
25
|
+
.env.*
|
|
26
|
+
|
|
27
|
+
# Testing & coverage
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
.coverage
|
|
30
|
+
htmlcov/
|
|
31
|
+
coverage.xml
|
|
32
|
+
|
|
33
|
+
# Type checking
|
|
34
|
+
.mypy_cache/
|
|
35
|
+
.ruff_cache/
|
|
36
|
+
|
|
37
|
+
# IDE
|
|
38
|
+
.vscode/
|
|
39
|
+
.idea/
|
|
40
|
+
*.swp
|
|
41
|
+
*.swo
|
|
42
|
+
|
|
43
|
+
# macOS
|
|
44
|
+
.DS_Store
|
replayd-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stonepath Labs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
replayd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: replayd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Turn failed AI agent runs into replayable regression tests
|
|
5
|
+
Project-URL: Homepage, https://stonepathlab.net
|
|
6
|
+
Project-URL: Repository, https://github.com/TaimoorKhan10/replayd
|
|
7
|
+
Author-email: Taimoor Khan <taimoorkhaniajaznabi@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Stonepath Labs
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Python: >=3.10
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
34
|
+
Provides-Extra: semantic
|
|
35
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'semantic'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# replayd
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/replayd/)
|
|
41
|
+
[](https://pypi.org/project/replayd/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
|
|
44
|
+
**Turn failed AI agent runs into replayable regression tests.**
|
|
45
|
+
|
|
46
|
+
When an AI agent fails in production, that failure becomes a test that runs before every future deployment. If the same failure returns after a prompt, model, or tool change, the release is blocked.
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
pip install replayd
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## The problem
|
|
55
|
+
|
|
56
|
+
AI agents regress silently. A team fixes a bug, changes a prompt or model, and the same bug quietly returns. Traditional software has regression tests and CI/CD to catch this. AI agents have nothing equivalent.
|
|
57
|
+
|
|
58
|
+
replayd is the open source fix. It replays known failures before you ship so the same mistake cannot return.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Quickstart
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from replayd import Replayd
|
|
66
|
+
|
|
67
|
+
rp = Replayd()
|
|
68
|
+
|
|
69
|
+
# 1. Capture a run — assign run.output inside the block
|
|
70
|
+
with rp.capture(input=user_input, model="gpt-4o") as run:
|
|
71
|
+
run.output = your_agent.run(user_input)
|
|
72
|
+
|
|
73
|
+
# 2. Mark it as failed
|
|
74
|
+
rp.mark_failed(run.id, reason="agent approved refund after policy limit")
|
|
75
|
+
|
|
76
|
+
# 3. Save as a regression test
|
|
77
|
+
rp.save_test(
|
|
78
|
+
run.id,
|
|
79
|
+
forbidden_actions=["approve_refund"],
|
|
80
|
+
expected_action="escalate",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# 4. Later — after changing your prompt or model — replay all tests
|
|
84
|
+
results = rp.replay_all(agent=your_agent_fn)
|
|
85
|
+
|
|
86
|
+
for r in results:
|
|
87
|
+
print(r.verdict, r.reason)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## See it working
|
|
93
|
+
|
|
94
|
+
Run the included example (`python examples/basic_example.py`) and you get:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
Capturing a refund-approval agent run...
|
|
98
|
+
agent called: approve_refund(amount=1200) [policy limit is $500]
|
|
99
|
+
output: {'action': 'approve_refund', 'amount': 1200}
|
|
100
|
+
|
|
101
|
+
Marking run as failed...
|
|
102
|
+
reason: agent approved refund of $1200, exceeding $500 policy limit
|
|
103
|
+
|
|
104
|
+
Saving as regression test...
|
|
105
|
+
forbidden: approve_refund | expected: escalate
|
|
106
|
+
|
|
107
|
+
-----------------------------------------
|
|
108
|
+
Replay #1 -- buggy agent (regression should be caught)
|
|
109
|
+
[FAIL] Forbidden action 'approve_refund' was called during replay.
|
|
110
|
+
|
|
111
|
+
Replay #2 -- fixed agent (regression should be resolved)
|
|
112
|
+
[PASS] No forbidden actions called; all expected actions present.
|
|
113
|
+
-----------------------------------------
|
|
114
|
+
1 failure caught. 1 resolved.
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The failure was captured, saved, replayed against a broken agent (FAIL), and replayed again against the fixed agent (PASS). That is the full loop.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Recording tool calls
|
|
122
|
+
|
|
123
|
+
replayd cannot intercept tool calls automatically. Wrap your agent's tool dispatcher to record them:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
def my_agent(input, run_ctx):
|
|
127
|
+
result = call_tool("search", {"query": input["query"]})
|
|
128
|
+
run_ctx.record_tool_call("search", {"query": input["query"]}, result)
|
|
129
|
+
# ... rest of agent logic
|
|
130
|
+
return final_output
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Pass this two-argument callable to `replay_all`:
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
results = rp.replay_all(agent=my_agent)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Grading
|
|
142
|
+
|
|
143
|
+
replayd does **not** grade on exact output matching. LLMs are non-deterministic — the same correct behavior will produce different output text every run, so exact matching creates false failures. The wrong tool being called, however, is a fact. replayd grades on facts.
|
|
144
|
+
|
|
145
|
+
| Failure type | Grading method |
|
|
146
|
+
|---|---|
|
|
147
|
+
| Wrong tool called, wrong argument, wrong state | Deterministic assertion — no LLM needed, never flaky |
|
|
148
|
+
| Policy violated, wrong reasoning, bad decision | LLM-as-judge via `grader_prompt` |
|
|
149
|
+
|
|
150
|
+
The structural check always runs first. If a forbidden action fires, the test fails immediately without calling the LLM.
|
|
151
|
+
|
|
152
|
+
### Semantic grading
|
|
153
|
+
|
|
154
|
+
For failures that can only be evaluated by reading the output:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
rp.save_test(
|
|
158
|
+
run.id,
|
|
159
|
+
grader_prompt="Did the agent approve a refund that exceeds the $500 policy limit?",
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Requires:
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
pip install "replayd[semantic]"
|
|
167
|
+
export ANTHROPIC_API_KEY=sk-...
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Storage
|
|
173
|
+
|
|
174
|
+
Runs and tests are stored as JSON files in `.replayd/` in your working directory:
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
.replayd/
|
|
178
|
+
runs/<run-id>.json ← full record of each captured run
|
|
179
|
+
tests/<test-id>.json ← saved regression tests
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
No database. No hosted backend. Check `.replayd/tests/` into version control to share tests with your team. The `.gitignore` included in this repo excludes `.replayd/` by default — commit only the `tests/` subfolder, not captured runs.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## CI integration
|
|
187
|
+
|
|
188
|
+
Save a script at `scripts/regression_check.py` in your repo:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
import sys
|
|
192
|
+
from replayd import Replayd
|
|
193
|
+
from your_agent import agent_fn # your agent wrapped as (input, run_ctx) -> output
|
|
194
|
+
|
|
195
|
+
rp = Replayd()
|
|
196
|
+
results = rp.replay_all(agent=agent_fn)
|
|
197
|
+
|
|
198
|
+
failures = [r for r in results if not r]
|
|
199
|
+
for f in failures:
|
|
200
|
+
print(f"FAIL [{f.test.failure_reason}]: {f.reason}")
|
|
201
|
+
|
|
202
|
+
if failures:
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Then in your workflow:
|
|
207
|
+
|
|
208
|
+
```yaml
|
|
209
|
+
# .github/workflows/regression.yml
|
|
210
|
+
- name: Run regression tests
|
|
211
|
+
run: python scripts/regression_check.py
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## What replayd is not
|
|
217
|
+
|
|
218
|
+
replayd is not an observability tool. LangSmith, Braintrust, and Arize tell you what happened after the fact. replayd is an **active release gate** — it replays known failures before you ship. Passive vs active. That is the distinction.
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Part of TAQ by Stonepath Labs
|
|
223
|
+
|
|
224
|
+
replayd is the open source core of [TAQ](https://stonepathlab.net) — the full AI release control platform.
|
|
225
|
+
|
|
226
|
+
TAQ adds: a dashboard, hosted backend, team access controls, release gate enforcement, and audit logs. replayd gets your team started with the concept. TAQ is what you run it on in production.
|
|
227
|
+
|
|
228
|
+
**[stonepathlab.net](https://stonepathlab.net)**
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Contributing
|
|
233
|
+
|
|
234
|
+
Bug reports and pull requests are welcome. Open an issue on GitHub to discuss anything before sending a large PR.
|
|
235
|
+
|
|
236
|
+
The build has no dependencies — `pip install -e ".[dev]"` gives you everything needed to run tests:
|
|
237
|
+
|
|
238
|
+
```
|
|
239
|
+
pip install -e ".[dev]"
|
|
240
|
+
pytest
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
MIT — see [LICENSE](LICENSE).
|
replayd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# replayd
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/replayd/)
|
|
4
|
+
[](https://pypi.org/project/replayd/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
**Turn failed AI agent runs into replayable regression tests.**
|
|
8
|
+
|
|
9
|
+
When an AI agent fails in production, that failure becomes a test that runs before every future deployment. If the same failure returns after a prompt, model, or tool change, the release is blocked.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
pip install replayd
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## The problem
|
|
18
|
+
|
|
19
|
+
AI agents regress silently. A team fixes a bug, changes a prompt or model, and the same bug quietly returns. Traditional software has regression tests and CI/CD to catch this. AI agents have nothing equivalent.
|
|
20
|
+
|
|
21
|
+
replayd is the open source fix. It replays known failures before you ship so the same mistake cannot return.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quickstart
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from replayd import Replayd
|
|
29
|
+
|
|
30
|
+
rp = Replayd()
|
|
31
|
+
|
|
32
|
+
# 1. Capture a run — assign run.output inside the block
|
|
33
|
+
with rp.capture(input=user_input, model="gpt-4o") as run:
|
|
34
|
+
run.output = your_agent.run(user_input)
|
|
35
|
+
|
|
36
|
+
# 2. Mark it as failed
|
|
37
|
+
rp.mark_failed(run.id, reason="agent approved refund after policy limit")
|
|
38
|
+
|
|
39
|
+
# 3. Save as a regression test
|
|
40
|
+
rp.save_test(
|
|
41
|
+
run.id,
|
|
42
|
+
forbidden_actions=["approve_refund"],
|
|
43
|
+
expected_action="escalate",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# 4. Later — after changing your prompt or model — replay all tests
|
|
47
|
+
results = rp.replay_all(agent=your_agent_fn)
|
|
48
|
+
|
|
49
|
+
for r in results:
|
|
50
|
+
print(r.verdict, r.reason)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## See it working
|
|
56
|
+
|
|
57
|
+
Run the included example (`python examples/basic_example.py`) and you get:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
Capturing a refund-approval agent run...
|
|
61
|
+
agent called: approve_refund(amount=1200) [policy limit is $500]
|
|
62
|
+
output: {'action': 'approve_refund', 'amount': 1200}
|
|
63
|
+
|
|
64
|
+
Marking run as failed...
|
|
65
|
+
reason: agent approved refund of $1200, exceeding $500 policy limit
|
|
66
|
+
|
|
67
|
+
Saving as regression test...
|
|
68
|
+
forbidden: approve_refund | expected: escalate
|
|
69
|
+
|
|
70
|
+
-----------------------------------------
|
|
71
|
+
Replay #1 -- buggy agent (regression should be caught)
|
|
72
|
+
[FAIL] Forbidden action 'approve_refund' was called during replay.
|
|
73
|
+
|
|
74
|
+
Replay #2 -- fixed agent (regression should be resolved)
|
|
75
|
+
[PASS] No forbidden actions called; all expected actions present.
|
|
76
|
+
-----------------------------------------
|
|
77
|
+
1 failure caught. 1 resolved.
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The failure was captured, saved, replayed against a broken agent (FAIL), and replayed again against the fixed agent (PASS). That is the full loop.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Recording tool calls
|
|
85
|
+
|
|
86
|
+
replayd cannot intercept tool calls automatically. Wrap your agent's tool dispatcher to record them:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
def my_agent(input, run_ctx):
|
|
90
|
+
result = call_tool("search", {"query": input["query"]})
|
|
91
|
+
run_ctx.record_tool_call("search", {"query": input["query"]}, result)
|
|
92
|
+
# ... rest of agent logic
|
|
93
|
+
return final_output
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Pass this two-argument callable to `replay_all`:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
results = rp.replay_all(agent=my_agent)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Grading
|
|
105
|
+
|
|
106
|
+
replayd does **not** grade on exact output matching. LLMs are non-deterministic — the same correct behavior will produce different output text every run, so exact matching creates false failures. The wrong tool being called, however, is a fact. replayd grades on facts.
|
|
107
|
+
|
|
108
|
+
| Failure type | Grading method |
|
|
109
|
+
|---|---|
|
|
110
|
+
| Wrong tool called, wrong argument, wrong state | Deterministic assertion — no LLM needed, never flaky |
|
|
111
|
+
| Policy violated, wrong reasoning, bad decision | LLM-as-judge via `grader_prompt` |
|
|
112
|
+
|
|
113
|
+
The structural check always runs first. If a forbidden action fires, the test fails immediately without calling the LLM.
|
|
114
|
+
|
|
115
|
+
### Semantic grading
|
|
116
|
+
|
|
117
|
+
For failures that can only be evaluated by reading the output:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
rp.save_test(
|
|
121
|
+
run.id,
|
|
122
|
+
grader_prompt="Did the agent approve a refund that exceeds the $500 policy limit?",
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Requires:
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
pip install "replayd[semantic]"
|
|
130
|
+
export ANTHROPIC_API_KEY=sk-...
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Storage
|
|
136
|
+
|
|
137
|
+
Runs and tests are stored as JSON files in `.replayd/` in your working directory:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
.replayd/
|
|
141
|
+
runs/<run-id>.json ← full record of each captured run
|
|
142
|
+
tests/<test-id>.json ← saved regression tests
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
No database. No hosted backend. Check `.replayd/tests/` into version control to share tests with your team. The `.gitignore` included in this repo excludes `.replayd/` by default — commit only the `tests/` subfolder, not captured runs.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## CI integration
|
|
150
|
+
|
|
151
|
+
Save a script at `scripts/regression_check.py` in your repo:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import sys
|
|
155
|
+
from replayd import Replayd
|
|
156
|
+
from your_agent import agent_fn # your agent wrapped as (input, run_ctx) -> output
|
|
157
|
+
|
|
158
|
+
rp = Replayd()
|
|
159
|
+
results = rp.replay_all(agent=agent_fn)
|
|
160
|
+
|
|
161
|
+
failures = [r for r in results if not r]
|
|
162
|
+
for f in failures:
|
|
163
|
+
print(f"FAIL [{f.test.failure_reason}]: {f.reason}")
|
|
164
|
+
|
|
165
|
+
if failures:
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Then in your workflow:
|
|
170
|
+
|
|
171
|
+
```yaml
|
|
172
|
+
# .github/workflows/regression.yml
|
|
173
|
+
- name: Run regression tests
|
|
174
|
+
run: python scripts/regression_check.py
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## What replayd is not
|
|
180
|
+
|
|
181
|
+
replayd is not an observability tool. LangSmith, Braintrust, and Arize tell you what happened after the fact. replayd is an **active release gate** — it replays known failures before you ship. Passive vs active. That is the distinction.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Part of TAQ by Stonepath Labs
|
|
186
|
+
|
|
187
|
+
replayd is the open source core of [TAQ](https://stonepathlab.net) — the full AI release control platform.
|
|
188
|
+
|
|
189
|
+
TAQ adds: a dashboard, hosted backend, team access controls, release gate enforcement, and audit logs. replayd gets your team started with the concept. TAQ is what you run it on in production.
|
|
190
|
+
|
|
191
|
+
**[stonepathlab.net](https://stonepathlab.net)**
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Contributing
|
|
196
|
+
|
|
197
|
+
Bug reports and pull requests are welcome. Open an issue on GitHub to discuss anything before sending a large PR.
|
|
198
|
+
|
|
199
|
+
The build has no dependencies — `pip install -e ".[dev]"` gives you everything needed to run tests:
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
pip install -e ".[dev]"
|
|
203
|
+
pytest
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
End-to-end example of replayd.
|
|
3
|
+
|
|
4
|
+
Run from the repo root with:
|
|
5
|
+
pip install -e .
|
|
6
|
+
python examples/basic_example.py
|
|
7
|
+
|
|
8
|
+
Or without installing:
|
|
9
|
+
PYTHONPATH=. python examples/basic_example.py
|
|
10
|
+
|
|
11
|
+
The example simulates a refund-approval agent that has a bug: it approves
|
|
12
|
+
refunds above the $500 policy limit. We capture the failure, save it as a
|
|
13
|
+
regression test, then replay it against both the buggy agent (expects FAIL)
|
|
14
|
+
and the fixed agent (expects PASS).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from replayd import Replayd
|
|
18
|
+
from replayd.capture import RunContext
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Mock agents
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
def buggy_agent(input: dict, run_ctx: RunContext) -> dict:
|
|
26
|
+
amount = input.get("amount", 0)
|
|
27
|
+
run_ctx.record_tool_call(
|
|
28
|
+
name="approve_refund",
|
|
29
|
+
arguments={"amount": amount, "customer_id": input.get("customer_id")},
|
|
30
|
+
result={"approved": True},
|
|
31
|
+
)
|
|
32
|
+
return {"action": "approve_refund", "amount": amount}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def fixed_agent(input: dict, run_ctx: RunContext) -> dict:
|
|
36
|
+
amount = input.get("amount", 0)
|
|
37
|
+
policy_limit = 500
|
|
38
|
+
|
|
39
|
+
if amount > policy_limit:
|
|
40
|
+
run_ctx.record_tool_call(
|
|
41
|
+
name="escalate",
|
|
42
|
+
arguments={"reason": "refund exceeds policy limit", "amount": amount},
|
|
43
|
+
result={"ticket_id": "ESC-001"},
|
|
44
|
+
)
|
|
45
|
+
return {"action": "escalate", "amount": amount}
|
|
46
|
+
|
|
47
|
+
run_ctx.record_tool_call(
|
|
48
|
+
name="approve_refund",
|
|
49
|
+
arguments={"amount": amount, "customer_id": input.get("customer_id")},
|
|
50
|
+
result={"approved": True},
|
|
51
|
+
)
|
|
52
|
+
return {"action": "approve_refund", "amount": amount}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Main
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
def main():
|
|
60
|
+
import shutil
|
|
61
|
+
import os
|
|
62
|
+
|
|
63
|
+
if os.path.exists(".replayd"):
|
|
64
|
+
shutil.rmtree(".replayd")
|
|
65
|
+
|
|
66
|
+
rp = Replayd()
|
|
67
|
+
user_input = {"customer_id": "cust-42", "amount": 1200, "reason": "defective product"}
|
|
68
|
+
|
|
69
|
+
# --- Capture ------------------------------------------------------------
|
|
70
|
+
print("Capturing a refund-approval agent run...")
|
|
71
|
+
with rp.capture(input=user_input, model="mock-v1") as run:
|
|
72
|
+
run.output = buggy_agent(user_input, run)
|
|
73
|
+
|
|
74
|
+
for tc in rp.get_run(run.id).tool_calls:
|
|
75
|
+
args = ", ".join(f"{k}={v}" for k, v in tc.arguments.items())
|
|
76
|
+
print(f" agent called: {tc.name}({args}) [policy limit is $500]")
|
|
77
|
+
print(f" output: {run.output}")
|
|
78
|
+
|
|
79
|
+
# --- Mark failed --------------------------------------------------------
|
|
80
|
+
print("\nMarking run as failed...")
|
|
81
|
+
failure_reason = "agent approved refund of $1200, exceeding $500 policy limit"
|
|
82
|
+
rp.mark_failed(run.id, reason=failure_reason)
|
|
83
|
+
print(f" reason: {failure_reason}")
|
|
84
|
+
|
|
85
|
+
# --- Save as test -------------------------------------------------------
|
|
86
|
+
print("\nSaving as regression test...")
|
|
87
|
+
test = rp.save_test(
|
|
88
|
+
run.id,
|
|
89
|
+
forbidden_actions=["approve_refund"],
|
|
90
|
+
expected_action="escalate",
|
|
91
|
+
)
|
|
92
|
+
print(f" forbidden: approve_refund | expected: escalate")
|
|
93
|
+
|
|
94
|
+
print()
|
|
95
|
+
print("-" * 41)
|
|
96
|
+
|
|
97
|
+
# --- Replay: buggy agent ------------------------------------------------
|
|
98
|
+
print("Replay #1 -- buggy agent (regression should be caught)")
|
|
99
|
+
results = rp.replay_all(agent=buggy_agent)
|
|
100
|
+
for r in results:
|
|
101
|
+
verdict = "FAIL" if r.verdict.value == "fail" else "PASS"
|
|
102
|
+
print(f" [{verdict}] {r.reason}")
|
|
103
|
+
|
|
104
|
+
print()
|
|
105
|
+
|
|
106
|
+
# --- Replay: fixed agent ------------------------------------------------
|
|
107
|
+
print("Replay #2 -- fixed agent (regression should be resolved)")
|
|
108
|
+
results = rp.replay_all(agent=fixed_agent)
|
|
109
|
+
for r in results:
|
|
110
|
+
verdict = "FAIL" if r.verdict.value == "fail" else "PASS"
|
|
111
|
+
print(f" [{verdict}] {r.reason}")
|
|
112
|
+
|
|
113
|
+
print("-" * 41)
|
|
114
|
+
print("1 failure caught. 1 resolved.")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
if __name__ == "__main__":
|
|
118
|
+
main()
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "replayd"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Turn failed AI agent runs into replayable regression tests"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
dependencies = []
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Taimoor Khan", email = "taimoorkhaniajaznabi@gmail.com" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
semantic = ["anthropic>=0.40.0"]
|
|
19
|
+
dev = [
|
|
20
|
+
"pytest>=8.0",
|
|
21
|
+
"anthropic>=0.40.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://stonepathlab.net"
|
|
26
|
+
Repository = "https://github.com/TaimoorKhan10/replayd"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["replayd"]
|
|
30
|
+
|
|
31
|
+
[tool.pytest.ini_options]
|
|
32
|
+
testpaths = ["tests"]
|