mimic-recording 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mimic_recording-1.0.0/LICENSE +21 -0
- mimic_recording-1.0.0/PKG-INFO +248 -0
- mimic_recording-1.0.0/README.md +209 -0
- mimic_recording-1.0.0/pyproject.toml +69 -0
- mimic_recording-1.0.0/setup.cfg +4 -0
- mimic_recording-1.0.0/src/mimic/__init__.py +27 -0
- mimic_recording-1.0.0/src/mimic/assertions.py +182 -0
- mimic_recording-1.0.0/src/mimic/cli.py +548 -0
- mimic_recording-1.0.0/src/mimic/diff.py +107 -0
- mimic_recording-1.0.0/src/mimic/integrations/__init__.py +17 -0
- mimic_recording-1.0.0/src/mimic/integrations/anthropic.py +71 -0
- mimic_recording-1.0.0/src/mimic/integrations/openai.py +132 -0
- mimic_recording-1.0.0/src/mimic/models.py +125 -0
- mimic_recording-1.0.0/src/mimic/recorder.py +313 -0
- mimic_recording-1.0.0/src/mimic/replay.py +40 -0
- mimic_recording-1.0.0/src/mimic/step.py +169 -0
- mimic_recording-1.0.0/src/mimic/storage.py +79 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/PKG-INFO +248 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/SOURCES.txt +28 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/dependency_links.txt +1 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/entry_points.txt +2 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/requires.txt +16 -0
- mimic_recording-1.0.0/src/mimic_recording.egg-info/top_level.txt +1 -0
- mimic_recording-1.0.0/tests/test_async_and_steps.py +139 -0
- mimic_recording-1.0.0/tests/test_cli.py +68 -0
- mimic_recording-1.0.0/tests/test_core.py +252 -0
- mimic_recording-1.0.0/tests/test_diff.py +64 -0
- mimic_recording-1.0.0/tests/test_edge_cases.py +143 -0
- mimic_recording-1.0.0/tests/test_integrations.py +122 -0
- mimic_recording-1.0.0/tests/test_storage.py +71 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mimic Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mimic-recording
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: The pytest for AI agents. Record, replay, assert, and diff agent behavior.
|
|
5
|
+
Author-email: Mimic <team@mimic.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mimic-ai/mimic
|
|
8
|
+
Project-URL: Documentation, https://docs.mimic.dev
|
|
9
|
+
Project-URL: Repository, https://github.com/mimic-ai/mimic
|
|
10
|
+
Project-URL: Issues, https://github.com/mimic-ai/mimic/issues
|
|
11
|
+
Keywords: ai,agents,testing,llm,evals,replay,observability
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Requires-Dist: click>=8.0
|
|
27
|
+
Requires-Dist: rich>=13.0
|
|
28
|
+
Requires-Dist: pyyaml>=6.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
34
|
+
Provides-Extra: anthropic
|
|
35
|
+
Requires-Dist: anthropic>=0.18.0; extra == "anthropic"
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# Mimic
|
|
41
|
+
|
|
42
|
+
> The pytest for AI agents. Record, replay, assert, and diff agent behavior.
|
|
43
|
+
|
|
44
|
+
[](https://pypi.org/project/mimic-ai/)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
[](https://www.python.org/downloads/)
|
|
47
|
+
|
|
48
|
+
Mimic is an open-source library that lets you **record** an AI agent's behavior, **replay** it deterministically, **assert** properties about it, and **diff** runs across versions. It's the missing testing layer for the agent era.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from mimic import Mimic, assert_that, replay
|
|
52
|
+
from mimic.integrations.openai import tracked_completion
|
|
53
|
+
|
|
54
|
+
mimic = Mimic()
|
|
55
|
+
client = OpenAI()
|
|
56
|
+
|
|
57
|
+
@mimic.record("customer-support-agent", model="gpt-4o")
|
|
58
|
+
def answer(question: str) -> str:
|
|
59
|
+
resp = client.chat.completions.create(
|
|
60
|
+
model="gpt-4o",
|
|
61
|
+
messages=[{"role": "user", "content": question}],
|
|
62
|
+
)
|
|
63
|
+
tracked_completion(resp) # auto-captures tokens + cost
|
|
64
|
+
return resp.choices[0].message.content
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Verified performance
|
|
68
|
+
|
|
69
|
+
| Scenario | Record mode | Replay mode | Savings |
|
|
70
|
+
|---|---|---|---|
|
|
71
|
+
| 5-test multi-step agent suite | 360 ms | 50 ms | **7× faster** |
|
|
72
|
+
| 1000 CI runs of the same suite | ~$2 in LLM cost | **$0** | **100%** |
|
|
73
|
+
|
|
74
|
+
Run `mimic benchmark --runs 1000` on your own recordings to see your numbers.
|
|
75
|
+
|
|
76
|
+
## Why Mimic?
|
|
77
|
+
|
|
78
|
+
Every team building AI agents hits the same wall:
|
|
79
|
+
|
|
80
|
+
- **"I changed a prompt. Did I break anything?"** — You don't know.
|
|
81
|
+
- **"I switched from GPT-4 to Claude. Is it 2x more expensive?"** — You don't know.
|
|
82
|
+
- **"Did this agent ever call `delete_file` in production?"** — You don't know.
|
|
83
|
+
- **"Why did the agent fail on Tuesday at 3pm?"** — You don't know.
|
|
84
|
+
|
|
85
|
+
Mimic turns those unknowns into testable, replayable, diffable artifacts. Think **Sentry recordings + pytest assertions + git blame**, purpose-built for LLM agents.
|
|
86
|
+
|
|
87
|
+
## Features
|
|
88
|
+
|
|
89
|
+
- ✅ **Record** any callable — sync or async, LLM calls, tool use, multi-step agents
|
|
90
|
+
- ✅ **Replay** runs offline with **zero API cost**, byte-for-byte deterministic
|
|
91
|
+
- ✅ **Assert** behavioral properties: cost, latency, tool usage, output content
|
|
92
|
+
- ✅ **Diff** two runs to see exactly what changed
|
|
93
|
+
- ✅ **Auto-track LLM costs** for OpenAI, Anthropic, Gemini (zero-config)
|
|
94
|
+
- ✅ **Multi-step agents** with per-step recording, cost, and metadata
|
|
95
|
+
- ✅ **Privacy mode** (`capture_args=False`, `capture_return=False`)
|
|
96
|
+
- ✅ **Storage-agnostic** — filesystem by default, pluggable for S3/Postgres
|
|
97
|
+
- ✅ **Zero LLM vendor lock-in** — works with any model
|
|
98
|
+
- ✅ **Beautiful CLI** — `mimic run / list / show / diff / report / benchmark`
|
|
99
|
+
- ✅ **CI-ready** — GitHub Actions template + pre-commit hook included
|
|
100
|
+
|
|
101
|
+
## Install
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install mimic-ai
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Or with optional integrations:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install mimic-ai[openai]
|
|
111
|
+
pip install mimic-ai[anthropic]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Quick start
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
mkdir my-agent && cd my-agent
|
|
118
|
+
mimic init
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
This creates a project skeleton:
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
my-agent/
|
|
125
|
+
├── mimic.yaml # Project config
|
|
126
|
+
├── tests/
|
|
127
|
+
│ └── test_agent.py # Your recorded tests
|
|
128
|
+
└── .mimic/ # Recorded runs (gitignored by default)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Edit `tests/test_agent.py`:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from mimic import Mimic, assert_that, replay
|
|
135
|
+
|
|
136
|
+
mimic = Mimic()
|
|
137
|
+
|
|
138
|
+
@mimic.record("my-agent", model="gpt-4o")
|
|
139
|
+
def answer(question: str) -> str:
|
|
140
|
+
# ... your LLM call here ...
|
|
141
|
+
return "..."
|
|
142
|
+
|
|
143
|
+
def test_agent():
|
|
144
|
+
answer("hello")
|
|
145
|
+
recorded = replay("my-agent")
|
|
146
|
+
assert_that(recorded).finished_without_errors()
|
|
147
|
+
assert_that(recorded).cost_less_than(usd=0.05)
|
|
148
|
+
assert_that(recorded).did_not_call_tool("delete_database")
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Run it:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
mimic run tests/ # records + runs (costs $$)
|
|
155
|
+
MIMIC_MODE=replay mimic run tests/ # replays only (free, deterministic)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Multi-step agents
|
|
159
|
+
|
|
160
|
+
For ReAct, multi-agent, or any agent with multiple LLM/tool calls, record each step:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
@mimic.record("research-agent")
|
|
164
|
+
async def research(question: str) -> str:
|
|
165
|
+
# Step 1: plan
|
|
166
|
+
with mimic.step("plan", model="gpt-4o-mini") as s:
|
|
167
|
+
resp = await llm.complete(model="gpt-4o-mini", messages=[...])
|
|
168
|
+
tracked_completion(resp)
|
|
169
|
+
s.metadata["plan_steps"] = 3
|
|
170
|
+
|
|
171
|
+
# Step 2: search
|
|
172
|
+
with mimic.step("search") as s:
|
|
173
|
+
results = await web_search(question)
|
|
174
|
+
s.metadata["result_count"] = len(results)
|
|
175
|
+
|
|
176
|
+
# Step 3: synthesize
|
|
177
|
+
with mimic.step("synthesize", model="gpt-4o") as s:
|
|
178
|
+
resp = await llm.complete(model="gpt-4o", messages=[...])
|
|
179
|
+
tracked_completion(resp)
|
|
180
|
+
|
|
181
|
+
return summary
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Assertions
|
|
185
|
+
|
|
186
|
+
The full chain (all return `self` for fluent chaining):
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
assert_that(run).finished_without_errors()
|
|
190
|
+
assert_that(run).had_error() # inverse
|
|
191
|
+
assert_that(run).cost_less_than(usd=0.05)
|
|
192
|
+
assert_that(run).completed_under(ms=2000)
|
|
193
|
+
assert_that(run).output_contains("substring")
|
|
194
|
+
assert_that(run).output_matches(r"regex")
|
|
195
|
+
assert_that(run).output_equals(value)
|
|
196
|
+
assert_that(run).called_tool("search")
|
|
197
|
+
assert_that(run).did_not_call_tool("delete_database")
|
|
198
|
+
assert_that(run).called_tools(["search", "synthesize"])
|
|
199
|
+
assert_that(run).had_exactly(3)
|
|
200
|
+
assert_that(run).had_at_least(2)
|
|
201
|
+
assert_that(run).used_model("gpt-4o")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## How it works
|
|
205
|
+
|
|
206
|
+
Mimic sits **outside** your agent code, watching the inputs and outputs of any function you decorate. The first time the function runs, Mimic records the full execution into a content-addressable store. Subsequent test runs use the stored record instead of calling the LLM, making them fast, free, and deterministic.
|
|
207
|
+
|
|
208
|
+
For multi-step agents, Mimic records each step separately, so you can replay just the broken step without re-running the whole agent.
|
|
209
|
+
|
|
210
|
+
## CI integration
|
|
211
|
+
|
|
212
|
+
Drop the included `.github/workflows/test.yml` into your repo. It runs your test suite in replay mode (no LLM cost) and validates that no cost was incurred.
|
|
213
|
+
|
|
214
|
+
Manual re-recording is a separate job, triggered on `workflow_dispatch` or a schedule.
|
|
215
|
+
|
|
216
|
+
## The recording format
|
|
217
|
+
|
|
218
|
+
Mimic recordings are plain JSON conforming to a documented schema — see [`RECORDING_FORMAT.md`](RECORDING_FORMAT.md). The format is **vendor-neutral**: you can build readers, web UIs, or analysis tools without depending on the Mimic library.
|
|
219
|
+
|
|
220
|
+
## The $100M thesis
|
|
221
|
+
|
|
222
|
+
Mimic sits at the intersection of three exploding markets:
|
|
223
|
+
|
|
224
|
+
1. **AI agent development** — 10M+ developers will build agents by 2027.
|
|
225
|
+
2. **AI observability** — already a $2B+ market, dominated by closed vendors (LangSmith, Helicone, Langfuse).
|
|
226
|
+
3. **AI safety & compliance** — every enterprise deploying agents needs guardrails, audit trails, and replay.
|
|
227
|
+
|
|
228
|
+
The land-and-expand model is proven (Sentry, Supabase, GitLab, Vercel, PostHog): open source core → community growth → enterprise tier with self-hosted, SSO, audit logs, and SOC2.
|
|
229
|
+
|
|
230
|
+
See [`BUSINESS_PLAN.md`](BUSINESS_PLAN.md) for the full strategy.
|
|
231
|
+
|
|
232
|
+
## Roadmap
|
|
233
|
+
|
|
234
|
+
- [x] v0.1 — Record/replay/assert core
|
|
235
|
+
- [x] v0.2 — Async + multi-step + OpenAI/Anthropic cost tracking
|
|
236
|
+
- [ ] v0.3 — Web UI for browsing recorded runs
|
|
237
|
+
- [ ] v0.4 — TypeScript SDK
|
|
238
|
+
- [ ] v0.5 — Auto-generated regression tests from production traces
|
|
239
|
+
- [ ] v0.6 — Multi-agent parent/child traces
|
|
240
|
+
- [ ] v1.0 — Enterprise self-hosted edition
|
|
241
|
+
|
|
242
|
+
## Contributing
|
|
243
|
+
|
|
244
|
+
We love contributions. See [`CONTRIBUTING.md`](CONTRIBUTING.md).
|
|
245
|
+
|
|
246
|
+
## License
|
|
247
|
+
|
|
248
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# Mimic
|
|
2
|
+
|
|
3
|
+
> The pytest for AI agents. Record, replay, assert, and diff agent behavior.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/mimic-ai/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
|
|
9
|
+
Mimic is an open-source library that lets you **record** an AI agent's behavior, **replay** it deterministically, **assert** properties about it, and **diff** runs across versions. It's the missing testing layer for the agent era.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from mimic import Mimic, assert_that, replay
|
|
13
|
+
from mimic.integrations.openai import tracked_completion
|
|
14
|
+
|
|
15
|
+
mimic = Mimic()
|
|
16
|
+
client = OpenAI()
|
|
17
|
+
|
|
18
|
+
@mimic.record("customer-support-agent", model="gpt-4o")
|
|
19
|
+
def answer(question: str) -> str:
|
|
20
|
+
resp = client.chat.completions.create(
|
|
21
|
+
model="gpt-4o",
|
|
22
|
+
messages=[{"role": "user", "content": question}],
|
|
23
|
+
)
|
|
24
|
+
tracked_completion(resp) # auto-captures tokens + cost
|
|
25
|
+
return resp.choices[0].message.content
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Verified performance
|
|
29
|
+
|
|
30
|
+
| Scenario | Record mode | Replay mode | Savings |
|
|
31
|
+
|---|---|---|---|
|
|
32
|
+
| 5-test multi-step agent suite | 360 ms | 50 ms | **7× faster** |
|
|
33
|
+
| 1000 CI runs of the same suite | ~$2 in LLM cost | **$0** | **100%** |
|
|
34
|
+
|
|
35
|
+
Run `mimic benchmark --runs 1000` on your own recordings to see your numbers.
|
|
36
|
+
|
|
37
|
+
## Why Mimic?
|
|
38
|
+
|
|
39
|
+
Every team building AI agents hits the same wall:
|
|
40
|
+
|
|
41
|
+
- **"I changed a prompt. Did I break anything?"** — You don't know.
|
|
42
|
+
- **"I switched from GPT-4 to Claude. Is it 2x more expensive?"** — You don't know.
|
|
43
|
+
- **"Did this agent ever call `delete_file` in production?"** — You don't know.
|
|
44
|
+
- **"Why did the agent fail on Tuesday at 3pm?"** — You don't know.
|
|
45
|
+
|
|
46
|
+
Mimic turns those unknowns into testable, replayable, diffable artifacts. Think **Sentry recordings + pytest assertions + git blame**, purpose-built for LLM agents.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- ✅ **Record** any callable — sync or async, LLM calls, tool use, multi-step agents
|
|
51
|
+
- ✅ **Replay** runs offline with **zero API cost**, byte-for-byte deterministic
|
|
52
|
+
- ✅ **Assert** behavioral properties: cost, latency, tool usage, output content
|
|
53
|
+
- ✅ **Diff** two runs to see exactly what changed
|
|
54
|
+
- ✅ **Auto-track LLM costs** for OpenAI, Anthropic, Gemini (zero-config)
|
|
55
|
+
- ✅ **Multi-step agents** with per-step recording, cost, and metadata
|
|
56
|
+
- ✅ **Privacy mode** (`capture_args=False`, `capture_return=False`)
|
|
57
|
+
- ✅ **Storage-agnostic** — filesystem by default, pluggable for S3/Postgres
|
|
58
|
+
- ✅ **Zero LLM vendor lock-in** — works with any model
|
|
59
|
+
- ✅ **Beautiful CLI** — `mimic run / list / show / diff / report / benchmark`
|
|
60
|
+
- ✅ **CI-ready** — GitHub Actions template + pre-commit hook included
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install mimic-ai
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Or with optional integrations:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install mimic-ai[openai]
|
|
72
|
+
pip install mimic-ai[anthropic]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quick start
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
mkdir my-agent && cd my-agent
|
|
79
|
+
mimic init
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This creates a project skeleton:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
my-agent/
|
|
86
|
+
├── mimic.yaml # Project config
|
|
87
|
+
├── tests/
|
|
88
|
+
│ └── test_agent.py # Your recorded tests
|
|
89
|
+
└── .mimic/ # Recorded runs (gitignored by default)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Edit `tests/test_agent.py`:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from mimic import Mimic, assert_that, replay
|
|
96
|
+
|
|
97
|
+
mimic = Mimic()
|
|
98
|
+
|
|
99
|
+
@mimic.record("my-agent", model="gpt-4o")
|
|
100
|
+
def answer(question: str) -> str:
|
|
101
|
+
# ... your LLM call here ...
|
|
102
|
+
return "..."
|
|
103
|
+
|
|
104
|
+
def test_agent():
|
|
105
|
+
answer("hello")
|
|
106
|
+
recorded = replay("my-agent")
|
|
107
|
+
assert_that(recorded).finished_without_errors()
|
|
108
|
+
assert_that(recorded).cost_less_than(usd=0.05)
|
|
109
|
+
assert_that(recorded).did_not_call_tool("delete_database")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Run it:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
mimic run tests/ # records + runs (costs $$)
|
|
116
|
+
MIMIC_MODE=replay mimic run tests/ # replays only (free, deterministic)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Multi-step agents
|
|
120
|
+
|
|
121
|
+
For ReAct, multi-agent, or any agent with multiple LLM/tool calls, record each step:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
@mimic.record("research-agent")
|
|
125
|
+
async def research(question: str) -> str:
|
|
126
|
+
# Step 1: plan
|
|
127
|
+
with mimic.step("plan", model="gpt-4o-mini") as s:
|
|
128
|
+
resp = await llm.complete(model="gpt-4o-mini", messages=[...])
|
|
129
|
+
tracked_completion(resp)
|
|
130
|
+
s.metadata["plan_steps"] = 3
|
|
131
|
+
|
|
132
|
+
# Step 2: search
|
|
133
|
+
with mimic.step("search") as s:
|
|
134
|
+
results = await web_search(question)
|
|
135
|
+
s.metadata["result_count"] = len(results)
|
|
136
|
+
|
|
137
|
+
# Step 3: synthesize
|
|
138
|
+
with mimic.step("synthesize", model="gpt-4o") as s:
|
|
139
|
+
resp = await llm.complete(model="gpt-4o", messages=[...])
|
|
140
|
+
tracked_completion(resp)
|
|
141
|
+
|
|
142
|
+
return summary
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Assertions
|
|
146
|
+
|
|
147
|
+
The full chain (all return `self` for fluent chaining):
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
assert_that(run).finished_without_errors()
|
|
151
|
+
assert_that(run).had_error() # inverse
|
|
152
|
+
assert_that(run).cost_less_than(usd=0.05)
|
|
153
|
+
assert_that(run).completed_under(ms=2000)
|
|
154
|
+
assert_that(run).output_contains("substring")
|
|
155
|
+
assert_that(run).output_matches(r"regex")
|
|
156
|
+
assert_that(run).output_equals(value)
|
|
157
|
+
assert_that(run).called_tool("search")
|
|
158
|
+
assert_that(run).did_not_call_tool("delete_database")
|
|
159
|
+
assert_that(run).called_tools(["search", "synthesize"])
|
|
160
|
+
assert_that(run).had_exactly(3)
|
|
161
|
+
assert_that(run).had_at_least(2)
|
|
162
|
+
assert_that(run).used_model("gpt-4o")
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## How it works
|
|
166
|
+
|
|
167
|
+
Mimic sits **outside** your agent code, watching the inputs and outputs of any function you decorate. The first time the function runs, Mimic records the full execution into a content-addressable store. Subsequent test runs use the stored record instead of calling the LLM, making them fast, free, and deterministic.
|
|
168
|
+
|
|
169
|
+
For multi-step agents, Mimic records each step separately, so you can replay just the broken step without re-running the whole agent.
|
|
170
|
+
|
|
171
|
+
## CI integration
|
|
172
|
+
|
|
173
|
+
Drop the included `.github/workflows/test.yml` into your repo. It runs your test suite in replay mode (no LLM cost) and validates that no cost was incurred.
|
|
174
|
+
|
|
175
|
+
Manual re-recording is a separate job, triggered on `workflow_dispatch` or a schedule.
|
|
176
|
+
|
|
177
|
+
## The recording format
|
|
178
|
+
|
|
179
|
+
Mimic recordings are plain JSON conforming to a documented schema — see [`RECORDING_FORMAT.md`](RECORDING_FORMAT.md). The format is **vendor-neutral**: you can build readers, web UIs, or analysis tools without depending on the Mimic library.
|
|
180
|
+
|
|
181
|
+
## The $100M thesis
|
|
182
|
+
|
|
183
|
+
Mimic sits at the intersection of three exploding markets:
|
|
184
|
+
|
|
185
|
+
1. **AI agent development** — 10M+ developers will build agents by 2027.
|
|
186
|
+
2. **AI observability** — already a $2B+ market, dominated by closed vendors (LangSmith, Helicone, Langfuse).
|
|
187
|
+
3. **AI safety & compliance** — every enterprise deploying agents needs guardrails, audit trails, and replay.
|
|
188
|
+
|
|
189
|
+
The land-and-expand model is proven (Sentry, Supabase, GitLab, Vercel, PostHog): open source core → community growth → enterprise tier with self-hosted, SSO, audit logs, and SOC2.
|
|
190
|
+
|
|
191
|
+
See [`BUSINESS_PLAN.md`](BUSINESS_PLAN.md) for the full strategy.
|
|
192
|
+
|
|
193
|
+
## Roadmap
|
|
194
|
+
|
|
195
|
+
- [x] v0.1 — Record/replay/assert core
|
|
196
|
+
- [x] v0.2 — Async + multi-step + OpenAI/Anthropic cost tracking
|
|
197
|
+
- [ ] v0.3 — Web UI for browsing recorded runs
|
|
198
|
+
- [ ] v0.4 — TypeScript SDK
|
|
199
|
+
- [ ] v0.5 — Auto-generated regression tests from production traces
|
|
200
|
+
- [ ] v0.6 — Multi-agent parent/child traces
|
|
201
|
+
- [ ] v1.0 — Enterprise self-hosted edition
|
|
202
|
+
|
|
203
|
+
## Contributing
|
|
204
|
+
|
|
205
|
+
We love contributions. See [`CONTRIBUTING.md`](CONTRIBUTING.md).
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mimic-recording"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "The pytest for AI agents. Record, replay, assert, and diff agent behavior."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [{name = "Mimic", email = "team@mimic.dev"}]
|
|
13
|
+
keywords = ["ai", "agents", "testing", "llm", "evals", "replay", "observability"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 5 - Production/Stable",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Software Development :: Testing",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pydantic>=2.0",
|
|
28
|
+
"click>=8.0",
|
|
29
|
+
"rich>=13.0",
|
|
30
|
+
"pyyaml>=6.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=7.0",
|
|
36
|
+
"pytest-cov>=4.0",
|
|
37
|
+
"ruff>=0.1.0",
|
|
38
|
+
"mypy>=1.0",
|
|
39
|
+
]
|
|
40
|
+
anthropic = ["anthropic>=0.18.0"]
|
|
41
|
+
openai = ["openai>=1.0.0"]
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
mimic = "mimic.cli:main"
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://github.com/mimic-ai/mimic"
|
|
48
|
+
Documentation = "https://docs.mimic.dev"
|
|
49
|
+
Repository = "https://github.com/mimic-ai/mimic"
|
|
50
|
+
Issues = "https://github.com/mimic-ai/mimic/issues"
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
where = ["src"]
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.package-data]
|
|
56
|
+
mimic = ["py.typed"]
|
|
57
|
+
|
|
58
|
+
[tool.pytest.ini_options]
|
|
59
|
+
testpaths = ["tests"]
|
|
60
|
+
python_files = ["test_*.py"]
|
|
61
|
+
addopts = "-v --tb=short"
|
|
62
|
+
|
|
63
|
+
[tool.ruff]
|
|
64
|
+
line-length = 100
|
|
65
|
+
target-version = "py39"
|
|
66
|
+
|
|
67
|
+
[tool.mypy]
|
|
68
|
+
python_version = "3.9"
|
|
69
|
+
strict = false
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Mimic — The pytest for AI agents.
|
|
2
|
+
|
|
3
|
+
Record, replay, assert, and diff agent behavior.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from mimic.assertions import AssertChain, assert_that
|
|
7
|
+
from mimic.diff import diff_runs
|
|
8
|
+
from mimic.models import RunRecord, StepRecord
|
|
9
|
+
from mimic.recorder import Mimic, record
|
|
10
|
+
from mimic.replay import replay
|
|
11
|
+
from mimic.step import ActiveStep, AsyncActiveStep
|
|
12
|
+
|
|
13
|
+
__version__ = "1.0.0"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Mimic",
|
|
17
|
+
"record",
|
|
18
|
+
"replay",
|
|
19
|
+
"assert_that",
|
|
20
|
+
"AssertChain",
|
|
21
|
+
"diff_runs",
|
|
22
|
+
"RunRecord",
|
|
23
|
+
"StepRecord",
|
|
24
|
+
"ActiveStep",
|
|
25
|
+
"AsyncActiveStep",
|
|
26
|
+
"__version__",
|
|
27
|
+
]
|