traceforge 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traceforge-0.2.0/LICENSE +21 -0
- traceforge-0.2.0/PKG-INFO +236 -0
- traceforge-0.2.0/README.md +204 -0
- traceforge-0.2.0/pyproject.toml +51 -0
- traceforge-0.2.0/setup.cfg +4 -0
- traceforge-0.2.0/src/traceforge/__init__.py +3 -0
- traceforge-0.2.0/src/traceforge/attribution.py +358 -0
- traceforge-0.2.0/src/traceforge/cli.py +588 -0
- traceforge-0.2.0/src/traceforge/evaluator.py +338 -0
- traceforge-0.2.0/src/traceforge/fuzzer.py +132 -0
- traceforge-0.2.0/src/traceforge/harness.py +228 -0
- traceforge-0.2.0/src/traceforge/history.py +110 -0
- traceforge-0.2.0/src/traceforge/html_report.py +245 -0
- traceforge-0.2.0/src/traceforge/invariants.py +477 -0
- traceforge-0.2.0/src/traceforge/judge.py +85 -0
- traceforge-0.2.0/src/traceforge/loader.py +97 -0
- traceforge-0.2.0/src/traceforge/minrepro.py +107 -0
- traceforge-0.2.0/src/traceforge/mock_tools.py +27 -0
- traceforge-0.2.0/src/traceforge/models.py +354 -0
- traceforge-0.2.0/src/traceforge/mutators.py +140 -0
- traceforge-0.2.0/src/traceforge/replay.py +92 -0
- traceforge-0.2.0/src/traceforge/reporter.py +220 -0
- traceforge-0.2.0/src/traceforge/trace_ir.py +32 -0
- traceforge-0.2.0/src/traceforge/trace_store.py +132 -0
- traceforge-0.2.0/src/traceforge/utils.py +15 -0
- traceforge-0.2.0/src/traceforge.egg-info/PKG-INFO +236 -0
- traceforge-0.2.0/src/traceforge.egg-info/SOURCES.txt +45 -0
- traceforge-0.2.0/src/traceforge.egg-info/dependency_links.txt +1 -0
- traceforge-0.2.0/src/traceforge.egg-info/entry_points.txt +2 -0
- traceforge-0.2.0/src/traceforge.egg-info/requires.txt +13 -0
- traceforge-0.2.0/src/traceforge.egg-info/top_level.txt +1 -0
- traceforge-0.2.0/tests/test_attribution.py +336 -0
- traceforge-0.2.0/tests/test_evaluator.py +282 -0
- traceforge-0.2.0/tests/test_fuzzer.py +168 -0
- traceforge-0.2.0/tests/test_harness.py +184 -0
- traceforge-0.2.0/tests/test_history.py +104 -0
- traceforge-0.2.0/tests/test_invariants.py +352 -0
- traceforge-0.2.0/tests/test_judge.py +96 -0
- traceforge-0.2.0/tests/test_loader.py +181 -0
- traceforge-0.2.0/tests/test_minrepro.py +249 -0
- traceforge-0.2.0/tests/test_mock_tools.py +56 -0
- traceforge-0.2.0/tests/test_models.py +290 -0
- traceforge-0.2.0/tests/test_mutators.py +145 -0
- traceforge-0.2.0/tests/test_replay.py +185 -0
- traceforge-0.2.0/tests/test_reporter.py +143 -0
- traceforge-0.2.0/tests/test_trace_ir.py +126 -0
- traceforge-0.2.0/tests/test_trace_store.py +130 -0
traceforge-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Abhimanyu Bhagwati
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: traceforge
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Test harness for AI agents that call tools. Record, replay, fuzz, and debug.
|
|
5
|
+
Author-email: Abhimanyu Bhagwati <abhimanyu@vt.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AbhimanyuBhagwati/TraceForge
|
|
8
|
+
Project-URL: Repository, https://github.com/AbhimanyuBhagwati/TraceForge
|
|
9
|
+
Project-URL: Issues, https://github.com/AbhimanyuBhagwati/TraceForge/issues
|
|
10
|
+
Keywords: ai,agents,testing,llm,ollama,evaluation,fuzzing,replay,debugging
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Software Development :: Testing
|
|
16
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
24
|
+
Provides-Extra: dependencies
|
|
25
|
+
Requires-Dist: click>=8.0; extra == "dependencies"
|
|
26
|
+
Requires-Dist: ollama>=0.1.6; extra == "dependencies"
|
|
27
|
+
Requires-Dist: pydantic>=2.0; extra == "dependencies"
|
|
28
|
+
Requires-Dist: pyyaml>=6.0; extra == "dependencies"
|
|
29
|
+
Requires-Dist: rich>=13.0; extra == "dependencies"
|
|
30
|
+
Requires-Dist: jinja2>=3.0; extra == "dependencies"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# TraceForge
|
|
34
|
+
|
|
35
|
+
A test harness for AI agents that call tools.
|
|
36
|
+
|
|
37
|
+
If you're building agents with tool-calling (on Ollama, local models, etc.) and you're tired of staring at logs trying to figure out why your agent called the wrong tool or returned garbage — this is for you.
|
|
38
|
+
|
|
39
|
+
## What it does
|
|
40
|
+
|
|
41
|
+
You write a YAML file describing what your agent should do. TraceForge runs it, records everything, and then lets you analyze the recordings without re-running the model.
|
|
42
|
+
|
|
43
|
+
```yaml
|
|
44
|
+
name: calculator_agent
|
|
45
|
+
agent:
|
|
46
|
+
model: qwen2.5:7b-instruct
|
|
47
|
+
system_prompt: "You are a calculator assistant."
|
|
48
|
+
tools:
|
|
49
|
+
- name: calculate
|
|
50
|
+
description: "Perform a math calculation"
|
|
51
|
+
parameters:
|
|
52
|
+
type: object
|
|
53
|
+
properties:
|
|
54
|
+
expression: { type: string }
|
|
55
|
+
required: [expression]
|
|
56
|
+
mock_responses: [{ result: 42 }]
|
|
57
|
+
|
|
58
|
+
steps:
|
|
59
|
+
- user_message: "What is 6 times 7?"
|
|
60
|
+
expectations:
|
|
61
|
+
- type: tool_called
|
|
62
|
+
tool: calculate
|
|
63
|
+
- type: response_contains
|
|
64
|
+
values: ["42"]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
$ traceforge run ./scenarios/ --runs 10
|
|
69
|
+
|
|
70
|
+
╭───────────────────── TraceForge Report ──────────────────────╮
|
|
71
|
+
│ SCENARIO PASS FAIL RATE CONSIST AVG MS │
|
|
72
|
+
│ OK calculator_agent 10/10 0/10 100% 1.00 1,059 │
|
|
73
|
+
│ XX multi_step_math 0/10 10/10 0% 1.00 3,598 │
|
|
74
|
+
│ OK simple_chat 10/10 0/10 100% 1.00 898 │
|
|
75
|
+
│ OK weather_agent 10/10 0/10 100% 1.00 1,246 │
|
|
76
|
+
│ │
|
|
77
|
+
│ OVERALL: 75.0% pass rate │
|
|
78
|
+
╰──────────────────────────────────────────────────────────────╯
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## The idea
|
|
82
|
+
|
|
83
|
+
Running an LLM is expensive and slow. But once you have a recording of what it did, you can re-evaluate it instantly, fuzz it, minimize it, and analyze it — all offline.
|
|
84
|
+
|
|
85
|
+
TraceForge records every agent run as an immutable, content-addressed trace (SHA-256 hashed). Then it gives you tools to work with those traces:
|
|
86
|
+
|
|
87
|
+
- **Replay** — re-evaluate a trace with different expectations, no model needed
|
|
88
|
+
- **Fuzz** — mutate tool responses (nulls, type swaps, empty strings) and see what breaks your agent
|
|
89
|
+
- **MinRepro** — your agent runs 4 steps and fails; delta debugging finds the 1 step that actually matters
|
|
90
|
+
- **Mine** — automatically discover behavioral rules from passing traces ("calculate is always called at step 0", "expression is always non-empty")
|
|
91
|
+
- **Attribute** — when something fails, run counterfactual experiments to find out why ("the agent is sensitive to tool output values, not format")
|
|
92
|
+
|
|
93
|
+
## Install
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
pip install traceforge
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Or from source:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/AbhimanyuBhagwati/TraceForge.git
|
|
103
|
+
cd TraceForge
|
|
104
|
+
pip install -e ".[dev]"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
You'll need [Ollama](https://ollama.com/) running locally with a model pulled:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
ollama pull qwen2.5:7b-instruct
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Quick start
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Create example scenarios
|
|
117
|
+
traceforge init
|
|
118
|
+
|
|
119
|
+
# Run them
|
|
120
|
+
traceforge run ./examples/scenarios/ --runs 5
|
|
121
|
+
|
|
122
|
+
# See what you've got
|
|
123
|
+
traceforge traces
|
|
124
|
+
traceforge info
|
|
125
|
+
|
|
126
|
+
# Replay a trace offline (no model call)
|
|
127
|
+
traceforge replay <trace-id>
|
|
128
|
+
|
|
129
|
+
# Fuzz tool responses
|
|
130
|
+
traceforge fuzz ./examples/scenarios/
|
|
131
|
+
|
|
132
|
+
# Find minimal failing case
|
|
133
|
+
traceforge minrepro <failing-trace-id> --scenario ./examples/scenarios/
|
|
134
|
+
|
|
135
|
+
# Discover behavioral patterns
|
|
136
|
+
traceforge mine calculator_agent -v
|
|
137
|
+
|
|
138
|
+
# Find root cause of failure
|
|
139
|
+
traceforge attribute <failing-trace-id> --scenario ./examples/scenarios/
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## How it works
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
YAML scenario
|
|
146
|
+
|
|
|
147
|
+
v
|
|
148
|
+
traceforge run -> traces (content-addressed, stored locally)
|
|
149
|
+
|
|
|
150
|
+
v
|
|
151
|
+
traceforge replay -> re-evaluate offline
|
|
152
|
+
traceforge fuzz -> break tool responses, find fragility
|
|
153
|
+
traceforge minrepro -> shrink failing trace to minimal case
|
|
154
|
+
traceforge mine -> discover behavioral rules from traces
|
|
155
|
+
traceforge attribute -> counterfactual analysis of failures
|
|
156
|
+
|
|
|
157
|
+
v
|
|
158
|
+
CLI output / HTML report / JSON export
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Everything after `run` works on stored traces. Run the model once, analyze as many times as you want.
|
|
162
|
+
|
|
163
|
+
## Expectations
|
|
164
|
+
|
|
165
|
+
10 built-in expectation types you can use in your YAML:
|
|
166
|
+
|
|
167
|
+
| Type | What it checks |
|
|
168
|
+
|------|---------------|
|
|
169
|
+
| `tool_called` | Agent called this tool |
|
|
170
|
+
| `tool_not_called` | Agent didn't call this tool |
|
|
171
|
+
| `tool_args_contain` | Tool was called with these arguments |
|
|
172
|
+
| `response_contains` | Agent's response includes these strings |
|
|
173
|
+
| `response_not_contains` | Agent's response doesn't include this |
|
|
174
|
+
| `response_matches_regex` | Response matches a regex |
|
|
175
|
+
| `llm_judge` | Another LLM evaluates the response |
|
|
176
|
+
| `latency_under` | Step completed within N ms |
|
|
177
|
+
| `no_tool_errors` | No tool calls returned errors |
|
|
178
|
+
| `tool_call_count` | Tool was called exactly/at least/at most N times |
|
|
179
|
+
|
|
180
|
+
## Invariant mining
|
|
181
|
+
|
|
182
|
+
Instead of writing expectations by hand, let TraceForge figure them out:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
$ traceforge mine calculator_agent -v
|
|
186
|
+
|
|
187
|
+
╭────────────── Invariant Mining Report ───────────────╮
|
|
188
|
+
│ Traces analyzed: 15 (15 passing, 0 failing) │
|
|
189
|
+
│ Invariants discovered: 5 │
|
|
190
|
+
│ │
|
|
191
|
+
│ - 'calculate' is always called at step 0 │
|
|
192
|
+
│ - 'calculate' is called 1-5 times per run │
|
|
193
|
+
│ - 'calculate.expression' is always non-empty │
|
|
194
|
+
│ - Step 0 response length is 30-48 chars │
|
|
195
|
+
│ - Step 0 latency is under 3916ms │
|
|
196
|
+
╰──────────────────────────────────────────────────────╯
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run enough traces and the miner will find rules that hold in all passing traces but break in failing ones. Those are your bugs.
|
|
200
|
+
|
|
201
|
+
## Causal attribution
|
|
202
|
+
|
|
203
|
+
When a trace fails, TraceForge can run counterfactual experiments — change one thing at a time, re-run the agent, and see what flips the outcome.
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
$ traceforge attribute <trace-id> --scenario ./scenarios/
|
|
207
|
+
|
|
208
|
+
╭────────────── Causal Attribution Report ─────────────╮
|
|
209
|
+
│ Failing step: 2 | Interventions: 23 | Flips: 7 │
|
|
210
|
+
│ │
|
|
211
|
+
│ CAUSAL FACTOR SENSITIVITY │
|
|
212
|
+
│ tool_output_value 40% │
|
|
213
|
+
│ tool_output_format 0% │
|
|
214
|
+
│ system_prompt_clause 0% │
|
|
215
|
+
╰──────────────────────────────────────────────────────╯
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
"40% of value changes flipped the outcome. Format and prompt don't matter." Now you know where to look.
|
|
219
|
+
|
|
220
|
+
## Requirements
|
|
221
|
+
|
|
222
|
+
- Python 3.12+
|
|
223
|
+
- Ollama running locally
|
|
224
|
+
- A pulled model (tested with `qwen2.5:7b-instruct`)
|
|
225
|
+
|
|
226
|
+
## Tests
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
pytest tests/ -v
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
183 tests, runs in about a second.
|
|
233
|
+
|
|
234
|
+
## License
|
|
235
|
+
|
|
236
|
+
MIT
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# TraceForge
|
|
2
|
+
|
|
3
|
+
A test harness for AI agents that call tools.
|
|
4
|
+
|
|
5
|
+
If you're building agents with tool-calling (on Ollama, local models, etc.) and you're tired of staring at logs trying to figure out why your agent called the wrong tool or returned garbage — this is for you.
|
|
6
|
+
|
|
7
|
+
## What it does
|
|
8
|
+
|
|
9
|
+
You write a YAML file describing what your agent should do. TraceForge runs it, records everything, and then lets you analyze the recordings without re-running the model.
|
|
10
|
+
|
|
11
|
+
```yaml
|
|
12
|
+
name: calculator_agent
|
|
13
|
+
agent:
|
|
14
|
+
model: qwen2.5:7b-instruct
|
|
15
|
+
system_prompt: "You are a calculator assistant."
|
|
16
|
+
tools:
|
|
17
|
+
- name: calculate
|
|
18
|
+
description: "Perform a math calculation"
|
|
19
|
+
parameters:
|
|
20
|
+
type: object
|
|
21
|
+
properties:
|
|
22
|
+
expression: { type: string }
|
|
23
|
+
required: [expression]
|
|
24
|
+
mock_responses: [{ result: 42 }]
|
|
25
|
+
|
|
26
|
+
steps:
|
|
27
|
+
- user_message: "What is 6 times 7?"
|
|
28
|
+
expectations:
|
|
29
|
+
- type: tool_called
|
|
30
|
+
tool: calculate
|
|
31
|
+
- type: response_contains
|
|
32
|
+
values: ["42"]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
$ traceforge run ./scenarios/ --runs 10
|
|
37
|
+
|
|
38
|
+
╭───────────────────── TraceForge Report ──────────────────────╮
|
|
39
|
+
│ SCENARIO PASS FAIL RATE CONSIST AVG MS │
|
|
40
|
+
│ OK calculator_agent 10/10 0/10 100% 1.00 1,059 │
|
|
41
|
+
│ XX multi_step_math 0/10 10/10 0% 1.00 3,598 │
|
|
42
|
+
│ OK simple_chat 10/10 0/10 100% 1.00 898 │
|
|
43
|
+
│ OK weather_agent 10/10 0/10 100% 1.00 1,246 │
|
|
44
|
+
│ │
|
|
45
|
+
│ OVERALL: 75.0% pass rate │
|
|
46
|
+
╰──────────────────────────────────────────────────────────────╯
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## The idea
|
|
50
|
+
|
|
51
|
+
Running an LLM is expensive and slow. But once you have a recording of what it did, you can re-evaluate it instantly, fuzz it, minimize it, and analyze it — all offline.
|
|
52
|
+
|
|
53
|
+
TraceForge records every agent run as an immutable, content-addressed trace (SHA-256 hashed). Then it gives you tools to work with those traces:
|
|
54
|
+
|
|
55
|
+
- **Replay** — re-evaluate a trace with different expectations, no model needed
|
|
56
|
+
- **Fuzz** — mutate tool responses (nulls, type swaps, empty strings) and see what breaks your agent
|
|
57
|
+
- **MinRepro** — your agent runs 4 steps and fails; delta debugging finds the 1 step that actually matters
|
|
58
|
+
- **Mine** — automatically discover behavioral rules from passing traces ("calculate is always called at step 0", "expression is always non-empty")
|
|
59
|
+
- **Attribute** — when something fails, run counterfactual experiments to find out why ("the agent is sensitive to tool output values, not format")
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install traceforge
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Or from source:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git clone https://github.com/AbhimanyuBhagwati/TraceForge.git
|
|
71
|
+
cd TraceForge
|
|
72
|
+
pip install -e ".[dev]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
You'll need [Ollama](https://ollama.com/) running locally with a model pulled:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
ollama pull qwen2.5:7b-instruct
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quick start
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Create example scenarios
|
|
85
|
+
traceforge init
|
|
86
|
+
|
|
87
|
+
# Run them
|
|
88
|
+
traceforge run ./examples/scenarios/ --runs 5
|
|
89
|
+
|
|
90
|
+
# See what you've got
|
|
91
|
+
traceforge traces
|
|
92
|
+
traceforge info
|
|
93
|
+
|
|
94
|
+
# Replay a trace offline (no model call)
|
|
95
|
+
traceforge replay <trace-id>
|
|
96
|
+
|
|
97
|
+
# Fuzz tool responses
|
|
98
|
+
traceforge fuzz ./examples/scenarios/
|
|
99
|
+
|
|
100
|
+
# Find minimal failing case
|
|
101
|
+
traceforge minrepro <failing-trace-id> --scenario ./examples/scenarios/
|
|
102
|
+
|
|
103
|
+
# Discover behavioral patterns
|
|
104
|
+
traceforge mine calculator_agent -v
|
|
105
|
+
|
|
106
|
+
# Find root cause of failure
|
|
107
|
+
traceforge attribute <failing-trace-id> --scenario ./examples/scenarios/
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## How it works
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
YAML scenario
|
|
114
|
+
|
|
|
115
|
+
v
|
|
116
|
+
traceforge run -> traces (content-addressed, stored locally)
|
|
117
|
+
|
|
|
118
|
+
v
|
|
119
|
+
traceforge replay -> re-evaluate offline
|
|
120
|
+
traceforge fuzz -> break tool responses, find fragility
|
|
121
|
+
traceforge minrepro -> shrink failing trace to minimal case
|
|
122
|
+
traceforge mine -> discover behavioral rules from traces
|
|
123
|
+
traceforge attribute -> counterfactual analysis of failures
|
|
124
|
+
|
|
|
125
|
+
v
|
|
126
|
+
CLI output / HTML report / JSON export
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Everything after `run` works on stored traces. Run the model once, analyze as many times as you want.
|
|
130
|
+
|
|
131
|
+
## Expectations
|
|
132
|
+
|
|
133
|
+
10 built-in expectation types you can use in your YAML:
|
|
134
|
+
|
|
135
|
+
| Type | What it checks |
|
|
136
|
+
|------|---------------|
|
|
137
|
+
| `tool_called` | Agent called this tool |
|
|
138
|
+
| `tool_not_called` | Agent didn't call this tool |
|
|
139
|
+
| `tool_args_contain` | Tool was called with these arguments |
|
|
140
|
+
| `response_contains` | Agent's response includes these strings |
|
|
141
|
+
| `response_not_contains` | Agent's response doesn't include this |
|
|
142
|
+
| `response_matches_regex` | Response matches a regex |
|
|
143
|
+
| `llm_judge` | Another LLM evaluates the response |
|
|
144
|
+
| `latency_under` | Step completed within N ms |
|
|
145
|
+
| `no_tool_errors` | No tool calls returned errors |
|
|
146
|
+
| `tool_call_count` | Tool was called exactly/at least/at most N times |
|
|
147
|
+
|
|
148
|
+
## Invariant mining
|
|
149
|
+
|
|
150
|
+
Instead of writing expectations by hand, let TraceForge figure them out:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
$ traceforge mine calculator_agent -v
|
|
154
|
+
|
|
155
|
+
╭────────────── Invariant Mining Report ───────────────╮
|
|
156
|
+
│ Traces analyzed: 15 (15 passing, 0 failing) │
|
|
157
|
+
│ Invariants discovered: 5 │
|
|
158
|
+
│ │
|
|
159
|
+
│ - 'calculate' is always called at step 0 │
|
|
160
|
+
│ - 'calculate' is called 1-5 times per run │
|
|
161
|
+
│ - 'calculate.expression' is always non-empty │
|
|
162
|
+
│ - Step 0 response length is 30-48 chars │
|
|
163
|
+
│ - Step 0 latency is under 3916ms │
|
|
164
|
+
╰──────────────────────────────────────────────────────╯
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Run enough traces and the miner will find rules that hold in all passing traces but break in failing ones. Those are your bugs.
|
|
168
|
+
|
|
169
|
+
## Causal attribution
|
|
170
|
+
|
|
171
|
+
When a trace fails, TraceForge can run counterfactual experiments — change one thing at a time, re-run the agent, and see what flips the outcome.
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
$ traceforge attribute <trace-id> --scenario ./scenarios/
|
|
175
|
+
|
|
176
|
+
╭────────────── Causal Attribution Report ─────────────╮
|
|
177
|
+
│ Failing step: 2 | Interventions: 23 | Flips: 7 │
|
|
178
|
+
│ │
|
|
179
|
+
│ CAUSAL FACTOR SENSITIVITY │
|
|
180
|
+
│ tool_output_value 40% │
|
|
181
|
+
│ tool_output_format 0% │
|
|
182
|
+
│ system_prompt_clause 0% │
|
|
183
|
+
╰──────────────────────────────────────────────────────╯
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
"40% of value changes flipped the outcome. Format and prompt don't matter." Now you know where to look.
|
|
187
|
+
|
|
188
|
+
## Requirements
|
|
189
|
+
|
|
190
|
+
- Python 3.12+
|
|
191
|
+
- Ollama running locally
|
|
192
|
+
- A pulled model (tested with `qwen2.5:7b-instruct`)
|
|
193
|
+
|
|
194
|
+
## Tests
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
pytest tests/ -v
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
183 tests, runs in about a second.
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
MIT
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "traceforge"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Test harness for AI agents that call tools. Record, replay, fuzz, and debug."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.12"
|
|
12
|
+
authors = [{name = "Abhimanyu Bhagwati", email = "abhimanyu@vt.edu"}]
|
|
13
|
+
keywords = ["ai", "agents", "testing", "llm", "ollama", "evaluation", "fuzzing", "replay", "debugging"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Software Development :: Testing",
|
|
20
|
+
"Topic :: Software Development :: Debuggers",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/AbhimanyuBhagwati/TraceForge"
|
|
25
|
+
Repository = "https://github.com/AbhimanyuBhagwati/TraceForge"
|
|
26
|
+
Issues = "https://github.com/AbhimanyuBhagwati/TraceForge/issues"
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=8.0",
|
|
31
|
+
"pytest-asyncio>=0.23",
|
|
32
|
+
"pytest-cov>=4.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
dependencies = [
|
|
36
|
+
"click>=8.0",
|
|
37
|
+
"ollama>=0.1.6",
|
|
38
|
+
"pydantic>=2.0",
|
|
39
|
+
"pyyaml>=6.0",
|
|
40
|
+
"rich>=13.0",
|
|
41
|
+
"jinja2>=3.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
traceforge = "traceforge.cli:main"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|