traceforge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traceforge/__init__.py +3 -0
- traceforge/attribution.py +358 -0
- traceforge/cli.py +588 -0
- traceforge/evaluator.py +338 -0
- traceforge/fuzzer.py +132 -0
- traceforge/harness.py +228 -0
- traceforge/history.py +110 -0
- traceforge/html_report.py +245 -0
- traceforge/invariants.py +477 -0
- traceforge/judge.py +85 -0
- traceforge/loader.py +97 -0
- traceforge/minrepro.py +107 -0
- traceforge/mock_tools.py +27 -0
- traceforge/models.py +354 -0
- traceforge/mutators.py +140 -0
- traceforge/replay.py +92 -0
- traceforge/reporter.py +220 -0
- traceforge/trace_ir.py +32 -0
- traceforge/trace_store.py +132 -0
- traceforge/utils.py +15 -0
- traceforge-0.2.0.dist-info/METADATA +236 -0
- traceforge-0.2.0.dist-info/RECORD +26 -0
- traceforge-0.2.0.dist-info/WHEEL +5 -0
- traceforge-0.2.0.dist-info/entry_points.txt +2 -0
- traceforge-0.2.0.dist-info/licenses/LICENSE +21 -0
- traceforge-0.2.0.dist-info/top_level.txt +1 -0
traceforge/__init__.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Causal attribution engine — counterfactual replay to find WHY agents fail."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from traceforge.models import (
|
|
7
|
+
CausalReport,
|
|
8
|
+
CounterfactualResult,
|
|
9
|
+
Intervention,
|
|
10
|
+
InterventionType,
|
|
11
|
+
Scenario,
|
|
12
|
+
TraceIR,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class InterventionGenerator:
|
|
17
|
+
"""Generates counterfactual interventions for a failing trace."""
|
|
18
|
+
|
|
19
|
+
def generate_all(
|
|
20
|
+
self, scenario: Scenario, trace: TraceIR, failing_step: int
|
|
21
|
+
) -> list[Intervention]:
|
|
22
|
+
interventions = []
|
|
23
|
+
interventions.extend(self._tool_output_format_interventions(trace, failing_step))
|
|
24
|
+
interventions.extend(self._tool_output_value_interventions(trace, failing_step))
|
|
25
|
+
interventions.extend(self._tool_output_field_interventions(trace, failing_step))
|
|
26
|
+
interventions.extend(self._context_truncation_interventions(trace, failing_step))
|
|
27
|
+
interventions.extend(self._system_prompt_clause_interventions(scenario))
|
|
28
|
+
interventions.extend(self._tool_schema_interventions(scenario, failing_step))
|
|
29
|
+
return interventions
|
|
30
|
+
|
|
31
|
+
def _tool_output_format_interventions(
|
|
32
|
+
self, trace: TraceIR, step_idx: int
|
|
33
|
+
) -> list[Intervention]:
|
|
34
|
+
interventions = []
|
|
35
|
+
if step_idx >= len(trace.steps):
|
|
36
|
+
return interventions
|
|
37
|
+
step = trace.steps[step_idx]
|
|
38
|
+
for tc in step.tool_calls:
|
|
39
|
+
for key, value in tc.response.items():
|
|
40
|
+
if isinstance(value, (int, float)):
|
|
41
|
+
interventions.append(Intervention(
|
|
42
|
+
intervention_type=InterventionType.TOOL_OUTPUT_FORMAT,
|
|
43
|
+
description=f"Changed '{key}' from {type(value).__name__} to string",
|
|
44
|
+
target_step=step_idx,
|
|
45
|
+
target_tool=tc.tool_name,
|
|
46
|
+
target_field=key,
|
|
47
|
+
original_value=value,
|
|
48
|
+
modified_value=str(value),
|
|
49
|
+
))
|
|
50
|
+
elif isinstance(value, str):
|
|
51
|
+
try:
|
|
52
|
+
num = float(value)
|
|
53
|
+
interventions.append(Intervention(
|
|
54
|
+
intervention_type=InterventionType.TOOL_OUTPUT_FORMAT,
|
|
55
|
+
description=f"Changed '{key}' from string to number",
|
|
56
|
+
target_step=step_idx,
|
|
57
|
+
target_tool=tc.tool_name,
|
|
58
|
+
target_field=key,
|
|
59
|
+
original_value=value,
|
|
60
|
+
modified_value=num,
|
|
61
|
+
))
|
|
62
|
+
except ValueError:
|
|
63
|
+
pass
|
|
64
|
+
return interventions
|
|
65
|
+
|
|
66
|
+
def _tool_output_value_interventions(
|
|
67
|
+
self, trace: TraceIR, step_idx: int
|
|
68
|
+
) -> list[Intervention]:
|
|
69
|
+
interventions = []
|
|
70
|
+
if step_idx >= len(trace.steps):
|
|
71
|
+
return interventions
|
|
72
|
+
step = trace.steps[step_idx]
|
|
73
|
+
for tc in step.tool_calls:
|
|
74
|
+
for key, value in tc.response.items():
|
|
75
|
+
if isinstance(value, bool):
|
|
76
|
+
interventions.append(Intervention(
|
|
77
|
+
intervention_type=InterventionType.TOOL_OUTPUT_VALUE,
|
|
78
|
+
description=f"Flipped '{key}' from {value} to {not value}",
|
|
79
|
+
target_step=step_idx,
|
|
80
|
+
target_tool=tc.tool_name,
|
|
81
|
+
target_field=key,
|
|
82
|
+
original_value=value,
|
|
83
|
+
modified_value=not value,
|
|
84
|
+
))
|
|
85
|
+
elif isinstance(value, (int, float)) and value != 0:
|
|
86
|
+
for new_val, desc in [
|
|
87
|
+
(0, "zero"),
|
|
88
|
+
(-value, "negated"),
|
|
89
|
+
(value * 2, "doubled"),
|
|
90
|
+
(value / 2, "halved"),
|
|
91
|
+
]:
|
|
92
|
+
interventions.append(Intervention(
|
|
93
|
+
intervention_type=InterventionType.TOOL_OUTPUT_VALUE,
|
|
94
|
+
description=f"Changed '{key}' to {desc} ({new_val})",
|
|
95
|
+
target_step=step_idx,
|
|
96
|
+
target_tool=tc.tool_name,
|
|
97
|
+
target_field=key,
|
|
98
|
+
original_value=value,
|
|
99
|
+
modified_value=new_val,
|
|
100
|
+
))
|
|
101
|
+
return interventions
|
|
102
|
+
|
|
103
|
+
def _tool_output_field_interventions(
|
|
104
|
+
self, trace: TraceIR, step_idx: int
|
|
105
|
+
) -> list[Intervention]:
|
|
106
|
+
interventions = []
|
|
107
|
+
if step_idx >= len(trace.steps):
|
|
108
|
+
return interventions
|
|
109
|
+
step = trace.steps[step_idx]
|
|
110
|
+
for tc in step.tool_calls:
|
|
111
|
+
for key in tc.response:
|
|
112
|
+
interventions.append(Intervention(
|
|
113
|
+
intervention_type=InterventionType.TOOL_OUTPUT_FIELDS,
|
|
114
|
+
description=f"Removed field '{key}' from {tc.tool_name} response",
|
|
115
|
+
target_step=step_idx,
|
|
116
|
+
target_tool=tc.tool_name,
|
|
117
|
+
target_field=key,
|
|
118
|
+
original_value=tc.response[key],
|
|
119
|
+
modified_value="__REMOVE__",
|
|
120
|
+
))
|
|
121
|
+
interventions.append(Intervention(
|
|
122
|
+
intervention_type=InterventionType.TOOL_OUTPUT_FIELDS,
|
|
123
|
+
description=f"Added unexpected field 'debug_info' to {tc.tool_name} response",
|
|
124
|
+
target_step=step_idx,
|
|
125
|
+
target_tool=tc.tool_name,
|
|
126
|
+
target_field="debug_info",
|
|
127
|
+
original_value=None,
|
|
128
|
+
modified_value="internal debug data - ignore this",
|
|
129
|
+
))
|
|
130
|
+
return interventions
|
|
131
|
+
|
|
132
|
+
def _context_truncation_interventions(
|
|
133
|
+
self, trace: TraceIR, step_idx: int
|
|
134
|
+
) -> list[Intervention]:
|
|
135
|
+
interventions = []
|
|
136
|
+
if step_idx > 0:
|
|
137
|
+
for keep_last in [1, 2, max(1, step_idx // 2)]:
|
|
138
|
+
if keep_last < step_idx:
|
|
139
|
+
interventions.append(Intervention(
|
|
140
|
+
intervention_type=InterventionType.CONTEXT_TRUNCATION,
|
|
141
|
+
description=f"Truncated context to last {keep_last} steps (from {step_idx})",
|
|
142
|
+
target_step=step_idx,
|
|
143
|
+
original_value=step_idx,
|
|
144
|
+
modified_value=keep_last,
|
|
145
|
+
))
|
|
146
|
+
return interventions
|
|
147
|
+
|
|
148
|
+
def _system_prompt_clause_interventions(
|
|
149
|
+
self, scenario: Scenario
|
|
150
|
+
) -> list[Intervention]:
|
|
151
|
+
interventions = []
|
|
152
|
+
prompt = scenario.agent.system_prompt or ""
|
|
153
|
+
sentences = [s.strip() for s in prompt.split(".") if s.strip()]
|
|
154
|
+
for i, sentence in enumerate(sentences):
|
|
155
|
+
reduced = ". ".join(s for j, s in enumerate(sentences) if j != i) + "."
|
|
156
|
+
interventions.append(Intervention(
|
|
157
|
+
intervention_type=InterventionType.SYSTEM_PROMPT_CLAUSE,
|
|
158
|
+
description=f"Removed prompt sentence {i}: '{sentence[:50]}...'",
|
|
159
|
+
original_value=prompt,
|
|
160
|
+
modified_value=reduced,
|
|
161
|
+
))
|
|
162
|
+
return interventions
|
|
163
|
+
|
|
164
|
+
def _tool_schema_interventions(
|
|
165
|
+
self, scenario: Scenario, step_idx: int
|
|
166
|
+
) -> list[Intervention]:
|
|
167
|
+
interventions = []
|
|
168
|
+
for tool in scenario.agent.tools:
|
|
169
|
+
props = tool.parameters.get("properties", {})
|
|
170
|
+
for param_name in props:
|
|
171
|
+
interventions.append(Intervention(
|
|
172
|
+
intervention_type=InterventionType.TOOL_SCHEMA_CHANGE,
|
|
173
|
+
description=f"Renamed '{param_name}' to '{param_name}_v2' in {tool.name} schema",
|
|
174
|
+
target_tool=tool.name,
|
|
175
|
+
target_field=param_name,
|
|
176
|
+
original_value=param_name,
|
|
177
|
+
modified_value=f"{param_name}_v2",
|
|
178
|
+
))
|
|
179
|
+
return interventions
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class CausalAttributionEngine:
|
|
183
|
+
"""Runs counterfactual experiments to determine WHY an agent fails."""
|
|
184
|
+
|
|
185
|
+
def __init__(self, harness, evaluator, trace_store, judge=None):
|
|
186
|
+
self.harness = harness
|
|
187
|
+
self.evaluator = evaluator
|
|
188
|
+
self.store = trace_store
|
|
189
|
+
self.judge = judge
|
|
190
|
+
self.generator = InterventionGenerator()
|
|
191
|
+
|
|
192
|
+
def attribute(
|
|
193
|
+
self,
|
|
194
|
+
trace_id: str,
|
|
195
|
+
scenario: Scenario,
|
|
196
|
+
confirmation_runs: int = 3,
|
|
197
|
+
max_interventions: int = 50,
|
|
198
|
+
) -> CausalReport:
|
|
199
|
+
trace = self.store.load(trace_id)
|
|
200
|
+
|
|
201
|
+
# Find the first failing step
|
|
202
|
+
baseline_result = self.evaluator.evaluate(trace, scenario)
|
|
203
|
+
failing_step = self._find_first_failing_step(baseline_result)
|
|
204
|
+
if failing_step is None:
|
|
205
|
+
raise ValueError(f"Trace {trace_id} does not appear to fail any step")
|
|
206
|
+
|
|
207
|
+
# Generate interventions
|
|
208
|
+
all_interventions = self.generator.generate_all(scenario, trace, failing_step)
|
|
209
|
+
interventions = all_interventions[:max_interventions]
|
|
210
|
+
|
|
211
|
+
# Run counterfactual experiments
|
|
212
|
+
results = []
|
|
213
|
+
for intervention in interventions:
|
|
214
|
+
cf_result = self._run_counterfactual(
|
|
215
|
+
scenario, trace, intervention, failing_step, confirmation_runs
|
|
216
|
+
)
|
|
217
|
+
results.append(cf_result)
|
|
218
|
+
|
|
219
|
+
flips = [r for r in results if r.flipped]
|
|
220
|
+
causal_factors = self._rank_factors(results)
|
|
221
|
+
summary = self._generate_summary(scenario, trace, failing_step, causal_factors)
|
|
222
|
+
|
|
223
|
+
return CausalReport(
|
|
224
|
+
scenario_name=scenario.name,
|
|
225
|
+
failing_trace_id=trace_id,
|
|
226
|
+
failing_step=failing_step,
|
|
227
|
+
total_interventions=len(results),
|
|
228
|
+
total_flips=len(flips),
|
|
229
|
+
interventions=results,
|
|
230
|
+
causal_factors=causal_factors,
|
|
231
|
+
summary=summary,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def _run_counterfactual(
|
|
235
|
+
self, scenario, trace, intervention, failing_step, runs
|
|
236
|
+
) -> CounterfactualResult:
|
|
237
|
+
modified_scenario = self._apply_intervention(scenario, trace, intervention)
|
|
238
|
+
|
|
239
|
+
pass_count = 0
|
|
240
|
+
last_trace_id = ""
|
|
241
|
+
for _ in range(runs):
|
|
242
|
+
try:
|
|
243
|
+
traces = self.harness.run_scenario(modified_scenario, runs=1)
|
|
244
|
+
if traces:
|
|
245
|
+
result = self.evaluator.evaluate(traces[0], modified_scenario)
|
|
246
|
+
if result.passed:
|
|
247
|
+
pass_count += 1
|
|
248
|
+
last_trace_id = traces[0].trace_id
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
cf_passed = pass_count > runs / 2
|
|
253
|
+
confidence = max(pass_count, runs - pass_count) / runs
|
|
254
|
+
|
|
255
|
+
return CounterfactualResult(
|
|
256
|
+
intervention=intervention,
|
|
257
|
+
original_passed=False,
|
|
258
|
+
counterfactual_passed=cf_passed,
|
|
259
|
+
flipped=cf_passed,
|
|
260
|
+
trace_id=last_trace_id,
|
|
261
|
+
confidence=confidence,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _apply_intervention(self, scenario, trace, intervention):
|
|
265
|
+
modified = scenario.model_copy(deep=True)
|
|
266
|
+
|
|
267
|
+
match intervention.intervention_type:
|
|
268
|
+
case InterventionType.TOOL_OUTPUT_FORMAT | InterventionType.TOOL_OUTPUT_VALUE:
|
|
269
|
+
for tool in modified.agent.tools:
|
|
270
|
+
if tool.name == intervention.target_tool and tool.mock_responses:
|
|
271
|
+
for resp in tool.mock_responses:
|
|
272
|
+
if intervention.target_field in resp:
|
|
273
|
+
resp[intervention.target_field] = intervention.modified_value
|
|
274
|
+
|
|
275
|
+
case InterventionType.TOOL_OUTPUT_FIELDS:
|
|
276
|
+
for tool in modified.agent.tools:
|
|
277
|
+
if tool.name == intervention.target_tool and tool.mock_responses:
|
|
278
|
+
for resp in tool.mock_responses:
|
|
279
|
+
if intervention.modified_value == "__REMOVE__":
|
|
280
|
+
resp.pop(intervention.target_field, None)
|
|
281
|
+
else:
|
|
282
|
+
resp[intervention.target_field] = intervention.modified_value
|
|
283
|
+
|
|
284
|
+
case InterventionType.CONTEXT_TRUNCATION:
|
|
285
|
+
keep = intervention.modified_value
|
|
286
|
+
if isinstance(keep, int) and keep < len(modified.steps):
|
|
287
|
+
modified.steps = modified.steps[-keep:]
|
|
288
|
+
|
|
289
|
+
case InterventionType.SYSTEM_PROMPT_CLAUSE:
|
|
290
|
+
modified.agent.system_prompt = intervention.modified_value
|
|
291
|
+
|
|
292
|
+
case InterventionType.TOOL_SCHEMA_CHANGE:
|
|
293
|
+
for tool in modified.agent.tools:
|
|
294
|
+
if tool.name == intervention.target_tool:
|
|
295
|
+
props = tool.parameters.get("properties", {})
|
|
296
|
+
if intervention.target_field in props:
|
|
297
|
+
props[intervention.modified_value] = props.pop(
|
|
298
|
+
intervention.target_field
|
|
299
|
+
)
|
|
300
|
+
required = tool.parameters.get("required", [])
|
|
301
|
+
if intervention.target_field in required:
|
|
302
|
+
idx = required.index(intervention.target_field)
|
|
303
|
+
required[idx] = intervention.modified_value
|
|
304
|
+
|
|
305
|
+
modified.runs = 1
|
|
306
|
+
return modified
|
|
307
|
+
|
|
308
|
+
def _rank_factors(self, results: list[CounterfactualResult]) -> list[dict]:
|
|
309
|
+
type_counts = defaultdict(lambda: {"total": 0, "flips": 0})
|
|
310
|
+
|
|
311
|
+
for r in results:
|
|
312
|
+
t = r.intervention.intervention_type.value
|
|
313
|
+
type_counts[t]["total"] += 1
|
|
314
|
+
if r.flipped:
|
|
315
|
+
type_counts[t]["flips"] += 1
|
|
316
|
+
|
|
317
|
+
factors = []
|
|
318
|
+
for factor_type, counts in type_counts.items():
|
|
319
|
+
if counts["total"] > 0:
|
|
320
|
+
sensitivity = counts["flips"] / counts["total"]
|
|
321
|
+
factors.append({
|
|
322
|
+
"factor": factor_type,
|
|
323
|
+
"sensitivity": round(sensitivity, 3),
|
|
324
|
+
"flips": counts["flips"],
|
|
325
|
+
"total": counts["total"],
|
|
326
|
+
"description": (
|
|
327
|
+
f"{counts['flips']}/{counts['total']} interventions of type "
|
|
328
|
+
f"'{factor_type}' flipped the outcome"
|
|
329
|
+
),
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
factors.sort(key=lambda f: f["sensitivity"], reverse=True)
|
|
333
|
+
return factors
|
|
334
|
+
|
|
335
|
+
def _generate_summary(self, scenario, trace, failing_step, factors) -> str:
|
|
336
|
+
if not factors:
|
|
337
|
+
return "No causal factors identified. The failure may be intrinsic to the model's capabilities."
|
|
338
|
+
|
|
339
|
+
top = factors[0]
|
|
340
|
+
lines = [
|
|
341
|
+
f"Causal analysis of '{scenario.name}' failure at step {failing_step}:",
|
|
342
|
+
"",
|
|
343
|
+
f"Primary cause: {top['factor']} (sensitivity: {top['sensitivity']:.0%})",
|
|
344
|
+
f" {top['description']}",
|
|
345
|
+
]
|
|
346
|
+
if len(factors) > 1:
|
|
347
|
+
lines.append("")
|
|
348
|
+
lines.append("Secondary factors:")
|
|
349
|
+
for f in factors[1:3]:
|
|
350
|
+
lines.append(f" - {f['factor']}: {f['sensitivity']:.0%} sensitivity")
|
|
351
|
+
|
|
352
|
+
return "\n".join(lines)
|
|
353
|
+
|
|
354
|
+
def _find_first_failing_step(self, run_result) -> Optional[int]:
|
|
355
|
+
for sr in run_result.step_results:
|
|
356
|
+
if not sr.all_passed:
|
|
357
|
+
return sr.step_index
|
|
358
|
+
return None
|