traceforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
traceforge/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """TraceForge: Deterministic replay, fuzzing & failure minimization for local AI agents."""
2
+
3
+ __version__ = "0.2.0"
@@ -0,0 +1,358 @@
1
+ """Causal attribution engine — counterfactual replay to find WHY agents fail."""
2
+
3
+ from collections import defaultdict
4
+ from typing import Optional
5
+
6
+ from traceforge.models import (
7
+ CausalReport,
8
+ CounterfactualResult,
9
+ Intervention,
10
+ InterventionType,
11
+ Scenario,
12
+ TraceIR,
13
+ )
14
+
15
+
16
+ class InterventionGenerator:
17
+ """Generates counterfactual interventions for a failing trace."""
18
+
19
+ def generate_all(
20
+ self, scenario: Scenario, trace: TraceIR, failing_step: int
21
+ ) -> list[Intervention]:
22
+ interventions = []
23
+ interventions.extend(self._tool_output_format_interventions(trace, failing_step))
24
+ interventions.extend(self._tool_output_value_interventions(trace, failing_step))
25
+ interventions.extend(self._tool_output_field_interventions(trace, failing_step))
26
+ interventions.extend(self._context_truncation_interventions(trace, failing_step))
27
+ interventions.extend(self._system_prompt_clause_interventions(scenario))
28
+ interventions.extend(self._tool_schema_interventions(scenario, failing_step))
29
+ return interventions
30
+
31
+ def _tool_output_format_interventions(
32
+ self, trace: TraceIR, step_idx: int
33
+ ) -> list[Intervention]:
34
+ interventions = []
35
+ if step_idx >= len(trace.steps):
36
+ return interventions
37
+ step = trace.steps[step_idx]
38
+ for tc in step.tool_calls:
39
+ for key, value in tc.response.items():
40
+ if isinstance(value, (int, float)):
41
+ interventions.append(Intervention(
42
+ intervention_type=InterventionType.TOOL_OUTPUT_FORMAT,
43
+ description=f"Changed '{key}' from {type(value).__name__} to string",
44
+ target_step=step_idx,
45
+ target_tool=tc.tool_name,
46
+ target_field=key,
47
+ original_value=value,
48
+ modified_value=str(value),
49
+ ))
50
+ elif isinstance(value, str):
51
+ try:
52
+ num = float(value)
53
+ interventions.append(Intervention(
54
+ intervention_type=InterventionType.TOOL_OUTPUT_FORMAT,
55
+ description=f"Changed '{key}' from string to number",
56
+ target_step=step_idx,
57
+ target_tool=tc.tool_name,
58
+ target_field=key,
59
+ original_value=value,
60
+ modified_value=num,
61
+ ))
62
+ except ValueError:
63
+ pass
64
+ return interventions
65
+
66
+ def _tool_output_value_interventions(
67
+ self, trace: TraceIR, step_idx: int
68
+ ) -> list[Intervention]:
69
+ interventions = []
70
+ if step_idx >= len(trace.steps):
71
+ return interventions
72
+ step = trace.steps[step_idx]
73
+ for tc in step.tool_calls:
74
+ for key, value in tc.response.items():
75
+ if isinstance(value, bool):
76
+ interventions.append(Intervention(
77
+ intervention_type=InterventionType.TOOL_OUTPUT_VALUE,
78
+ description=f"Flipped '{key}' from {value} to {not value}",
79
+ target_step=step_idx,
80
+ target_tool=tc.tool_name,
81
+ target_field=key,
82
+ original_value=value,
83
+ modified_value=not value,
84
+ ))
85
+ elif isinstance(value, (int, float)) and value != 0:
86
+ for new_val, desc in [
87
+ (0, "zero"),
88
+ (-value, "negated"),
89
+ (value * 2, "doubled"),
90
+ (value / 2, "halved"),
91
+ ]:
92
+ interventions.append(Intervention(
93
+ intervention_type=InterventionType.TOOL_OUTPUT_VALUE,
94
+ description=f"Changed '{key}' to {desc} ({new_val})",
95
+ target_step=step_idx,
96
+ target_tool=tc.tool_name,
97
+ target_field=key,
98
+ original_value=value,
99
+ modified_value=new_val,
100
+ ))
101
+ return interventions
102
+
103
+ def _tool_output_field_interventions(
104
+ self, trace: TraceIR, step_idx: int
105
+ ) -> list[Intervention]:
106
+ interventions = []
107
+ if step_idx >= len(trace.steps):
108
+ return interventions
109
+ step = trace.steps[step_idx]
110
+ for tc in step.tool_calls:
111
+ for key in tc.response:
112
+ interventions.append(Intervention(
113
+ intervention_type=InterventionType.TOOL_OUTPUT_FIELDS,
114
+ description=f"Removed field '{key}' from {tc.tool_name} response",
115
+ target_step=step_idx,
116
+ target_tool=tc.tool_name,
117
+ target_field=key,
118
+ original_value=tc.response[key],
119
+ modified_value="__REMOVE__",
120
+ ))
121
+ interventions.append(Intervention(
122
+ intervention_type=InterventionType.TOOL_OUTPUT_FIELDS,
123
+ description=f"Added unexpected field 'debug_info' to {tc.tool_name} response",
124
+ target_step=step_idx,
125
+ target_tool=tc.tool_name,
126
+ target_field="debug_info",
127
+ original_value=None,
128
+ modified_value="internal debug data - ignore this",
129
+ ))
130
+ return interventions
131
+
132
+ def _context_truncation_interventions(
133
+ self, trace: TraceIR, step_idx: int
134
+ ) -> list[Intervention]:
135
+ interventions = []
136
+ if step_idx > 0:
137
+ for keep_last in [1, 2, max(1, step_idx // 2)]:
138
+ if keep_last < step_idx:
139
+ interventions.append(Intervention(
140
+ intervention_type=InterventionType.CONTEXT_TRUNCATION,
141
+ description=f"Truncated context to last {keep_last} steps (from {step_idx})",
142
+ target_step=step_idx,
143
+ original_value=step_idx,
144
+ modified_value=keep_last,
145
+ ))
146
+ return interventions
147
+
148
+ def _system_prompt_clause_interventions(
149
+ self, scenario: Scenario
150
+ ) -> list[Intervention]:
151
+ interventions = []
152
+ prompt = scenario.agent.system_prompt or ""
153
+ sentences = [s.strip() for s in prompt.split(".") if s.strip()]
154
+ for i, sentence in enumerate(sentences):
155
+ reduced = ". ".join(s for j, s in enumerate(sentences) if j != i) + "."
156
+ interventions.append(Intervention(
157
+ intervention_type=InterventionType.SYSTEM_PROMPT_CLAUSE,
158
+ description=f"Removed prompt sentence {i}: '{sentence[:50]}...'",
159
+ original_value=prompt,
160
+ modified_value=reduced,
161
+ ))
162
+ return interventions
163
+
164
+ def _tool_schema_interventions(
165
+ self, scenario: Scenario, step_idx: int
166
+ ) -> list[Intervention]:
167
+ interventions = []
168
+ for tool in scenario.agent.tools:
169
+ props = tool.parameters.get("properties", {})
170
+ for param_name in props:
171
+ interventions.append(Intervention(
172
+ intervention_type=InterventionType.TOOL_SCHEMA_CHANGE,
173
+ description=f"Renamed '{param_name}' to '{param_name}_v2' in {tool.name} schema",
174
+ target_tool=tool.name,
175
+ target_field=param_name,
176
+ original_value=param_name,
177
+ modified_value=f"{param_name}_v2",
178
+ ))
179
+ return interventions
180
+
181
+
182
+ class CausalAttributionEngine:
183
+ """Runs counterfactual experiments to determine WHY an agent fails."""
184
+
185
+ def __init__(self, harness, evaluator, trace_store, judge=None):
186
+ self.harness = harness
187
+ self.evaluator = evaluator
188
+ self.store = trace_store
189
+ self.judge = judge
190
+ self.generator = InterventionGenerator()
191
+
192
+ def attribute(
193
+ self,
194
+ trace_id: str,
195
+ scenario: Scenario,
196
+ confirmation_runs: int = 3,
197
+ max_interventions: int = 50,
198
+ ) -> CausalReport:
199
+ trace = self.store.load(trace_id)
200
+
201
+ # Find the first failing step
202
+ baseline_result = self.evaluator.evaluate(trace, scenario)
203
+ failing_step = self._find_first_failing_step(baseline_result)
204
+ if failing_step is None:
205
+ raise ValueError(f"Trace {trace_id} does not appear to fail any step")
206
+
207
+ # Generate interventions
208
+ all_interventions = self.generator.generate_all(scenario, trace, failing_step)
209
+ interventions = all_interventions[:max_interventions]
210
+
211
+ # Run counterfactual experiments
212
+ results = []
213
+ for intervention in interventions:
214
+ cf_result = self._run_counterfactual(
215
+ scenario, trace, intervention, failing_step, confirmation_runs
216
+ )
217
+ results.append(cf_result)
218
+
219
+ flips = [r for r in results if r.flipped]
220
+ causal_factors = self._rank_factors(results)
221
+ summary = self._generate_summary(scenario, trace, failing_step, causal_factors)
222
+
223
+ return CausalReport(
224
+ scenario_name=scenario.name,
225
+ failing_trace_id=trace_id,
226
+ failing_step=failing_step,
227
+ total_interventions=len(results),
228
+ total_flips=len(flips),
229
+ interventions=results,
230
+ causal_factors=causal_factors,
231
+ summary=summary,
232
+ )
233
+
234
+ def _run_counterfactual(
235
+ self, scenario, trace, intervention, failing_step, runs
236
+ ) -> CounterfactualResult:
237
+ modified_scenario = self._apply_intervention(scenario, trace, intervention)
238
+
239
+ pass_count = 0
240
+ last_trace_id = ""
241
+ for _ in range(runs):
242
+ try:
243
+ traces = self.harness.run_scenario(modified_scenario, runs=1)
244
+ if traces:
245
+ result = self.evaluator.evaluate(traces[0], modified_scenario)
246
+ if result.passed:
247
+ pass_count += 1
248
+ last_trace_id = traces[0].trace_id
249
+ except Exception:
250
+ pass
251
+
252
+ cf_passed = pass_count > runs / 2
253
+ confidence = max(pass_count, runs - pass_count) / runs
254
+
255
+ return CounterfactualResult(
256
+ intervention=intervention,
257
+ original_passed=False,
258
+ counterfactual_passed=cf_passed,
259
+ flipped=cf_passed,
260
+ trace_id=last_trace_id,
261
+ confidence=confidence,
262
+ )
263
+
264
+ def _apply_intervention(self, scenario, trace, intervention):
265
+ modified = scenario.model_copy(deep=True)
266
+
267
+ match intervention.intervention_type:
268
+ case InterventionType.TOOL_OUTPUT_FORMAT | InterventionType.TOOL_OUTPUT_VALUE:
269
+ for tool in modified.agent.tools:
270
+ if tool.name == intervention.target_tool and tool.mock_responses:
271
+ for resp in tool.mock_responses:
272
+ if intervention.target_field in resp:
273
+ resp[intervention.target_field] = intervention.modified_value
274
+
275
+ case InterventionType.TOOL_OUTPUT_FIELDS:
276
+ for tool in modified.agent.tools:
277
+ if tool.name == intervention.target_tool and tool.mock_responses:
278
+ for resp in tool.mock_responses:
279
+ if intervention.modified_value == "__REMOVE__":
280
+ resp.pop(intervention.target_field, None)
281
+ else:
282
+ resp[intervention.target_field] = intervention.modified_value
283
+
284
+ case InterventionType.CONTEXT_TRUNCATION:
285
+ keep = intervention.modified_value
286
+ if isinstance(keep, int) and keep < len(modified.steps):
287
+ modified.steps = modified.steps[-keep:]
288
+
289
+ case InterventionType.SYSTEM_PROMPT_CLAUSE:
290
+ modified.agent.system_prompt = intervention.modified_value
291
+
292
+ case InterventionType.TOOL_SCHEMA_CHANGE:
293
+ for tool in modified.agent.tools:
294
+ if tool.name == intervention.target_tool:
295
+ props = tool.parameters.get("properties", {})
296
+ if intervention.target_field in props:
297
+ props[intervention.modified_value] = props.pop(
298
+ intervention.target_field
299
+ )
300
+ required = tool.parameters.get("required", [])
301
+ if intervention.target_field in required:
302
+ idx = required.index(intervention.target_field)
303
+ required[idx] = intervention.modified_value
304
+
305
+ modified.runs = 1
306
+ return modified
307
+
308
+ def _rank_factors(self, results: list[CounterfactualResult]) -> list[dict]:
309
+ type_counts = defaultdict(lambda: {"total": 0, "flips": 0})
310
+
311
+ for r in results:
312
+ t = r.intervention.intervention_type.value
313
+ type_counts[t]["total"] += 1
314
+ if r.flipped:
315
+ type_counts[t]["flips"] += 1
316
+
317
+ factors = []
318
+ for factor_type, counts in type_counts.items():
319
+ if counts["total"] > 0:
320
+ sensitivity = counts["flips"] / counts["total"]
321
+ factors.append({
322
+ "factor": factor_type,
323
+ "sensitivity": round(sensitivity, 3),
324
+ "flips": counts["flips"],
325
+ "total": counts["total"],
326
+ "description": (
327
+ f"{counts['flips']}/{counts['total']} interventions of type "
328
+ f"'{factor_type}' flipped the outcome"
329
+ ),
330
+ })
331
+
332
+ factors.sort(key=lambda f: f["sensitivity"], reverse=True)
333
+ return factors
334
+
335
+ def _generate_summary(self, scenario, trace, failing_step, factors) -> str:
336
+ if not factors:
337
+ return "No causal factors identified. The failure may be intrinsic to the model's capabilities."
338
+
339
+ top = factors[0]
340
+ lines = [
341
+ f"Causal analysis of '{scenario.name}' failure at step {failing_step}:",
342
+ "",
343
+ f"Primary cause: {top['factor']} (sensitivity: {top['sensitivity']:.0%})",
344
+ f" {top['description']}",
345
+ ]
346
+ if len(factors) > 1:
347
+ lines.append("")
348
+ lines.append("Secondary factors:")
349
+ for f in factors[1:3]:
350
+ lines.append(f" - {f['factor']}: {f['sensitivity']:.0%} sensitivity")
351
+
352
+ return "\n".join(lines)
353
+
354
+ def _find_first_failing_step(self, run_result) -> Optional[int]:
355
+ for sr in run_result.step_results:
356
+ if not sr.all_passed:
357
+ return sr.step_index
358
+ return None