agent-apprenticeship 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +217 -0
  3. package/bin/agent-apprenticeship.js +131 -0
  4. package/package.json +30 -0
  5. package/pyproject.toml +23 -0
  6. package/src/agent_apprenticeship_trace/__init__.py +2 -0
  7. package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
  8. package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
  9. package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
  10. package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
  11. package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
  12. package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
  13. package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
  14. package/src/agent_apprenticeship_trace/certification.py +580 -0
  15. package/src/agent_apprenticeship_trace/cli.py +2979 -0
  16. package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
  17. package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
  18. package/src/agent_apprenticeship_trace/config.py +609 -0
  19. package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
  20. package/src/agent_apprenticeship_trace/env.py +46 -0
  21. package/src/agent_apprenticeship_trace/evaluator.py +64 -0
  22. package/src/agent_apprenticeship_trace/grader.py +194 -0
  23. package/src/agent_apprenticeship_trace/integration_status.py +193 -0
  24. package/src/agent_apprenticeship_trace/io.py +20 -0
  25. package/src/agent_apprenticeship_trace/learning.py +627 -0
  26. package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
  27. package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
  28. package/src/agent_apprenticeship_trace/loop.py +111 -0
  29. package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
  30. package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
  31. package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
  32. package/src/agent_apprenticeship_trace/progress.py +223 -0
  33. package/src/agent_apprenticeship_trace/public_run.py +1109 -0
  34. package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
  35. package/src/agent_apprenticeship_trace/recipes.py +129 -0
  36. package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
  37. package/src/agent_apprenticeship_trace/revision.py +21 -0
  38. package/src/agent_apprenticeship_trace/role_runners.py +7 -0
  39. package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
  40. package/src/agent_apprenticeship_trace/schemas.py +273 -0
  41. package/src/agent_apprenticeship_trace/session_events.py +99 -0
  42. package/src/agent_apprenticeship_trace/task_intake.py +112 -0
  43. package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
  44. package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
  45. package/src/agent_apprenticeship_trace/training_signals.py +30 -0
  46. package/src/agent_apprenticeship_trace/validation.py +210 -0
  47. package/src/agent_apprenticeship_trace/verifier.py +55 -0
@@ -0,0 +1,669 @@
1
+ from __future__ import annotations
2
+ import json, re
3
+ from pathlib import Path
4
+ from typing import Any
5
+ from pydantic import BaseModel, Field
6
+ from .schemas import AgentTrace, ActualOutputs
7
+ from .io import write_json
8
+
9
+ CANONICAL_TOP = {'schema_version','trace_id','collection_id','prior_trace_id','trace_mode','task','task_id','task_family_id','attempt_id','attempt_kind','attempt_status','agent_tools','started_at','ended_at','system_prompt','system_prompt_hash','skills','memory','agent_config','learning','termination_reason','steps','actual_outputs','artifacts','metadata_json'}
10
+ CANONICAL_STEP = {'step','turn','actor','action','operation','tool','execution_mode','parallel_group','observation','input','input_source','output','state_change','reasoning','caused_by','causal_type','causal_note','alternatives_considered','success','step_outcome','error_type','error_message','message_role','feedback_type','feedback_content','started_at','ended_at','retry_of','artifact_refs','metadata_json'}
11
+ VALID_OUTCOMES={'progress','neutral','blocked','failed','corrected','completed'}
12
+ VALID_ACTIONS={'user_message','agent_step','output','error'}
13
+ VALID_OPS={'plan','analyze','search','read','write','edit','execute','verify','download','install','ask_user','answer','select','grade','evaluate','revise','other'}
14
+ VALID_CAUSAL_TYPES={'user_request','follow_up_user_request','answer_to_agent_question','execution_of_plan','dependency_on_tool_result','retry_after_failure','correction_response','approval_response','verification_of_prior_step','dependency_on_multiple_prior_steps','delegation_to_subagent','delegated_work','used_subagent_result','handoff_from_subagent','parallel_work','other'}
15
+ CAUSAL_MAP={'sequential':'execution_of_plan','sequence':'execution_of_plan','previous_step':'execution_of_plan'}
16
+ OP_MAP={
17
+ 'inspect_workspace_files':'read','inspect_workspace':'read','read_policy':'read','read_input_csvs':'read','create_artifacts_directory':'execute','create_reconciliation_script':'write','run_initial_reconciliation':'execute','inspect_initial_outputs':'read','generate_reconciliation_artifacts':'execute','inspect_reconciled_payments':'read','validate_json_files':'verify','validate_artifacts':'verify','run':'execute','shell':'execute','bash':'execute','create_file':'write','write_file':'write','modify_file':'edit','read_file':'read','inspect':'read','think':'analyze','reason':'analyze','summarize':'analyze','finalize':'answer','final_response':'answer'}
18
+ READ_OPS={'read','search'}; WRITE_OPS={'write','edit'}; EXEC_OPS={'execute','install','download'}; VERIFY_OPS={'verify'}
19
+ VALID_ATTEMPT_KINDS={'baseline','revised','apprentice_without_lessons','apprentice_with_lessons','other'}
20
+ VALID_TRACE_MODES={'live','retraced','hybrid'}
21
+ VALID_TERMINATION_REASONS={'task_complete','verifier_passed','verifier_failed','max_iterations_reached','agent_blocked','timeout','error_unrecoverable','partial_then_stopped','provider_usage_limit','other'}
22
+ VALID_ATTEMPT_STATUSES={'completed','failed','blocked','fallback','partial'}
23
+
24
+ class TraceNormalizationContext(BaseModel):
25
+ task_id: str
26
+ attempt_id: str
27
+ attempt_kind: str
28
+ task: str
29
+ actual_outputs: ActualOutputs | None = None
30
+ trace_id_prefix: str = 'trace'
31
+
32
+ class TraceNormalizationReport(BaseModel):
33
+ task_id: str
34
+ attempt_id: str
35
+ attempt_kind: str
36
+ raw_trace_ref: str | None = None
37
+ normalized_trace_ref: str | None = None
38
+ canonical_trace_ref: str | None = None
39
+ trace_schema_valid: bool = False
40
+ trace_normalized: bool = False
41
+ trace_lossless: bool = True
42
+ fallback_trace: bool = False
43
+ raw_step_count: int = 0
44
+ normalized_step_count: int = 0
45
+ discarded_step_count: int = 0
46
+ raw_trace_parse_error: bool = False
47
+ trace_normalization_error: bool = False
48
+ trace_normalization_partial: bool = False
49
+ validation_errors: list[str] = Field(default_factory=list)
50
+ warnings: list[str] = Field(default_factory=list)
51
+ field_mappings: dict[str,str] = Field(default_factory=dict)
52
+ input_dict_to_string_count: int = 0
53
+ output_dict_to_string_count: int = 0
54
+ causal_type_repair_count: int = 0
55
+ operation_repair_count: int = 0
56
+ action_repair_count: int = 0
57
+ metadata_json: dict[str, Any] = Field(default_factory=dict)
58
+
59
+ class NormalizationResult(BaseModel):
60
+ normalized_trace: dict[str, Any] | None = None
61
+ report: TraceNormalizationReport
62
+ fallback_required: bool = False
63
+ parse_error: str | None = None
64
+
65
+ def _compact(v: Any, max_len: int=1200) -> str:
66
+ if v is None: return ''
67
+ if isinstance(v, str): return v[:max_len]
68
+ try: s=json.dumps(v, sort_keys=True)
69
+ except Exception: s=str(v)
70
+ return s[:max_len]
71
+
72
+ def _as_list(v: Any) -> list[Any]:
73
+ if v is None: return []
74
+ if isinstance(v, list): return v
75
+ return [v]
76
+
77
+ def _operation(raw_action: str | None, raw_op: str | None, meta: dict[str,Any], report: TraceNormalizationReport | None = None) -> str:
78
+ val=str(raw_op or raw_action or '').strip()
79
+ mapped=OP_MAP.get(val, val if val in VALID_OPS else 'other')
80
+ if val and (val != mapped or val not in VALID_OPS):
81
+ meta['original_operation']=val
82
+ if report is not None:
83
+ report.operation_repair_count += 1
84
+ return mapped if mapped in VALID_OPS else 'other'
85
+
86
+ def _action(raw_action: str | None, op: str, meta: dict[str,Any], report: TraceNormalizationReport | None = None) -> str:
87
+ a=str(raw_action or '').strip()
88
+ low=a.lower()
89
+ if a:
90
+ meta['original_action']=a
91
+ if low in VALID_ACTIONS:
92
+ return low
93
+ if low in {'error','failure','exception'} or (op == 'other' and any(x in low for x in ['error','fail','exception'])):
94
+ canonical='error'
95
+ elif low in {'final_response','finalize'} or op == 'answer':
96
+ canonical='output'
97
+ else:
98
+ canonical='agent_step'
99
+ if a and report is not None:
100
+ report.action_repair_count += 1
101
+ return canonical
102
+
103
+ def _string_or_none(v: Any, meta: dict[str, Any] | None = None, key: str | None = None) -> str | None:
104
+ if v is None:
105
+ return None
106
+ if isinstance(v, str):
107
+ return v
108
+ if meta is not None and key is not None:
109
+ meta[key]=v
110
+ return _compact(v)
111
+
112
+ def _output_to_string(v: Any) -> str | None:
113
+ if v is None:
114
+ return None
115
+ if isinstance(v, str):
116
+ return v
117
+ if isinstance(v, dict):
118
+ parts=[]
119
+ for k in ['exit_code','stdout','stdout_summary','stderr','stderr_summary']:
120
+ if k in v and v[k] not in (None, ''):
121
+ parts.append(f'{k}: {_compact(v[k], 900)}')
122
+ return '\n'.join(parts) if parts else _compact(v)
123
+ return _compact(v)
124
+
125
+ def _normalize_causal_type(v: Any, meta: dict[str, Any], report: TraceNormalizationReport | None = None) -> str | None:
126
+ if v is None:
127
+ return None
128
+ raw=str(v).strip()
129
+ mapped=CAUSAL_MAP.get(raw, raw if raw in VALID_CAUSAL_TYPES else 'other')
130
+ if raw != mapped or raw not in VALID_CAUSAL_TYPES:
131
+ meta['original_causal_type']=v
132
+ if report is not None:
133
+ report.causal_type_repair_count += 1
134
+ return mapped
135
+
136
+
137
+ def _command_tokens(command: str) -> list[str]:
138
+ return [part.strip().lower() for part in command.replace('&&', '\n').replace(';', '\n').split('\n') if part.strip()]
139
+
140
+ def _first_command_word(part: str) -> str:
141
+ words=part.split()
142
+ while words and ('=' in words[0] or words[0] in {'env','time','uv','poetry','pipenv','xargs','sudo'}):
143
+ words=words[1:]
144
+ return words[0] if words else ''
145
+
146
+
147
+
148
+ def _has_phrase(text: str | None, phrase: str) -> bool:
149
+ """Return true when a normalized phrase appears in text.
150
+
151
+ Multi-word phrases use substring matching. Single-word phrases require
152
+ token/word boundaries so generic language like "next move" does not
153
+ accidentally become a file-write operation.
154
+ """
155
+ import re
156
+
157
+ if not text or not phrase:
158
+ return False
159
+
160
+ text_l = str(text).lower()
161
+ phrase_l = str(phrase).lower().strip()
162
+ if not phrase_l:
163
+ return False
164
+
165
+ if " " in phrase_l:
166
+ return phrase_l in text_l
167
+
168
+ return re.search(r"(?<![a-z0-9_])" + re.escape(phrase_l) + r"(?![a-z0-9_])", text_l) is not None
169
+
170
+
171
+ def _operation_from_phrase(value: str | None) -> str | None:
172
+ """Infer canonical operation from descriptive action/operation/tool text.
173
+
174
+ Canonical operations are intentionally small:
175
+ read, write, execute, verify, other.
176
+ Return None when intent is unclear so command-level inference can still run.
177
+ """
178
+ if not value:
179
+ return None
180
+
181
+ low = str(value).lower().replace("_", " ").replace("-", " ").strip()
182
+ if not low or low == "other":
183
+ return None
184
+
185
+ # Avoid mapping generic reasoning language to file operations.
186
+ if "next move" in low or low.startswith(("ponder ", "think ", "reason ")):
187
+ return None
188
+
189
+ if low in {"read", "write", "execute", "verify"}:
190
+ return low
191
+
192
+ verify_markers = (
193
+ "validate", "validation", "verify", "verification", "check", "test",
194
+ "pytest", "compileall", "json tool", "schema", "checksum", "hash",
195
+ "inspect exported", "inspect workbook", "formula scan", "scan workbook",
196
+ "render", "visual qa", "preview", "qa", "review output",
197
+ "spreadsheet validation", "xlsx validation", "csv validation",
198
+ "markdown validation", "artifact validation",
199
+ "attempted spreadsheet library import", "checked local office",
200
+ "package availability", "tool check",
201
+ )
202
+ if any(_has_phrase(low, marker) for marker in verify_markers):
203
+ return "verify"
204
+
205
+ write_markers = (
206
+ "write", "create", "edit", "patch", "apply patch", "file creation",
207
+ "file edit", "write artifact", "create artifact", "edit artifact",
208
+ "mkdir", "copy", "cp ", "mv ", "builder script",
209
+ "write final trace", "persist", "save", "generated artifacts",
210
+ )
211
+ if any(_has_phrase(low, marker) for marker in write_markers):
212
+ return "write"
213
+
214
+ execute_markers = (
215
+ "execute", "run", "rerun", "re run", "executed", "script execution",
216
+ "builder", "python", "python3", "node", "npm", "bash", "sh ",
217
+ "calculation script", "link workspace", "environment setup",
218
+ "symlink", "ln -s",
219
+ )
220
+ if any(_has_phrase(low, marker) for marker in execute_markers):
221
+ return "execute"
222
+
223
+ read_markers = (
224
+ "read", "inspect", "inventory", "list", "listing", "file listing",
225
+ "file inspection", "read context", "inspect task", "inspect inputs",
226
+ "inspect workspace", "load", "loaded", "sed", "cat", "head", "tail",
227
+ "less", "ls", "find", "rg", "grep", "git status", "unzip",
228
+ )
229
+ if any(_has_phrase(low, marker) for marker in read_markers):
230
+ return "read"
231
+
232
+ return None
233
+
234
+
235
+ def _operation_from_command(command: str | None, meta: dict[str, Any], report: TraceNormalizationReport | None = None) -> str | None:
236
+ c=(command or '').strip()
237
+ if not c:
238
+ return None
239
+ parts=_command_tokens(c) or [c.lower()]
240
+ low='\n'.join(parts)
241
+ firsts={_first_command_word(p) for p in parts}
242
+ verify_markers=['json.tool','pytest','compileall','schema','validate','validation','validator',' check','check ','checksum','sha256sum','shasum','md5sum','openpyxl','xlsx validation','csv validation','json validation','markdown validation','test ']
243
+ write_markers=['apply_patch','file edit','edit file','write artifact','write_artifact','create file','file creation','write','create','edit','patch','builder script','write final trace','persist','save']
244
+ execute_markers=['execute','run','rerun','re run','executed','script execution','builder','python','python3','node','npm','bash','sh','calculation script','environment setup','symlink','ln -s']
245
+ read_markers=['read','inspect','inventory','list','listing','file listing','file inspection','read context','inspect task','inspect inputs','inspect workspace','load','loaded','sed','cat','head','tail','less','ls','find','rg','grep','git status','unzip']
246
+ if any(m in low for m in verify_markers) or any(f in {'pytest','py.test'} for f in firsts):
247
+ return 'verify'
248
+ if 'git status' in low:
249
+ return 'read'
250
+ if any(m in low for m in ['apply_patch','file edit','edit file','write artifact','write_artifact','create file','file creation']):
251
+ return 'write'
252
+ if any(f in {'sed','cat','head','tail','less','ls','find','rg','grep','file','pwd'} for f in firsts):
253
+ return 'read'
254
+ if any('unzip -l' in p or 'zipinfo' in p for p in parts) or ' inventory' in low or 'listing' in low:
255
+ return 'read'
256
+ if any(f in {'mkdir','cp','mv','touch','tee'} for f in firsts):
257
+ return 'write'
258
+ if any(f in {'python','python3','node','npm','npx','bash','sh','zsh'} for f in firsts):
259
+ return 'execute'
260
+ if any(_has_phrase(low, m) for m in verify_markers):
261
+ return 'verify'
262
+ if any(_has_phrase(low, m) for m in write_markers):
263
+ return 'write'
264
+ if any(_has_phrase(low, m) for m in read_markers):
265
+ return 'read'
266
+ return None
267
+
268
+ def _operation_from_step_context(raw_action: Any, raw_op: Any, tool: Any, command: Any, meta: dict[str, Any], report: TraceNormalizationReport | None = None) -> str | None:
269
+ values=[raw_op, raw_action, tool, command]
270
+ for value in values:
271
+ op=_operation_from_phrase(str(value) if value is not None else None)
272
+ if op:
273
+ if report is not None:
274
+ report.operation_repair_count += 1
275
+ return op
276
+ return None
277
+
278
+ def infer_attempt_status(trace: dict[str, Any], report: dict[str, Any] | None = None) -> str:
279
+ existing=trace.get('attempt_status')
280
+ if existing in VALID_ATTEMPT_STATUSES:
281
+ return existing
282
+ termination=trace.get('termination_reason')
283
+ md=trace.get('metadata_json') if isinstance(trace.get('metadata_json'), dict) else {}
284
+ report=report or {}
285
+ if termination == 'task_complete':
286
+ return 'completed'
287
+ if termination == 'agent_blocked':
288
+ return 'blocked'
289
+ actual=trace.get('actual_outputs') if isinstance(trace.get('actual_outputs'), dict) else {}
290
+ if termination in {'timeout','error_unrecoverable','provider_usage_limit','verifier_failed'} or actual.get('status') in {'failed','timeout','error'}:
291
+ return 'failed'
292
+ if report.get('fallback_trace') or md.get('fallback_trace') or md.get('fallback_trace_created') or md.get('fallback_reason') or report.get('fallback_reason') or (isinstance(report.get('metadata_json'), dict) and report['metadata_json'].get('fallback_reason')):
293
+ return 'fallback'
294
+ steps=trace.get('steps') or []
295
+ if any(s.get('step_outcome') == 'blocked' for s in steps if isinstance(s, dict)):
296
+ return 'blocked'
297
+ if any(s.get('step_outcome') == 'failed' or s.get('success') is False for s in steps if isinstance(s, dict)):
298
+ return 'failed' if not any(s.get('success') is True for s in steps if isinstance(s, dict)) else 'partial'
299
+ if steps:
300
+ return 'partial'
301
+ return 'failed'
302
+
303
+ def normalize_trace_for_export(trace: dict[str, Any], report: dict[str, Any] | None = None) -> dict[str, Any]:
304
+ out=dict(trace)
305
+ steps=[]
306
+ for s in out.get('steps') or []:
307
+ if not isinstance(s, dict):
308
+ steps.append(s); continue
309
+ ns=dict(s)
310
+ meta=dict(ns.get('metadata_json') or {}) if isinstance(ns.get('metadata_json'), dict) else {}
311
+ command=ns.get('input') or ns.get('command') or meta.get('command')
312
+ op=_operation_from_command(str(command) if command is not None else None, meta)
313
+ if op and (not ns.get('operation') or ns.get('operation') == 'other'):
314
+ ns['operation']=op
315
+ ns['metadata_json']=meta
316
+ steps.append(ns)
317
+ out['steps']=steps
318
+ out['attempt_status']=infer_attempt_status(out, report)
319
+ return out
320
+
321
+ def _tool(command: str | None, raw: dict[str,Any], op: str) -> str | None:
322
+ c=(command or '').strip()
323
+ low=c.lower()
324
+ if low.startswith('python ') or low.startswith('python3 ') or low == 'python' or low == 'python3': return 'python'
325
+ if low.startswith('apply_patch') or low == 'apply_patch': return 'apply_patch'
326
+ if c and any(tok in low for tok in ['sed','cat','ls','rg','find','pwd','mkdir','bash','zsh','sh ','python','touch','cp ','mv ','rm ']): return 'Bash'
327
+ if op in READ_OPS: return 'file_read'
328
+ if op in WRITE_OPS: return 'file_write'
329
+ return raw.get('tool')
330
+
331
+ def _derive_observation(step_num: int, op: str) -> str:
332
+ if step_num == 1: return 'Attempt workspace and task inputs were available.'
333
+ if op in READ_OPS: return 'Input file was available for inspection.'
334
+ if op in EXEC_OPS or op in WRITE_OPS: return 'Prior inputs and task requirements were available.'
335
+ if op in VERIFY_OPS: return 'Generated outputs were available for validation.'
336
+ return 'Prior workflow context was available.'
337
+
338
+ def _normalize_outcome(v: Any, meta: dict[str,Any]) -> str | None:
339
+ if v is None: return None
340
+ low=str(v).lower().strip()
341
+ if low in VALID_OUTCOMES: return low
342
+ meta['original_step_outcome']=v
343
+ if low in {'ok','success','succeeded','done'}: return 'completed'
344
+ if low in {'fail','failure','errored','error'}: return 'failed'
345
+ if low in {'block','blocked'}: return 'blocked'
346
+ return 'neutral'
347
+
348
+
349
+ def _safe_int(v: Any, default: int, field: str, meta: dict[str, Any], report: TraceNormalizationReport | None = None) -> int:
350
+ try:
351
+ if isinstance(v, bool):
352
+ raise ValueError('boolean is not numeric')
353
+ if v is None or v == '':
354
+ return default
355
+ return int(v)
356
+ except Exception:
357
+ if v is not None:
358
+ meta[f'original_{field}']=v
359
+ warning=f'{field} value {v!r} was not numeric; defaulted to {default}'
360
+ meta.setdefault('normalization_warnings', []).append(warning)
361
+ if report is not None:
362
+ report.warnings.append(warning)
363
+ return default
364
+
365
+ def _normalize_step(raw_step: Any, ordinal: int, used_steps: set[int], report: TraceNormalizationReport) -> dict[str,Any]:
366
+ raw = raw_step if isinstance(raw_step, dict) else {'value': raw_step}
367
+ meta=dict(raw.get('metadata_json') or {}) if isinstance(raw.get('metadata_json'), dict) else {}
368
+ meta['raw_step']=raw
369
+ unknown={k:v for k,v in raw.items() if k not in CANONICAL_STEP and k not in {'step_number','index','command','inputs','outputs','state_changes','artifacts','artifact_paths','decision_summary','stdout_summary','exit_code'}}
370
+ if unknown: meta['original_fields']=unknown
371
+ orig_step=raw.get('step', raw.get('step_number', raw.get('index')))
372
+ step_num=_safe_int(orig_step, ordinal, 'step', meta, report)
373
+ if step_num < 1 or step_num in used_steps:
374
+ meta['original_step_number']=orig_step
375
+ step_num=ordinal
376
+ report.warnings.append(f'step {ordinal} renumbered from {orig_step!r}')
377
+ used_steps.add(step_num)
378
+ raw_action=raw.get('action')
379
+ command=raw.get('command')
380
+ inputs=raw.get('inputs')
381
+ raw_input_probe=raw.get('input')
382
+ commands_list = None
383
+ if isinstance(raw_input_probe, dict):
384
+ commands_list = raw_input_probe.get('commands')
385
+ if commands_list is None and isinstance(inputs, dict):
386
+ commands_list = inputs.get('commands')
387
+ inferred_command = command or (raw_input_probe.get('cmd') if isinstance(raw_input_probe, dict) else None) or (raw_input_probe if isinstance(raw_input_probe, str) else None) or (inputs.get('cmd') if isinstance(inputs, dict) else None) or (inputs if isinstance(inputs, str) else None)
388
+ if inferred_command is None and isinstance(commands_list, list):
389
+ inferred_command = '\n'.join(str(c) for c in commands_list)
390
+ op=_operation(str(raw_action) if raw_action is not None else None, raw.get('operation'), meta, report)
391
+ ctx_op=_operation_from_step_context(raw_action, raw.get('operation'), raw.get('tool'), inferred_command, meta, report)
392
+ cmd_op=_operation_from_command(str(inferred_command) if inferred_command is not None else None, meta, report) or ctx_op
393
+ if cmd_op and (raw.get('operation') is None or op == 'other'):
394
+ if op != cmd_op and report is not None:
395
+ report.operation_repair_count += 1
396
+ op=cmd_op
397
+ action=_action(str(raw_action) if raw_action is not None else None, op, meta, report)
398
+ if action == 'error' and raw.get('success') is True and not raw.get('error_type') and not raw.get('error_message'):
399
+ action='agent_step'
400
+
401
+ raw_input=raw.get('input')
402
+ input_val: str | None
403
+ if command is not None:
404
+ input_val=str(command)
405
+ meta['command']=command
406
+ if inputs is not None: meta['inputs']=inputs
407
+ if isinstance(raw_input, dict): meta.setdefault('input', raw_input); report.input_dict_to_string_count += 1
408
+ elif isinstance(raw_input, dict):
409
+ meta['inputs']=raw_input
410
+ report.input_dict_to_string_count += 1
411
+ input_val=str(raw_input.get('cmd')) if raw_input.get('cmd') is not None else _compact(raw_input)
412
+ elif raw_input is not None:
413
+ input_val=_string_or_none(raw_input, meta, 'input')
414
+ elif inputs is not None:
415
+ meta['inputs']=inputs
416
+ if isinstance(inputs, dict) and inputs.get('cmd') is not None:
417
+ input_val=str(inputs.get('cmd'))
418
+ else:
419
+ input_val=_compact(inputs)
420
+ if isinstance(inputs, dict): report.input_dict_to_string_count += 1
421
+ else:
422
+ input_val=None
423
+
424
+ raw_output=raw.get('output')
425
+ outputs=raw.get('outputs')
426
+ output_val: str | None = None
427
+ if isinstance(raw_output, dict):
428
+ meta['outputs']=raw_output
429
+ output_val=_output_to_string(raw_output)
430
+ report.output_dict_to_string_count += 1
431
+ elif raw_output is not None:
432
+ output_val=_string_or_none(raw_output, meta, 'output')
433
+ if outputs is not None:
434
+ meta['outputs']=outputs
435
+ if isinstance(outputs, dict):
436
+ report.output_dict_to_string_count += 1
437
+ output_val=output_val or _output_to_string(outputs)
438
+ if raw.get('stdout_summary') is not None:
439
+ meta['stdout_summary']=raw.get('stdout_summary')
440
+ output_val=output_val or str(raw.get('stdout_summary'))
441
+
442
+ state_change=_string_or_none(raw.get('state_change'), meta, 'state_change')
443
+ if raw.get('state_changes') is not None:
444
+ sc=raw.get('state_changes'); meta['state_changes']=sc
445
+ if isinstance(sc, list):
446
+ state_change=state_change or ('; '.join(str(x) for x in sc) if sc else None)
447
+ else:
448
+ state_change=state_change or _compact(sc)
449
+ artifact_refs=list(raw.get('artifact_refs') or [])
450
+ if raw.get('artifacts') is not None:
451
+ meta['original_artifacts']=raw.get('artifacts')
452
+ artifact_refs.extend(str(x) for x in _as_list(raw.get('artifacts')))
453
+ if raw.get('artifact_paths') is not None:
454
+ meta['original_artifact_paths']=raw.get('artifact_paths')
455
+ artifact_refs.extend(str(x) for x in _as_list(raw.get('artifact_paths')))
456
+ if raw.get('exit_code') is not None: meta['exit_code']=raw.get('exit_code')
457
+ if raw.get('decision_summary') is not None: meta['decision_summary']=raw.get('decision_summary')
458
+
459
+ observation=_string_or_none(raw.get('observation'), meta, 'observation')
460
+ if observation is None and action != 'user_message':
461
+ observation=_derive_observation(step_num, op); meta['observation_derived']=True
462
+ reasoning=None if action=='user_message' else _string_or_none(raw.get('reasoning') or raw.get('decision_summary'), meta, 'reasoning')
463
+
464
+ caused=raw.get('caused_by')
465
+ warnings=[]
466
+ if caused is None:
467
+ caused_by=None if ordinal == 1 else [ordinal-1]
468
+ causal_type=None if ordinal == 1 else 'execution_of_plan'
469
+ causal_note=None if ordinal == 1 else 'Sequential workflow step following the prior action.'
470
+ else:
471
+ caused_by=[int(x) for x in _as_list(caused) if isinstance(x, int) or str(x).isdigit()]
472
+ valid=[x for x in caused_by if 1 <= x < step_num]
473
+ if len(valid) != len(caused_by): warnings.append('invalid caused_by references removed')
474
+ caused_by=valid or None
475
+ causal_type=_normalize_causal_type(raw.get('causal_type') or ('execution_of_plan' if caused_by else None), meta, report)
476
+ causal_note=_string_or_none(raw.get('causal_note'), meta, 'causal_note')
477
+ if caused is None and causal_type is not None:
478
+ causal_type=_normalize_causal_type(causal_type, meta, report)
479
+ if warnings:
480
+ meta['normalization_warnings']=warnings; report.warnings.extend(warnings)
481
+
482
+ actor=raw.get('actor') or ('user' if action == 'user_message' else 'agent:worker')
483
+ if action == 'user_message': actor='user'
484
+ step={
485
+ 'step':step_num,'turn':_safe_int(raw.get('turn'), 1, 'turn', meta, report),'actor':actor,'action':action,
486
+ 'operation': None if action=='user_message' else op,'tool': None if action=='user_message' else _tool(command if command is not None else input_val, raw, op),
487
+ 'execution_mode':raw.get('execution_mode') if raw.get('execution_mode') in {'serial','parallel'} else None,'parallel_group':_string_or_none(raw.get('parallel_group'), meta, 'parallel_group'),'observation':None if action=='user_message' else observation,
488
+ 'input':input_val,'input_source':raw.get('input_source') if isinstance(raw.get('input_source'), dict) else None,'output':None if action=='user_message' else output_val,
489
+ 'state_change':state_change,'reasoning':None if action=='user_message' else reasoning,
490
+ 'caused_by':caused_by,'causal_type':causal_type,'causal_note':causal_note,'alternatives_considered':_string_or_none(raw.get('alternatives_considered'), meta, 'alternatives_considered'),
491
+ 'success':None if action=='user_message' else raw.get('success') if isinstance(raw.get('success'), bool) else None,'step_outcome':None if action=='user_message' else _normalize_outcome(raw.get('step_outcome'), meta),
492
+ 'error_type':_string_or_none(raw.get('error_type'), meta, 'error_type'),'error_message':_string_or_none(raw.get('error_message'), meta, 'error_message'),'message_role':raw.get('message_role') if action=='user_message' else None,
493
+ 'feedback_type':raw.get('feedback_type') if action=='user_message' else None,'feedback_content':_string_or_none(raw.get('feedback_content'), meta, 'feedback_content') if action=='user_message' else None,
494
+ 'started_at':_string_or_none(raw.get('started_at'), meta, 'started_at'),'ended_at':_string_or_none(raw.get('ended_at'), meta, 'ended_at'),'retry_of':raw.get('retry_of') if isinstance(raw.get('retry_of'), int) else None,'artifact_refs':artifact_refs,'metadata_json':meta,
495
+ }
496
+ if action == 'user_message':
497
+ step.update({'operation':None,'tool':None,'execution_mode':None,'observation':None,'reasoning':None,'success':None,'step_outcome':None,'output':None})
498
+ return step
499
+
500
+ def _model_dump(obj: Any) -> dict[str, Any]:
501
+ if obj is None:
502
+ return None
503
+ if hasattr(obj, 'model_dump'):
504
+ try:
505
+ return obj.model_dump(mode='json')
506
+ except TypeError:
507
+ return obj.model_dump()
508
+ return obj
509
+
510
+ def _valid_actual_outputs(raw_actual: Any, context: TraceNormalizationContext, metadata: dict[str, Any]) -> dict[str, Any] | None:
511
+ if context.actual_outputs is not None:
512
+ if raw_actual is not None:
513
+ metadata['original_actual_outputs']=raw_actual
514
+ return _model_dump(context.actual_outputs)
515
+ if isinstance(raw_actual, dict):
516
+ try:
517
+ return ActualOutputs.model_validate(raw_actual).model_dump()
518
+ except Exception:
519
+ metadata['original_actual_outputs']=raw_actual
520
+ return None
521
+ if raw_actual is not None:
522
+ metadata['original_actual_outputs']=raw_actual
523
+ return None
524
+
525
+ def _canonical_attempt_kind(raw: dict[str, Any], context: TraceNormalizationContext, metadata: dict[str, Any]) -> str:
526
+ val=raw.get('attempt_kind') or raw.get('attempt') or context.attempt_kind
527
+ if val in VALID_ATTEMPT_KINDS:
528
+ return val
529
+ if val is not None:
530
+ metadata['original_attempt_kind']=val
531
+ return context.attempt_kind if context.attempt_kind in VALID_ATTEMPT_KINDS else 'other'
532
+
533
+ def _canonical_trace_mode(raw: dict[str, Any], metadata: dict[str, Any]) -> str:
534
+ val=raw.get('trace_mode') or 'live'
535
+ if val in VALID_TRACE_MODES:
536
+ return val
537
+ metadata['original_trace_mode']=val
538
+ return 'live'
539
+
540
+ def _canonical_termination_reason(raw: dict[str, Any], success_like: bool, metadata: dict[str, Any]) -> str:
541
+ val=raw.get('termination_reason')
542
+ if val in VALID_TERMINATION_REASONS:
543
+ return val
544
+ if val is not None:
545
+ metadata['original_termination_reason']=val
546
+ return 'task_complete' if success_like else 'other'
547
+
548
+ def _canonical_agent_tools(raw: dict[str, Any], metadata: dict[str, Any]) -> list[str]:
549
+ val=raw.get('agent_tools')
550
+ if isinstance(val, list) and all(isinstance(x, str) for x in val):
551
+ return val
552
+ if val is not None:
553
+ metadata['original_agent_tools']=val
554
+ return ['codex_cli','Bash','python','file_read','file_write','apply_patch']
555
+
556
+ def _canonical_artifacts(raw: dict[str, Any], metadata: dict[str, Any]) -> list[dict[str, Any]]:
557
+ val=raw.get('artifacts')
558
+ if not val:
559
+ return []
560
+ metadata['original_artifacts']=val
561
+ return []
562
+
563
+ def _extract_raw_steps(raw: dict[str, Any]) -> tuple[list[Any], str | None]:
564
+ for key in ['steps','trace_steps','trace','records','events','actions']:
565
+ val=raw.get(key)
566
+ if isinstance(val, list):
567
+ return val, key
568
+ if isinstance(val, dict):
569
+ nested=val.get('steps') or val.get('events') or val.get('actions')
570
+ if isinstance(nested, list):
571
+ return nested, key
572
+ return [], None
573
+
574
+ def normalize_agent_trace(raw: dict[str,Any], context: TraceNormalizationContext) -> dict[str,Any]:
575
+ report=TraceNormalizationReport(task_id=context.task_id, attempt_id=context.attempt_id, attempt_kind=context.attempt_kind)
576
+ raw_steps, raw_step_source = _extract_raw_steps(raw)
577
+ if not isinstance(raw_steps, list): raw_steps=[]; report.warnings.append('raw steps was not a list')
578
+ metadata=dict(raw.get('metadata_json') or {}) if isinstance(raw.get('metadata_json'), dict) else {}
579
+ if raw_step_source:
580
+ metadata['raw_trace_step_source']=raw_step_source
581
+ unknown_top={k:v for k,v in raw.items() if k not in CANONICAL_TOP and k not in {'attempt','role','schema_name'}}
582
+ if unknown_top: metadata['original_fields']=unknown_top
583
+ if raw.get('role') is not None: metadata['original_role']=raw.get('role')
584
+ if raw.get('schema_name') is not None: metadata['schema_name']=raw.get('schema_name')
585
+ used=set(); steps=[_normalize_step(s, i+1, used, report) for i,s in enumerate(raw_steps)]
586
+ # Ensure final monotonic sequence without dropping; preserve original numbers already in metadata when changed.
587
+ if [s['step'] for s in steps] != list(range(1,len(steps)+1)):
588
+ for i,s in enumerate(steps, 1):
589
+ s['metadata_json'].setdefault('original_step_number', s['step']); s['step']=i
590
+ report.warnings.append('steps renumbered to canonical 1..N order')
591
+ success_like=bool(raw.get('actual_outputs') or any(s.get('success') for s in steps))
592
+ metadata.update({'raw_trace_preserved':True,'discarded_step_count':0,'trace_normalization_status':'normalized' if not report.warnings else 'partial'})
593
+ trace={
594
+ 'schema_version': str(raw.get('schema_version') or 'aa-trace-v0.1'),
595
+ 'trace_id': str(raw.get('trace_id') or f'{context.trace_id_prefix}_{context.attempt_id}_normalized'),
596
+ 'collection_id': _string_or_none(raw.get('collection_id'), metadata, 'collection_id'), 'prior_trace_id': _string_or_none(raw.get('prior_trace_id'), metadata, 'prior_trace_id'), 'trace_mode': _canonical_trace_mode(raw, metadata),
597
+ 'task': _string_or_none(raw.get('task'), metadata, 'task') or context.task, 'task_id': _string_or_none(raw.get('task_id'), metadata, 'task_id') or context.task_id, 'task_family_id': _string_or_none(raw.get('task_family_id'), metadata, 'task_family_id'),
598
+ 'attempt_id': _string_or_none(raw.get('attempt_id'), metadata, 'attempt_id') or context.attempt_id, 'attempt_kind': _canonical_attempt_kind(raw, context, metadata),
599
+ 'agent_tools': _canonical_agent_tools(raw, metadata),
600
+ 'started_at': _string_or_none(raw.get('started_at'), metadata, 'started_at'), 'ended_at': _string_or_none(raw.get('ended_at'), metadata, 'ended_at'), 'system_prompt': _string_or_none(raw.get('system_prompt'), metadata, 'system_prompt'), 'system_prompt_hash': _string_or_none(raw.get('system_prompt_hash'), metadata, 'system_prompt_hash'), 'skills': raw.get('skills') if isinstance(raw.get('skills'), list) else None, 'memory': _string_or_none(raw.get('memory'), metadata, 'memory'), 'agent_config': raw.get('agent_config') if isinstance(raw.get('agent_config'), dict) else None, 'learning': _string_or_none(raw.get('learning'), metadata, 'learning'),
601
+ 'termination_reason': _canonical_termination_reason(raw, success_like, metadata),
602
+ 'steps': steps, 'actual_outputs': _valid_actual_outputs(raw.get('actual_outputs'), context, metadata), 'artifacts': _canonical_artifacts(raw, metadata), 'metadata_json': metadata,
603
+ }
604
+ trace['attempt_status']=infer_attempt_status(trace)
605
+ report.raw_step_count=len(raw_steps); report.normalized_step_count=len(steps); report.discarded_step_count=0
606
+ valid, errors = _validate_normalized_trace(trace)
607
+ report.trace_schema_valid=valid; trace['metadata_json']['schema_valid']=valid
608
+ if not valid:
609
+ report.trace_normalization_partial=True; report.validation_errors.extend(errors)
610
+ report.trace_normalized=True; report.trace_lossless=(report.raw_step_count==report.normalized_step_count and report.discarded_step_count==0)
611
+ return trace
612
+
613
+ def _validation_error_messages(exc: Exception) -> list[str]:
614
+ if hasattr(exc, 'errors'):
615
+ try:
616
+ return [_compact(e, 2000) for e in exc.errors()]
617
+ except Exception:
618
+ pass
619
+ return [str(exc)]
620
+
621
+ def _validate_normalized_trace(trace: dict[str, Any]) -> tuple[bool, list[str]]:
622
+ try:
623
+ AgentTrace.model_validate(trace)
624
+ return True, []
625
+ except Exception as exc:
626
+ return False, _validation_error_messages(exc)
627
+
628
+ def repair_agent_trace_file(raw_trace_path: Path, context: TraceNormalizationContext) -> NormalizationResult:
629
+ report=TraceNormalizationReport(task_id=context.task_id, attempt_id=context.attempt_id, attempt_kind=context.attempt_kind, raw_trace_ref=raw_trace_path.name)
630
+ try:
631
+ raw=json.loads(raw_trace_path.read_text())
632
+ if isinstance(raw, list):
633
+ raw={'steps': raw, 'metadata_json': {'raw_trace_shape': 'list', 'raw_trace_list': raw}}
634
+ elif isinstance(raw, dict):
635
+ raw_steps, source = _extract_raw_steps(raw)
636
+ if source and source != 'steps':
637
+ md=dict(raw.get('metadata_json') or {}) if isinstance(raw.get('metadata_json'), dict) else {}
638
+ md.setdefault('raw_trace_shape', source)
639
+ raw['metadata_json']=md
640
+ raw['steps']=raw_steps
641
+ else:
642
+ raise ValueError('raw trace JSON was not an object or step list')
643
+ except Exception as e:
644
+ report.raw_trace_parse_error=True; report.validation_errors.append(str(e))
645
+ return NormalizationResult(normalized_trace=None, report=report, fallback_required=True, parse_error=str(e))
646
+ trace=normalize_agent_trace(raw, context)
647
+ valid, errors = _validate_normalized_trace(trace)
648
+ raw_steps, _source = _extract_raw_steps(raw)
649
+ raw_step_count = len(raw_steps) if isinstance(raw_steps, list) else 0
650
+ normalized_step_count = len(trace.get('steps') or [])
651
+ comp=TraceNormalizationReport(task_id=context.task_id, attempt_id=context.attempt_id, attempt_kind=context.attempt_kind, raw_trace_ref=raw_trace_path.name, normalized_trace_ref='agent_trace.normalized.json', canonical_trace_ref='agent_trace.json', trace_schema_valid=valid, trace_normalized=True, trace_lossless=True, fallback_trace=False, raw_step_count=raw_step_count, normalized_step_count=normalized_step_count, discarded_step_count=0, trace_normalization_partial=(not valid) or trace.get('metadata_json',{}).get('trace_normalization_status')=='partial')
652
+ # Re-run normalization into a temporary report to retain repair counters.
653
+ counter_report=TraceNormalizationReport(task_id=context.task_id, attempt_id=context.attempt_id, attempt_kind=context.attempt_kind)
654
+ used=set()
655
+ for i, step in enumerate(raw_steps if isinstance(raw_steps, list) else [], 1):
656
+ _normalize_step(step, i, used, counter_report)
657
+ comp.input_dict_to_string_count=counter_report.input_dict_to_string_count
658
+ comp.output_dict_to_string_count=counter_report.output_dict_to_string_count
659
+ comp.causal_type_repair_count=counter_report.causal_type_repair_count
660
+ comp.operation_repair_count=counter_report.operation_repair_count
661
+ comp.action_repair_count=counter_report.action_repair_count
662
+ comp.warnings.extend(counter_report.warnings)
663
+ comp.trace_lossless = comp.raw_step_count == comp.normalized_step_count and comp.discarded_step_count == 0
664
+ if not comp.trace_lossless: comp.validation_errors.append('normalized step count is less than raw step count')
665
+ if not valid:
666
+ comp.validation_errors.extend(errors)
667
+ trace.setdefault('metadata_json', {})['schema_valid']=valid
668
+ trace['metadata_json']['trace_normalization_status']='normalized' if valid and not comp.warnings else 'partial'
669
+ return NormalizationResult(normalized_trace=trace if valid else trace, report=comp, fallback_required=False)