agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import shutil, subprocess
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import re
|
|
5
|
+
from .schemas import AgentTrace, AgentTraceStep, ActualOutputs, RawTaskRecord, TaskIntakeSpec
|
|
6
|
+
from .trace_prompt import build_worker_prompt
|
|
7
|
+
from .io import write_json
|
|
8
|
+
from .config import get_settings
|
|
9
|
+
from .env import redact_secrets
|
|
10
|
+
from .contract_diagnostics import build_contract_diagnostics, diagnostics_text
|
|
11
|
+
from .trace_normalizer import TraceNormalizationContext, repair_agent_trace_file, TraceNormalizationReport
|
|
12
|
+
from .actual_outputs_normalizer import ActualOutputsNormalizationContext, repair_actual_outputs_file, write_actual_outputs_normalization
|
|
13
|
+
from .public_sanitizer import classify_provider_failure, public_error_summary
|
|
14
|
+
|
|
15
|
+
class AttemptResult(dict): pass
|
|
16
|
+
|
|
17
|
+
CODEX_TRUST_RETRY_MESSAGE = "Codex refused to run because the workspace is not a trusted Git directory. Retrying with --skip-git-repo-check if supported."
|
|
18
|
+
|
|
19
|
+
def _attempt_dir(package_root: Path, attempt_kind: str) -> Path:
|
|
20
|
+
p=package_root/'attempts'/attempt_kind
|
|
21
|
+
(p/'artifacts').mkdir(parents=True, exist_ok=True)
|
|
22
|
+
(p/'input').mkdir(parents=True, exist_ok=True)
|
|
23
|
+
return p
|
|
24
|
+
|
|
25
|
+
def _copy_attempt_inputs(package_root: Path, attempt_dir: Path) -> list[str]:
|
|
26
|
+
src=package_root/'input'; dst=attempt_dir/'input'
|
|
27
|
+
dst.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
copied=[]
|
|
29
|
+
if src.exists():
|
|
30
|
+
for f in src.iterdir():
|
|
31
|
+
if f.is_file():
|
|
32
|
+
shutil.copy2(f, dst/f.name); copied.append(f.name)
|
|
33
|
+
elif f.is_dir():
|
|
34
|
+
shutil.copytree(f, dst/f.name, dirs_exist_ok=True); copied.append(f.name)
|
|
35
|
+
return copied
|
|
36
|
+
|
|
37
|
+
def parse_expected_deliverables(text: str | None) -> list[str]:
|
|
38
|
+
if not text:
|
|
39
|
+
return []
|
|
40
|
+
parts=re.split(r'\s*(?:\+|,|;|\band\b|\n)\s*', str(text))
|
|
41
|
+
items=[]
|
|
42
|
+
for part in parts:
|
|
43
|
+
part=part.strip().strip('`*•- ')
|
|
44
|
+
if not part:
|
|
45
|
+
continue
|
|
46
|
+
match=re.search(r'[A-Za-z0-9_.-]+\.(?:csv|xlsx|xls|json|jsonl|md|txt|pdf|html|xml|py|zip)', part, re.I)
|
|
47
|
+
if match:
|
|
48
|
+
items.append(match.group(0))
|
|
49
|
+
return list(dict.fromkeys(items))
|
|
50
|
+
|
|
51
|
+
def _deliverables(raw: RawTaskRecord, spec: TaskIntakeSpec) -> list[str]:
|
|
52
|
+
expected=raw.expected_deliverable or raw.raw_payload.get('expected_deliverable')
|
|
53
|
+
parsed=parse_expected_deliverables(expected)
|
|
54
|
+
if parsed:
|
|
55
|
+
return parsed
|
|
56
|
+
reqs=raw.raw_payload.get('output_requirements') or spec.output_requirements
|
|
57
|
+
file_like=[]
|
|
58
|
+
for item in reqs or []:
|
|
59
|
+
item=str(item)
|
|
60
|
+
if re.search(r'\.(?:csv|xlsx|xls|json|jsonl|md|txt|pdf|html|xml|py|zip)\b', item, re.I):
|
|
61
|
+
file_like.extend(parse_expected_deliverables(item))
|
|
62
|
+
return file_like or ['output.txt']
|
|
63
|
+
|
|
64
|
+
def _fallback_actual(spec: TaskIntakeSpec, attempt_kind: str, error_type: str, error_message: str) -> ActualOutputs:
|
|
65
|
+
cls=classify_provider_failure(error_message)
|
|
66
|
+
et=cls.get('error_type') or error_type
|
|
67
|
+
msg=public_error_summary(error_message) if cls else redact_secrets(error_message)
|
|
68
|
+
return ActualOutputs(task_id=spec.task_id, attempt_id=f'{spec.task_id}_{attempt_kind}', attempt_kind=attempt_kind, status='failed', output_summary='Attempt failed or did not produce a valid agent trace; raw logs were preserved.', primary_output_ref=None, deliverable_refs=[], final_message_ref=f'attempts/{attempt_kind}/final_message.txt', artifact_refs=[f'attempts/{attempt_kind}/prompt.md', f'attempts/{attempt_kind}/stdout.txt', f'attempts/{attempt_kind}/stderr.txt', f'attempts/{attempt_kind}/final_message.txt'], files_created=[], files_modified=[], files_deleted=[], stdout_ref=f'attempts/{attempt_kind}/stdout.txt', stderr_ref=f'attempts/{attempt_kind}/stderr.txt', raw_log_refs=[f'attempts/{attempt_kind}/stdout.txt',f'attempts/{attempt_kind}/stderr.txt',f'attempts/{attempt_kind}/final_message.txt'], error_type=et, error_message=msg, metadata_json={'trace_valid':False,'fallback_actual_outputs_created':True, **cls})
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _apprentice_operational_error(error: object | None, stdout: str = "", stderr: str = "", returncode: int | None = None) -> str | None:
|
|
72
|
+
text = f"{error or ''}\n{stdout or ''}\n{stderr or ''}".lower()
|
|
73
|
+
if isinstance(error, FileNotFoundError) or "no such file or directory: 'codex'" in text or "no such file or directory: codex" in text:
|
|
74
|
+
return "Apprentice Agent command not found: codex"
|
|
75
|
+
if isinstance(error, subprocess.TimeoutExpired) or "timed out" in text or "timeout" in text:
|
|
76
|
+
return "Apprentice Agent timed out while running Codex."
|
|
77
|
+
if _is_codex_workspace_trust_error(stdout, stderr, error):
|
|
78
|
+
return "Codex workspace trust error: Codex refused to run because the workspace is not a trusted Git directory. Use a Codex CLI with --skip-git-repo-check support or run from a trusted Git repository."
|
|
79
|
+
if _is_codex_local_state_permission_error(stdout, stderr, error):
|
|
80
|
+
return "Codex local state permission error: Codex could not initialize its local state or app-server client. Check that CODEX_HOME is writable and Codex can access its state directory."
|
|
81
|
+
auth_markers = [
|
|
82
|
+
"not authenticated",
|
|
83
|
+
"not logged in",
|
|
84
|
+
"login required",
|
|
85
|
+
"please login",
|
|
86
|
+
"please log in",
|
|
87
|
+
"authentication failed",
|
|
88
|
+
"auth failed",
|
|
89
|
+
"unauthorized",
|
|
90
|
+
"http 401",
|
|
91
|
+
"status 401",
|
|
92
|
+
"status code 401",
|
|
93
|
+
"401 unauthorized",
|
|
94
|
+
"missing api key",
|
|
95
|
+
"api key not configured",
|
|
96
|
+
"invalid api key",
|
|
97
|
+
"credential error",
|
|
98
|
+
"credentials not configured",
|
|
99
|
+
]
|
|
100
|
+
if any(token in text for token in auth_markers):
|
|
101
|
+
return "Apprentice Agent setup error: Codex is not authenticated or configured."
|
|
102
|
+
quota_markers = [
|
|
103
|
+
"quota",
|
|
104
|
+
"rate limit",
|
|
105
|
+
"billing",
|
|
106
|
+
"insufficient quota",
|
|
107
|
+
"insufficient credits",
|
|
108
|
+
"out of credits",
|
|
109
|
+
"credit limit",
|
|
110
|
+
"usage limit",
|
|
111
|
+
]
|
|
112
|
+
if any(token in text for token in quota_markers):
|
|
113
|
+
return "Apprentice Agent provider quota or usage limit reached."
|
|
114
|
+
if returncode not in (None, 0):
|
|
115
|
+
return f"Apprentice Agent exited before producing required outputs (exit code {returncode})."
|
|
116
|
+
if error:
|
|
117
|
+
return f"Apprentice Agent operational error: {redact_secrets(str(error))}"
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _is_codex_workspace_trust_error(stdout: str = "", stderr: str = "", error: object | None = None) -> bool:
|
|
122
|
+
text = f"{error or ''}\n{stdout or ''}\n{stderr or ''}".lower()
|
|
123
|
+
return (
|
|
124
|
+
"not inside a trusted directory" in text
|
|
125
|
+
or "--skip-git-repo-check was not specified" in text
|
|
126
|
+
or "trusted git directory" in text
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _is_codex_local_state_permission_error(stdout: str = "", stderr: str = "", error: object | None = None) -> bool:
|
|
131
|
+
text = f"{error or ''}\n{stdout or ''}\n{stderr or ''}".lower()
|
|
132
|
+
return (
|
|
133
|
+
("codex_state" in text and "readonly database" in text)
|
|
134
|
+
or ("failed to open state db" in text and "readonly database" in text)
|
|
135
|
+
or ("failed to initialize in-process app-server client" in text and "operation not permitted" in text)
|
|
136
|
+
or ("failed to initialize state runtime" in text and "readonly database" in text)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def codex_exec_help(command: str = "codex") -> str:
|
|
141
|
+
try:
|
|
142
|
+
cp = subprocess.run([command, "exec", "--help"], cwd=None, text=True, capture_output=True, timeout=5)
|
|
143
|
+
return (getattr(cp, "stdout", "") or "") + "\n" + (getattr(cp, "stderr", "") or "")
|
|
144
|
+
except Exception:
|
|
145
|
+
try:
|
|
146
|
+
cp = subprocess.run([command, "--help"], cwd=None, text=True, capture_output=True, timeout=5)
|
|
147
|
+
return (getattr(cp, "stdout", "") or "") + "\n" + (getattr(cp, "stderr", "") or "")
|
|
148
|
+
except Exception:
|
|
149
|
+
return ""
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _codex_exec_supports(flag: str, command: str = "codex") -> bool:
|
|
153
|
+
return flag in codex_exec_help(command)
|
|
154
|
+
|
|
155
|
+
def _fallback_trace(spec: TaskIntakeSpec, attempt_kind: str, prompt: str, actual: ActualOutputs, error_type: str, error_message: str, codex_sandbox: str) -> AgentTrace:
|
|
156
|
+
cls=classify_provider_failure(error_message)
|
|
157
|
+
attempt_id=f'{spec.task_id}_{attempt_kind}'
|
|
158
|
+
actor=f'agent:{"reviser" if attempt_kind=="revised" else "worker"}'
|
|
159
|
+
safe=(public_error_summary(error_message) if cls else redact_secrets(error_message)[-3000:])
|
|
160
|
+
error_type=cls.get('error_type') or error_type
|
|
161
|
+
steps=[
|
|
162
|
+
AgentTraceStep(step=1, turn=1, actor='user', action='user_message', input=spec.normalized_instruction, message_role='direct_request'),
|
|
163
|
+
AgentTraceStep(step=2, turn=1, actor=actor, action='error', operation='other', tool='codex_cli', observation='Codex attempt ended without a valid agent_trace.json or actual_outputs.json.', input='Validate required output contract for ./agent_trace.json and ./actual_outputs.json.', output=safe, state_change='A minimal failure trace and failed actual_outputs.json were written by the runner.', reasoning='The package must preserve failure evidence without fabricating a successful detailed trace.', caused_by=[1], causal_type='dependency_on_tool_result', success=False, step_outcome='failed', error_type=error_type, error_message=safe, artifact_refs=[f'attempts/{attempt_kind}/prompt.md', f'attempts/{attempt_kind}/stdout.txt', f'attempts/{attempt_kind}/stderr.txt', f'attempts/{attempt_kind}/final_message.txt']),
|
|
164
|
+
]
|
|
165
|
+
return AgentTrace(trace_id=f'trace_{attempt_id}_fallback', collection_id=None, trace_mode='live', task=spec.normalized_instruction, task_id=spec.task_id, attempt_id=attempt_id, attempt_kind=attempt_kind if attempt_kind in ['baseline','revised'] else 'other', agent_tools=['codex_cli','Bash','python','file_read','file_write'], system_prompt=prompt, system_prompt_hash=None, skills=['agent_trace_skill'], learning='When a live agent cannot write outputs, preserve logs and create a minimal failure trace.', termination_reason='agent_blocked', steps=steps, actual_outputs=actual, artifacts=[], metadata_json={'trace_valid':False,'fallback_trace_created':True,'codex_sandbox':codex_sandbox, **cls})
|
|
166
|
+
|
|
167
|
+
def _write_fallback_report(d: Path, spec: TaskIntakeSpec, attempt_kind: str, reason: str, raw_parse_error: bool=False) -> None:
|
|
168
|
+
report=TraceNormalizationReport(task_id=spec.task_id, attempt_id=f'{spec.task_id}_{attempt_kind}', attempt_kind=attempt_kind, raw_trace_ref='agent_trace.raw.json' if (d/'agent_trace.raw.json').exists() else None, normalized_trace_ref=None, canonical_trace_ref='agent_trace.json', trace_schema_valid=True, trace_normalized=False, trace_lossless=True, fallback_trace=True, raw_step_count=0, normalized_step_count=2, discarded_step_count=0, raw_trace_parse_error=raw_parse_error, validation_errors=[reason], metadata_json={'fallback_reason': reason})
|
|
169
|
+
write_json(d/'trace_normalization_report.json', report)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def ensure_attempt_outputs(package_root: Path, spec: TaskIntakeSpec, attempt_kind: str, prompt: str, codex_sandbox: str, validation_error: Exception | None=None) -> tuple[AgentTrace, ActualOutputs, bool]:
|
|
173
|
+
d=_attempt_dir(package_root, attempt_kind)
|
|
174
|
+
actual_path=d/'actual_outputs.json'; trace_path=d/'agent_trace.json'; raw_path=d/'agent_trace.raw.json'
|
|
175
|
+
err = validation_error or FileNotFoundError('agent_trace.json missing or invalid')
|
|
176
|
+
required_artifacts=_deliverables(RawTaskRecord(raw_task_id=spec.task_id, source_kind='normalized_spec', raw_title=spec.normalized_title, raw_description=spec.normalized_instruction, raw_payload={'expected_deliverable': spec.expected_agent_deliverable}), spec)
|
|
177
|
+
actual_ctx=ActualOutputsNormalizationContext(task_id=spec.task_id, attempt_id=f'{spec.task_id}_{attempt_kind}', attempt_kind=attempt_kind, package_root=package_root, required_artifacts=required_artifacts)
|
|
178
|
+
actual_result=repair_actual_outputs_file(actual_path, actual_ctx)
|
|
179
|
+
if actual_result.actual_outputs is not None:
|
|
180
|
+
write_actual_outputs_normalization(d, actual_result)
|
|
181
|
+
actual=ActualOutputs.model_validate(actual_result.actual_outputs)
|
|
182
|
+
else:
|
|
183
|
+
actual=_fallback_actual(spec, attempt_kind, type(err).__name__, str(err)); write_json(actual_path, actual); write_json(d/'actual_outputs_normalization_report.json', actual_result.report)
|
|
184
|
+
if trace_path.exists():
|
|
185
|
+
original_text=trace_path.read_text(errors='ignore')
|
|
186
|
+
if not raw_path.exists(): raw_path.write_text(original_text)
|
|
187
|
+
context=TraceNormalizationContext(task_id=spec.task_id, attempt_id=f'{spec.task_id}_{attempt_kind}', attempt_kind=attempt_kind, task=spec.normalized_instruction, actual_outputs=actual)
|
|
188
|
+
result=repair_agent_trace_file(raw_path, context)
|
|
189
|
+
if result.normalized_trace and (result.report.raw_step_count or result.normalized_trace.get('steps')):
|
|
190
|
+
try:
|
|
191
|
+
AgentTrace.model_validate_json(raw_path.read_text())
|
|
192
|
+
except Exception:
|
|
193
|
+
(d/'agent_trace.invalid.json').write_text(raw_path.read_text())
|
|
194
|
+
write_json(d/'agent_trace.normalized.json', result.normalized_trace)
|
|
195
|
+
write_json(d/'agent_trace.json', result.normalized_trace)
|
|
196
|
+
write_json(d/'trace_normalization_report.json', result.report)
|
|
197
|
+
return AgentTrace.model_validate(result.normalized_trace), actual, bool(result.report.trace_schema_valid)
|
|
198
|
+
reason=result.parse_error or 'parseable trace contained no steps'
|
|
199
|
+
final=(d/'final_message.txt').read_text(errors='ignore') if (d/'final_message.txt').exists() else ''
|
|
200
|
+
stderr=(d/'stderr.txt').read_text(errors='ignore') if (d/'stderr.txt').exists() else ''
|
|
201
|
+
msg=f'{reason}\n\nfinal_message:\n{final}\n\nstderr:\n{stderr}'
|
|
202
|
+
trace=_fallback_trace(spec, attempt_kind, prompt, actual, 'TraceNormalizationError', msg, codex_sandbox)
|
|
203
|
+
trace.metadata_json['fallback_reason']=classify_provider_failure(msg).get('fallback_reason') or ('unparseable_trace' if result.fallback_required else 'missing_steps')
|
|
204
|
+
write_json(trace_path, trace); _write_fallback_report(d, spec, attempt_kind, trace.metadata_json['fallback_reason'], result.fallback_required)
|
|
205
|
+
return trace, actual, False
|
|
206
|
+
final=(d/'final_message.txt').read_text(errors='ignore') if (d/'final_message.txt').exists() else ''
|
|
207
|
+
stderr=(d/'stderr.txt').read_text(errors='ignore') if (d/'stderr.txt').exists() else ''
|
|
208
|
+
msg=f'missing_trace\n\nfinal_message:\n{final}\n\nstderr:\n{stderr}'
|
|
209
|
+
trace=_fallback_trace(spec, attempt_kind, prompt, actual, 'FileNotFoundError', msg, codex_sandbox)
|
|
210
|
+
trace.metadata_json['fallback_reason']=classify_provider_failure(msg).get('fallback_reason') or 'missing_trace'
|
|
211
|
+
write_json(trace_path, trace); _write_fallback_report(d, spec, attempt_kind, 'missing_trace')
|
|
212
|
+
return trace, actual, False
|
|
213
|
+
|
|
214
|
+
def deterministic_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSpec, attempt_kind='baseline', feedback: str | None=None) -> AttemptResult:
|
|
215
|
+
attempt_id=f'{spec.task_id}_{attempt_kind}'
|
|
216
|
+
d=_attempt_dir(package_root, attempt_kind)
|
|
217
|
+
input_files=_copy_attempt_inputs(package_root, d)
|
|
218
|
+
deliverables = _deliverables(raw, spec)
|
|
219
|
+
settings=get_settings()
|
|
220
|
+
prompt=build_worker_prompt(spec.normalized_instruction, '', attempt_kind, input_files, deliverables, settings.sensitive_info_masking, workspace_path=str(d))
|
|
221
|
+
(d/'prompt.md').write_text(prompt)
|
|
222
|
+
(d/'stdout.txt').write_text('deterministic runner completed\n')
|
|
223
|
+
(d/'stderr.txt').write_text('')
|
|
224
|
+
(d/'final_message.txt').write_text(f'Deterministic {attempt_kind} attempt complete.\n')
|
|
225
|
+
files=[]
|
|
226
|
+
for name in deliverables:
|
|
227
|
+
fname=name if '.' in name and '/' not in name else name.lower().replace(' ','_') + '.txt'
|
|
228
|
+
(d/'artifacts'/fname).write_text(f'{attempt_kind} deterministic artifact for {name}\n')
|
|
229
|
+
files.append(f'attempts/{attempt_kind}/artifacts/{fname}')
|
|
230
|
+
actual=ActualOutputs(task_id=spec.task_id, attempt_id=attempt_id, attempt_kind=attempt_kind, status='success', output_summary=f'Deterministic {attempt_kind} outputs for {spec.normalized_title}', primary_output_ref=files[0] if files else None, deliverable_refs=files, final_message_ref=f'attempts/{attempt_kind}/final_message.txt', artifact_refs=files, files_created=files, files_modified=[], files_deleted=[], stdout_ref=f'attempts/{attempt_kind}/stdout.txt', stderr_ref=f'attempts/{attempt_kind}/stderr.txt', raw_log_refs=[f'attempts/{attempt_kind}/stdout.txt', f'attempts/{attempt_kind}/stderr.txt'], error_type=None, error_message=None, metadata_json={'runner':'deterministic','expected_deliverable_items':deliverables,'produced_deliverable_items':[Path(f).name for f in files]})
|
|
231
|
+
steps=[
|
|
232
|
+
AgentTraceStep(step=1, turn=1, actor='user', action='user_message', input=spec.normalized_instruction, message_role='direct_request'),
|
|
233
|
+
AgentTraceStep(step=2, turn=1, actor=f'agent:{"reviser" if attempt_kind=="revised" else "worker"}', action='agent_step', operation='plan', tool=None, observation='Task instructions, ./input files, and worker-visible rubric are available.', input='Review task and required deliverables.', output='Plan created.', state_change='The attempt plan identified required artifacts.', reasoning='A short plan reduces missed deliverables.', caused_by=[1], causal_type='user_request', success=True, step_outcome='progress'),
|
|
234
|
+
AgentTraceStep(step=3, turn=1, actor=f'agent:{"reviser" if attempt_kind=="revised" else "worker"}', action='agent_step', operation='write', tool='file_write', observation='Deliverable names are known.', input=', '.join(files), output='Artifacts written.', state_change='Required deterministic artifacts were created under ./artifacts.', reasoning='Writing explicit files satisfies the artifact contract.', caused_by=[2], causal_type='execution_of_plan', success=True, step_outcome='completed', artifact_refs=files),
|
|
235
|
+
]
|
|
236
|
+
trace=AgentTrace(trace_id=f'trace_{attempt_id}', collection_id=None, trace_mode='live', task=spec.normalized_instruction, task_id=spec.task_id, attempt_id=attempt_id, attempt_kind=attempt_kind if attempt_kind in ['baseline','revised'] else 'other', agent_tools=['deterministic_runner','file_write'], system_prompt=prompt, system_prompt_hash=None, skills=['agent_trace_skill'], learning='Materialize required artifacts and keep evaluation outside the trace.', termination_reason='task_complete', steps=steps, actual_outputs=actual, artifacts=[], metadata_json={'trace_valid': True})
|
|
237
|
+
write_json(d/'actual_outputs.json', actual)
|
|
238
|
+
actual_ctx=ActualOutputsNormalizationContext(task_id=spec.task_id, attempt_id=attempt_id, attempt_kind=attempt_kind, package_root=package_root, required_artifacts=deliverables)
|
|
239
|
+
actual_result=repair_actual_outputs_file(d/'actual_outputs.json', actual_ctx)
|
|
240
|
+
write_actual_outputs_normalization(d, actual_result)
|
|
241
|
+
actual=ActualOutputs.model_validate(actual_result.actual_outputs)
|
|
242
|
+
write_json(d/'agent_trace.raw.json', trace); write_json(d/'agent_trace.normalized.json', trace); write_json(d/'agent_trace.json', trace)
|
|
243
|
+
report=TraceNormalizationReport(task_id=spec.task_id, attempt_id=attempt_id, attempt_kind=attempt_kind, raw_trace_ref='agent_trace.raw.json', normalized_trace_ref='agent_trace.normalized.json', canonical_trace_ref='agent_trace.json', trace_schema_valid=True, trace_normalized=False, trace_lossless=True, fallback_trace=False, raw_step_count=len(steps), normalized_step_count=len(steps), discarded_step_count=0)
|
|
244
|
+
write_json(d/'trace_normalization_report.json', report)
|
|
245
|
+
return AttemptResult(attempt_dir=str(d), trace_valid=True, trace=trace, actual_outputs=actual)
|
|
246
|
+
|
|
247
|
+
def codex_command(
|
|
248
|
+
prompt: str,
|
|
249
|
+
sandbox: str,
|
|
250
|
+
workspace: Path | str | None = None,
|
|
251
|
+
*,
|
|
252
|
+
command: str = "codex",
|
|
253
|
+
skip_git_repo_check_supported: bool | None = None,
|
|
254
|
+
ask_for_approval_supported: bool | None = None,
|
|
255
|
+
) -> list[str]:
|
|
256
|
+
cmd = [command, 'exec']
|
|
257
|
+
if workspace is not None:
|
|
258
|
+
cmd.extend(['--cd', str(workspace)])
|
|
259
|
+
cmd.extend(['--sandbox', sandbox])
|
|
260
|
+
if ask_for_approval_supported is None:
|
|
261
|
+
ask_for_approval_supported = _codex_exec_supports('--ask-for-approval', command)
|
|
262
|
+
if ask_for_approval_supported:
|
|
263
|
+
cmd.extend(['--ask-for-approval', 'never'])
|
|
264
|
+
if skip_git_repo_check_supported is None:
|
|
265
|
+
skip_git_repo_check_supported = _codex_exec_supports('--skip-git-repo-check', command)
|
|
266
|
+
if skip_git_repo_check_supported:
|
|
267
|
+
cmd.append('--skip-git-repo-check')
|
|
268
|
+
cmd.append(prompt)
|
|
269
|
+
return cmd
|
|
270
|
+
|
|
271
|
+
def run_codex_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSpec, attempt_kind='baseline', timeout=900) -> AttemptResult:
|
|
272
|
+
d=_attempt_dir(package_root, attempt_kind)
|
|
273
|
+
input_files=_copy_attempt_inputs(package_root, d)
|
|
274
|
+
deliverables=_deliverables(raw, spec)
|
|
275
|
+
settings=get_settings()
|
|
276
|
+
sandbox=settings.codex_sandbox or 'workspace-write'
|
|
277
|
+
codex_executable=settings.worker_agent_command or 'codex'
|
|
278
|
+
prompt=build_worker_prompt(spec.normalized_instruction, (package_root/'rubric'/'worker_visible_rubric.md').read_text() if (package_root/'rubric'/'worker_visible_rubric.md').exists() else '', attempt_kind, input_files, deliverables, settings.sensitive_info_masking, workspace_path=str(d))
|
|
279
|
+
(d/'prompt.md').write_text(prompt)
|
|
280
|
+
skip_supported=_codex_exec_supports('--skip-git-repo-check', codex_executable)
|
|
281
|
+
ask_supported=_codex_exec_supports('--ask-for-approval', codex_executable)
|
|
282
|
+
cmd=codex_command(
|
|
283
|
+
prompt,
|
|
284
|
+
sandbox,
|
|
285
|
+
d,
|
|
286
|
+
command=codex_executable,
|
|
287
|
+
skip_git_repo_check_supported=skip_supported,
|
|
288
|
+
ask_for_approval_supported=ask_supported,
|
|
289
|
+
)
|
|
290
|
+
run_error=None
|
|
291
|
+
returncode=None
|
|
292
|
+
stdout=''
|
|
293
|
+
stderr=''
|
|
294
|
+
try:
|
|
295
|
+
cp=subprocess.run(cmd, cwd=d, text=True, capture_output=True, timeout=timeout)
|
|
296
|
+
returncode=cp.returncode
|
|
297
|
+
stdout=cp.stdout or ''
|
|
298
|
+
stderr=cp.stderr or ''
|
|
299
|
+
if cp.returncode != 0 and _is_codex_workspace_trust_error(stdout, stderr) and not skip_supported:
|
|
300
|
+
stderr = f"{stderr}\n{CODEX_TRUST_RETRY_MESSAGE}\nCodex CLI help did not list --skip-git-repo-check, so Agent Apprenticeship could not retry safely."
|
|
301
|
+
elif cp.returncode != 0 and _is_codex_workspace_trust_error(stdout, stderr) and skip_supported and '--skip-git-repo-check' not in cmd:
|
|
302
|
+
stderr = f"{stderr}\n{CODEX_TRUST_RETRY_MESSAGE}"
|
|
303
|
+
retry_cmd=codex_command(
|
|
304
|
+
prompt,
|
|
305
|
+
sandbox,
|
|
306
|
+
d,
|
|
307
|
+
command=codex_executable,
|
|
308
|
+
skip_git_repo_check_supported=True,
|
|
309
|
+
ask_for_approval_supported=ask_supported,
|
|
310
|
+
)
|
|
311
|
+
cp=subprocess.run(retry_cmd, cwd=d, text=True, capture_output=True, timeout=timeout)
|
|
312
|
+
cmd=retry_cmd
|
|
313
|
+
returncode=cp.returncode
|
|
314
|
+
stdout=(stdout or '') + "\n" + (cp.stdout or '')
|
|
315
|
+
stderr=(stderr or '') + "\n" + (cp.stderr or '')
|
|
316
|
+
(d/'stdout.txt').write_text(redact_secrets(stdout)); (d/'stderr.txt').write_text(redact_secrets(stderr)); (d/'final_message.txt').write_text(redact_secrets((stdout or stderr or '')[-4000:]))
|
|
317
|
+
if cp.returncode != 0:
|
|
318
|
+
run_error=RuntimeError(_apprentice_operational_error(None, stdout, stderr, cp.returncode) or f'Codex exited with code {cp.returncode}.')
|
|
319
|
+
except Exception as e:
|
|
320
|
+
run_error=e
|
|
321
|
+
(d/'stdout.txt').write_text(''); (d/'stderr.txt').write_text(redact_secrets(str(e))); (d/'final_message.txt').write_text('Codex run failed before producing validated trace.')
|
|
322
|
+
contract_diagnostics = None
|
|
323
|
+
if not (d/'agent_trace.json').exists() or not (d/'actual_outputs.json').exists():
|
|
324
|
+
contract_diagnostics = build_contract_diagnostics(d, command=cmd, working_directory=d, agent_display_name='Codex', prompt=prompt)
|
|
325
|
+
with (d/'final_message.txt').open('a') as f:
|
|
326
|
+
f.write('\n\n' + diagnostics_text(contract_diagnostics))
|
|
327
|
+
trace, actual, trace_valid = ensure_attempt_outputs(package_root, spec, attempt_kind, prompt, sandbox, run_error)
|
|
328
|
+
trace.metadata_json['codex_sandbox']=sandbox
|
|
329
|
+
trace.metadata_json['codex_command']=[part if part != prompt else '<prompt>' for part in cmd]
|
|
330
|
+
trace.metadata_json['codex_skip_git_repo_check_supported']=skip_supported
|
|
331
|
+
trace.metadata_json['codex_skip_git_repo_check_used']='--skip-git-repo-check' in cmd
|
|
332
|
+
trace.metadata_json['trace_valid']=trace_valid
|
|
333
|
+
if contract_diagnostics:
|
|
334
|
+
trace.metadata_json['apprentice_agent_contract_diagnostics']=contract_diagnostics
|
|
335
|
+
write_json(d/'agent_trace.json', trace)
|
|
336
|
+
if actual.metadata_json is None: actual.metadata_json={}
|
|
337
|
+
actual.metadata_json['codex_sandbox']=sandbox
|
|
338
|
+
if contract_diagnostics:
|
|
339
|
+
actual.metadata_json['apprentice_agent_contract_diagnostics']=contract_diagnostics
|
|
340
|
+
op_error = _apprentice_operational_error(run_error, stdout, stderr, returncode)
|
|
341
|
+
if op_error and returncode not in (None, 0) and trace_valid and actual.status == 'success':
|
|
342
|
+
op_error = f"Apprentice Agent exited nonzero after producing required outputs (exit code {returncode})."
|
|
343
|
+
if op_error or not trace_valid:
|
|
344
|
+
actual.metadata_json['apprentice_agent_operational_error'] = op_error or 'Apprentice Agent did not produce a valid agent_trace.json; raw logs were preserved.'
|
|
345
|
+
actual.error_message = op_error or actual.error_message
|
|
346
|
+
trace.metadata_json['apprentice_agent_operational_error'] = actual.metadata_json['apprentice_agent_operational_error']
|
|
347
|
+
write_json(d/'actual_outputs.json', actual)
|
|
348
|
+
write_json(d/'agent_trace.json', trace)
|
|
349
|
+
return AttemptResult(attempt_dir=str(d), trace_valid=trace_valid, trace=trace, actual_outputs=actual, codex_sandbox=sandbox)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def run_custom_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSpec, attempt_kind='baseline', timeout: int | None=None) -> AttemptResult:
|
|
353
|
+
settings=get_settings()
|
|
354
|
+
d=_attempt_dir(package_root, attempt_kind)
|
|
355
|
+
input_files=_copy_attempt_inputs(package_root, d)
|
|
356
|
+
deliverables=_deliverables(raw, spec)
|
|
357
|
+
rubric_md=(package_root/'rubric'/'worker_visible_rubric.md').read_text() if (package_root/'rubric'/'worker_visible_rubric.md').exists() else ''
|
|
358
|
+
prompt=build_worker_prompt(spec.normalized_instruction, rubric_md, attempt_kind, input_files, deliverables, settings.sensitive_info_masking, workspace_path=str(d))
|
|
359
|
+
prompt_file=d/'prompt.md'
|
|
360
|
+
prompt_file.write_text(prompt)
|
|
361
|
+
template=settings.custom_worker_command_template
|
|
362
|
+
if not template:
|
|
363
|
+
err=RuntimeError('Custom Apprentice Agent is configured without a command template.')
|
|
364
|
+
(d/'stdout.txt').write_text('')
|
|
365
|
+
(d/'stderr.txt').write_text(str(err))
|
|
366
|
+
(d/'final_message.txt').write_text('Custom Apprentice Agent configuration error.')
|
|
367
|
+
contract_diagnostics = build_contract_diagnostics(d, command='custom-agent', working_directory=d, agent_display_name=settings.custom_worker_display_name or 'Custom', prompt=prompt)
|
|
368
|
+
with (d/'final_message.txt').open('a') as f:
|
|
369
|
+
f.write('\n\n' + diagnostics_text(contract_diagnostics))
|
|
370
|
+
trace, actual, trace_valid=ensure_attempt_outputs(package_root, spec, attempt_kind, prompt, 'custom', err)
|
|
371
|
+
if actual.metadata_json is None:
|
|
372
|
+
actual.metadata_json = {}
|
|
373
|
+
actual.metadata_json['apprentice_agent_contract_diagnostics'] = contract_diagnostics
|
|
374
|
+
write_json(d/'actual_outputs.json', actual)
|
|
375
|
+
trace.metadata_json['apprentice_agent_contract_diagnostics'] = contract_diagnostics
|
|
376
|
+
write_json(d/'agent_trace.json', trace)
|
|
377
|
+
return AttemptResult(attempt_dir=str(d), trace_valid=trace_valid, trace=trace, actual_outputs=actual, custom_worker_error=str(err))
|
|
378
|
+
replacements={
|
|
379
|
+
'workspace': str(d),
|
|
380
|
+
'prompt_file': str(prompt_file),
|
|
381
|
+
'run_dir': str(package_root.parent.parent),
|
|
382
|
+
'task_instruction': spec.normalized_instruction,
|
|
383
|
+
}
|
|
384
|
+
command=template
|
|
385
|
+
for key, value in replacements.items():
|
|
386
|
+
command=command.replace('{'+key+'}', value)
|
|
387
|
+
run_error=None
|
|
388
|
+
cp=None
|
|
389
|
+
try:
|
|
390
|
+
cp=subprocess.run(command, cwd=d, text=True, capture_output=True, timeout=timeout or settings.task_timeout_seconds, shell=True)
|
|
391
|
+
stdout=redact_secrets(cp.stdout or '')
|
|
392
|
+
stderr=redact_secrets(cp.stderr or '')
|
|
393
|
+
(d/'stdout.txt').write_text(stdout)
|
|
394
|
+
(d/'stderr.txt').write_text(stderr)
|
|
395
|
+
(d/'final_message.txt').write_text(redact_secrets((cp.stdout or cp.stderr or '')[-4000:]))
|
|
396
|
+
if cp.returncode != 0:
|
|
397
|
+
run_error=RuntimeError(f'Apprentice Agent exited before producing required outputs (exit code {cp.returncode}).')
|
|
398
|
+
except Exception as e:
|
|
399
|
+
run_error=e
|
|
400
|
+
(d/'stdout.txt').write_text('')
|
|
401
|
+
(d/'stderr.txt').write_text(redact_secrets(str(e)))
|
|
402
|
+
(d/'final_message.txt').write_text('Custom Apprentice Agent failed before producing validated trace.')
|
|
403
|
+
contract_diagnostics = None
|
|
404
|
+
if not (d/'agent_trace.json').exists() or not (d/'actual_outputs.json').exists():
|
|
405
|
+
contract_diagnostics = build_contract_diagnostics(d, command=command, working_directory=d, agent_display_name=settings.custom_worker_display_name or 'Custom', prompt=prompt)
|
|
406
|
+
with (d/'final_message.txt').open('a') as f:
|
|
407
|
+
f.write('\n\n' + diagnostics_text(contract_diagnostics))
|
|
408
|
+
trace, actual, trace_valid=ensure_attempt_outputs(package_root, spec, attempt_kind, prompt, 'custom', run_error)
|
|
409
|
+
trace.metadata_json['custom_worker_display_name']=settings.custom_worker_display_name
|
|
410
|
+
trace.metadata_json['custom_worker_command_template']=template
|
|
411
|
+
trace.metadata_json['trace_valid']=trace_valid
|
|
412
|
+
if contract_diagnostics:
|
|
413
|
+
trace.metadata_json['apprentice_agent_contract_diagnostics']=contract_diagnostics
|
|
414
|
+
write_json(d/'agent_trace.json', trace)
|
|
415
|
+
if actual.metadata_json is None: actual.metadata_json={}
|
|
416
|
+
actual.metadata_json['apprentice_agent']='custom'
|
|
417
|
+
if contract_diagnostics:
|
|
418
|
+
actual.metadata_json['apprentice_agent_contract_diagnostics']=contract_diagnostics
|
|
419
|
+
if run_error:
|
|
420
|
+
actual.metadata_json['apprentice_agent_operational_error']=str(run_error)
|
|
421
|
+
write_json(d/'actual_outputs.json', actual)
|
|
422
|
+
return AttemptResult(
|
|
423
|
+
attempt_dir=str(d),
|
|
424
|
+
trace_valid=trace_valid,
|
|
425
|
+
trace=trace,
|
|
426
|
+
actual_outputs=actual,
|
|
427
|
+
custom_worker_returncode=(cp.returncode if cp else None),
|
|
428
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
AGENT_COMMAND_CANDIDATES: dict[str, list[str]] = {
|
|
9
|
+
"codex": ["codex"],
|
|
10
|
+
"cursor": ["cursor-agent", "cursor"],
|
|
11
|
+
"claude-code": ["claude"],
|
|
12
|
+
"openclaw": ["openclaw"],
|
|
13
|
+
"opencode": ["opencode"],
|
|
14
|
+
"hermes-agent": ["hermes"],
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def common_command_dirs() -> list[Path]:
|
|
19
|
+
if os.getenv("AA_DISABLE_LOCAL_ENV") == "1":
|
|
20
|
+
return []
|
|
21
|
+
home = Path.home()
|
|
22
|
+
candidates = [
|
|
23
|
+
Path("/opt/homebrew/bin"),
|
|
24
|
+
Path("/usr/local/bin"),
|
|
25
|
+
home / ".local/bin",
|
|
26
|
+
home / ".npm-global/bin",
|
|
27
|
+
home / ".bun/bin",
|
|
28
|
+
home / ".cargo/bin",
|
|
29
|
+
home / ".volta/bin",
|
|
30
|
+
home / ".yarn/bin",
|
|
31
|
+
home / "Library/pnpm",
|
|
32
|
+
home / ".opencode/bin",
|
|
33
|
+
home / ".hermes/hermes-agent",
|
|
34
|
+
]
|
|
35
|
+
extra = [Path(p) for p in os.getenv("AA_AGENT_COMMAND_DIRS", "").split(os.pathsep) if p]
|
|
36
|
+
seen: set[str] = set()
|
|
37
|
+
dirs: list[Path] = []
|
|
38
|
+
for path in [*extra, *candidates]:
|
|
39
|
+
key = str(path.expanduser())
|
|
40
|
+
if key in seen:
|
|
41
|
+
continue
|
|
42
|
+
seen.add(key)
|
|
43
|
+
dirs.append(path.expanduser())
|
|
44
|
+
return dirs
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def explicit_command_dirs() -> list[Path]:
|
|
48
|
+
if os.getenv("AA_DISABLE_LOCAL_ENV") == "1":
|
|
49
|
+
return []
|
|
50
|
+
return [Path(p).expanduser() for p in os.getenv("AA_AGENT_COMMAND_DIRS", "").split(os.pathsep) if p]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def resolve_command(command: str | None) -> str | None:
|
|
54
|
+
if not command:
|
|
55
|
+
return None
|
|
56
|
+
expanded = str(Path(command).expanduser()) if any(command.startswith(p) for p in ("~", ".", "/")) else command
|
|
57
|
+
if os.path.sep in expanded:
|
|
58
|
+
path = Path(expanded)
|
|
59
|
+
return str(path) if path.exists() and os.access(path, os.X_OK) else None
|
|
60
|
+
for directory in explicit_command_dirs():
|
|
61
|
+
candidate = directory / expanded
|
|
62
|
+
if candidate.exists() and os.access(candidate, os.X_OK):
|
|
63
|
+
return str(candidate)
|
|
64
|
+
found = shutil.which(expanded)
|
|
65
|
+
if found:
|
|
66
|
+
return expanded
|
|
67
|
+
for directory in common_command_dirs():
|
|
68
|
+
candidate = directory / expanded
|
|
69
|
+
if candidate.exists() and os.access(candidate, os.X_OK):
|
|
70
|
+
return str(candidate)
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def resolve_agent_command(agent_id: str, configured_command: str | None = None) -> tuple[str | None, str | None]:
|
|
75
|
+
candidates = [configured_command] if configured_command else []
|
|
76
|
+
candidates.extend(AGENT_COMMAND_CANDIDATES.get(agent_id, []))
|
|
77
|
+
for candidate in candidates:
|
|
78
|
+
if not candidate:
|
|
79
|
+
continue
|
|
80
|
+
resolved = resolve_command(candidate)
|
|
81
|
+
if resolved:
|
|
82
|
+
return candidate, resolved
|
|
83
|
+
return (candidates[0] if candidates else None), None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def gui_app_hint(agent_id: str) -> str | None:
|
|
87
|
+
app_paths = {
|
|
88
|
+
"cursor": Path("/Applications/Cursor.app"),
|
|
89
|
+
"claude-code": Path("/Applications/Claude.app"),
|
|
90
|
+
}
|
|
91
|
+
path = app_paths.get(agent_id)
|
|
92
|
+
if path and path.exists():
|
|
93
|
+
return f"{path} is installed, but the headless CLI command was not found."
|
|
94
|
+
return None
|