agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Iterable
|
|
8
|
+
|
|
9
|
+
from .config import Settings
|
|
10
|
+
from .io import read_json, write_json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _now() -> str:
|
|
14
|
+
return datetime.now(timezone.utc).isoformat()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _latest_package(run_root: Path) -> Path | None:
|
|
18
|
+
packages = run_root / "packages"
|
|
19
|
+
if not packages.exists():
|
|
20
|
+
return None
|
|
21
|
+
candidates = [p for p in packages.iterdir() if p.is_dir()]
|
|
22
|
+
return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _attempt_summary(pkg: Path | None) -> dict[str, Any]:
|
|
26
|
+
if not pkg:
|
|
27
|
+
return {"attempts": 0, "traced_steps": 0, "artifacts": []}
|
|
28
|
+
attempts = sorted((pkg / "attempts").glob("*"))
|
|
29
|
+
traced_steps = 0
|
|
30
|
+
artifacts: list[str] = []
|
|
31
|
+
for attempt in attempts:
|
|
32
|
+
trace = attempt / "agent_trace.json"
|
|
33
|
+
if trace.exists():
|
|
34
|
+
try:
|
|
35
|
+
traced_steps += len(read_json(trace).get("steps") or [])
|
|
36
|
+
except Exception:
|
|
37
|
+
pass
|
|
38
|
+
art_dir = attempt / "artifacts"
|
|
39
|
+
if art_dir.exists():
|
|
40
|
+
for path in art_dir.rglob("*"):
|
|
41
|
+
if path.is_file():
|
|
42
|
+
artifacts.append(str(path.relative_to(pkg)))
|
|
43
|
+
return {"attempts": len(attempts), "traced_steps": traced_steps, "artifacts": artifacts[:50]}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _truthy_env(name: str) -> bool:
|
|
47
|
+
return os.getenv(name, "").strip().lower() in {"1", "true", "yes", "on"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_checkpoint_inputs() -> dict[str, Any]:
|
|
51
|
+
raw = os.getenv("AA_MENTOR_CHECKPOINT_INPUTS_JSON")
|
|
52
|
+
if not raw:
|
|
53
|
+
return {}
|
|
54
|
+
try:
|
|
55
|
+
parsed = json.loads(raw)
|
|
56
|
+
except json.JSONDecodeError:
|
|
57
|
+
return {}
|
|
58
|
+
return parsed if isinstance(parsed, dict) else {}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _prompt(label: str, default: str = "") -> str:
|
|
62
|
+
suffix = f" [{default}]" if default else ""
|
|
63
|
+
try:
|
|
64
|
+
value = input(f"{label}{suffix}: ").strip()
|
|
65
|
+
except EOFError:
|
|
66
|
+
return default
|
|
67
|
+
return value if value else default
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _split_csv(value: str) -> list[str]:
|
|
71
|
+
return [part.strip() for part in value.split(",") if part.strip()]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parse_score(value: str, default: float) -> float:
|
|
75
|
+
try:
|
|
76
|
+
score = float(value)
|
|
77
|
+
except ValueError:
|
|
78
|
+
return default
|
|
79
|
+
if score > 1:
|
|
80
|
+
score = score / 100
|
|
81
|
+
return max(0.0, min(1.0, score))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
APPROVE_WORDS = {"accept", "approve", "approved", "yes", "y", "ok", "okay"}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _is_approve(value: str) -> bool:
|
|
88
|
+
return value.strip().lower() in APPROVE_WORDS
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _is_edit(value: str) -> bool:
|
|
92
|
+
return value.strip().lower().startswith("edit")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _stage_set(stages: Iterable[str] | None) -> set[str]:
|
|
96
|
+
return set(stages or {"task_intake", "rubric", "evaluation", "revision", "final_approval"})
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _collect_interactive_inputs(
|
|
100
|
+
mode: str,
|
|
101
|
+
session: dict[str, Any],
|
|
102
|
+
status: dict[str, Any],
|
|
103
|
+
summary: dict[str, Any],
|
|
104
|
+
stages: set[str],
|
|
105
|
+
) -> dict[str, Any]:
|
|
106
|
+
label = "Expert-led" if mode == "expert_led" else "Hybrid"
|
|
107
|
+
default_passed = "pass" if status.get("task_status") == "completed" else "fail"
|
|
108
|
+
default_score = "1.0" if default_passed == "pass" else "0.0"
|
|
109
|
+
collected: dict[str, Any] = {}
|
|
110
|
+
if "task_intake" in stages:
|
|
111
|
+
print(f"{label} checkpoint: task intake")
|
|
112
|
+
intake_decision = _prompt("Approve or edit task intake", "approve")
|
|
113
|
+
if _is_edit(intake_decision):
|
|
114
|
+
edited_title = _prompt("Edited title, optional", "")
|
|
115
|
+
intake_notes = _prompt("Task intake notes, optional", "")
|
|
116
|
+
decision = "edited"
|
|
117
|
+
else:
|
|
118
|
+
edited_title = ""
|
|
119
|
+
intake_notes = "Approved as-is." if _is_approve(intake_decision) else f"Approved as-is. Input: {intake_decision}"
|
|
120
|
+
decision = "approved"
|
|
121
|
+
collected["task_intake"] = {
|
|
122
|
+
"decision": decision,
|
|
123
|
+
"edited_title": edited_title or None,
|
|
124
|
+
"notes": intake_notes or "Approved as-is.",
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if "rubric" in stages:
|
|
128
|
+
print(f"{label} checkpoint: rubric")
|
|
129
|
+
rubric_decision = _prompt("Approve or edit rubric", "approve")
|
|
130
|
+
if _is_edit(rubric_decision):
|
|
131
|
+
additional_criterion = _prompt("Add rubric criterion, optional", "")
|
|
132
|
+
rubric_notes = _prompt("Rubric notes, optional", "")
|
|
133
|
+
rubric_choice = "edited"
|
|
134
|
+
else:
|
|
135
|
+
additional_criterion = ""
|
|
136
|
+
rubric_notes = "Rubric approved." if _is_approve(rubric_decision) else f"Rubric approved. Input: {rubric_decision}"
|
|
137
|
+
rubric_choice = "approved"
|
|
138
|
+
collected["rubric"] = {
|
|
139
|
+
"decision": rubric_choice,
|
|
140
|
+
"additional_criteria": [additional_criterion] if additional_criterion else [],
|
|
141
|
+
"notes": rubric_notes or "Rubric approved.",
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if "evaluation" in stages:
|
|
145
|
+
print(f"{label} checkpoint: evaluation/verifier")
|
|
146
|
+
passed_raw = _prompt("Pass or fail", default_passed).strip().lower()
|
|
147
|
+
passed = passed_raw.startswith("pass") or passed_raw in {"yes", "y", "true", "approve", "approved", "ok"}
|
|
148
|
+
score = _parse_score(_prompt("Score 0-1 or 0-100", default_score), 1.0 if default_passed == "pass" else 0.0)
|
|
149
|
+
feedback = _prompt("Expert feedback", status.get("latest_message") or "Reviewed artifacts and trace summary.")
|
|
150
|
+
failed_criteria = [] if passed else _split_csv(_prompt("Failed criteria, comma-separated optional", ""))
|
|
151
|
+
collected["evaluation"] = {
|
|
152
|
+
"passed": passed,
|
|
153
|
+
"score": score,
|
|
154
|
+
"failed_criteria": failed_criteria,
|
|
155
|
+
"feedback": feedback,
|
|
156
|
+
"attempt_summary": summary,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if "revision" in stages:
|
|
160
|
+
print(f"{label} checkpoint: revision")
|
|
161
|
+
revision_decision = _prompt("Revise or finish", "finish").strip().lower()
|
|
162
|
+
revision_notes = _prompt("Revision notes, optional", "")
|
|
163
|
+
collected["revision"] = {
|
|
164
|
+
"revision_should_run": revision_decision.startswith("revise"),
|
|
165
|
+
"decision": "revise" if revision_decision.startswith("revise") else "finish",
|
|
166
|
+
"notes": revision_notes or ("Revision requested by expert." if revision_decision.startswith("revise") else "Expert chose to finish."),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if "final_approval" in stages:
|
|
170
|
+
print(f"{label} checkpoint: final bundle approval")
|
|
171
|
+
final_decision = _prompt("Approve final Contribution Bundle or hold", "approve").strip().lower()
|
|
172
|
+
final_notes = _prompt("Final approval notes, optional", "")
|
|
173
|
+
collected["final_approval"] = {
|
|
174
|
+
"approved_for_local_bundle": _is_approve(final_decision) or final_decision.startswith("approve"),
|
|
175
|
+
"decision": "approved" if (_is_approve(final_decision) or final_decision.startswith("approve")) else "held",
|
|
176
|
+
"notes": final_notes or "Final Contribution Bundle checkpoint completed.",
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return collected
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _checkpoint_inputs(
|
|
183
|
+
mode: str,
|
|
184
|
+
session: dict[str, Any],
|
|
185
|
+
status: dict[str, Any],
|
|
186
|
+
summary: dict[str, Any],
|
|
187
|
+
stages: set[str],
|
|
188
|
+
) -> dict[str, Any]:
|
|
189
|
+
provided = _load_checkpoint_inputs()
|
|
190
|
+
if provided:
|
|
191
|
+
return provided
|
|
192
|
+
if _truthy_env("AA_MENTOR_INTERACTIVE_CHECKPOINTS"):
|
|
193
|
+
return _collect_interactive_inputs(mode, session, status, summary, stages)
|
|
194
|
+
return {}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _section(inputs: dict[str, Any], name: str) -> dict[str, Any]:
|
|
198
|
+
value = inputs.get(name)
|
|
199
|
+
return value if isinstance(value, dict) else {}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _write_checkpoint_history(path: Path, payload: dict[str, Any]) -> None:
|
|
203
|
+
if not path.name.endswith("_checkpoint.json"):
|
|
204
|
+
return
|
|
205
|
+
history_dir = path.parent / "history"
|
|
206
|
+
history_dir.mkdir(parents=True, exist_ok=True)
|
|
207
|
+
created = str(payload.get("created_at") or _now())
|
|
208
|
+
safe_created = "".join(ch if ch.isalnum() else "-" for ch in created).strip("-")
|
|
209
|
+
stem = path.stem.removesuffix("_checkpoint")
|
|
210
|
+
candidate = history_dir / f"{safe_created}_{stem}.json"
|
|
211
|
+
index = 2
|
|
212
|
+
while candidate.exists():
|
|
213
|
+
candidate = history_dir / f"{safe_created}_{stem}_{index}.json"
|
|
214
|
+
index += 1
|
|
215
|
+
write_json(candidate, payload)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _write_checkpoint(path: Path, payload: dict[str, Any], *, preserve_interactive: bool = True) -> None:
|
|
219
|
+
_write_checkpoint_history(path, payload)
|
|
220
|
+
if preserve_interactive and path.exists() and payload.get("checkpoint_input_source") != "interactive":
|
|
221
|
+
try:
|
|
222
|
+
existing = read_json(path)
|
|
223
|
+
except Exception:
|
|
224
|
+
existing = {}
|
|
225
|
+
if existing.get("checkpoint_input_source") == "interactive":
|
|
226
|
+
return
|
|
227
|
+
write_json(path, payload)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def write_mentor_checkpoints(
|
|
231
|
+
run_root: Path,
|
|
232
|
+
settings: Settings,
|
|
233
|
+
*,
|
|
234
|
+
auto_approve: bool = False,
|
|
235
|
+
human_approved: bool = True,
|
|
236
|
+
stages: Iterable[str] | None = None,
|
|
237
|
+
preserve_interactive: bool = True,
|
|
238
|
+
) -> Path | None:
|
|
239
|
+
mode = settings.mentor_mode
|
|
240
|
+
if mode not in {"expert_led", "hybrid"}:
|
|
241
|
+
return None
|
|
242
|
+
enabled_stages = _stage_set(stages)
|
|
243
|
+
pkg = _latest_package(run_root)
|
|
244
|
+
session = read_json(run_root / "session.json") if (run_root / "session.json").exists() else {}
|
|
245
|
+
status = read_json(run_root / "run_status.json") if (run_root / "run_status.json").exists() else {}
|
|
246
|
+
root = run_root / "mentor_checkpoints"
|
|
247
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
248
|
+
source = "human_expert" if mode == "expert_led" else "hybrid_human_approval"
|
|
249
|
+
model_draft_source = "mentor_model_provider" if mode == "hybrid" else None
|
|
250
|
+
common = {
|
|
251
|
+
"mentor_mode": mode,
|
|
252
|
+
"source": source,
|
|
253
|
+
"auto_approved": bool(auto_approve),
|
|
254
|
+
"human_approved": bool(human_approved),
|
|
255
|
+
"model_draft_source": model_draft_source,
|
|
256
|
+
"run_id": session.get("run_id") or status.get("run_id"),
|
|
257
|
+
"task_id": session.get("task_id"),
|
|
258
|
+
}
|
|
259
|
+
summary = _attempt_summary(pkg)
|
|
260
|
+
inputs = _checkpoint_inputs(mode, session, status, summary, enabled_stages)
|
|
261
|
+
intake_input = _section(inputs, "task_intake")
|
|
262
|
+
rubric_input = _section(inputs, "rubric")
|
|
263
|
+
evaluation_input = _section(inputs, "evaluation")
|
|
264
|
+
revision_input = _section(inputs, "revision")
|
|
265
|
+
final_input = _section(inputs, "final_approval")
|
|
266
|
+
input_source = "interactive" if inputs else ("auto_approve" if auto_approve else "recorded")
|
|
267
|
+
if "task_intake" in enabled_stages:
|
|
268
|
+
_write_checkpoint(
|
|
269
|
+
root / "task_intake_checkpoint.json",
|
|
270
|
+
{
|
|
271
|
+
**common,
|
|
272
|
+
"created_at": _now(),
|
|
273
|
+
"checkpoint_type": "task_intake",
|
|
274
|
+
"title": intake_input.get("edited_title") or status.get("task_title"),
|
|
275
|
+
"instruction": session.get("task_instruction"),
|
|
276
|
+
"decision": intake_input.get("decision") or "approved",
|
|
277
|
+
"notes": intake_input.get("notes") or ("Approved as-is." if auto_approve else "Human expert checkpoint recorded."),
|
|
278
|
+
"checkpoint_input_source": input_source,
|
|
279
|
+
},
|
|
280
|
+
preserve_interactive=preserve_interactive,
|
|
281
|
+
)
|
|
282
|
+
if "rubric" in enabled_stages:
|
|
283
|
+
_write_checkpoint(
|
|
284
|
+
root / "rubric_checkpoint.json",
|
|
285
|
+
{
|
|
286
|
+
**common,
|
|
287
|
+
"created_at": _now(),
|
|
288
|
+
"checkpoint_type": "rubric",
|
|
289
|
+
"rubric_refs": [
|
|
290
|
+
"rubric/worker_visible_rubric.md",
|
|
291
|
+
"rubric/rubric.json",
|
|
292
|
+
],
|
|
293
|
+
"decision": rubric_input.get("decision") or "approved",
|
|
294
|
+
"additional_criteria": rubric_input.get("additional_criteria") or [],
|
|
295
|
+
"notes": rubric_input.get("notes") or ("Default/model-drafted rubric approved." if auto_approve else "Human expert rubric checkpoint recorded."),
|
|
296
|
+
"checkpoint_input_source": input_source,
|
|
297
|
+
},
|
|
298
|
+
preserve_interactive=preserve_interactive,
|
|
299
|
+
)
|
|
300
|
+
default_passed = status.get("task_status") == "completed"
|
|
301
|
+
if "evaluation" in enabled_stages:
|
|
302
|
+
passed = evaluation_input.get("passed", default_passed)
|
|
303
|
+
_write_checkpoint(
|
|
304
|
+
root / "evaluation_checkpoint.json",
|
|
305
|
+
{
|
|
306
|
+
**common,
|
|
307
|
+
"created_at": _now(),
|
|
308
|
+
"checkpoint_type": "evaluation_verifier",
|
|
309
|
+
"task_status": status.get("task_status"),
|
|
310
|
+
"run_status": status.get("run_status"),
|
|
311
|
+
"score": evaluation_input.get("score", 1.0 if passed else 0.0),
|
|
312
|
+
"passed": passed,
|
|
313
|
+
"failed_criteria": evaluation_input.get("failed_criteria") or ([] if passed else ["task_status_not_completed"]),
|
|
314
|
+
"feedback": evaluation_input.get("feedback") or status.get("last_operational_error") or status.get("latest_message") or "Human checkpoint recorded.",
|
|
315
|
+
"attempt_summary": evaluation_input.get("attempt_summary") or summary,
|
|
316
|
+
"checkpoint_input_source": input_source,
|
|
317
|
+
},
|
|
318
|
+
preserve_interactive=preserve_interactive,
|
|
319
|
+
)
|
|
320
|
+
if "revision" in enabled_stages:
|
|
321
|
+
_write_checkpoint(
|
|
322
|
+
root / "revision_checkpoint.json",
|
|
323
|
+
{
|
|
324
|
+
**common,
|
|
325
|
+
"created_at": _now(),
|
|
326
|
+
"checkpoint_type": "revision_decision",
|
|
327
|
+
"revision_should_run": bool(revision_input.get("revision_should_run", False)),
|
|
328
|
+
"decision": revision_input.get("decision") or "finish",
|
|
329
|
+
"notes": revision_input.get("notes") or "No additional human-requested revision recorded by this checkpoint.",
|
|
330
|
+
"checkpoint_input_source": input_source,
|
|
331
|
+
},
|
|
332
|
+
preserve_interactive=preserve_interactive,
|
|
333
|
+
)
|
|
334
|
+
if "final_approval" in enabled_stages:
|
|
335
|
+
_write_checkpoint(
|
|
336
|
+
root / "final_approval_checkpoint.json",
|
|
337
|
+
{
|
|
338
|
+
**common,
|
|
339
|
+
"created_at": _now(),
|
|
340
|
+
"checkpoint_type": "final_approval",
|
|
341
|
+
"approved_for_local_bundle": final_input.get("approved_for_local_bundle", True),
|
|
342
|
+
"contribution_bundle_path": status.get("contribution_bundle_path"),
|
|
343
|
+
"decision": final_input.get("decision") or "approved",
|
|
344
|
+
"notes": final_input.get("notes") or "Final Contribution Bundle checkpoint recorded.",
|
|
345
|
+
"checkpoint_input_source": input_source,
|
|
346
|
+
},
|
|
347
|
+
preserve_interactive=preserve_interactive,
|
|
348
|
+
)
|
|
349
|
+
(root / "README.md").write_text(
|
|
350
|
+
"# Mentor Checkpoints\n\n"
|
|
351
|
+
"These files record expert-led or hybrid human approval checkpoints for the apprenticeship session. "
|
|
352
|
+
"They are local review artifacts and do not contain raw provider secrets.\n"
|
|
353
|
+
)
|
|
354
|
+
return root
|