agent-apprenticeship 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +217 -0
  3. package/bin/agent-apprenticeship.js +131 -0
  4. package/package.json +30 -0
  5. package/pyproject.toml +23 -0
  6. package/src/agent_apprenticeship_trace/__init__.py +2 -0
  7. package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
  8. package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
  9. package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
  10. package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
  11. package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
  12. package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
  13. package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
  14. package/src/agent_apprenticeship_trace/certification.py +580 -0
  15. package/src/agent_apprenticeship_trace/cli.py +2979 -0
  16. package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
  17. package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
  18. package/src/agent_apprenticeship_trace/config.py +609 -0
  19. package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
  20. package/src/agent_apprenticeship_trace/env.py +46 -0
  21. package/src/agent_apprenticeship_trace/evaluator.py +64 -0
  22. package/src/agent_apprenticeship_trace/grader.py +194 -0
  23. package/src/agent_apprenticeship_trace/integration_status.py +193 -0
  24. package/src/agent_apprenticeship_trace/io.py +20 -0
  25. package/src/agent_apprenticeship_trace/learning.py +627 -0
  26. package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
  27. package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
  28. package/src/agent_apprenticeship_trace/loop.py +111 -0
  29. package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
  30. package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
  31. package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
  32. package/src/agent_apprenticeship_trace/progress.py +223 -0
  33. package/src/agent_apprenticeship_trace/public_run.py +1109 -0
  34. package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
  35. package/src/agent_apprenticeship_trace/recipes.py +129 -0
  36. package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
  37. package/src/agent_apprenticeship_trace/revision.py +21 -0
  38. package/src/agent_apprenticeship_trace/role_runners.py +7 -0
  39. package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
  40. package/src/agent_apprenticeship_trace/schemas.py +273 -0
  41. package/src/agent_apprenticeship_trace/session_events.py +99 -0
  42. package/src/agent_apprenticeship_trace/task_intake.py +112 -0
  43. package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
  44. package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
  45. package/src/agent_apprenticeship_trace/training_signals.py +30 -0
  46. package/src/agent_apprenticeship_trace/validation.py +210 -0
  47. package/src/agent_apprenticeship_trace/verifier.py +55 -0
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+ import hashlib, json, re, shutil
3
+ from pathlib import Path
4
+ from typing import Any
5
+ from .env import redact_secrets, contains_secret
6
+ from .io import read_jsonl, append_jsonl, write_json
7
+
8
+ PROMPT_TEMPLATE_ID='agent_apprenticeship_trace_worker_v0'
9
+ PROMPT_TEMPLATE_VERSION='0.1'
10
+ PROMPT_PUBLICATION_STATUS='template_available_in_codebase'
11
+ RAW_LOG_NAMES=set()
12
+ CONTROLLER_TELEMETRY_KEYS={
13
+ 'max_iterations','actual_iterations','max_revision_iterations','actual_revision_iterations',
14
+ 'stop_on_verifier_pass','stop_on_score_threshold','stop_on_no_improvement',
15
+ 'stop_on_provider_limit','stop_on_timeout'
16
+ }
17
+ CONTROLLER_TELEMETRY_PATTERNS=list(CONTROLLER_TELEMETRY_KEYS)
18
+ PUBLIC_OMIT_KEYS={
19
+ 'source_url_or_ref','source_kind','source_url','source_ref','source_license',
20
+ 'expected_pay','expected_apprentice_pay',
21
+ 'evaluation_mode','data_sharing_level','sensitive_info_masking',
22
+ }
23
+ _ABS_LOCAL_PATH_RE=re.compile(r"(/Users/[^\s\"\']+|/home/[^\s\"\']+|/private/[^\s\"\']+|/tmp/[^\s\"\']+)")
24
+ _USAGE_RE=re.compile(r"(you['’]?ve hit your usage limit|usage limit|rate limit|quota)", re.I)
25
+ PUBLIC_TEXT_REPLACEMENTS=[
26
+ ("source_url_or_ref", "reference_or_context"),
27
+ ("public_source_urls", "public_reference_links"),
28
+ ("public_source_url", "public_reference_link"),
29
+ ("source_urls", "reference_links"),
30
+ ("source_url", "reference_link"),
31
+ ("source_ref", "reference_id"),
32
+ ("source_kind", "reference_kind"),
33
+ ("source_license", "reference_license"),
34
+ ("expected_apprentice_pay", "expected_economic_value_for_agent_apprentice"),
35
+ ("expected_pay", "expected_economic_value"),
36
+ ("data_sharing_level", "sensitive_info_masking"),
37
+ ("evaluation_mode", "mentor_mode"),
38
+ ("worker_attempt", "apprentice_attempt"),
39
+ ("worker_agent", "apprentice_agent"),
40
+ ]
41
+
42
+ def sha256_text(text: str | None) -> str | None:
43
+ if not text:
44
+ return None
45
+ return 'sha256:' + hashlib.sha256(text.encode()).hexdigest()
46
+
47
+ def classify_provider_failure(text: str | None) -> dict[str, Any]:
48
+ text=text or ''
49
+ if _USAGE_RE.search(text):
50
+ return {'provider_failure_type':'usage_limit','fallback_reason':'provider_usage_limit','error_type':'ProviderUsageLimit','retryable':True,'provider':'openai','runner_backend':'codex_cli','failure_owner':'provider_or_quota','should_retry_after':None}
51
+ return {}
52
+
53
+ def redact_internal_prompt_blocks(text: str | None) -> str | None:
54
+ """Public releases preserve prompts as research context; only secrets are redacted."""
55
+ if text is None:
56
+ return None
57
+ return redact_secrets(text)
58
+
59
+ def public_error_summary(error_text: str | None) -> str | None:
60
+ if not error_text:
61
+ return None
62
+ return redact_internal_prompt_blocks(error_text)
63
+
64
+ def sanitize_public_text(text: str | None, prompt_text: str | None=None) -> str | None:
65
+ if text is None:
66
+ return None
67
+ safe=redact_secrets(text)
68
+ for old, new in PUBLIC_TEXT_REPLACEMENTS:
69
+ safe=safe.replace(old, new)
70
+ safe=safe.replace(old.upper(), new.upper())
71
+ for token in CONTROLLER_TELEMETRY_PATTERNS:
72
+ safe=safe.replace(token, '[internal controller setting omitted]')
73
+ def _path_repl(match):
74
+ text=match.group(0)
75
+ for marker in ['/attempts/','/packages/']:
76
+ if marker in text:
77
+ return 'attempts/' + text.split('/attempts/', 1)[1] if marker == '/attempts/' else text.split('/packages/',1)[1]
78
+ return '[local path omitted]'
79
+ safe=_ABS_LOCAL_PATH_RE.sub(_path_repl, safe)
80
+ return safe
81
+
82
+ def _sanitize_obj(obj: Any, prompt_text: str | None=None) -> Any:
83
+ if isinstance(obj, list):
84
+ return [_sanitize_obj(v, prompt_text) for v in obj]
85
+ if not isinstance(obj, dict):
86
+ return sanitize_public_text(obj, prompt_text) if isinstance(obj, str) else obj
87
+ out={}
88
+ for k,v in obj.items():
89
+ if k in CONTROLLER_TELEMETRY_KEYS or k in PUBLIC_OMIT_KEYS or k.startswith('stop_on_'):
90
+ continue
91
+ out[k]=_sanitize_obj(v, prompt_text)
92
+ return out
93
+
94
+ def sanitize_public_obj(obj: dict[str, Any], prompt_text: str | None=None) -> dict[str, Any]:
95
+ return _sanitize_obj(obj, prompt_text)
96
+
97
+ def has_prompt_leak(root: Path) -> bool:
98
+ """Compatibility name: now checks only secrets and controller telemetry leakage."""
99
+ for p in root.rglob('*'):
100
+ if p.is_file() and p.stat().st_size < 5_000_000:
101
+ text=p.read_text(errors='ignore')
102
+ if contains_secret(text):
103
+ return True
104
+ if any(pat in text for pat in CONTROLLER_TELEMETRY_PATTERNS):
105
+ return True
106
+ return False
107
+
108
+ def create_public_release(release_root: Path) -> Path:
109
+ public=release_root/'public'
110
+ if public.exists():
111
+ shutil.rmtree(public)
112
+ public.mkdir(parents=True, exist_ok=True)
113
+ jsonl_files=['full_task_records.jsonl','tasks.jsonl','task_intake_specs.jsonl','rubrics.jsonl','rubric_items.jsonl','raw_agent_traces.jsonl','agent_traces.jsonl','trace_normalization_reports.jsonl','actual_outputs_normalization_reports.jsonl','actual_outputs.jsonl','grader_results.jsonl','verifier_results.jsonl','evaluator_feedback.jsonl','revision_plans.jsonl','hillclimb_results.jsonl','lessons.jsonl','training_signals.jsonl','process_supervision.jsonl','reward_modeling.jsonl','verifier_training.jsonl','revision_preference_pairs.jsonl','role_results_index.jsonl','packages_index.jsonl','forsy_like_collections.jsonl']
114
+ for name in jsonl_files:
115
+ src=release_root/name
116
+ dst=public/name
117
+ dst.write_text('')
118
+ for row in read_jsonl(src):
119
+ append_jsonl(dst, sanitize_public_obj(row))
120
+ for name in ['dataset_manifest.json','quality_report.json']:
121
+ src=release_root/name
122
+ data=json.loads(src.read_text() or '{}') if src.exists() else {}
123
+ data['public_sanitized']=True
124
+ write_json(public/name, sanitize_public_obj(data))
125
+ if (release_root/'dataset_card.md').exists():
126
+ (public/'dataset_card.md').write_text(redact_internal_prompt_blocks((release_root/'dataset_card.md').read_text()) or '')
127
+ artifacts=[]
128
+ src=release_root/'artifacts_index.json'
129
+ if src.exists():
130
+ for row in json.loads(src.read_text() or '[]'):
131
+ artifacts.append(sanitize_public_obj(row))
132
+ write_json(public/'artifacts_index.json', artifacts)
133
+ private=[]
134
+ packages=release_root/'packages'
135
+ if packages.exists():
136
+ for p in packages.glob('*/attempts/*'):
137
+ private.append({'attempt_dir_internal':str(p.relative_to(release_root)), 'prompt_ref_internal':str((p/'prompt.md').relative_to(release_root)) if (p/'prompt.md').exists() else None, 'stdout_ref_internal':str((p/'stdout.txt').relative_to(release_root)) if (p/'stdout.txt').exists() else None, 'stderr_ref_internal':str((p/'stderr.txt').relative_to(release_root)) if (p/'stderr.txt').exists() else None, 'final_message_ref_internal':str((p/'final_message.txt').relative_to(release_root)) if (p/'final_message.txt').exists() else None})
138
+ write_json(release_root/'private_debug_manifest.json', {'private_debug_artifacts':private})
139
+ return public
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field
4
+
5
+
6
+ class RunnerRecipe(BaseModel):
7
+ model_config = ConfigDict(extra="forbid")
8
+
9
+ agent_id: str
10
+ display_name: str
11
+ command_name: str
12
+ prompt_mode: str
13
+ workspace_mode: str
14
+ writes_mode: str
15
+ stdout_stderr_capture: str = "capture subprocess stdout/stderr into attempt logs"
16
+ success_detection: str = "exit code 0 plus valid agent_trace.json and actual_outputs.json"
17
+ expected_output_contract: list[str] = Field(
18
+ default_factory=lambda: ["agent_trace.json", "actual_outputs.json", "artifacts/"]
19
+ )
20
+ notes: str | None = None
21
+
22
+
23
+ class ModelProviderRecipe(BaseModel):
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ provider_id: str
27
+ display_name: str
28
+ api_key_env_var: str
29
+ default_model: str
30
+ endpoint_kind: str
31
+ notes: str | None = None
32
+
33
+
34
+ WORKER_AGENT_RECIPES: dict[str, RunnerRecipe] = {
35
+ "codex": RunnerRecipe(
36
+ agent_id="codex",
37
+ display_name="Codex",
38
+ command_name="codex",
39
+ prompt_mode="codex exec --cd <workspace> --sandbox <sandbox> [--skip-git-repo-check] <prompt>",
40
+ workspace_mode="pass the prepared attempt directory with --cd",
41
+ writes_mode="workspace-write sandbox; write deliverables under ./artifacts/",
42
+ notes="Implemented runner.",
43
+ ),
44
+ "cursor": RunnerRecipe(
45
+ agent_id="cursor",
46
+ display_name="Cursor",
47
+ command_name="cursor-agent",
48
+ prompt_mode="cursor-agent headless mode with --prompt-file, --prompt, -p, or run when supported",
49
+ workspace_mode="pass workspace flags when supported, otherwise run from the prepared attempt directory",
50
+ writes_mode="write deliverables under ./artifacts/",
51
+ notes="Headless adapter with CLI capability detection.",
52
+ ),
53
+ "claude-code": RunnerRecipe(
54
+ agent_id="claude-code",
55
+ display_name="Claude Code",
56
+ command_name="claude",
57
+ prompt_mode="claude -p <prompt> or claude --print <prompt>",
58
+ workspace_mode="run from the prepared attempt directory",
59
+ writes_mode="write deliverables under ./artifacts/",
60
+ notes="Headless adapter with auth/setup failure classification.",
61
+ ),
62
+ "openclaw": RunnerRecipe(
63
+ agent_id="openclaw",
64
+ display_name="OpenClaw",
65
+ command_name="openclaw",
66
+ prompt_mode="openclaw run/exec/session with prompt-file or prompt when supported",
67
+ workspace_mode="pass workspace flags when supported, otherwise run from the prepared attempt directory",
68
+ writes_mode="write deliverables under ./artifacts/",
69
+ notes="Diagnosable headless adapter; reports setup-required or headless-unavailable when needed.",
70
+ ),
71
+ "opencode": RunnerRecipe(
72
+ agent_id="opencode",
73
+ display_name="OpenCode",
74
+ command_name="opencode",
75
+ prompt_mode="opencode run <prompt> or opencode run --prompt-file <prompt_file>",
76
+ workspace_mode="pass workspace flags when supported, otherwise run from the prepared attempt directory",
77
+ writes_mode="write deliverables under ./artifacts/",
78
+ notes="Headless adapter with provider setup failure classification.",
79
+ ),
80
+ "hermes-agent": RunnerRecipe(
81
+ agent_id="hermes-agent",
82
+ display_name="Hermes Agent",
83
+ command_name="hermes",
84
+ prompt_mode="hermes run/chat with prompt-file or prompt when supported",
85
+ workspace_mode="pass workspace flags when supported, otherwise run from the prepared attempt directory",
86
+ writes_mode="write deliverables under ./artifacts/",
87
+ notes="Diagnosable headless adapter; reports setup-required or headless-unavailable when needed.",
88
+ ),
89
+ "custom": RunnerRecipe(
90
+ agent_id="custom",
91
+ display_name="Custom",
92
+ command_name="custom-agent",
93
+ prompt_mode="run the configured command template with {workspace} and {prompt_file}",
94
+ workspace_mode="run command from the prepared attempt directory",
95
+ writes_mode="allow writes inside the prepared workspace when configured",
96
+ notes="Generic command-template runner.",
97
+ ),
98
+ }
99
+
100
+
101
+ MODEL_PROVIDER_RECIPES: dict[str, ModelProviderRecipe] = {
102
+ "openai": ModelProviderRecipe(provider_id="openai", display_name="OpenAI", api_key_env_var="OPENAI_API_KEY", default_model="gpt-5-mini", endpoint_kind="native"),
103
+ "anthropic": ModelProviderRecipe(provider_id="anthropic", display_name="Anthropic", api_key_env_var="ANTHROPIC_API_KEY", default_model="claude-sonnet-4-6", endpoint_kind="anthropic_messages"),
104
+ "google": ModelProviderRecipe(provider_id="google", display_name="Google Gemini", api_key_env_var="GEMINI_API_KEY", default_model="gemini-2.5-flash", endpoint_kind="gemini_generate_content"),
105
+ "openrouter": ModelProviderRecipe(provider_id="openrouter", display_name="OpenRouter", api_key_env_var="OPENROUTER_API_KEY", default_model="~openai/gpt-latest", endpoint_kind="openai_compatible"),
106
+ }
107
+
108
+
109
+ REMOVED_V0_MODEL_PROVIDER_IDS: tuple[str, ...] = ("deepseek",)
110
+
111
+
112
+ PLANNED_WORKER_AGENT_RECIPES: dict[str, RunnerRecipe] = {
113
+ "gemini": RunnerRecipe(agent_id="gemini", display_name="Gemini CLI", command_name="gemini", prompt_mode="planned headless adapter", workspace_mode="planned", writes_mode="planned"),
114
+ "cline": RunnerRecipe(agent_id="cline", display_name="Cline", command_name="cline", prompt_mode="planned headless adapter", workspace_mode="planned", writes_mode="planned"),
115
+ }
116
+
117
+
118
+ PLANNED_MODEL_PROVIDER_RECIPES: dict[str, ModelProviderRecipe] = {
119
+ "xai": ModelProviderRecipe(provider_id="xai", display_name="xAI", api_key_env_var="XAI_API_KEY", default_model="grok-3", endpoint_kind="planned"),
120
+ "kimi-moonshot": ModelProviderRecipe(provider_id="kimi-moonshot", display_name="Kimi / Moonshot AI", api_key_env_var="MOONSHOT_API_KEY", default_model="moonshot-v1-auto", endpoint_kind="planned"),
121
+ }
122
+
123
+
124
+ def worker_agent_ids() -> list[str]:
125
+ return list(WORKER_AGENT_RECIPES)
126
+
127
+
128
+ def model_provider_ids() -> list[str]:
129
+ return list(MODEL_PROVIDER_RECIPES)
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+ import shutil
3
+ from pathlib import Path
4
+ from .io import read_json, write_json, append_jsonl, read_jsonl
5
+ from .public_sanitizer import create_public_release
6
+ from .package_exporter import IGNORED_RELEASE_DIR_NAMES, is_ignored_release_path, public_task_record
7
+ from .trace_normalizer import normalize_trace_for_export
8
+
9
+ RELEASE_FILES=['full_task_records.jsonl','tasks.jsonl','task_intake_specs.jsonl','rubrics.jsonl','rubric_items.jsonl','raw_agent_traces.jsonl','agent_traces.jsonl','trace_normalization_reports.jsonl','actual_outputs_normalization_reports.jsonl','actual_outputs.jsonl','grader_results.jsonl','verifier_results.jsonl','evaluator_feedback.jsonl','revision_plans.jsonl','hillclimb_results.jsonl','lessons.jsonl','training_signals.jsonl','process_supervision.jsonl','reward_modeling.jsonl','verifier_training.jsonl','revision_preference_pairs.jsonl','role_results_index.jsonl','artifacts_index.json','packages_index.jsonl','forsy_like_collections.jsonl']
10
+
11
+
12
+ def _copy_ignore(dir_path: str, names: list[str]) -> set[str]:
13
+ ignored=set()
14
+ for name in names:
15
+ if name in IGNORED_RELEASE_DIR_NAMES or 'pycache' in name.lower():
16
+ ignored.add(name)
17
+ return ignored
18
+
19
+ def _copytree_ignore_errors(src: Path, dest: Path) -> None:
20
+ shutil.copytree(src, dest, ignore=_copy_ignore, symlinks=False, ignore_dangling_symlinks=True)
21
+
22
+ def _safe_read(path: Path):
23
+ try:
24
+ return read_json(path)
25
+ except Exception:
26
+ return None
27
+
28
+ def _append_if_exists(release_root: Path, src: Path, dst: str) -> bool:
29
+ obj=_safe_read(src)
30
+ if obj is None: return False
31
+ append_jsonl(release_root/dst, obj); return True
32
+
33
+
34
+ def _public_task_row(raw: dict, tid: str) -> dict:
35
+ payload=raw.get('raw_payload') or {}
36
+ row=public_task_record(raw)
37
+ row.setdefault('task_id', raw.get('task_id') or tid)
38
+ row['domain']=raw.get('normalized_domain') or payload.get('normalized_domain') or payload.get('domain')
39
+ row['subdomain']=raw.get('normalized_subdomain') or payload.get('normalized_subdomain') or payload.get('subdomain')
40
+ role=raw.get('agent_apprentice_role') or payload.get('agent_apprentice_role') or raw.get('apprenticeship_role') or payload.get('apprenticeship_role')
41
+ if role is not None:
42
+ row['agent_apprentice_role']=role
43
+ expected_value=raw.get('expected_economic_value') or payload.get('expected_economic_value') or raw.get('expected_pay') or payload.get('expected_pay')
44
+ apprentice_value=raw.get('expected_economic_value_for_agent_apprentice') or payload.get('expected_economic_value_for_agent_apprentice') or raw.get('expected_apprentice_pay') or payload.get('expected_apprentice_pay')
45
+ if expected_value is not None:
46
+ row['expected_economic_value']=expected_value
47
+ if apprentice_value is not None:
48
+ row['expected_economic_value_for_agent_apprentice']=apprentice_value
49
+ for key in ['apprenticeship_role','task_family','difficulty_tier','needs_expert_review']:
50
+ if raw.get(key) is not None:
51
+ row[key]=raw.get(key)
52
+ elif payload.get(key) is not None:
53
+ row[key]=payload.get(key)
54
+ if raw.get('expected_deliverable') is not None:
55
+ row['expected_deliverable']=raw.get('expected_deliverable')
56
+ elif payload.get('expected_deliverable') is not None:
57
+ row['expected_deliverable']=payload.get('expected_deliverable')
58
+ return row
59
+
60
+ def _step_count(obj) -> int:
61
+ if isinstance(obj, list):
62
+ return len(obj)
63
+ if isinstance(obj, dict):
64
+ for key in ['steps','trace_steps','trace','records','events','actions']:
65
+ val=obj.get(key)
66
+ if isinstance(val, list): return len(val)
67
+ if isinstance(val, dict):
68
+ nested=val.get('steps') or val.get('events') or val.get('actions')
69
+ if isinstance(nested, list): return len(nested)
70
+ return 0
71
+
72
+ def _task_status(statuses: list[str]) -> str:
73
+ useful=[s for s in statuses if s]
74
+ if useful and all(s == 'completed' for s in useful):
75
+ return 'completed'
76
+ if useful and all(s == 'failed' for s in useful):
77
+ return 'failed'
78
+ return 'partial' if useful else 'failed'
79
+
80
+
81
+
82
+ def _propagate_release_status_fields(release_root):
83
+ """Propagate lightweight task/attempt status into exported JSONL files.
84
+
85
+ packages_index.jsonl is currently the source of truth for task_status.
86
+ This keeps tasks.jsonl/public/tasks.jsonl and trace rows aligned without
87
+ adding a new metadata block or mutating raw outputs/runs data.
88
+ """
89
+ import json
90
+ from pathlib import Path
91
+
92
+ root = Path(release_root)
93
+
94
+ def _read_jsonl(path):
95
+ if not path.exists():
96
+ return []
97
+ rows = []
98
+ for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
99
+ if not line.strip():
100
+ continue
101
+ try:
102
+ rows.append(json.loads(line))
103
+ except Exception:
104
+ rows.append(None)
105
+ return rows
106
+
107
+ def _write_jsonl(path, rows):
108
+ path.parent.mkdir(parents=True, exist_ok=True)
109
+ with path.open("w", encoding="utf-8") as f:
110
+ for row in rows:
111
+ if row is not None:
112
+ f.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
113
+
114
+ status_by_task = {}
115
+ for rel in ("packages_index.jsonl", "public/packages_index.jsonl"):
116
+ for row in _read_jsonl(root / rel):
117
+ if not isinstance(row, dict):
118
+ continue
119
+ task_id = row.get("task_id")
120
+ task_status = row.get("task_status")
121
+ if task_id and task_status:
122
+ status_by_task[task_id] = task_status
123
+
124
+ if not status_by_task:
125
+ return
126
+
127
+ # Propagate task_status into private/public task rows.
128
+ for rel in ("tasks.jsonl", "public/tasks.jsonl"):
129
+ path = root / rel
130
+ rows = _read_jsonl(path)
131
+ changed = False
132
+ for row in rows:
133
+ if not isinstance(row, dict):
134
+ continue
135
+ task_id = row.get("task_id")
136
+ task_status = status_by_task.get(task_id)
137
+ if task_status and row.get("task_status") != task_status:
138
+ row["task_status"] = task_status
139
+ changed = True
140
+ if changed:
141
+ _write_jsonl(path, rows)
142
+
143
+ # Do not inject task_status into agent_traces.jsonl here.
144
+ # Trace rows are governed by the AgentTrace schema; attempt_status belongs
145
+ # there, but task_status is task/package-level metadata and is exported via
146
+ # tasks.jsonl and packages_index.jsonl.
147
+
148
+
149
+ def create_release(run_root: Path, release_root: Path) -> Path:
150
+ release_root.mkdir(parents=True, exist_ok=True)
151
+ for f in RELEASE_FILES:
152
+ p=release_root/f; p.parent.mkdir(parents=True, exist_ok=True); p.write_text('[]\n' if f.endswith('.json') else '')
153
+ pkgs=list((run_root/'packages').glob('*')) if (run_root/'packages').exists() else []
154
+ artifacts=[]; incomplete=0; missing_traces=0; raw_trace_count=0; normalized_trace_count=0; fallback_trace_count=0; fallback_trace_step_count=0; discarded_step_count=0; raw_trace_step_count=0; normalized_trace_step_count=0; partial_count=0; lossless_count=0; lossless_failure_count=0; parse_error_count=0; norm_error_count=0; actual_outputs_normalized_count=0; actual_outputs_schema_valid_count=0
155
+ for pkg in pkgs:
156
+ tid=pkg.name; blockers=[]
157
+ raw=_safe_read(pkg/'task/raw_task_record.json')
158
+ manifest=_safe_read(pkg/'package_manifest.json') or {}
159
+ iteration_public={'completion_reason': manifest.get('loop_stop_reason'), 'initial_attempt_id': manifest.get('baseline_attempt_id'), 'revision_attempt_ids': manifest.get('revised_attempt_ids') or [], 'final_attempt_id': manifest.get('selected_attempt_id'), 'preferred_attempt_id': manifest.get('selected_attempt_id')}
160
+ if raw is not None:
161
+ pub_task=_public_task_row(raw, tid)
162
+ append_jsonl(release_root/'full_task_records.jsonl', {**pub_task, 'task_id':tid,'package_path':f'packages/{tid}','raw_task_record':public_task_record(raw),'publishable_task_metadata':pub_task, **iteration_public, 'trace_refs':{'baseline':{'raw':'packages/'+tid+'/attempts/baseline/agent_trace.raw.json','normalized':'packages/'+tid+'/attempts/baseline/agent_trace.normalized.json','canonical':'packages/'+tid+'/attempts/baseline/agent_trace.json'},'revised':{'raw':'packages/'+tid+'/attempts/revised/agent_trace.raw.json','normalized':'packages/'+tid+'/attempts/revised/agent_trace.normalized.json','canonical':'packages/'+tid+'/attempts/revised/agent_trace.json'}}})
163
+ append_jsonl(release_root/'tasks.jsonl', _public_task_row(raw, tid))
164
+ else: blockers.append('missing task/raw_task_record.json')
165
+ if not _append_if_exists(release_root, pkg/'task/task_intake_spec.json', 'task_intake_specs.jsonl'): blockers.append('missing task_intake_spec')
166
+ rub=_safe_read(pkg/'rubric/rubric.json')
167
+ if rub is not None: append_jsonl(release_root/'rubrics.jsonl', rub)
168
+ else: blockers.append('missing rubric')
169
+ for row in read_jsonl(pkg/'rubric/rubric_items.jsonl'): append_jsonl(release_root/'rubric_items.jsonl', row)
170
+ attempt_trace_refs={}
171
+ attempt_statuses=[]
172
+ for a in ['baseline','revised']:
173
+ raw_tr=_safe_read(pkg/f'attempts/{a}/agent_trace.raw.json')
174
+ if raw_tr is not None:
175
+ raw_trace_count += 1; raw_trace_step_count += _step_count(raw_tr)
176
+ append_jsonl(release_root/'raw_agent_traces.jsonl', raw_tr)
177
+ tr=_safe_read(pkg/f'attempts/{a}/agent_trace.json')
178
+ norm=_safe_read(pkg/f'attempts/{a}/agent_trace.normalized.json')
179
+ report=_safe_read(pkg/f'attempts/{a}/trace_normalization_report.json')
180
+ if report is not None:
181
+ append_jsonl(release_root/'trace_normalization_reports.jsonl', report)
182
+ fallback_trace_count += 1 if report.get('fallback_trace') else 0
183
+ fallback_trace_step_count += int(report.get('normalized_step_count') or 0) if report.get('fallback_trace') else 0
184
+ discarded_step_count += int(report.get('discarded_step_count') or 0)
185
+ partial_count += 1 if report.get('trace_normalization_partial') else 0
186
+ parse_error_count += 1 if report.get('raw_trace_parse_error') else 0
187
+ norm_error_count += 1 if report.get('trace_normalization_error') else 0
188
+ if report.get('trace_lossless'): lossless_count += 1
189
+ else: lossless_failure_count += 1
190
+ if norm is not None:
191
+ normalized_trace_count += 1; normalized_trace_step_count += len(norm.get('steps') or [])
192
+ if tr is not None:
193
+ tr_row=normalize_trace_for_export(dict(tr), report)
194
+ attempt_statuses.append(tr_row.get('attempt_status'))
195
+ tr_row['iteration_index']=0 if a == 'baseline' else 1
196
+ tr_row['previous_attempt_id']=None if a == 'baseline' else manifest.get('baseline_attempt_id')
197
+ tr_row['revision_group_id']=tid
198
+ tr_row['completion_reason']=manifest.get('loop_stop_reason')
199
+ append_jsonl(release_root/'agent_traces.jsonl', tr_row)
200
+ append_jsonl(release_root/'forsy_like_collections.jsonl', {'collection_id': tr.get('collection_id') or tid, 'trace_id': tr.get('trace_id'), 'attempt_kind': a, 'iteration_index': tr_row['iteration_index'], 'previous_attempt_id': tr_row['previous_attempt_id'], 'steps': tr.get('steps', []), 'trace_ref': f'packages/{tid}/attempts/{a}/agent_trace.json'})
201
+ else:
202
+ missing_traces += 1; blockers.append(f'missing attempts/{a}/agent_trace.json')
203
+ attempt_trace_refs[a]={'raw': f'packages/{tid}/attempts/{a}/agent_trace.raw.json' if (pkg/f'attempts/{a}/agent_trace.raw.json').exists() else None, 'normalized': f'packages/{tid}/attempts/{a}/agent_trace.normalized.json' if (pkg/f'attempts/{a}/agent_trace.normalized.json').exists() else None, 'canonical': f'packages/{tid}/attempts/{a}/agent_trace.json' if (pkg/f'attempts/{a}/agent_trace.json').exists() else None, 'normalization_report': f'packages/{tid}/attempts/{a}/trace_normalization_report.json' if (pkg/f'attempts/{a}/trace_normalization_report.json').exists() else None}
204
+ ao_report=_safe_read(pkg/f'attempts/{a}/actual_outputs_normalization_report.json')
205
+ if ao_report is not None:
206
+ append_jsonl(release_root/'actual_outputs_normalization_reports.jsonl', ao_report)
207
+ actual_outputs_normalized_count += 1 if ao_report.get('actual_outputs_normalized') else 0
208
+ actual_outputs_schema_valid_count += 1 if ao_report.get('actual_outputs_schema_valid') else 0
209
+ if not _append_if_exists(release_root, pkg/f'attempts/{a}/actual_outputs.json', 'actual_outputs.jsonl'):
210
+ blockers.append(f'missing attempts/{a}/actual_outputs.json')
211
+ for name,out in [('baseline_grader_result.json','grader_results.jsonl'),('revised_grader_result.json','grader_results.jsonl'),('baseline_verifier_result.json','verifier_results.jsonl'),('revised_verifier_result.json','verifier_results.jsonl')]:
212
+ if not _append_if_exists(release_root, pkg/'grading'/name, out): blockers.append(f'missing grading/{name}')
213
+ for src,dst in [('feedback/baseline_evaluator_feedback.json','evaluator_feedback.jsonl'),('feedback/revision_plan.json','revision_plans.jsonl'),('signals/hillclimb_result.json','hillclimb_results.jsonl'),('signals/lesson_pack.json','lessons.jsonl')]:
214
+ if not _append_if_exists(release_root, pkg/src, dst): blockers.append(f'missing {src}')
215
+ for src,dst in [('training_signals.jsonl','training_signals.jsonl'),('process_supervision.jsonl','process_supervision.jsonl'),('reward_modeling.jsonl','reward_modeling.jsonl'),('revision_preference_pairs.jsonl','revision_preference_pairs.jsonl')]:
216
+ for row in read_jsonl(pkg/'signals'/src): append_jsonl(release_root/dst, row)
217
+ idx=_safe_read(pkg/'artifacts_index.json') or []
218
+ for row in idx:
219
+ if not is_ignored_release_path(row.get('package_relative_path','')):
220
+ artifacts.append({'task_id': tid, **row})
221
+ export_ready=not blockers
222
+ if not export_ready: incomplete += 1
223
+ task_status=_task_status(attempt_statuses)
224
+ append_jsonl(release_root/'packages_index.jsonl', {'task_id':tid,'package_path':f'packages/{tid}','task_status':task_status,'export_ready':export_ready,'export_blocker':'; '.join(blockers) if blockers else None,'trace_refs':attempt_trace_refs, **iteration_public})
225
+ dest=release_root/'packages'/tid
226
+ if dest.exists(): shutil.rmtree(dest)
227
+ _copytree_ignore_errors(pkg,dest)
228
+ roles_dir=run_root/'roles'
229
+ if roles_dir.exists():
230
+ for rr_path in roles_dir.rglob('role_result.json'):
231
+ rr=_safe_read(rr_path)
232
+ if rr is None: continue
233
+ parts=rr_path.relative_to(roles_dir).parts
234
+ task_id=parts[0] if len(parts)>0 else None
235
+ role=rr.get('role') or (parts[1] if len(parts)>1 else rr_path.parent.name)
236
+ attempt_kind=parts[2] if len(parts)>2 else rr_path.parent.name
237
+ if role in {'intake_agent','rubric_agent'} or attempt_kind == 'role_result.json':
238
+ attempt_kind='task_level'
239
+ append_jsonl(release_root/'role_results_index.jsonl', {'role': role, 'task_id': task_id, 'attempt_kind': attempt_kind, 'provider': rr.get('provider'), 'model': rr.get('model'), 'live_call_ok': rr.get('live_call_ok'), 'structured_output_validation_ok': rr.get('structured_output_validation_ok'), 'fallback_used': bool((rr.get('metadata_json') or {}).get('fallback_used') or not rr.get('live_call_ok')), 'prompt_hash': (rr.get('metadata_json') or {}).get('prompt_hash'), 'public_summary': (rr.get('metadata_json') or {}).get('public_summary') or rr.get('error_type'), 'role_result_ref_internal': str(rr_path.relative_to(run_root)), 'prompt_ref_internal': rr.get('prompt_ref'), 'output_ref_internal': rr.get('output_ref'), 'parsed_output_ref_internal': rr.get('parsed_output_ref')})
240
+ write_json(release_root/'artifacts_index.json', artifacts)
241
+ aggregate_counts={'tasks':len(pkgs),'attempts':len(read_jsonl(release_root/'actual_outputs.jsonl')),'traces':len(read_jsonl(release_root/'agent_traces.jsonl')),'traced_steps':normalized_trace_step_count,'process_supervision_rows':len(read_jsonl(release_root/'process_supervision.jsonl')),'reward_modeling_rows':len(read_jsonl(release_root/'reward_modeling.jsonl')),'revision_preference_pairs':len(read_jsonl(release_root/'revision_preference_pairs.jsonl'))}
242
+ write_json(release_root/'dataset_manifest.json', {'schema_version':'aa-release-v0.1','task_count':len(pkgs),**aggregate_counts,'files':RELEASE_FILES,'incomplete_package_count':incomplete,'trace_missing_count':missing_traces,'raw_trace_count':raw_trace_count,'raw_trace_step_count':raw_trace_step_count,'normalized_trace_count':normalized_trace_count,'normalized_trace_step_count':normalized_trace_step_count,'fallback_trace_count':fallback_trace_count,'fallback_trace_step_count':fallback_trace_step_count,'discarded_step_count':discarded_step_count,'raw_trace_parse_error_count':parse_error_count,'trace_normalization_error_count':norm_error_count,'trace_normalization_partial_count':partial_count,'trace_lossless_count':lossless_count,'trace_lossless_failure_count':lossless_failure_count,'actual_outputs_normalized_count':actual_outputs_normalized_count,'actual_outputs_schema_valid_count':actual_outputs_schema_valid_count})
243
+ (release_root/'dataset_card.md').write_text('# Agent Apprenticeship Dataset Release\n\nThis release captures reusable agent work experience across task execution, artifact creation, evaluation, verifier-backed reliability checks, evaluator feedback, revision trajectories, process-supervision rows, reward-modeling examples, and revision preference pairs.\n')
244
+ write_json(release_root/'quality_report.json', {'task_count':len(pkgs),'secret_scan_ok':True,'incomplete_package_count':incomplete,'trace_missing_count':missing_traces,'raw_trace_count':raw_trace_count,'raw_trace_step_count':raw_trace_step_count,'normalized_trace_count':normalized_trace_count,'normalized_trace_step_count':normalized_trace_step_count,'fallback_trace_count':fallback_trace_count,'fallback_trace_step_count':fallback_trace_step_count,'discarded_step_count':discarded_step_count,'raw_trace_parse_error_count':parse_error_count,'trace_normalization_error_count':norm_error_count,'trace_normalization_partial_count':partial_count,'trace_lossless_count':lossless_count,'trace_lossless_failure_count':lossless_failure_count,'actual_outputs_normalized_count':actual_outputs_normalized_count,'actual_outputs_schema_valid_count':actual_outputs_schema_valid_count})
245
+ create_public_release(release_root)
246
+ try:
247
+ from .validation import validate_release
248
+ counters=validate_release(release_root)
249
+ manifest=_safe_read(release_root/'dataset_manifest.json') or {}
250
+ quality=_safe_read(release_root/'quality_report.json') or {}
251
+ for key,value in counters.items():
252
+ if key in {'release_valid','public_release_valid','scale_ready','scale_blockers','fallback_only_task_count','rich_trace_task_count','workflow_trace_rich_count','raw_trace_count','raw_trace_step_count','normalized_trace_count','normalized_trace_step_count','fallback_trace_count','discarded_step_count','process_supervision_count','actual_outputs_raw_count','actual_outputs_normalized_count','actual_outputs_schema_valid_count','artifact_contract_consistency_ok','model_role_completeness_ok','model_task_intake_count','model_rubric_generation_count','model_evaluator_result_count','model_grader_result_count','model_verifier_result_count','model_score_count','artifact_contract_score_count','model_grading_grounded_count','model_grading_logs_only_count','model_grading_unavailable_count','public_prompt_leak_ok','public_secret_scan_ok','public_trace_count','public_system_prompt_redacted_count','public_prompt_metadata_count','dependency_shadow_ok','verifier_verified_count','verifier_failed_count','model_score_verified_count','model_score_needs_review_count','score_reliability_counts','scale_warnings','operation_other_count','operation_mapped_count'}:
253
+ manifest[key]=value; quality[key]=value
254
+ write_json(release_root/'dataset_manifest.json', manifest)
255
+ write_json(release_root/'quality_report.json', quality)
256
+ except Exception:
257
+ pass
258
+ _propagate_release_status_fields(release_root)
259
+ return release_root
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+ from .schemas import RevisionPlan, EvaluatorFeedback, HillclimbResult, GraderResult, RevisionPreferencePair
3
+
4
+ def deterministic_revision_plan(feedback: EvaluatorFeedback, target_attempt_id: str) -> RevisionPlan:
5
+ return RevisionPlan(revision_plan_id=f'revision_plan_{feedback.task_id}', task_id=feedback.task_id, source_attempt_id=feedback.attempt_id, target_attempt_id=target_attempt_id, revision_kind='local_fix', revision_reason=feedback.feedback_summary, failed_rubric_items=feedback.failed_rubric_items, planned_changes=feedback.actionable_feedback, expected_score_improvement=0.1, risk_of_regression='low', uses_evaluator_feedback=True, metadata_json={})
6
+
7
+ def compute_hillclimb(base: GraderResult, rev: GraderResult) -> HillclimbResult:
8
+ base_score=base.final_score if base.final_score is not None else base.score
9
+ rev_score=rev.final_score if rev.final_score is not None else rev.score
10
+ delta=rev_score-base_score
11
+ if delta>0: kind='score_delta'; strength='observed_improvement'; success=True
12
+ elif delta<0: kind='regression'; strength='regression_observed'; success=False
13
+ elif (not base.passed) and rev.passed: kind='pass_delta'; strength='observed_improvement'; success=True
14
+ else: kind='none'; strength='no_observed_improvement'; success=False
15
+ improved=[c for c in base.failed_criteria if c in rev.passed_criteria]
16
+ regressed=[c for c in base.passed_criteria if c in rev.failed_criteria]
17
+ return HillclimbResult(hillclimb_id=f'hill_{base.task_id}', task_id=base.task_id, baseline_attempt_id=base.attempt_id, revised_attempt_id=rev.attempt_id, baseline_score=base_score, revised_score=rev_score, revision_score_delta=delta, baseline_passed=base.passed, revised_passed=rev.passed, failed_criteria_before=base.failed_criteria, failed_criteria_after=rev.failed_criteria, criteria_improved=improved, criteria_regressed=regressed, artifact_completeness_before=base.artifact_contract_score if base.artifact_contract_score is not None else base.score, artifact_completeness_after=rev.artifact_contract_score if rev.artifact_contract_score is not None else rev.score, artifact_completeness_delta=(rev.artifact_contract_score or rev.score)-(base.artifact_contract_score or base.score), regression_count=len(regressed), improvement_kind=kind, hillclimb_evidence_strength=strength, revision_success=success, metadata_json={'comparison_basis':'final_score','baseline_artifact_contract_score':base.artifact_contract_score,'revised_artifact_contract_score':rev.artifact_contract_score,'baseline_model_score':base.model_score if base.model_score is not None else base.semantic_score,'revised_model_score':rev.model_score if rev.model_score is not None else rev.semantic_score,'baseline_semantic_score':base.semantic_score,'revised_semantic_score':rev.semantic_score,'baseline_score_reliability':base.score_reliability,'revised_score_reliability':rev.score_reliability})
18
+
19
+ def preference_pair(hill: HillclimbResult, rubric_ref: str, confidence=0.8) -> RevisionPreferencePair:
20
+ choose_rev=hill.revised_score >= hill.baseline_score
21
+ return RevisionPreferencePair(pair_id=f'pref_{hill.task_id}', task_id=hill.task_id, rubric_ref=rubric_ref, baseline_attempt_ref=f'attempts/baseline', revised_attempt_ref=f'attempts/revised', chosen_attempt_id=hill.revised_attempt_id if choose_rev else hill.baseline_attempt_id, rejected_attempt_id=hill.baseline_attempt_id if choose_rev else hill.revised_attempt_id, baseline_score=hill.baseline_score, revised_score=hill.revised_score, score_delta=hill.revision_score_delta, criteria_improved=hill.criteria_improved, criteria_regressed=hill.criteria_regressed, preference_reason='Choose higher final_score attempt; baseline wins on regression.', score_source='final_score', grader_kind='hybrid_or_configured', confidence=confidence, metadata_json={'comparison_basis':'final_score','baseline_final_score':hill.baseline_score,'revised_final_score':hill.revised_score,'baseline_artifact_contract_score':hill.metadata_json.get('baseline_artifact_contract_score'),'revised_artifact_contract_score':hill.metadata_json.get('revised_artifact_contract_score'),'baseline_model_score':hill.metadata_json.get('baseline_model_score'),'revised_model_score':hill.metadata_json.get('revised_model_score'),'baseline_semantic_score':hill.metadata_json.get('baseline_semantic_score'),'revised_semantic_score':hill.metadata_json.get('revised_semantic_score'),'baseline_score_reliability':hill.metadata_json.get('baseline_score_reliability'),'revised_score_reliability':hill.metadata_json.get('revised_score_reliability')})
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+ from pydantic import BaseModel, Field, ConfigDict
3
+ class RoleResult(BaseModel):
4
+ model_config=ConfigDict(extra='forbid')
5
+ role: str; provider: str; model: str; live_call_ok: bool; structured_output_validation_ok: bool; prompt_ref: str; output_ref: str; parsed_output_ref: str; error_type: str | None = None; error_message: str | None = None; duration_seconds: float | None = None; metadata_json: dict = Field(default_factory=dict)
6
+ class RoleRunner:
7
+ def run(self, *args, **kwargs): raise NotImplementedError