agent-apprenticeship 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +217 -0
  3. package/bin/agent-apprenticeship.js +131 -0
  4. package/package.json +30 -0
  5. package/pyproject.toml +23 -0
  6. package/src/agent_apprenticeship_trace/__init__.py +2 -0
  7. package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
  8. package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
  9. package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
  10. package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
  11. package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
  12. package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
  13. package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
  14. package/src/agent_apprenticeship_trace/certification.py +580 -0
  15. package/src/agent_apprenticeship_trace/cli.py +2979 -0
  16. package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
  17. package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
  18. package/src/agent_apprenticeship_trace/config.py +609 -0
  19. package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
  20. package/src/agent_apprenticeship_trace/env.py +46 -0
  21. package/src/agent_apprenticeship_trace/evaluator.py +64 -0
  22. package/src/agent_apprenticeship_trace/grader.py +194 -0
  23. package/src/agent_apprenticeship_trace/integration_status.py +193 -0
  24. package/src/agent_apprenticeship_trace/io.py +20 -0
  25. package/src/agent_apprenticeship_trace/learning.py +627 -0
  26. package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
  27. package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
  28. package/src/agent_apprenticeship_trace/loop.py +111 -0
  29. package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
  30. package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
  31. package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
  32. package/src/agent_apprenticeship_trace/progress.py +223 -0
  33. package/src/agent_apprenticeship_trace/public_run.py +1109 -0
  34. package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
  35. package/src/agent_apprenticeship_trace/recipes.py +129 -0
  36. package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
  37. package/src/agent_apprenticeship_trace/revision.py +21 -0
  38. package/src/agent_apprenticeship_trace/role_runners.py +7 -0
  39. package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
  40. package/src/agent_apprenticeship_trace/schemas.py +273 -0
  41. package/src/agent_apprenticeship_trace/session_events.py +99 -0
  42. package/src/agent_apprenticeship_trace/task_intake.py +112 -0
  43. package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
  44. package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
  45. package/src/agent_apprenticeship_trace/training_signals.py +30 -0
  46. package/src/agent_apprenticeship_trace/validation.py +210 -0
  47. package/src/agent_apprenticeship_trace/verifier.py +55 -0
@@ -0,0 +1,580 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ import sys
6
+ import tempfile
7
+ from contextlib import contextmanager
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from .config import get_settings, init_settings, update_settings
13
+ from .command_discovery import resolve_command, resolve_agent_command, gui_app_hint
14
+ from .env import redact_secrets
15
+ from .integration_status import record_certification_result
16
+ from .io import read_json
17
+ from .openai_structured import run_llm_smoke
18
+ from .public_run import run_prompt_task
19
+ from .recipes import MODEL_PROVIDER_RECIPES, WORKER_AGENT_RECIPES
20
+
21
+
22
+ RESULT_PASSED = "passed"
23
+ RESULT_FAILED = "failed"
24
+ RESULT_SKIPPED_MISSING_COMMAND = "skipped_missing_command"
25
+ RESULT_SKIPPED_MISSING_KEY = "skipped_missing_key"
26
+ RESULT_SKIPPED_NOT_CONFIGURED = "skipped_not_configured"
27
+ RESULT_FAILED_OUTPUT_CONTRACT = "failed_output_contract"
28
+ RESULT_FAILED_AUTH = "failed_auth"
29
+ RESULT_FAILED_QUOTA = "failed_quota"
30
+ RESULT_FAILED_TIMEOUT = "failed_timeout"
31
+ RESULT_FAILED_PROVIDER_ERROR = "failed_provider_error"
32
+ RESULT_FAILED_INSUFFICIENT_BALANCE = "failed_insufficient_balance"
33
+
34
+
35
+ def selected_agent_ids() -> list[str]:
36
+ return list(WORKER_AGENT_RECIPES)
37
+
38
+
39
+ def selected_model_provider_ids() -> list[str]:
40
+ return list(MODEL_PROVIDER_RECIPES)
41
+
42
+
43
+ def _repo_root() -> Path:
44
+ return Path(__file__).resolve().parents[2]
45
+
46
+
47
+ def custom_fixture_command_template() -> str:
48
+ fixture = _repo_root() / "scripts" / "fixtures" / "successful_custom_apprentice.py"
49
+ return f"{sys.executable} {fixture} --workspace {{workspace}} --prompt-file {{prompt_file}}"
50
+
51
+
52
+ def _utc_now() -> str:
53
+ return datetime.now(timezone.utc).isoformat()
54
+
55
+
56
+ def _smoke_timeout_seconds() -> int:
57
+ try:
58
+ return int(os.getenv("AA_TASK_TIMEOUT_SECONDS") or "240")
59
+ except ValueError:
60
+ return 240
61
+
62
+
63
+ def _google_key_visible() -> tuple[str, bool]:
64
+ if os.getenv("GEMINI_API_KEY"):
65
+ return "GEMINI_API_KEY", True
66
+ if os.getenv("GOOGLE_API_KEY"):
67
+ return "GOOGLE_API_KEY", True
68
+ return "GEMINI_API_KEY", False
69
+
70
+
71
+ def model_key_status(provider_id: str) -> tuple[str, bool]:
72
+ recipe = MODEL_PROVIDER_RECIPES[provider_id]
73
+ if provider_id == "google":
74
+ return _google_key_visible()
75
+ return recipe.api_key_env_var, bool(os.getenv(recipe.api_key_env_var))
76
+
77
+
78
+ def _agent_command_for(agent_id: str, *, user_custom: bool = False) -> tuple[str | None, str | None, bool]:
79
+ settings = get_settings()
80
+ recipe = WORKER_AGENT_RECIPES[agent_id]
81
+ if agent_id == "custom":
82
+ if user_custom:
83
+ template = settings.custom_worker_command_template
84
+ if not template:
85
+ return None, None, False
86
+ command = template.split()[0]
87
+ return (resolve_command(command) or command), template, bool(resolve_command(command))
88
+ command = sys.executable
89
+ return command, custom_fixture_command_template(), bool(resolve_command(command))
90
+ configured = settings.worker_agent_command if settings.worker_agent == agent_id and settings.worker_agent_command else recipe.command_name
91
+ _candidate, resolved = resolve_agent_command(agent_id, configured)
92
+ return (resolved or configured), None, bool(resolved)
93
+
94
+
95
+ @contextmanager
96
+ def _isolated_app_home(prefix: str, cache_home: Path):
97
+ previous_home = os.environ.get("AA_HOME")
98
+ previous_disable = os.environ.get("AA_DISABLE_LOCAL_ENV")
99
+ try:
100
+ parent = cache_home / "certification_runs"
101
+ parent.mkdir(parents=True, exist_ok=True)
102
+ path = Path(tempfile.mkdtemp(prefix=f"{prefix}-", dir=parent))
103
+ except OSError:
104
+ path = Path(tempfile.mkdtemp(prefix=f"aa-{prefix}-"))
105
+ os.environ["AA_HOME"] = str(path)
106
+ # Certification runs should not accidentally read active app-home settings
107
+ # unless we explicitly copy them below.
108
+ os.environ.pop("AA_DISABLE_LOCAL_ENV", None)
109
+ init_settings(path, overwrite=True)
110
+ try:
111
+ yield path
112
+ finally:
113
+ if previous_home is None:
114
+ os.environ.pop("AA_HOME", None)
115
+ else:
116
+ os.environ["AA_HOME"] = previous_home
117
+ if previous_disable is None:
118
+ os.environ.pop("AA_DISABLE_LOCAL_ENV", None)
119
+ else:
120
+ os.environ["AA_DISABLE_LOCAL_ENV"] = previous_disable
121
+
122
+
123
+ def _base_row(kind: str, result: str, **extra: Any) -> dict[str, Any]:
124
+ return {
125
+ "certification_kind": kind,
126
+ "result": result,
127
+ "timestamp": _utc_now(),
128
+ **extra,
129
+ }
130
+
131
+
132
+ def _classify_error(text: str | None, *, output_contract_default: bool = False) -> tuple[str, str | None, str | None]:
133
+ message = redact_secrets(text or "").strip()
134
+ lower = message.lower()
135
+ if (
136
+ "auth" in lower
137
+ or "not logged in" in lower
138
+ or "login" in lower
139
+ or "unauthorized" in lower
140
+ or "api key" in lower
141
+ or "setup required" in lower
142
+ or "provider not configured" in lower
143
+ or "model not configured" in lower
144
+ ):
145
+ return RESULT_FAILED_AUTH, "auth", message
146
+ if "insufficient balance" in lower:
147
+ return RESULT_FAILED_INSUFFICIENT_BALANCE, "provider_account_balance", message
148
+ if "quota" in lower or "credit" in lower or "rate limit" in lower or "usage limit" in lower or "billing" in lower or "insufficient" in lower:
149
+ return RESULT_FAILED_QUOTA, "quota", message
150
+ if "timeout" in lower or "timed out" in lower:
151
+ return RESULT_FAILED_TIMEOUT, "timeout", message
152
+ if "output-contract" in lower or "did not produce required" in lower or "agent_trace.json" in lower:
153
+ return RESULT_FAILED_OUTPUT_CONTRACT, "output_contract", message or "Apprentice Agent output-contract failure."
154
+ if output_contract_default:
155
+ return RESULT_FAILED_OUTPUT_CONTRACT, "output_contract", message or "Apprentice Agent output-contract failure."
156
+ return RESULT_FAILED_PROVIDER_ERROR, "provider_error", message or "Provider failed."
157
+
158
+
159
+ def _result_from_run_status(run_root: Path) -> tuple[str, str | None, str | None]:
160
+ status_path = run_root / "run_status.json"
161
+ if not status_path.exists():
162
+ return RESULT_FAILED_PROVIDER_ERROR, "missing_status", f"run_status.json was not created at {status_path}"
163
+ status = read_json(status_path)
164
+ if status.get("task_status") == "completed":
165
+ return RESULT_PASSED, None, None
166
+ reason = status.get("last_operational_error") or status.get("latest_message") or "Task did not complete."
167
+ return _classify_error(str(reason), output_contract_default=True)
168
+
169
+
170
+ def _model_smoke_ok(counters: dict[str, Any]) -> bool:
171
+ roles = ["intake", "rubric", "grader", "verifier", "evaluator"]
172
+ return bool(counters.get("mentor_model_provider_available")) and all(
173
+ bool(counters.get(f"{role}_live_call_ok")) and bool(counters.get(f"{role}_structured_output_validation_ok"))
174
+ for role in roles
175
+ ) and bool(counters.get("secret_scan_ok"))
176
+
177
+
178
+ def _model_error(counters: dict[str, Any]) -> tuple[str, str | None, str | None]:
179
+ for key in sorted(counters):
180
+ if key.endswith("_error_message") and counters.get(key):
181
+ result, error_type, summary = _classify_error(str(counters[key]))
182
+ if result == RESULT_FAILED_OUTPUT_CONTRACT:
183
+ result = RESULT_FAILED_PROVIDER_ERROR
184
+ return result, error_type, summary
185
+ return RESULT_FAILED_PROVIDER_ERROR, "provider_error", "Mentor Model Provider live smoke failed."
186
+
187
+
188
+ def _record_agent_row(row: dict[str, Any], cache_home: Path) -> None:
189
+ provider_id = row["agent_id"]
190
+ record_certification_result(
191
+ provider_type="apprentice_agent",
192
+ provider_id=provider_id,
193
+ result=row["result"],
194
+ certification_kind=row["certification_kind"],
195
+ agent_id=provider_id,
196
+ error_type=row.get("error_type"),
197
+ error_summary=row.get("error_summary"),
198
+ command_or_model=row.get("command"),
199
+ app_home=cache_home,
200
+ metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
201
+ )
202
+
203
+
204
+ def _record_model_row(row: dict[str, Any], cache_home: Path) -> None:
205
+ provider_id = row["provider_id"]
206
+ record_certification_result(
207
+ provider_type="mentor_model_provider",
208
+ provider_id=provider_id,
209
+ result=row["result"],
210
+ certification_kind=row["certification_kind"],
211
+ model_provider_id=provider_id,
212
+ error_type=row.get("error_type"),
213
+ error_summary=row.get("error_summary"),
214
+ command_or_model=row.get("model"),
215
+ app_home=cache_home,
216
+ metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
217
+ )
218
+
219
+
220
+ def _record_full_row(row: dict[str, Any], cache_home: Path) -> None:
221
+ record_certification_result(
222
+ provider_type="full_e2e",
223
+ provider_id=f"{row['agent_id']}+{row['provider_id']}",
224
+ result=row["result"],
225
+ certification_kind="full_e2e",
226
+ agent_id=row["agent_id"],
227
+ model_provider_id=row["provider_id"],
228
+ error_type=row.get("error_type"),
229
+ error_summary=row.get("error_summary"),
230
+ command_or_model=f"{row.get('command') or row['agent_id']} + {row.get('model') or row['provider_id']}",
231
+ app_home=cache_home,
232
+ metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
233
+ )
234
+
235
+
236
+ def certify_agent(agent_id: str, *, strict: bool = False, user_custom: bool = False, cache_home: Path | None = None) -> dict[str, Any]:
237
+ cache_home = cache_home or get_settings().app_home
238
+ recipe = WORKER_AGENT_RECIPES[agent_id]
239
+ mode = "user_configured" if agent_id == "custom" and user_custom else ("fixture" if agent_id == "custom" else "live")
240
+ command, template, found = _agent_command_for(agent_id, user_custom=user_custom)
241
+ kind = "agent_live_smoke_user" if mode == "user_configured" else ("agent_live_smoke_fixture" if mode == "fixture" else "agent_live_smoke")
242
+ if command is None:
243
+ row = _base_row(
244
+ kind,
245
+ RESULT_SKIPPED_NOT_CONFIGURED,
246
+ agent_id=agent_id,
247
+ display_name=recipe.display_name,
248
+ mode=mode,
249
+ command=None,
250
+ command_found=False,
251
+ error_type="not_configured",
252
+ error_summary="User-configured Custom Apprentice Agent command template is not configured.",
253
+ )
254
+ _record_agent_row(row, cache_home)
255
+ return row
256
+ if not found:
257
+ row = _base_row(
258
+ kind,
259
+ RESULT_SKIPPED_MISSING_COMMAND,
260
+ agent_id=agent_id,
261
+ display_name=recipe.display_name,
262
+ mode=mode,
263
+ command=command,
264
+ command_found=False,
265
+ error_type="missing_command",
266
+ error_summary=(
267
+ f"Apprentice Agent command not found: {command}"
268
+ + (f". {gui_app_hint(agent_id)} Install or expose the headless CLI on PATH." if gui_app_hint(agent_id) else "")
269
+ ),
270
+ )
271
+ _record_agent_row(row, cache_home)
272
+ return row
273
+ try:
274
+ with _isolated_app_home(f"agent-{agent_id}", cache_home) as home:
275
+ if agent_id == "custom":
276
+ update_settings(
277
+ worker_agent="custom",
278
+ worker_agent_command=command,
279
+ custom_worker_display_name="Custom Fixture" if mode == "fixture" else "Custom",
280
+ custom_worker_command_template=template,
281
+ mentor_mode="expert_led",
282
+ max_improvement_loops=1,
283
+ task_timeout_seconds=_smoke_timeout_seconds(),
284
+ )
285
+ else:
286
+ update_settings(
287
+ worker_agent=agent_id,
288
+ worker_agent_command=command,
289
+ worker_runner=("codex" if agent_id == "codex" else agent_id),
290
+ reviser_runner=("codex" if agent_id == "codex" else agent_id),
291
+ mentor_mode="expert_led",
292
+ max_improvement_loops=1,
293
+ task_timeout_seconds=_smoke_timeout_seconds(),
294
+ )
295
+ run_root, bundle = run_prompt_task(
296
+ "Create a one-paragraph readiness note and save it under artifacts/readiness.md.",
297
+ run_id=f"cert-agent-{agent_id}-{mode}",
298
+ create_bundle=True,
299
+ )
300
+ result, error_type, error_summary = _result_from_run_status(run_root)
301
+ row = _base_row(
302
+ kind,
303
+ result,
304
+ agent_id=agent_id,
305
+ display_name=recipe.display_name,
306
+ mode=mode,
307
+ command=command,
308
+ command_found=True,
309
+ run_path=str(run_root),
310
+ bundle_path=str(bundle) if bundle else None,
311
+ error_type=error_type,
312
+ error_summary=error_summary,
313
+ )
314
+ except Exception as exc:
315
+ result, error_type, error_summary = _classify_error(str(exc), output_contract_default=False)
316
+ row = _base_row(
317
+ kind,
318
+ result if result != RESULT_FAILED_OUTPUT_CONTRACT else RESULT_FAILED_PROVIDER_ERROR,
319
+ agent_id=agent_id,
320
+ display_name=recipe.display_name,
321
+ mode=mode,
322
+ command=command,
323
+ command_found=True,
324
+ error_type=error_type,
325
+ error_summary=error_summary,
326
+ )
327
+ _record_agent_row(row, cache_home)
328
+ return row
329
+
330
+
331
+ def certify_model_provider(provider_id: str, *, strict: bool = False, cache_home: Path | None = None) -> dict[str, Any]:
332
+ cache_home = cache_home or get_settings().app_home
333
+ recipe = MODEL_PROVIDER_RECIPES[provider_id]
334
+ key_env, visible = model_key_status(provider_id)
335
+ if not visible:
336
+ row = _base_row(
337
+ "model_live_smoke",
338
+ RESULT_SKIPPED_MISSING_KEY,
339
+ provider_id=provider_id,
340
+ display_name=recipe.display_name,
341
+ model=recipe.default_model,
342
+ api_key_env_var=key_env,
343
+ api_key_visible=False,
344
+ error_type="missing_key",
345
+ error_summary=f"{key_env} is not visible.",
346
+ )
347
+ _record_model_row(row, cache_home)
348
+ return row
349
+ out_dir = cache_home / "certification_model_smokes" / provider_id
350
+ try:
351
+ counters = run_llm_smoke(out_dir, provider_id=provider_id)
352
+ if _model_smoke_ok(counters):
353
+ row = _base_row(
354
+ "model_live_smoke",
355
+ RESULT_PASSED,
356
+ provider_id=provider_id,
357
+ display_name=recipe.display_name,
358
+ model=recipe.default_model,
359
+ api_key_env_var=key_env,
360
+ api_key_visible=True,
361
+ output_dir=str(out_dir),
362
+ counters=counters,
363
+ )
364
+ else:
365
+ result, error_type, error_summary = _model_error(counters)
366
+ row = _base_row(
367
+ "model_live_smoke",
368
+ result,
369
+ provider_id=provider_id,
370
+ display_name=recipe.display_name,
371
+ model=recipe.default_model,
372
+ api_key_env_var=key_env,
373
+ api_key_visible=True,
374
+ output_dir=str(out_dir),
375
+ counters=counters,
376
+ error_type=error_type,
377
+ error_summary=error_summary,
378
+ )
379
+ except Exception as exc:
380
+ result, error_type, error_summary = _classify_error(str(exc))
381
+ row = _base_row(
382
+ "model_live_smoke",
383
+ result,
384
+ provider_id=provider_id,
385
+ display_name=recipe.display_name,
386
+ model=recipe.default_model,
387
+ api_key_env_var=key_env,
388
+ api_key_visible=True,
389
+ output_dir=str(out_dir),
390
+ error_type=error_type,
391
+ error_summary=error_summary,
392
+ )
393
+ _record_model_row(row, cache_home)
394
+ return row
395
+
396
+
397
+ def bounded_full_e2e_pairs(
398
+ *,
399
+ all_combinations: bool = False,
400
+ agent_ids: list[str] | None = None,
401
+ provider_ids: list[str] | None = None,
402
+ ) -> list[tuple[str, str]]:
403
+ agents = agent_ids or selected_agent_ids()
404
+ providers = provider_ids or selected_model_provider_ids()
405
+ if all_combinations:
406
+ return [(agent, provider) for agent in agents for provider in providers]
407
+ pairs: list[tuple[str, str]] = [("codex", "openai"), ("openclaw", "openai")]
408
+ pairs.extend(("custom", provider) for provider in providers)
409
+ for agent in agents:
410
+ if agent == "custom":
411
+ continue
412
+ command, _, found = _agent_command_for(agent)
413
+ if found:
414
+ pairs.append((agent, "openai"))
415
+ deduped: list[tuple[str, str]] = []
416
+ seen = set()
417
+ for pair in pairs:
418
+ if pair[0] not in agents or pair[1] not in providers:
419
+ continue
420
+ if pair not in seen:
421
+ deduped.append(pair)
422
+ seen.add(pair)
423
+ return deduped
424
+
425
+
426
+ def certify_full_e2e_pair(agent_id: str, provider_id: str, *, cache_home: Path | None = None) -> dict[str, Any]:
427
+ cache_home = cache_home or get_settings().app_home
428
+ agent_recipe = WORKER_AGENT_RECIPES[agent_id]
429
+ provider_recipe = MODEL_PROVIDER_RECIPES[provider_id]
430
+ key_env, key_visible = model_key_status(provider_id)
431
+ if not key_visible:
432
+ row = _base_row(
433
+ "full_e2e",
434
+ RESULT_SKIPPED_MISSING_KEY,
435
+ agent_id=agent_id,
436
+ provider_id=provider_id,
437
+ command=None,
438
+ model=provider_recipe.default_model,
439
+ api_key_env_var=key_env,
440
+ error_type="missing_key",
441
+ error_summary=f"{key_env} is not visible.",
442
+ )
443
+ _record_full_row(row, cache_home)
444
+ return row
445
+ command, template, found = _agent_command_for(agent_id)
446
+ if not found:
447
+ row = _base_row(
448
+ "full_e2e",
449
+ RESULT_SKIPPED_MISSING_COMMAND,
450
+ agent_id=agent_id,
451
+ provider_id=provider_id,
452
+ command=command,
453
+ model=provider_recipe.default_model,
454
+ api_key_env_var=key_env,
455
+ error_type="missing_command",
456
+ error_summary=(
457
+ f"Apprentice Agent command not found: {command}"
458
+ + (f". {gui_app_hint(agent_id)} Install or expose the headless CLI on PATH." if gui_app_hint(agent_id) else "")
459
+ ),
460
+ )
461
+ _record_full_row(row, cache_home)
462
+ return row
463
+ try:
464
+ with _isolated_app_home(f"e2e-{agent_id}-{provider_id}", cache_home):
465
+ task_timeout = max(_smoke_timeout_seconds(), 240) if agent_id == "openclaw" else _smoke_timeout_seconds()
466
+ if agent_id == "custom":
467
+ update_settings(
468
+ worker_agent="custom",
469
+ worker_agent_command=command,
470
+ custom_worker_display_name="Custom Fixture",
471
+ custom_worker_command_template=template,
472
+ mentor_mode="model_assisted",
473
+ model_provider=provider_id,
474
+ model_provider_api_key_env=key_env,
475
+ model_provider_model=provider_recipe.default_model,
476
+ max_improvement_loops=1,
477
+ task_timeout_seconds=task_timeout,
478
+ )
479
+ else:
480
+ update_settings(
481
+ worker_agent=agent_id,
482
+ worker_agent_command=command,
483
+ worker_runner=("codex" if agent_id == "codex" else agent_id),
484
+ reviser_runner=("codex" if agent_id == "codex" else agent_id),
485
+ mentor_mode="model_assisted",
486
+ model_provider=provider_id,
487
+ model_provider_api_key_env=key_env,
488
+ model_provider_model=provider_recipe.default_model,
489
+ max_improvement_loops=1,
490
+ task_timeout_seconds=task_timeout,
491
+ )
492
+ run_root, bundle = run_prompt_task(
493
+ "Create one sentence under artifacts/market_note.md.",
494
+ run_id=f"cert-e2e-{agent_id}-{provider_id}",
495
+ create_bundle=True,
496
+ )
497
+ result, error_type, error_summary = _result_from_run_status(run_root)
498
+ row = _base_row(
499
+ "full_e2e",
500
+ result,
501
+ agent_id=agent_id,
502
+ provider_id=provider_id,
503
+ display_name=f"{agent_recipe.display_name} + {provider_recipe.display_name}",
504
+ command=command,
505
+ model=provider_recipe.default_model,
506
+ api_key_env_var=key_env,
507
+ run_path=str(run_root),
508
+ bundle_path=str(bundle) if bundle else None,
509
+ error_type=error_type,
510
+ error_summary=error_summary,
511
+ )
512
+ except Exception as exc:
513
+ result, error_type, error_summary = _classify_error(str(exc))
514
+ row = _base_row(
515
+ "full_e2e",
516
+ result,
517
+ agent_id=agent_id,
518
+ provider_id=provider_id,
519
+ display_name=f"{agent_recipe.display_name} + {provider_recipe.display_name}",
520
+ command=command,
521
+ model=provider_recipe.default_model,
522
+ api_key_env_var=key_env,
523
+ error_type=error_type,
524
+ error_summary=error_summary,
525
+ )
526
+ _record_full_row(row, cache_home)
527
+ return row
528
+
529
+
530
+ def run_certification_matrix(
531
+ *,
532
+ include_agents: bool = False,
533
+ include_models: bool = False,
534
+ include_full_e2e: bool = False,
535
+ all_combinations: bool = False,
536
+ strict: bool = False,
537
+ agent_ids: list[str] | None = None,
538
+ provider_ids: list[str] | None = None,
539
+ full_e2e_pairs: list[tuple[str, str]] | None = None,
540
+ ) -> dict[str, Any]:
541
+ if not (include_agents or include_models or include_full_e2e):
542
+ include_agents = include_models = include_full_e2e = True
543
+ cache_home = get_settings().app_home
544
+ report: dict[str, Any] = {"agents": [], "models": [], "full_e2e": [], "summary": {}}
545
+ if include_agents:
546
+ for agent_id in agent_ids or selected_agent_ids():
547
+ if agent_id == "custom":
548
+ report["agents"].append(certify_agent(agent_id, strict=strict, user_custom=False, cache_home=cache_home))
549
+ report["agents"].append(certify_agent(agent_id, strict=strict, user_custom=True, cache_home=cache_home))
550
+ else:
551
+ report["agents"].append(certify_agent(agent_id, strict=strict, cache_home=cache_home))
552
+ if include_models:
553
+ for provider_id in provider_ids or selected_model_provider_ids():
554
+ report["models"].append(certify_model_provider(provider_id, strict=strict, cache_home=cache_home))
555
+ if include_full_e2e:
556
+ pairs = full_e2e_pairs or bounded_full_e2e_pairs(
557
+ all_combinations=all_combinations,
558
+ agent_ids=agent_ids,
559
+ provider_ids=provider_ids,
560
+ )
561
+ for agent_id, provider_id in pairs:
562
+ report["full_e2e"].append(certify_full_e2e_pair(agent_id, provider_id, cache_home=cache_home))
563
+ all_rows = [*report["agents"], *report["models"], *report["full_e2e"]]
564
+ report["summary"] = {
565
+ "passed": sum(1 for row in all_rows if row["result"] == RESULT_PASSED),
566
+ "failed": sum(1 for row in all_rows if str(row["result"]).startswith("failed")),
567
+ "skipped": sum(1 for row in all_rows if str(row["result"]).startswith("skipped")),
568
+ "strict": strict,
569
+ "all_combinations": all_combinations,
570
+ }
571
+ return report
572
+
573
+
574
+ def certification_exit_code(report: dict[str, Any], *, strict: bool = False) -> int:
575
+ rows = [*(report.get("agents") or []), *(report.get("models") or []), *(report.get("full_e2e") or [])]
576
+ if any(str(row.get("result")).startswith("failed") for row in rows):
577
+ return 1
578
+ if strict and any(str(row.get("result")).startswith("skipped") for row in rows):
579
+ return 1
580
+ return 0