agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .config import get_settings, init_settings, update_settings
|
|
13
|
+
from .command_discovery import resolve_command, resolve_agent_command, gui_app_hint
|
|
14
|
+
from .env import redact_secrets
|
|
15
|
+
from .integration_status import record_certification_result
|
|
16
|
+
from .io import read_json
|
|
17
|
+
from .openai_structured import run_llm_smoke
|
|
18
|
+
from .public_run import run_prompt_task
|
|
19
|
+
from .recipes import MODEL_PROVIDER_RECIPES, WORKER_AGENT_RECIPES
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
RESULT_PASSED = "passed"
|
|
23
|
+
RESULT_FAILED = "failed"
|
|
24
|
+
RESULT_SKIPPED_MISSING_COMMAND = "skipped_missing_command"
|
|
25
|
+
RESULT_SKIPPED_MISSING_KEY = "skipped_missing_key"
|
|
26
|
+
RESULT_SKIPPED_NOT_CONFIGURED = "skipped_not_configured"
|
|
27
|
+
RESULT_FAILED_OUTPUT_CONTRACT = "failed_output_contract"
|
|
28
|
+
RESULT_FAILED_AUTH = "failed_auth"
|
|
29
|
+
RESULT_FAILED_QUOTA = "failed_quota"
|
|
30
|
+
RESULT_FAILED_TIMEOUT = "failed_timeout"
|
|
31
|
+
RESULT_FAILED_PROVIDER_ERROR = "failed_provider_error"
|
|
32
|
+
RESULT_FAILED_INSUFFICIENT_BALANCE = "failed_insufficient_balance"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def selected_agent_ids() -> list[str]:
|
|
36
|
+
return list(WORKER_AGENT_RECIPES)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def selected_model_provider_ids() -> list[str]:
|
|
40
|
+
return list(MODEL_PROVIDER_RECIPES)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _repo_root() -> Path:
|
|
44
|
+
return Path(__file__).resolve().parents[2]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def custom_fixture_command_template() -> str:
|
|
48
|
+
fixture = _repo_root() / "scripts" / "fixtures" / "successful_custom_apprentice.py"
|
|
49
|
+
return f"{sys.executable} {fixture} --workspace {{workspace}} --prompt-file {{prompt_file}}"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _utc_now() -> str:
|
|
53
|
+
return datetime.now(timezone.utc).isoformat()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _smoke_timeout_seconds() -> int:
|
|
57
|
+
try:
|
|
58
|
+
return int(os.getenv("AA_TASK_TIMEOUT_SECONDS") or "240")
|
|
59
|
+
except ValueError:
|
|
60
|
+
return 240
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _google_key_visible() -> tuple[str, bool]:
|
|
64
|
+
if os.getenv("GEMINI_API_KEY"):
|
|
65
|
+
return "GEMINI_API_KEY", True
|
|
66
|
+
if os.getenv("GOOGLE_API_KEY"):
|
|
67
|
+
return "GOOGLE_API_KEY", True
|
|
68
|
+
return "GEMINI_API_KEY", False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def model_key_status(provider_id: str) -> tuple[str, bool]:
|
|
72
|
+
recipe = MODEL_PROVIDER_RECIPES[provider_id]
|
|
73
|
+
if provider_id == "google":
|
|
74
|
+
return _google_key_visible()
|
|
75
|
+
return recipe.api_key_env_var, bool(os.getenv(recipe.api_key_env_var))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _agent_command_for(agent_id: str, *, user_custom: bool = False) -> tuple[str | None, str | None, bool]:
|
|
79
|
+
settings = get_settings()
|
|
80
|
+
recipe = WORKER_AGENT_RECIPES[agent_id]
|
|
81
|
+
if agent_id == "custom":
|
|
82
|
+
if user_custom:
|
|
83
|
+
template = settings.custom_worker_command_template
|
|
84
|
+
if not template:
|
|
85
|
+
return None, None, False
|
|
86
|
+
command = template.split()[0]
|
|
87
|
+
return (resolve_command(command) or command), template, bool(resolve_command(command))
|
|
88
|
+
command = sys.executable
|
|
89
|
+
return command, custom_fixture_command_template(), bool(resolve_command(command))
|
|
90
|
+
configured = settings.worker_agent_command if settings.worker_agent == agent_id and settings.worker_agent_command else recipe.command_name
|
|
91
|
+
_candidate, resolved = resolve_agent_command(agent_id, configured)
|
|
92
|
+
return (resolved or configured), None, bool(resolved)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@contextmanager
|
|
96
|
+
def _isolated_app_home(prefix: str, cache_home: Path):
|
|
97
|
+
previous_home = os.environ.get("AA_HOME")
|
|
98
|
+
previous_disable = os.environ.get("AA_DISABLE_LOCAL_ENV")
|
|
99
|
+
try:
|
|
100
|
+
parent = cache_home / "certification_runs"
|
|
101
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
path = Path(tempfile.mkdtemp(prefix=f"{prefix}-", dir=parent))
|
|
103
|
+
except OSError:
|
|
104
|
+
path = Path(tempfile.mkdtemp(prefix=f"aa-{prefix}-"))
|
|
105
|
+
os.environ["AA_HOME"] = str(path)
|
|
106
|
+
# Certification runs should not accidentally read active app-home settings
|
|
107
|
+
# unless we explicitly copy them below.
|
|
108
|
+
os.environ.pop("AA_DISABLE_LOCAL_ENV", None)
|
|
109
|
+
init_settings(path, overwrite=True)
|
|
110
|
+
try:
|
|
111
|
+
yield path
|
|
112
|
+
finally:
|
|
113
|
+
if previous_home is None:
|
|
114
|
+
os.environ.pop("AA_HOME", None)
|
|
115
|
+
else:
|
|
116
|
+
os.environ["AA_HOME"] = previous_home
|
|
117
|
+
if previous_disable is None:
|
|
118
|
+
os.environ.pop("AA_DISABLE_LOCAL_ENV", None)
|
|
119
|
+
else:
|
|
120
|
+
os.environ["AA_DISABLE_LOCAL_ENV"] = previous_disable
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _base_row(kind: str, result: str, **extra: Any) -> dict[str, Any]:
|
|
124
|
+
return {
|
|
125
|
+
"certification_kind": kind,
|
|
126
|
+
"result": result,
|
|
127
|
+
"timestamp": _utc_now(),
|
|
128
|
+
**extra,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _classify_error(text: str | None, *, output_contract_default: bool = False) -> tuple[str, str | None, str | None]:
|
|
133
|
+
message = redact_secrets(text or "").strip()
|
|
134
|
+
lower = message.lower()
|
|
135
|
+
if (
|
|
136
|
+
"auth" in lower
|
|
137
|
+
or "not logged in" in lower
|
|
138
|
+
or "login" in lower
|
|
139
|
+
or "unauthorized" in lower
|
|
140
|
+
or "api key" in lower
|
|
141
|
+
or "setup required" in lower
|
|
142
|
+
or "provider not configured" in lower
|
|
143
|
+
or "model not configured" in lower
|
|
144
|
+
):
|
|
145
|
+
return RESULT_FAILED_AUTH, "auth", message
|
|
146
|
+
if "insufficient balance" in lower:
|
|
147
|
+
return RESULT_FAILED_INSUFFICIENT_BALANCE, "provider_account_balance", message
|
|
148
|
+
if "quota" in lower or "credit" in lower or "rate limit" in lower or "usage limit" in lower or "billing" in lower or "insufficient" in lower:
|
|
149
|
+
return RESULT_FAILED_QUOTA, "quota", message
|
|
150
|
+
if "timeout" in lower or "timed out" in lower:
|
|
151
|
+
return RESULT_FAILED_TIMEOUT, "timeout", message
|
|
152
|
+
if "output-contract" in lower or "did not produce required" in lower or "agent_trace.json" in lower:
|
|
153
|
+
return RESULT_FAILED_OUTPUT_CONTRACT, "output_contract", message or "Apprentice Agent output-contract failure."
|
|
154
|
+
if output_contract_default:
|
|
155
|
+
return RESULT_FAILED_OUTPUT_CONTRACT, "output_contract", message or "Apprentice Agent output-contract failure."
|
|
156
|
+
return RESULT_FAILED_PROVIDER_ERROR, "provider_error", message or "Provider failed."
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _result_from_run_status(run_root: Path) -> tuple[str, str | None, str | None]:
|
|
160
|
+
status_path = run_root / "run_status.json"
|
|
161
|
+
if not status_path.exists():
|
|
162
|
+
return RESULT_FAILED_PROVIDER_ERROR, "missing_status", f"run_status.json was not created at {status_path}"
|
|
163
|
+
status = read_json(status_path)
|
|
164
|
+
if status.get("task_status") == "completed":
|
|
165
|
+
return RESULT_PASSED, None, None
|
|
166
|
+
reason = status.get("last_operational_error") or status.get("latest_message") or "Task did not complete."
|
|
167
|
+
return _classify_error(str(reason), output_contract_default=True)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _model_smoke_ok(counters: dict[str, Any]) -> bool:
|
|
171
|
+
roles = ["intake", "rubric", "grader", "verifier", "evaluator"]
|
|
172
|
+
return bool(counters.get("mentor_model_provider_available")) and all(
|
|
173
|
+
bool(counters.get(f"{role}_live_call_ok")) and bool(counters.get(f"{role}_structured_output_validation_ok"))
|
|
174
|
+
for role in roles
|
|
175
|
+
) and bool(counters.get("secret_scan_ok"))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _model_error(counters: dict[str, Any]) -> tuple[str, str | None, str | None]:
|
|
179
|
+
for key in sorted(counters):
|
|
180
|
+
if key.endswith("_error_message") and counters.get(key):
|
|
181
|
+
result, error_type, summary = _classify_error(str(counters[key]))
|
|
182
|
+
if result == RESULT_FAILED_OUTPUT_CONTRACT:
|
|
183
|
+
result = RESULT_FAILED_PROVIDER_ERROR
|
|
184
|
+
return result, error_type, summary
|
|
185
|
+
return RESULT_FAILED_PROVIDER_ERROR, "provider_error", "Mentor Model Provider live smoke failed."
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _record_agent_row(row: dict[str, Any], cache_home: Path) -> None:
|
|
189
|
+
provider_id = row["agent_id"]
|
|
190
|
+
record_certification_result(
|
|
191
|
+
provider_type="apprentice_agent",
|
|
192
|
+
provider_id=provider_id,
|
|
193
|
+
result=row["result"],
|
|
194
|
+
certification_kind=row["certification_kind"],
|
|
195
|
+
agent_id=provider_id,
|
|
196
|
+
error_type=row.get("error_type"),
|
|
197
|
+
error_summary=row.get("error_summary"),
|
|
198
|
+
command_or_model=row.get("command"),
|
|
199
|
+
app_home=cache_home,
|
|
200
|
+
metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _record_model_row(row: dict[str, Any], cache_home: Path) -> None:
|
|
205
|
+
provider_id = row["provider_id"]
|
|
206
|
+
record_certification_result(
|
|
207
|
+
provider_type="mentor_model_provider",
|
|
208
|
+
provider_id=provider_id,
|
|
209
|
+
result=row["result"],
|
|
210
|
+
certification_kind=row["certification_kind"],
|
|
211
|
+
model_provider_id=provider_id,
|
|
212
|
+
error_type=row.get("error_type"),
|
|
213
|
+
error_summary=row.get("error_summary"),
|
|
214
|
+
command_or_model=row.get("model"),
|
|
215
|
+
app_home=cache_home,
|
|
216
|
+
metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _record_full_row(row: dict[str, Any], cache_home: Path) -> None:
|
|
221
|
+
record_certification_result(
|
|
222
|
+
provider_type="full_e2e",
|
|
223
|
+
provider_id=f"{row['agent_id']}+{row['provider_id']}",
|
|
224
|
+
result=row["result"],
|
|
225
|
+
certification_kind="full_e2e",
|
|
226
|
+
agent_id=row["agent_id"],
|
|
227
|
+
model_provider_id=row["provider_id"],
|
|
228
|
+
error_type=row.get("error_type"),
|
|
229
|
+
error_summary=row.get("error_summary"),
|
|
230
|
+
command_or_model=f"{row.get('command') or row['agent_id']} + {row.get('model') or row['provider_id']}",
|
|
231
|
+
app_home=cache_home,
|
|
232
|
+
metadata_json={k: v for k, v in row.items() if k not in {"result", "error_type", "error_summary"}},
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def certify_agent(agent_id: str, *, strict: bool = False, user_custom: bool = False, cache_home: Path | None = None) -> dict[str, Any]:
|
|
237
|
+
cache_home = cache_home or get_settings().app_home
|
|
238
|
+
recipe = WORKER_AGENT_RECIPES[agent_id]
|
|
239
|
+
mode = "user_configured" if agent_id == "custom" and user_custom else ("fixture" if agent_id == "custom" else "live")
|
|
240
|
+
command, template, found = _agent_command_for(agent_id, user_custom=user_custom)
|
|
241
|
+
kind = "agent_live_smoke_user" if mode == "user_configured" else ("agent_live_smoke_fixture" if mode == "fixture" else "agent_live_smoke")
|
|
242
|
+
if command is None:
|
|
243
|
+
row = _base_row(
|
|
244
|
+
kind,
|
|
245
|
+
RESULT_SKIPPED_NOT_CONFIGURED,
|
|
246
|
+
agent_id=agent_id,
|
|
247
|
+
display_name=recipe.display_name,
|
|
248
|
+
mode=mode,
|
|
249
|
+
command=None,
|
|
250
|
+
command_found=False,
|
|
251
|
+
error_type="not_configured",
|
|
252
|
+
error_summary="User-configured Custom Apprentice Agent command template is not configured.",
|
|
253
|
+
)
|
|
254
|
+
_record_agent_row(row, cache_home)
|
|
255
|
+
return row
|
|
256
|
+
if not found:
|
|
257
|
+
row = _base_row(
|
|
258
|
+
kind,
|
|
259
|
+
RESULT_SKIPPED_MISSING_COMMAND,
|
|
260
|
+
agent_id=agent_id,
|
|
261
|
+
display_name=recipe.display_name,
|
|
262
|
+
mode=mode,
|
|
263
|
+
command=command,
|
|
264
|
+
command_found=False,
|
|
265
|
+
error_type="missing_command",
|
|
266
|
+
error_summary=(
|
|
267
|
+
f"Apprentice Agent command not found: {command}"
|
|
268
|
+
+ (f". {gui_app_hint(agent_id)} Install or expose the headless CLI on PATH." if gui_app_hint(agent_id) else "")
|
|
269
|
+
),
|
|
270
|
+
)
|
|
271
|
+
_record_agent_row(row, cache_home)
|
|
272
|
+
return row
|
|
273
|
+
try:
|
|
274
|
+
with _isolated_app_home(f"agent-{agent_id}", cache_home) as home:
|
|
275
|
+
if agent_id == "custom":
|
|
276
|
+
update_settings(
|
|
277
|
+
worker_agent="custom",
|
|
278
|
+
worker_agent_command=command,
|
|
279
|
+
custom_worker_display_name="Custom Fixture" if mode == "fixture" else "Custom",
|
|
280
|
+
custom_worker_command_template=template,
|
|
281
|
+
mentor_mode="expert_led",
|
|
282
|
+
max_improvement_loops=1,
|
|
283
|
+
task_timeout_seconds=_smoke_timeout_seconds(),
|
|
284
|
+
)
|
|
285
|
+
else:
|
|
286
|
+
update_settings(
|
|
287
|
+
worker_agent=agent_id,
|
|
288
|
+
worker_agent_command=command,
|
|
289
|
+
worker_runner=("codex" if agent_id == "codex" else agent_id),
|
|
290
|
+
reviser_runner=("codex" if agent_id == "codex" else agent_id),
|
|
291
|
+
mentor_mode="expert_led",
|
|
292
|
+
max_improvement_loops=1,
|
|
293
|
+
task_timeout_seconds=_smoke_timeout_seconds(),
|
|
294
|
+
)
|
|
295
|
+
run_root, bundle = run_prompt_task(
|
|
296
|
+
"Create a one-paragraph readiness note and save it under artifacts/readiness.md.",
|
|
297
|
+
run_id=f"cert-agent-{agent_id}-{mode}",
|
|
298
|
+
create_bundle=True,
|
|
299
|
+
)
|
|
300
|
+
result, error_type, error_summary = _result_from_run_status(run_root)
|
|
301
|
+
row = _base_row(
|
|
302
|
+
kind,
|
|
303
|
+
result,
|
|
304
|
+
agent_id=agent_id,
|
|
305
|
+
display_name=recipe.display_name,
|
|
306
|
+
mode=mode,
|
|
307
|
+
command=command,
|
|
308
|
+
command_found=True,
|
|
309
|
+
run_path=str(run_root),
|
|
310
|
+
bundle_path=str(bundle) if bundle else None,
|
|
311
|
+
error_type=error_type,
|
|
312
|
+
error_summary=error_summary,
|
|
313
|
+
)
|
|
314
|
+
except Exception as exc:
|
|
315
|
+
result, error_type, error_summary = _classify_error(str(exc), output_contract_default=False)
|
|
316
|
+
row = _base_row(
|
|
317
|
+
kind,
|
|
318
|
+
result if result != RESULT_FAILED_OUTPUT_CONTRACT else RESULT_FAILED_PROVIDER_ERROR,
|
|
319
|
+
agent_id=agent_id,
|
|
320
|
+
display_name=recipe.display_name,
|
|
321
|
+
mode=mode,
|
|
322
|
+
command=command,
|
|
323
|
+
command_found=True,
|
|
324
|
+
error_type=error_type,
|
|
325
|
+
error_summary=error_summary,
|
|
326
|
+
)
|
|
327
|
+
_record_agent_row(row, cache_home)
|
|
328
|
+
return row
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def certify_model_provider(provider_id: str, *, strict: bool = False, cache_home: Path | None = None) -> dict[str, Any]:
|
|
332
|
+
cache_home = cache_home or get_settings().app_home
|
|
333
|
+
recipe = MODEL_PROVIDER_RECIPES[provider_id]
|
|
334
|
+
key_env, visible = model_key_status(provider_id)
|
|
335
|
+
if not visible:
|
|
336
|
+
row = _base_row(
|
|
337
|
+
"model_live_smoke",
|
|
338
|
+
RESULT_SKIPPED_MISSING_KEY,
|
|
339
|
+
provider_id=provider_id,
|
|
340
|
+
display_name=recipe.display_name,
|
|
341
|
+
model=recipe.default_model,
|
|
342
|
+
api_key_env_var=key_env,
|
|
343
|
+
api_key_visible=False,
|
|
344
|
+
error_type="missing_key",
|
|
345
|
+
error_summary=f"{key_env} is not visible.",
|
|
346
|
+
)
|
|
347
|
+
_record_model_row(row, cache_home)
|
|
348
|
+
return row
|
|
349
|
+
out_dir = cache_home / "certification_model_smokes" / provider_id
|
|
350
|
+
try:
|
|
351
|
+
counters = run_llm_smoke(out_dir, provider_id=provider_id)
|
|
352
|
+
if _model_smoke_ok(counters):
|
|
353
|
+
row = _base_row(
|
|
354
|
+
"model_live_smoke",
|
|
355
|
+
RESULT_PASSED,
|
|
356
|
+
provider_id=provider_id,
|
|
357
|
+
display_name=recipe.display_name,
|
|
358
|
+
model=recipe.default_model,
|
|
359
|
+
api_key_env_var=key_env,
|
|
360
|
+
api_key_visible=True,
|
|
361
|
+
output_dir=str(out_dir),
|
|
362
|
+
counters=counters,
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
result, error_type, error_summary = _model_error(counters)
|
|
366
|
+
row = _base_row(
|
|
367
|
+
"model_live_smoke",
|
|
368
|
+
result,
|
|
369
|
+
provider_id=provider_id,
|
|
370
|
+
display_name=recipe.display_name,
|
|
371
|
+
model=recipe.default_model,
|
|
372
|
+
api_key_env_var=key_env,
|
|
373
|
+
api_key_visible=True,
|
|
374
|
+
output_dir=str(out_dir),
|
|
375
|
+
counters=counters,
|
|
376
|
+
error_type=error_type,
|
|
377
|
+
error_summary=error_summary,
|
|
378
|
+
)
|
|
379
|
+
except Exception as exc:
|
|
380
|
+
result, error_type, error_summary = _classify_error(str(exc))
|
|
381
|
+
row = _base_row(
|
|
382
|
+
"model_live_smoke",
|
|
383
|
+
result,
|
|
384
|
+
provider_id=provider_id,
|
|
385
|
+
display_name=recipe.display_name,
|
|
386
|
+
model=recipe.default_model,
|
|
387
|
+
api_key_env_var=key_env,
|
|
388
|
+
api_key_visible=True,
|
|
389
|
+
output_dir=str(out_dir),
|
|
390
|
+
error_type=error_type,
|
|
391
|
+
error_summary=error_summary,
|
|
392
|
+
)
|
|
393
|
+
_record_model_row(row, cache_home)
|
|
394
|
+
return row
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def bounded_full_e2e_pairs(
|
|
398
|
+
*,
|
|
399
|
+
all_combinations: bool = False,
|
|
400
|
+
agent_ids: list[str] | None = None,
|
|
401
|
+
provider_ids: list[str] | None = None,
|
|
402
|
+
) -> list[tuple[str, str]]:
|
|
403
|
+
agents = agent_ids or selected_agent_ids()
|
|
404
|
+
providers = provider_ids or selected_model_provider_ids()
|
|
405
|
+
if all_combinations:
|
|
406
|
+
return [(agent, provider) for agent in agents for provider in providers]
|
|
407
|
+
pairs: list[tuple[str, str]] = [("codex", "openai"), ("openclaw", "openai")]
|
|
408
|
+
pairs.extend(("custom", provider) for provider in providers)
|
|
409
|
+
for agent in agents:
|
|
410
|
+
if agent == "custom":
|
|
411
|
+
continue
|
|
412
|
+
command, _, found = _agent_command_for(agent)
|
|
413
|
+
if found:
|
|
414
|
+
pairs.append((agent, "openai"))
|
|
415
|
+
deduped: list[tuple[str, str]] = []
|
|
416
|
+
seen = set()
|
|
417
|
+
for pair in pairs:
|
|
418
|
+
if pair[0] not in agents or pair[1] not in providers:
|
|
419
|
+
continue
|
|
420
|
+
if pair not in seen:
|
|
421
|
+
deduped.append(pair)
|
|
422
|
+
seen.add(pair)
|
|
423
|
+
return deduped
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def certify_full_e2e_pair(agent_id: str, provider_id: str, *, cache_home: Path | None = None) -> dict[str, Any]:
|
|
427
|
+
cache_home = cache_home or get_settings().app_home
|
|
428
|
+
agent_recipe = WORKER_AGENT_RECIPES[agent_id]
|
|
429
|
+
provider_recipe = MODEL_PROVIDER_RECIPES[provider_id]
|
|
430
|
+
key_env, key_visible = model_key_status(provider_id)
|
|
431
|
+
if not key_visible:
|
|
432
|
+
row = _base_row(
|
|
433
|
+
"full_e2e",
|
|
434
|
+
RESULT_SKIPPED_MISSING_KEY,
|
|
435
|
+
agent_id=agent_id,
|
|
436
|
+
provider_id=provider_id,
|
|
437
|
+
command=None,
|
|
438
|
+
model=provider_recipe.default_model,
|
|
439
|
+
api_key_env_var=key_env,
|
|
440
|
+
error_type="missing_key",
|
|
441
|
+
error_summary=f"{key_env} is not visible.",
|
|
442
|
+
)
|
|
443
|
+
_record_full_row(row, cache_home)
|
|
444
|
+
return row
|
|
445
|
+
command, template, found = _agent_command_for(agent_id)
|
|
446
|
+
if not found:
|
|
447
|
+
row = _base_row(
|
|
448
|
+
"full_e2e",
|
|
449
|
+
RESULT_SKIPPED_MISSING_COMMAND,
|
|
450
|
+
agent_id=agent_id,
|
|
451
|
+
provider_id=provider_id,
|
|
452
|
+
command=command,
|
|
453
|
+
model=provider_recipe.default_model,
|
|
454
|
+
api_key_env_var=key_env,
|
|
455
|
+
error_type="missing_command",
|
|
456
|
+
error_summary=(
|
|
457
|
+
f"Apprentice Agent command not found: {command}"
|
|
458
|
+
+ (f". {gui_app_hint(agent_id)} Install or expose the headless CLI on PATH." if gui_app_hint(agent_id) else "")
|
|
459
|
+
),
|
|
460
|
+
)
|
|
461
|
+
_record_full_row(row, cache_home)
|
|
462
|
+
return row
|
|
463
|
+
try:
|
|
464
|
+
with _isolated_app_home(f"e2e-{agent_id}-{provider_id}", cache_home):
|
|
465
|
+
task_timeout = max(_smoke_timeout_seconds(), 240) if agent_id == "openclaw" else _smoke_timeout_seconds()
|
|
466
|
+
if agent_id == "custom":
|
|
467
|
+
update_settings(
|
|
468
|
+
worker_agent="custom",
|
|
469
|
+
worker_agent_command=command,
|
|
470
|
+
custom_worker_display_name="Custom Fixture",
|
|
471
|
+
custom_worker_command_template=template,
|
|
472
|
+
mentor_mode="model_assisted",
|
|
473
|
+
model_provider=provider_id,
|
|
474
|
+
model_provider_api_key_env=key_env,
|
|
475
|
+
model_provider_model=provider_recipe.default_model,
|
|
476
|
+
max_improvement_loops=1,
|
|
477
|
+
task_timeout_seconds=task_timeout,
|
|
478
|
+
)
|
|
479
|
+
else:
|
|
480
|
+
update_settings(
|
|
481
|
+
worker_agent=agent_id,
|
|
482
|
+
worker_agent_command=command,
|
|
483
|
+
worker_runner=("codex" if agent_id == "codex" else agent_id),
|
|
484
|
+
reviser_runner=("codex" if agent_id == "codex" else agent_id),
|
|
485
|
+
mentor_mode="model_assisted",
|
|
486
|
+
model_provider=provider_id,
|
|
487
|
+
model_provider_api_key_env=key_env,
|
|
488
|
+
model_provider_model=provider_recipe.default_model,
|
|
489
|
+
max_improvement_loops=1,
|
|
490
|
+
task_timeout_seconds=task_timeout,
|
|
491
|
+
)
|
|
492
|
+
run_root, bundle = run_prompt_task(
|
|
493
|
+
"Create one sentence under artifacts/market_note.md.",
|
|
494
|
+
run_id=f"cert-e2e-{agent_id}-{provider_id}",
|
|
495
|
+
create_bundle=True,
|
|
496
|
+
)
|
|
497
|
+
result, error_type, error_summary = _result_from_run_status(run_root)
|
|
498
|
+
row = _base_row(
|
|
499
|
+
"full_e2e",
|
|
500
|
+
result,
|
|
501
|
+
agent_id=agent_id,
|
|
502
|
+
provider_id=provider_id,
|
|
503
|
+
display_name=f"{agent_recipe.display_name} + {provider_recipe.display_name}",
|
|
504
|
+
command=command,
|
|
505
|
+
model=provider_recipe.default_model,
|
|
506
|
+
api_key_env_var=key_env,
|
|
507
|
+
run_path=str(run_root),
|
|
508
|
+
bundle_path=str(bundle) if bundle else None,
|
|
509
|
+
error_type=error_type,
|
|
510
|
+
error_summary=error_summary,
|
|
511
|
+
)
|
|
512
|
+
except Exception as exc:
|
|
513
|
+
result, error_type, error_summary = _classify_error(str(exc))
|
|
514
|
+
row = _base_row(
|
|
515
|
+
"full_e2e",
|
|
516
|
+
result,
|
|
517
|
+
agent_id=agent_id,
|
|
518
|
+
provider_id=provider_id,
|
|
519
|
+
display_name=f"{agent_recipe.display_name} + {provider_recipe.display_name}",
|
|
520
|
+
command=command,
|
|
521
|
+
model=provider_recipe.default_model,
|
|
522
|
+
api_key_env_var=key_env,
|
|
523
|
+
error_type=error_type,
|
|
524
|
+
error_summary=error_summary,
|
|
525
|
+
)
|
|
526
|
+
_record_full_row(row, cache_home)
|
|
527
|
+
return row
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def run_certification_matrix(
|
|
531
|
+
*,
|
|
532
|
+
include_agents: bool = False,
|
|
533
|
+
include_models: bool = False,
|
|
534
|
+
include_full_e2e: bool = False,
|
|
535
|
+
all_combinations: bool = False,
|
|
536
|
+
strict: bool = False,
|
|
537
|
+
agent_ids: list[str] | None = None,
|
|
538
|
+
provider_ids: list[str] | None = None,
|
|
539
|
+
full_e2e_pairs: list[tuple[str, str]] | None = None,
|
|
540
|
+
) -> dict[str, Any]:
|
|
541
|
+
if not (include_agents or include_models or include_full_e2e):
|
|
542
|
+
include_agents = include_models = include_full_e2e = True
|
|
543
|
+
cache_home = get_settings().app_home
|
|
544
|
+
report: dict[str, Any] = {"agents": [], "models": [], "full_e2e": [], "summary": {}}
|
|
545
|
+
if include_agents:
|
|
546
|
+
for agent_id in agent_ids or selected_agent_ids():
|
|
547
|
+
if agent_id == "custom":
|
|
548
|
+
report["agents"].append(certify_agent(agent_id, strict=strict, user_custom=False, cache_home=cache_home))
|
|
549
|
+
report["agents"].append(certify_agent(agent_id, strict=strict, user_custom=True, cache_home=cache_home))
|
|
550
|
+
else:
|
|
551
|
+
report["agents"].append(certify_agent(agent_id, strict=strict, cache_home=cache_home))
|
|
552
|
+
if include_models:
|
|
553
|
+
for provider_id in provider_ids or selected_model_provider_ids():
|
|
554
|
+
report["models"].append(certify_model_provider(provider_id, strict=strict, cache_home=cache_home))
|
|
555
|
+
if include_full_e2e:
|
|
556
|
+
pairs = full_e2e_pairs or bounded_full_e2e_pairs(
|
|
557
|
+
all_combinations=all_combinations,
|
|
558
|
+
agent_ids=agent_ids,
|
|
559
|
+
provider_ids=provider_ids,
|
|
560
|
+
)
|
|
561
|
+
for agent_id, provider_id in pairs:
|
|
562
|
+
report["full_e2e"].append(certify_full_e2e_pair(agent_id, provider_id, cache_home=cache_home))
|
|
563
|
+
all_rows = [*report["agents"], *report["models"], *report["full_e2e"]]
|
|
564
|
+
report["summary"] = {
|
|
565
|
+
"passed": sum(1 for row in all_rows if row["result"] == RESULT_PASSED),
|
|
566
|
+
"failed": sum(1 for row in all_rows if str(row["result"]).startswith("failed")),
|
|
567
|
+
"skipped": sum(1 for row in all_rows if str(row["result"]).startswith("skipped")),
|
|
568
|
+
"strict": strict,
|
|
569
|
+
"all_combinations": all_combinations,
|
|
570
|
+
}
|
|
571
|
+
return report
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def certification_exit_code(report: dict[str, Any], *, strict: bool = False) -> int:
|
|
575
|
+
rows = [*(report.get("agents") or []), *(report.get("models") or []), *(report.get("full_e2e") or [])]
|
|
576
|
+
if any(str(row.get("result")).startswith("failed") for row in rows):
|
|
577
|
+
return 1
|
|
578
|
+
if strict and any(str(row.get("result")).startswith("skipped") for row in rows):
|
|
579
|
+
return 1
|
|
580
|
+
return 0
|