agent-apprenticeship 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +217 -0
  3. package/bin/agent-apprenticeship.js +131 -0
  4. package/package.json +30 -0
  5. package/pyproject.toml +23 -0
  6. package/src/agent_apprenticeship_trace/__init__.py +2 -0
  7. package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
  8. package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
  9. package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
  10. package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
  11. package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
  12. package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
  13. package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
  14. package/src/agent_apprenticeship_trace/certification.py +580 -0
  15. package/src/agent_apprenticeship_trace/cli.py +2979 -0
  16. package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
  17. package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
  18. package/src/agent_apprenticeship_trace/config.py +609 -0
  19. package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
  20. package/src/agent_apprenticeship_trace/env.py +46 -0
  21. package/src/agent_apprenticeship_trace/evaluator.py +64 -0
  22. package/src/agent_apprenticeship_trace/grader.py +194 -0
  23. package/src/agent_apprenticeship_trace/integration_status.py +193 -0
  24. package/src/agent_apprenticeship_trace/io.py +20 -0
  25. package/src/agent_apprenticeship_trace/learning.py +627 -0
  26. package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
  27. package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
  28. package/src/agent_apprenticeship_trace/loop.py +111 -0
  29. package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
  30. package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
  31. package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
  32. package/src/agent_apprenticeship_trace/progress.py +223 -0
  33. package/src/agent_apprenticeship_trace/public_run.py +1109 -0
  34. package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
  35. package/src/agent_apprenticeship_trace/recipes.py +129 -0
  36. package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
  37. package/src/agent_apprenticeship_trace/revision.py +21 -0
  38. package/src/agent_apprenticeship_trace/role_runners.py +7 -0
  39. package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
  40. package/src/agent_apprenticeship_trace/schemas.py +273 -0
  41. package/src/agent_apprenticeship_trace/session_events.py +99 -0
  42. package/src/agent_apprenticeship_trace/task_intake.py +112 -0
  43. package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
  44. package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
  45. package/src/agent_apprenticeship_trace/training_signals.py +30 -0
  46. package/src/agent_apprenticeship_trace/validation.py +210 -0
  47. package/src/agent_apprenticeship_trace/verifier.py +55 -0
@@ -0,0 +1,783 @@
1
+ from __future__ import annotations
2
+ import importlib, json, os, re, sys, time, urllib.error, urllib.parse, urllib.request
3
+ from pathlib import Path
4
+ from typing import Any
5
+ from pydantic import BaseModel
6
+ from .config import get_settings
7
+ from .io import write_json
8
+ from .env import redact_secrets, contains_secret
9
+ from .recipes import MODEL_PROVIDER_RECIPES
10
+ from .role_runners import RoleResult
11
+ from .llm_output_normalizer import normalize_role_output
12
+
13
+ class JsonExtractionError(ValueError):
14
+ pass
15
+
16
+
17
+ def _bool(v: bool) -> str:
18
+ return str(bool(v)).lower()
19
+
20
+ def _safe_error_message(exc: BaseException | str | None) -> str | None:
21
+ if exc is None:
22
+ return None
23
+ return redact_secrets(str(exc))[:1000]
24
+
25
+ def _repo_src_path() -> str:
26
+ return str(Path(__file__).resolve().parents[1])
27
+
28
+ def _drop_local_pydantic_if_needed() -> dict[str, Any]:
29
+ """Avoid src/pydantic shim shadowing real pydantic for the OpenAI SDK.
30
+
31
+ Deterministic tests in this kata run without third-party deps, so the repo has a tiny
32
+ local pydantic shim. Real OpenAI SDK imports require real pydantic. When available,
33
+ temporarily remove repo src from import lookup and evict the shim before importing
34
+ `openai`.
35
+ """
36
+ src = _repo_src_path()
37
+ removed_paths = []
38
+ for p in list(sys.path):
39
+ if Path(p or '.').resolve().as_posix() == Path(src).resolve().as_posix():
40
+ sys.path.remove(p); removed_paths.append(p)
41
+ removed_modules = {}
42
+ for name, mod in list(sys.modules.items()):
43
+ if name == 'pydantic' or name.startswith('pydantic.'):
44
+ file = getattr(mod, '__file__', '') or ''
45
+ if '/src/pydantic/' in file or file.endswith('/src/pydantic/__init__.py'):
46
+ removed_modules[name] = mod
47
+ del sys.modules[name]
48
+ return {'removed_paths': removed_paths, 'removed_modules': removed_modules}
49
+
50
+ def _restore_import_state(state: dict[str, Any]) -> None:
51
+ for p in reversed(state.get('removed_paths', [])):
52
+ if p not in sys.path:
53
+ sys.path.insert(0, p)
54
+ # Do not restore local pydantic if real pydantic was imported successfully.
55
+ if 'pydantic' not in sys.modules:
56
+ sys.modules.update(state.get('removed_modules', {}))
57
+
58
+ def import_openai_sdk():
59
+ if 'openai' in sys.modules:
60
+ return sys.modules['openai']
61
+ state = _drop_local_pydantic_if_needed()
62
+ try:
63
+ return importlib.import_module('openai')
64
+ finally:
65
+ _restore_import_state(state)
66
+
67
+ def get_openai_status() -> dict[str, Any]:
68
+ settings = get_settings()
69
+ status = {
70
+ 'openai_sdk_import_ok': False,
71
+ 'openai_api_key_visible': bool(settings.openai_api_key),
72
+ 'openai_client_constructed_ok': False,
73
+ 'openai_available': False,
74
+ 'error_type': None,
75
+ 'error_message': None,
76
+ }
77
+ try:
78
+ mod = import_openai_sdk()
79
+ status['openai_sdk_import_ok'] = True
80
+ except Exception as exc:
81
+ status['error_type'] = type(exc).__name__
82
+ status['error_message'] = _safe_error_message(exc)
83
+ return status
84
+ if not settings.openai_api_key:
85
+ status['error_type'] = 'OpenAIKeyMissing'
86
+ status['error_message'] = 'OPENAI_API_KEY is not visible.'
87
+ return status
88
+ # Availability means SDK import + visible key. Client construction is reported separately.
89
+ status['openai_available'] = True
90
+ try:
91
+ mod.OpenAI(api_key=settings.openai_api_key)
92
+ status['openai_client_constructed_ok'] = True
93
+ except Exception as exc:
94
+ status['error_type'] = type(exc).__name__
95
+ status['error_message'] = _safe_error_message(exc)
96
+ return status
97
+
98
+ def openai_available() -> bool:
99
+ return bool(get_openai_status().get('openai_available'))
100
+
101
+ OPENAI_COMPATIBLE_BASE_URLS = {
102
+ "openrouter": "https://openrouter.ai/api/v1",
103
+ }
104
+
105
+ def _configured_provider(provider_id: str | None = None) -> str:
106
+ settings = get_settings()
107
+ return provider_id or settings.model_provider or "openai"
108
+
109
+ def _provider_key_env(provider_id: str, settings=None) -> str | None:
110
+ settings = settings or get_settings()
111
+ recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
112
+ if settings.model_provider == provider_id and settings.model_provider_api_key_env:
113
+ return settings.model_provider_api_key_env
114
+ return recipe.api_key_env_var if recipe else None
115
+
116
+ def _provider_key(provider_id: str, settings=None) -> tuple[str | None, str | None]:
117
+ settings = settings or get_settings()
118
+ env_var = _provider_key_env(provider_id, settings)
119
+ if provider_id == "openai":
120
+ return env_var or "OPENAI_API_KEY", settings.openai_api_key or os.getenv(env_var or "OPENAI_API_KEY")
121
+ value = os.getenv(env_var or "") if env_var else None
122
+ if provider_id == "google" and not value:
123
+ fallback = "GOOGLE_API_KEY"
124
+ value = os.getenv(fallback)
125
+ if value:
126
+ env_var = fallback
127
+ return env_var, value
128
+
129
+ def _provider_model(provider_id: str, model_override: str | None = None) -> str:
130
+ settings = get_settings()
131
+ recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
132
+ if model_override:
133
+ return model_override
134
+ if settings.model_provider == provider_id and settings.model_provider_model:
135
+ return settings.model_provider_model
136
+ if provider_id == "openai":
137
+ return settings.openai_model
138
+ return (recipe.default_model if recipe else None) or settings.openai_model
139
+
140
+ def _provider_max_output_tokens() -> int:
141
+ try:
142
+ value = int(os.getenv("AA_MODEL_SMOKE_MAX_TOKENS") or "512")
143
+ except ValueError:
144
+ value = 2048
145
+ return max(256, min(value, 4096))
146
+
147
+ def get_model_provider_status(provider_id: str | None = None) -> dict[str, Any]:
148
+ provider_id = _configured_provider(provider_id)
149
+ settings = get_settings()
150
+ recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
151
+ env_var, key = _provider_key(provider_id, settings)
152
+ status = {
153
+ "provider_id": provider_id,
154
+ "provider_display": recipe.display_name if recipe else provider_id,
155
+ "model": _provider_model(provider_id),
156
+ "api_key_env_var": env_var,
157
+ "api_key_visible": bool(key),
158
+ "adapter_available": provider_id in MODEL_PROVIDER_RECIPES,
159
+ "provider_available": False,
160
+ "client_constructed_ok": False,
161
+ "error_type": None,
162
+ "error_message": None,
163
+ }
164
+ if not recipe:
165
+ status["error_type"] = "UnsupportedProvider"
166
+ status["error_message"] = f"Unsupported Mentor Model Provider: {provider_id}"
167
+ return status
168
+ if not key:
169
+ status["error_type"] = "APIKeyMissing"
170
+ status["error_message"] = f"{env_var or recipe.api_key_env_var} is not visible."
171
+ return status
172
+ if provider_id in {"openai", "openrouter"}:
173
+ try:
174
+ mod = import_openai_sdk()
175
+ status["sdk_import_ok"] = True
176
+ kwargs = {"api_key": key}
177
+ if provider_id in OPENAI_COMPATIBLE_BASE_URLS:
178
+ kwargs["base_url"] = OPENAI_COMPATIBLE_BASE_URLS[provider_id]
179
+ mod.OpenAI(**kwargs)
180
+ status["client_constructed_ok"] = True
181
+ status["provider_available"] = True
182
+ except Exception as exc:
183
+ status["error_type"] = type(exc).__name__
184
+ status["error_message"] = _safe_error_message(exc)
185
+ return status
186
+ status["client_constructed_ok"] = True
187
+ status["provider_available"] = True
188
+ return status
189
+
190
+ def classify_model_provider_error(exc: BaseException | str | None, status_code: int | None = None) -> tuple[str, str]:
191
+ text = redact_secrets(str(exc or ""))[:1000]
192
+ low = text.lower()
193
+ if status_code in {401, 403} or any(n in low for n in ["unauthorized", "forbidden", "invalid api key", "authentication", "permission denied"]):
194
+ return "auth_error", text or "Mentor Model Provider authentication failed."
195
+ if "insufficient balance" in low:
196
+ return "insufficient_balance", text or "Mentor Model Provider account balance is insufficient."
197
+ if status_code in {402, 429} or any(n in low for n in ["quota", "credit", "billing", "rate limit", "insufficient"]):
198
+ return "quota_or_credit_error", text or "Mentor Model Provider quota or credit limit reached."
199
+ if any(n in low for n in ["timeout", "timed out"]):
200
+ return "timeout", text or "Mentor Model Provider request timed out."
201
+ if status_code == 404 or any(n in low for n in ["model_not_found", "model not found", "unknown model", "invalid model"]):
202
+ return "bad_model", text or "Mentor Model Provider model was not found."
203
+ if any(n in low for n in ["network", "name or service not known", "temporary failure", "connection refused", "connection reset"]):
204
+ return "network_error", text or "Mentor Model Provider network error."
205
+ return "provider_error", text or "Mentor Model Provider request failed."
206
+
207
+ def _http_json(url: str, headers: dict[str, str], payload: dict[str, Any], timeout: int = 60) -> dict[str, Any]:
208
+ req = urllib.request.Request(
209
+ url,
210
+ data=json.dumps(payload).encode("utf-8"),
211
+ headers={**headers, "Content-Type": "application/json"},
212
+ method="POST",
213
+ )
214
+ try:
215
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
216
+ return json.loads(resp.read().decode("utf-8"))
217
+ except urllib.error.HTTPError as exc:
218
+ body = exc.read().decode("utf-8", errors="replace")
219
+ kind, msg = classify_model_provider_error(body or exc, exc.code)
220
+ raise RuntimeError(f"{kind}: {msg}") from exc
221
+ except TimeoutError as exc:
222
+ kind, msg = classify_model_provider_error(exc)
223
+ raise RuntimeError(f"{kind}: {msg}") from exc
224
+
225
+ def _openai_compatible_response(provider_id: str, model_name: str, prompt: str) -> str:
226
+ settings = get_settings()
227
+ _env_var, key = _provider_key(provider_id, settings)
228
+ mod = import_openai_sdk()
229
+ kwargs = {"api_key": key}
230
+ if provider_id in OPENAI_COMPATIBLE_BASE_URLS:
231
+ kwargs["base_url"] = OPENAI_COMPATIBLE_BASE_URLS[provider_id]
232
+ if provider_id == "openrouter":
233
+ try:
234
+ kwargs["default_headers"] = {
235
+ "HTTP-Referer": "https://github.com/Forsy-AI/agent-apprenticeship",
236
+ "X-Title": "Agent Apprenticeship",
237
+ }
238
+ except Exception:
239
+ pass
240
+ client = mod.OpenAI(**kwargs)
241
+ if provider_id == "openai":
242
+ return _response_text(_responses_create_with_retry(client, model_name, prompt))
243
+ response = client.chat.completions.create(
244
+ model=model_name,
245
+ messages=[{"role": "user", "content": prompt}],
246
+ response_format={"type": "json_object"},
247
+ max_tokens=_provider_max_output_tokens(),
248
+ )
249
+ choice = response.choices[0]
250
+ return str(choice.message.content or "")
251
+
252
+ def _anthropic_response(model_name: str, prompt: str) -> str:
253
+ settings = get_settings()
254
+ _env_var, key = _provider_key("anthropic", settings)
255
+ payload = {
256
+ "model": model_name,
257
+ "max_tokens": _provider_max_output_tokens(),
258
+ "messages": [{"role": "user", "content": prompt}],
259
+ }
260
+ data = _http_json(
261
+ "https://api.anthropic.com/v1/messages",
262
+ {"x-api-key": key or "", "anthropic-version": "2023-06-01"},
263
+ payload,
264
+ timeout=min(120, settings.task_timeout_seconds),
265
+ )
266
+ parts = data.get("content") or []
267
+ text = "".join(str(part.get("text") or "") for part in parts if isinstance(part, dict))
268
+ if not text:
269
+ raise RuntimeError("provider_response_format: Anthropic response did not contain text content.")
270
+ return text
271
+
272
+ def _google_response(model_name: str, prompt: str) -> str:
273
+ settings = get_settings()
274
+ _env_var, key = _provider_key("google", settings)
275
+ query = urllib.parse.urlencode({"key": key or ""})
276
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{urllib.parse.quote(model_name, safe='')}:generateContent?{query}"
277
+ payload = {
278
+ "contents": [{"parts": [{"text": prompt}]}],
279
+ "generationConfig": {"responseMimeType": "application/json", "maxOutputTokens": _provider_max_output_tokens()},
280
+ }
281
+ data = _http_json(url, {}, payload, timeout=min(120, settings.task_timeout_seconds))
282
+ try:
283
+ return str(data["candidates"][0]["content"]["parts"][0]["text"])
284
+ except Exception as exc:
285
+ raise RuntimeError("provider_response_format: Google response did not contain candidate text.") from exc
286
+
287
+ def _provider_completion_text(provider_id: str, model_name: str, prompt: str) -> str:
288
+ if provider_id in {"openai", "openrouter"}:
289
+ return _openai_compatible_response(provider_id, model_name, prompt)
290
+ if provider_id == "anthropic":
291
+ return _anthropic_response(model_name, prompt)
292
+ if provider_id == "google":
293
+ return _google_response(model_name, prompt)
294
+ raise RuntimeError(f"Unsupported Mentor Model Provider: {provider_id}")
295
+
296
+ def _provider_completion_text_with_retry(provider_id: str, model_name: str, prompt: str) -> str:
297
+ last: BaseException | None = None
298
+ for i in range(2):
299
+ try:
300
+ return _provider_completion_text(provider_id, model_name, prompt)
301
+ except Exception as exc:
302
+ last = exc
303
+ kind, _msg = classify_model_provider_error(exc)
304
+ if i > 0 or kind in {"auth_error", "quota_or_credit_error", "bad_model"}:
305
+ raise
306
+ time.sleep(0.75)
307
+ raise last or RuntimeError("Mentor Model Provider request failed.")
308
+
309
+ def _json_repair_candidates(cand: str) -> list[tuple[str, str]]:
310
+ repaired=[]
311
+ try:
312
+ from json_repair import repair_json # type: ignore
313
+ repaired_text=repair_json(cand)
314
+ if repaired_text and repaired_text != cand:
315
+ repaired.append((repaired_text, 'json_repair'))
316
+ except Exception:
317
+ pass
318
+ local=re.sub(r',\s*([}\]])', r'\1', cand)
319
+ if local != cand:
320
+ repaired.append((local, 'removed_trailing_commas'))
321
+ return repaired
322
+
323
+ def extract_json_object(text: str, return_metadata: bool=False):
324
+ if not text or not text.strip():
325
+ raise JsonExtractionError('No text to parse as JSON.')
326
+ candidates: list[tuple[str, str]] = [(text.strip(), 'direct')]
327
+ for match in re.finditer(r"```(?:json)?\s*(.*?)```", text, re.DOTALL | re.IGNORECASE):
328
+ candidates.append((match.group(1).strip(), 'markdown_fence'))
329
+ for start, ch in enumerate(text):
330
+ if ch not in '{[':
331
+ continue
332
+ stack=[]; in_str=False; esc=False
333
+ for idx in range(start, len(text)):
334
+ c=text[idx]
335
+ if in_str:
336
+ if esc: esc=False
337
+ elif c == '\\': esc=True
338
+ elif c == '"': in_str=False
339
+ continue
340
+ if c == '"': in_str=True
341
+ elif c in '{[': stack.append(c)
342
+ elif c in '}]':
343
+ if not stack: break
344
+ opening=stack.pop()
345
+ if (opening,c) not in [('{','}'),('[',']')]: break
346
+ if not stack:
347
+ candidates.append((text[start:idx+1], 'balanced_brace_extraction')); break
348
+ expanded=[]
349
+ for cand, reason in candidates:
350
+ expanded.append((cand, reason, False))
351
+ for repaired, repair_reason in _json_repair_candidates(cand):
352
+ expanded.append((repaired, f'{reason}+{repair_reason}', True))
353
+ errors=[]
354
+ for cand, reason, repaired in expanded:
355
+ try:
356
+ obj=json.loads(cand)
357
+ if isinstance(obj, dict):
358
+ meta={'repaired_json': repaired, 'repair_reason': reason if repaired else None, 'json_extraction_reason': reason}
359
+ return (obj, meta) if return_metadata else obj
360
+ raise JsonExtractionError('Parsed JSON was not an object.')
361
+ except Exception as exc:
362
+ errors.append(type(exc).__name__)
363
+ raise JsonExtractionError('Could not extract a valid JSON object from model output: ' + ','.join(errors[:5]))
364
+
365
+ def _validate_role_output(
366
+ role: str,
367
+ raw: str,
368
+ output_model: type[BaseModel],
369
+ normalizer_context: dict[str, Any] | None,
370
+ out_dir: Path,
371
+ ) -> tuple[dict[str, Any], dict[str, Any], bool]:
372
+ parsed,json_meta=extract_json_object(raw, return_metadata=True)
373
+ write_json(out_dir/'raw_parsed_output.json', parsed)
374
+ initial_validation_error=None
375
+ try:
376
+ output_model.model_validate(parsed)
377
+ except Exception as first_exc:
378
+ initial_validation_error=_safe_error_message(first_exc)
379
+ normalized=normalize_role_output(role, parsed, normalizer_context)
380
+ normalized_used=normalized != parsed or initial_validation_error is not None
381
+ output_model.model_validate(normalized)
382
+ if normalized_used:
383
+ write_json(out_dir/'normalization_report.json', {'role': role, 'normalization_applied': True, 'initial_validation_error': initial_validation_error, 'raw_keys': list(parsed.keys()), 'normalized_keys': list(normalized.keys()) if isinstance(normalized, dict) else [], 'extras_preserved_in_metadata': True})
384
+ return normalized, json_meta, normalized_used
385
+
386
+ def _strict_json_retry_prompt(role: str, original_prompt: str, error_message: str | None) -> str:
387
+ return (
388
+ "Your previous response could not be parsed or validated. "
389
+ "Return ONLY one compact valid JSON object, with double-quoted JSON keys and no markdown/prose. "
390
+ f"Role: {role}. Error to fix: {error_message or 'invalid JSON'}.\n\n"
391
+ f"{original_prompt}"
392
+ )
393
+
394
+ def _transient_provider_error(exc: BaseException) -> bool:
395
+ name=type(exc).__name__.lower(); msg=str(exc).lower()
396
+ needles=['timeout','rate','temporar','connection reset','disconnect','websocket','service unavailable','gateway','overloaded']
397
+ return any(n in name or n in msg for n in needles)
398
+
399
+ def _responses_create_with_retry(client: Any, model_name: str, prompt: str, attempts: int = 3):
400
+ last=None
401
+ for i in range(attempts):
402
+ try:
403
+ return client.responses.create(model=model_name, input=prompt)
404
+ except Exception as exc:
405
+ last=exc
406
+ if i >= attempts-1 or not _transient_provider_error(exc):
407
+ raise
408
+ time.sleep(min(4.0, 0.5 * (2 ** i)))
409
+ raise last # type: ignore[misc]
410
+
411
+ def _response_text(response: Any) -> str:
412
+ text = getattr(response, 'output_text', None)
413
+ if text is not None:
414
+ return str(text)
415
+ if isinstance(response, dict):
416
+ return str(response.get('output_text') or response.get('text') or json.dumps(response))
417
+ try:
418
+ return response.model_dump_json()
419
+ except Exception:
420
+ return str(response)
421
+
422
+ def run_structured_role(role: str, prompt: str, output_model: type[BaseModel], out_dir: Path, allow_fallback=False, require_validation=False, model_override: str | None=None, normalizer_context: dict[str, Any] | None=None, provider_override: str | None=None) -> RoleResult:
423
+ out_dir.mkdir(parents=True, exist_ok=True)
424
+ (out_dir/'prompt.md').write_text(redact_secrets(prompt))
425
+ settings=get_settings()
426
+ provider_id=_configured_provider(provider_override)
427
+ model_name=_provider_model(provider_id, model_override)
428
+ start=time.time()
429
+ status=get_model_provider_status(provider_id)
430
+ meta={
431
+ 'mentor_model_provider': provider_id,
432
+ 'mentor_model_provider_display': status.get('provider_display'),
433
+ 'mentor_model_provider_api_key_visible': bool(status.get('api_key_visible')),
434
+ 'mentor_model_provider_adapter_available': bool(status.get('adapter_available')),
435
+ 'mentor_model_provider_client_constructed_ok': bool(status.get('client_constructed_ok')),
436
+ 'mentor_model_provider_available': bool(status.get('provider_available')),
437
+ }
438
+ if provider_id == 'openai':
439
+ openai_status=get_openai_status()
440
+ meta.update({k: openai_status.get(k) for k in ['openai_sdk_import_ok','openai_api_key_visible','openai_client_constructed_ok','openai_available']})
441
+ if not status['provider_available']:
442
+ result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=False, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=status.get('error_type') or 'MentorModelProviderUnavailable', error_message=status.get('error_message') or 'Mentor Model Provider adapter or API key unavailable; no live call made.', duration_seconds=time.time()-start, metadata_json={**meta, 'allow_fallback': allow_fallback})
443
+ (out_dir/'raw_output.txt').write_text(result.error_message or '')
444
+ write_json(out_dir/'parsed_output.json', {'error': result.error_message})
445
+ write_json(out_dir/'role_result.json', result)
446
+ if not allow_fallback:
447
+ raise RuntimeError(result.error_message)
448
+ return result
449
+ try:
450
+ raw=_provider_completion_text_with_retry(provider_id, model_name, prompt)
451
+ (out_dir/'raw_output.txt').write_text(redact_secrets(raw))
452
+ try:
453
+ normalized,json_meta,normalized_used=_validate_role_output(role, raw, output_model, normalizer_context, out_dir)
454
+ write_json(out_dir/'parsed_output.json', normalized)
455
+ result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=True, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=None, error_message=None, duration_seconds=time.time()-start, metadata_json={**meta, **json_meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'normalization_applied': normalized_used})
456
+ except Exception as exc:
457
+ first_error=_safe_error_message(exc)
458
+ retry_raw=None
459
+ try:
460
+ retry_prompt=_strict_json_retry_prompt(role, prompt, first_error)
461
+ retry_raw=_provider_completion_text_with_retry(provider_id, model_name, retry_prompt)
462
+ (out_dir/'raw_output.retry.txt').write_text(redact_secrets(retry_raw))
463
+ normalized,json_meta,normalized_used=_validate_role_output(role, retry_raw, output_model, normalizer_context, out_dir)
464
+ write_json(out_dir/'parsed_output.json', normalized)
465
+ result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=True, prompt_ref='prompt.md', output_ref='raw_output.retry.txt', parsed_output_ref='parsed_output.json', error_type=None, error_message=None, duration_seconds=time.time()-start, metadata_json={**meta, **json_meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'normalization_applied': normalized_used, 'structured_retry_used': True, 'initial_parse_or_validation_error': first_error})
466
+ except Exception as retry_exc:
467
+ write_json(out_dir/'parsed_output.json', {'parse_or_validation_error': _safe_error_message(retry_exc), 'initial_parse_or_validation_error': first_error, 'raw_output_preserved_ref': 'raw_output.txt', 'retry_output_preserved_ref': 'raw_output.retry.txt' if retry_raw is not None else None, 'raw_parsed_output_ref': 'raw_parsed_output.json' if (out_dir/'raw_parsed_output.json').exists() else None})
468
+ result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=type(retry_exc).__name__, error_message=_safe_error_message(retry_exc), duration_seconds=time.time()-start, metadata_json={**meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'fallback_used': allow_fallback, 'structured_retry_used': retry_raw is not None, 'initial_parse_or_validation_error': first_error})
469
+ if require_validation:
470
+ write_json(out_dir/'role_result.json', result)
471
+ raise RuntimeError(result.error_message)
472
+ except Exception as exc:
473
+ err_type, err_msg = classify_model_provider_error(exc)
474
+ (out_dir/'raw_output.txt').write_text(_safe_error_message(exc) or '')
475
+ write_json(out_dir/'parsed_output.json', {'error': err_msg})
476
+ result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=False, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=err_type, error_message=err_msg, duration_seconds=time.time()-start, metadata_json={**meta, 'mentor_model_provider_live_call_ok': False, f'{provider_id}_live_call_ok': False, 'fallback_used': allow_fallback})
477
+ if require_validation and not allow_fallback:
478
+ write_json(out_dir/'role_result.json', result)
479
+ raise
480
+ write_json(out_dir/'role_result.json', result)
481
+ return result
482
+
483
+ def _json_smoke_prompt(label: str, skeleton: dict[str, Any]) -> str:
484
+ return (
485
+ f"Return ONLY this compact valid JSON object for {label}. "
486
+ "Keep every key and JSON type. Use double quotes. No markdown, no prose.\n"
487
+ + json.dumps(skeleton, separators=(",", ":"))
488
+ )
489
+
490
+ # Compact schema prompts for llm-smoke. They use literal JSON skeletons so
491
+ # providers do not need to infer local Pydantic class definitions.
492
+ def smoke_prompts() -> dict[str, tuple[str, type[BaseModel]]]:
493
+ from .schemas import TaskIntakeSpec, TaskIntakeQualityReport, RubricSpec, RubricQualityReport, GraderResult, VerifierResult, EvaluatorFeedback, RevisionPlan
494
+
495
+ class _IntakeSmokeOutput(BaseModel):
496
+ task_intake_spec: TaskIntakeSpec
497
+ task_intake_quality_report: TaskIntakeQualityReport
498
+
499
+ class _RubricSmokeOutput(BaseModel):
500
+ rubric_spec: RubricSpec
501
+ rubric_quality_report: RubricQualityReport
502
+
503
+ class _EvaluatorSmokeOutput(BaseModel):
504
+ evaluator_feedback: EvaluatorFeedback
505
+ revision_plan: RevisionPlan
506
+
507
+ rubric_item = {
508
+ "rubric_item_id": "ri_1",
509
+ "criterion_name": "artifact",
510
+ "criterion_description": "hello.txt exists and contains hello",
511
+ "weight": 1.0,
512
+ "score_min": 0.0,
513
+ "score_max": 1.0,
514
+ "pass_threshold": 0.7,
515
+ "observable_evidence": ["artifacts/hello.txt"],
516
+ "required_artifacts": ["hello.txt"],
517
+ "scoring_method": "deterministic",
518
+ "worker_visible": True,
519
+ "verifier_only": False,
520
+ "hidden_reference_required": False,
521
+ "failure_modes": [],
522
+ "partial_credit_rules": [],
523
+ "edge_cases": [],
524
+ "anti_cheat_notes": [],
525
+ "metadata_json": {},
526
+ }
527
+ rubric_score = {
528
+ "rubric_item_id": "ri_1",
529
+ "criterion_name": "artifact",
530
+ "score": 1.0,
531
+ "max_score": 1.0,
532
+ "passed": True,
533
+ "evidence_refs": ["artifacts/hello.txt"],
534
+ "failure_mode": None,
535
+ "notes": "present",
536
+ "confidence": 0.9,
537
+ "artifact_presence_ok": True,
538
+ "semantic_correctness_score": 1.0,
539
+ "reasoning_summary": "The artifact is present.",
540
+ "improvement_suggestion": None,
541
+ }
542
+ return {
543
+ 'intake_agent': (_json_smoke_prompt("task intake smoke", {
544
+ "task_intake_spec": {
545
+ "task_id": "smoke_task",
546
+ "normalized_title": "Hello artifact smoke",
547
+ "normalized_instruction": "Create hello.txt containing hello.",
548
+ "domain": "software",
549
+ "subdomain": "certification",
550
+ "professional_role": "QA engineer",
551
+ "apprenticeship_role": "QA engineer",
552
+ "task_family": "file_creation",
553
+ "expected_economic_value": "$50-$100",
554
+ "expected_economic_value_for_agent_apprentice": "$5-$15",
555
+ "workflow_type": "file_creation",
556
+ "skill_targets": ["artifact_creation"],
557
+ "difficulty_tier": "easy",
558
+ "expected_human_deliverable": "hello.txt",
559
+ "expected_agent_deliverable": "artifacts/hello.txt",
560
+ "input_requirements": [],
561
+ "output_requirements": ["hello.txt contains hello"],
562
+ "required_context": [],
563
+ "assumptions": [],
564
+ "constraints": [],
565
+ "allowed_tools": ["filesystem"],
566
+ "disallowed_tools": [],
567
+ "privacy_classification": "synthetic",
568
+ "license": None,
569
+ "allowed_use": "open_research",
570
+ "rubricability_score": 0.9,
571
+ "verifiability_score": 0.9,
572
+ "artifactability_score": 1.0,
573
+ "needs_expert_review": False,
574
+ "metadata_json": {},
575
+ },
576
+ "task_intake_quality_report": {
577
+ "task_id": "smoke_task",
578
+ "instruction_clarity_score": 0.9,
579
+ "input_completeness_score": 0.9,
580
+ "output_contract_score": 1.0,
581
+ "rubricability_score": 0.9,
582
+ "verifiability_score": 0.9,
583
+ "artifactability_score": 1.0,
584
+ "privacy_risk_score": 0.0,
585
+ "license_risk_score": 0.0,
586
+ "ambiguity_score": 0.1,
587
+ "overall_intake_quality_score": 0.9,
588
+ "quality_flags": [],
589
+ "blockers": [],
590
+ "recommended_fix": None,
591
+ "metadata_json": {},
592
+ },
593
+ }), _IntakeSmokeOutput),
594
+ 'rubric_agent': (_json_smoke_prompt("rubric smoke", {
595
+ "rubric_spec": {
596
+ "rubric_id": "smoke_rubric",
597
+ "task_id": "smoke_task",
598
+ "task_family_id": None,
599
+ "rubric_version": "v1",
600
+ "rubric_items": [rubric_item],
601
+ "total_weight": 1.0,
602
+ "pass_threshold": 0.7,
603
+ "worker_visible_rubric_ref": "rubric/worker_visible_rubric.md",
604
+ "verifier_private_rubric_ref": "rubric/rubric.json",
605
+ "hidden_reference_policy": "none",
606
+ "scoring_aggregation": "weighted_sum",
607
+ "required_artifacts": ["hello.txt"],
608
+ "disqualifying_errors": [],
609
+ "partial_credit_allowed": True,
610
+ "grader_kind": "deterministic",
611
+ "rubric_generation_source": "deterministic_seed",
612
+ "rubric_generation_agent_provider": None,
613
+ "rubric_generation_agent_model": None,
614
+ "rubric_generation_confidence": 0.9,
615
+ "metadata_json": {},
616
+ },
617
+ "rubric_quality_report": {
618
+ "rubric_id": "smoke_rubric",
619
+ "task_id": "smoke_task",
620
+ "criteria_count": 1,
621
+ "total_weight": 1.0,
622
+ "weights_sum_valid": True,
623
+ "has_observable_evidence": True,
624
+ "has_required_artifacts": True,
625
+ "has_partial_credit_rules": False,
626
+ "has_disqualifying_errors": False,
627
+ "has_hidden_reference_policy": True,
628
+ "has_worker_visible_view": True,
629
+ "has_verifier_private_view": True,
630
+ "ambiguous_criteria_count": 0,
631
+ "unverifiable_criteria_count": 0,
632
+ "rubric_quality_score": 0.9,
633
+ "quality_flags": [],
634
+ "blockers": [],
635
+ "metadata_json": {},
636
+ },
637
+ }), _RubricSmokeOutput),
638
+ 'grader_agent': (_json_smoke_prompt("grader smoke", {
639
+ "grader_result_id": "smoke_grader",
640
+ "task_id": "smoke_task",
641
+ "attempt_id": "smoke_attempt",
642
+ "attempt_kind": "baseline",
643
+ "rubric_id": "smoke_rubric",
644
+ "grader_kind": "model",
645
+ "score_source": "model_judged",
646
+ "score": 1.0,
647
+ "max_score": 1.0,
648
+ "passed": True,
649
+ "rubric_item_scores": [rubric_score],
650
+ "failed_criteria": [],
651
+ "passed_criteria": ["artifact"],
652
+ "evidence_refs": ["artifacts/hello.txt"],
653
+ "confidence": 0.9,
654
+ "reasoning_summary": "The expected artifact is present.",
655
+ "limitations": [],
656
+ "hidden_reference_used": False,
657
+ "hidden_reference_leaked": False,
658
+ "artifact_contract_score": 1.0,
659
+ "semantic_score": 1.0,
660
+ "model_score": 1.0,
661
+ "legacy_semantic_score": None,
662
+ "legacy_score_source": None,
663
+ "final_score": 1.0,
664
+ "model": None,
665
+ "provider": None,
666
+ "deterministic_precheck_ref": None,
667
+ "llm_prompt_ref_internal": None,
668
+ "llm_response_ref_internal": None,
669
+ "public_prompt_hash": None,
670
+ "public_response_summary": "passed",
671
+ "score_reliability": "verified",
672
+ "verifier_status": "verified",
673
+ "verifier_confidence": 0.9,
674
+ "verifier_issue_count": 0,
675
+ "verifier_issues_summary": None,
676
+ "metadata_json": {},
677
+ }), GraderResult),
678
+ 'verifier_agent': (_json_smoke_prompt("verifier smoke", {
679
+ "verifier_result_id": "smoke_verifier",
680
+ "task_id": "smoke_task",
681
+ "attempt_id": "smoke_attempt",
682
+ "attempt_kind": "baseline",
683
+ "grader_result_id": "smoke_grader",
684
+ "verification_status": "verified",
685
+ "artifact_contract_ok": True,
686
+ "evidence_grounding_ok": True,
687
+ "score_consistency_ok": True,
688
+ "hidden_reference_leaked": False,
689
+ "issues": [],
690
+ "confidence": 0.9,
691
+ "verifier_notes": "verified",
692
+ "semantic_evidence_grounding_ok": True,
693
+ "unsupported_claims": [],
694
+ "leakage_check_ok": True,
695
+ "model": None,
696
+ "provider": None,
697
+ "metadata_json": {},
698
+ }), VerifierResult),
699
+ 'evaluator_agent': (_json_smoke_prompt("evaluator smoke", {
700
+ "evaluator_feedback": {
701
+ "feedback_id": "smoke_feedback",
702
+ "task_id": "smoke_task",
703
+ "attempt_id": "smoke_attempt",
704
+ "target_actor": "worker",
705
+ "feedback_type": "other",
706
+ "failed_rubric_items": [],
707
+ "evidence_refs": ["artifacts/hello.txt"],
708
+ "artifact_refs": ["artifacts/hello.txt"],
709
+ "feedback_summary": "The attempt passes.",
710
+ "actionable_feedback": [],
711
+ "suggested_revision": "No revision needed.",
712
+ "revision_priority": "low",
713
+ "confidence": 0.9,
714
+ "hidden_reference_used": False,
715
+ "hidden_reference_leaked": False,
716
+ "failed_or_weak_rubric_items": [],
717
+ "artifact_specific_comments": [],
718
+ "trace_specific_comments": [],
719
+ "revision_plan": None,
720
+ "model": None,
721
+ "provider": None,
722
+ "metadata_json": {},
723
+ },
724
+ "revision_plan": {
725
+ "revision_plan_id": "smoke_revision",
726
+ "task_id": "smoke_task",
727
+ "source_attempt_id": "smoke_attempt",
728
+ "target_attempt_id": "smoke_attempt_revised",
729
+ "revision_kind": "local_fix",
730
+ "revision_reason": "No revision needed for smoke.",
731
+ "failed_rubric_items": [],
732
+ "planned_changes": [],
733
+ "expected_score_improvement": 0.0,
734
+ "risk_of_regression": "low",
735
+ "uses_evaluator_feedback": True,
736
+ "metadata_json": {},
737
+ },
738
+ }), _EvaluatorSmokeOutput),
739
+ }
740
+
741
+ def run_llm_smoke(out_dir: Path, provider_id: str | None = None) -> dict[str, Any]:
742
+ provider_id=_configured_provider(provider_id)
743
+ status=get_model_provider_status(provider_id)
744
+ counters={
745
+ 'mentor_model_provider': provider_id,
746
+ 'mentor_model_provider_api_key_visible': bool(status['api_key_visible']),
747
+ 'mentor_model_provider_client_constructed_ok': bool(status['client_constructed_ok']),
748
+ 'mentor_model_provider_available': bool(status['provider_available']),
749
+ }
750
+ if provider_id == "openai":
751
+ openai_status=get_openai_status()
752
+ counters.update({
753
+ 'openai_sdk_import_ok': bool(openai_status['openai_sdk_import_ok']),
754
+ 'openai_api_key_visible': bool(openai_status['openai_api_key_visible']),
755
+ 'openai_client_constructed_ok': bool(openai_status['openai_client_constructed_ok']),
756
+ 'openai_available': bool(openai_status['openai_available']),
757
+ })
758
+ secret_ok=True
759
+ for role,(prompt,model) in smoke_prompts().items():
760
+ short=role.replace('_agent','')
761
+ try:
762
+ rr=run_structured_role(role, prompt, model, out_dir/role, allow_fallback=True, provider_override=provider_id)
763
+ except Exception as exc:
764
+ rr=RoleResult(role=role, provider=provider_id, model=_provider_model(provider_id), live_call_ok=False, structured_output_validation_ok=False, prompt_ref='', output_ref='', parsed_output_ref='', error_type=type(exc).__name__, error_message=_safe_error_message(exc), metadata_json={})
765
+ counters[f'{short}_live_call_ok']=bool(rr.live_call_ok)
766
+ counters[f'{short}_structured_output_validation_ok']=bool(rr.structured_output_validation_ok)
767
+ if rr.error_type:
768
+ counters[f'{short}_error_type']=rr.error_type
769
+ counters[f'{short}_error_message']=rr.error_message
770
+ # ensure generated role files do not contain secrets
771
+ if (out_dir/role).exists():
772
+ for p in (out_dir/role).rglob('*'):
773
+ if p.is_file() and contains_secret(p.read_text(errors='ignore')):
774
+ secret_ok=False
775
+ counters['secret_scan_ok']=secret_ok
776
+ return counters
777
+
778
+ def format_smoke_counters(counters: dict[str, Any]) -> str:
779
+ lines=[]
780
+ for k,v in counters.items():
781
+ if isinstance(v,bool): v=_bool(v)
782
+ lines.append(f'{k}={v}')
783
+ return '\n'.join(lines)