agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,783 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import importlib, json, os, re, sys, time, urllib.error, urllib.parse, urllib.request
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from .config import get_settings
|
|
7
|
+
from .io import write_json
|
|
8
|
+
from .env import redact_secrets, contains_secret
|
|
9
|
+
from .recipes import MODEL_PROVIDER_RECIPES
|
|
10
|
+
from .role_runners import RoleResult
|
|
11
|
+
from .llm_output_normalizer import normalize_role_output
|
|
12
|
+
|
|
13
|
+
class JsonExtractionError(ValueError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _bool(v: bool) -> str:
|
|
18
|
+
return str(bool(v)).lower()
|
|
19
|
+
|
|
20
|
+
def _safe_error_message(exc: BaseException | str | None) -> str | None:
|
|
21
|
+
if exc is None:
|
|
22
|
+
return None
|
|
23
|
+
return redact_secrets(str(exc))[:1000]
|
|
24
|
+
|
|
25
|
+
def _repo_src_path() -> str:
|
|
26
|
+
return str(Path(__file__).resolve().parents[1])
|
|
27
|
+
|
|
28
|
+
def _drop_local_pydantic_if_needed() -> dict[str, Any]:
|
|
29
|
+
"""Avoid src/pydantic shim shadowing real pydantic for the OpenAI SDK.
|
|
30
|
+
|
|
31
|
+
Deterministic tests in this kata run without third-party deps, so the repo has a tiny
|
|
32
|
+
local pydantic shim. Real OpenAI SDK imports require real pydantic. When available,
|
|
33
|
+
temporarily remove repo src from import lookup and evict the shim before importing
|
|
34
|
+
`openai`.
|
|
35
|
+
"""
|
|
36
|
+
src = _repo_src_path()
|
|
37
|
+
removed_paths = []
|
|
38
|
+
for p in list(sys.path):
|
|
39
|
+
if Path(p or '.').resolve().as_posix() == Path(src).resolve().as_posix():
|
|
40
|
+
sys.path.remove(p); removed_paths.append(p)
|
|
41
|
+
removed_modules = {}
|
|
42
|
+
for name, mod in list(sys.modules.items()):
|
|
43
|
+
if name == 'pydantic' or name.startswith('pydantic.'):
|
|
44
|
+
file = getattr(mod, '__file__', '') or ''
|
|
45
|
+
if '/src/pydantic/' in file or file.endswith('/src/pydantic/__init__.py'):
|
|
46
|
+
removed_modules[name] = mod
|
|
47
|
+
del sys.modules[name]
|
|
48
|
+
return {'removed_paths': removed_paths, 'removed_modules': removed_modules}
|
|
49
|
+
|
|
50
|
+
def _restore_import_state(state: dict[str, Any]) -> None:
|
|
51
|
+
for p in reversed(state.get('removed_paths', [])):
|
|
52
|
+
if p not in sys.path:
|
|
53
|
+
sys.path.insert(0, p)
|
|
54
|
+
# Do not restore local pydantic if real pydantic was imported successfully.
|
|
55
|
+
if 'pydantic' not in sys.modules:
|
|
56
|
+
sys.modules.update(state.get('removed_modules', {}))
|
|
57
|
+
|
|
58
|
+
def import_openai_sdk():
|
|
59
|
+
if 'openai' in sys.modules:
|
|
60
|
+
return sys.modules['openai']
|
|
61
|
+
state = _drop_local_pydantic_if_needed()
|
|
62
|
+
try:
|
|
63
|
+
return importlib.import_module('openai')
|
|
64
|
+
finally:
|
|
65
|
+
_restore_import_state(state)
|
|
66
|
+
|
|
67
|
+
def get_openai_status() -> dict[str, Any]:
|
|
68
|
+
settings = get_settings()
|
|
69
|
+
status = {
|
|
70
|
+
'openai_sdk_import_ok': False,
|
|
71
|
+
'openai_api_key_visible': bool(settings.openai_api_key),
|
|
72
|
+
'openai_client_constructed_ok': False,
|
|
73
|
+
'openai_available': False,
|
|
74
|
+
'error_type': None,
|
|
75
|
+
'error_message': None,
|
|
76
|
+
}
|
|
77
|
+
try:
|
|
78
|
+
mod = import_openai_sdk()
|
|
79
|
+
status['openai_sdk_import_ok'] = True
|
|
80
|
+
except Exception as exc:
|
|
81
|
+
status['error_type'] = type(exc).__name__
|
|
82
|
+
status['error_message'] = _safe_error_message(exc)
|
|
83
|
+
return status
|
|
84
|
+
if not settings.openai_api_key:
|
|
85
|
+
status['error_type'] = 'OpenAIKeyMissing'
|
|
86
|
+
status['error_message'] = 'OPENAI_API_KEY is not visible.'
|
|
87
|
+
return status
|
|
88
|
+
# Availability means SDK import + visible key. Client construction is reported separately.
|
|
89
|
+
status['openai_available'] = True
|
|
90
|
+
try:
|
|
91
|
+
mod.OpenAI(api_key=settings.openai_api_key)
|
|
92
|
+
status['openai_client_constructed_ok'] = True
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
status['error_type'] = type(exc).__name__
|
|
95
|
+
status['error_message'] = _safe_error_message(exc)
|
|
96
|
+
return status
|
|
97
|
+
|
|
98
|
+
def openai_available() -> bool:
|
|
99
|
+
return bool(get_openai_status().get('openai_available'))
|
|
100
|
+
|
|
101
|
+
OPENAI_COMPATIBLE_BASE_URLS = {
|
|
102
|
+
"openrouter": "https://openrouter.ai/api/v1",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def _configured_provider(provider_id: str | None = None) -> str:
|
|
106
|
+
settings = get_settings()
|
|
107
|
+
return provider_id or settings.model_provider or "openai"
|
|
108
|
+
|
|
109
|
+
def _provider_key_env(provider_id: str, settings=None) -> str | None:
|
|
110
|
+
settings = settings or get_settings()
|
|
111
|
+
recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
|
|
112
|
+
if settings.model_provider == provider_id and settings.model_provider_api_key_env:
|
|
113
|
+
return settings.model_provider_api_key_env
|
|
114
|
+
return recipe.api_key_env_var if recipe else None
|
|
115
|
+
|
|
116
|
+
def _provider_key(provider_id: str, settings=None) -> tuple[str | None, str | None]:
|
|
117
|
+
settings = settings or get_settings()
|
|
118
|
+
env_var = _provider_key_env(provider_id, settings)
|
|
119
|
+
if provider_id == "openai":
|
|
120
|
+
return env_var or "OPENAI_API_KEY", settings.openai_api_key or os.getenv(env_var or "OPENAI_API_KEY")
|
|
121
|
+
value = os.getenv(env_var or "") if env_var else None
|
|
122
|
+
if provider_id == "google" and not value:
|
|
123
|
+
fallback = "GOOGLE_API_KEY"
|
|
124
|
+
value = os.getenv(fallback)
|
|
125
|
+
if value:
|
|
126
|
+
env_var = fallback
|
|
127
|
+
return env_var, value
|
|
128
|
+
|
|
129
|
+
def _provider_model(provider_id: str, model_override: str | None = None) -> str:
|
|
130
|
+
settings = get_settings()
|
|
131
|
+
recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
|
|
132
|
+
if model_override:
|
|
133
|
+
return model_override
|
|
134
|
+
if settings.model_provider == provider_id and settings.model_provider_model:
|
|
135
|
+
return settings.model_provider_model
|
|
136
|
+
if provider_id == "openai":
|
|
137
|
+
return settings.openai_model
|
|
138
|
+
return (recipe.default_model if recipe else None) or settings.openai_model
|
|
139
|
+
|
|
140
|
+
def _provider_max_output_tokens() -> int:
|
|
141
|
+
try:
|
|
142
|
+
value = int(os.getenv("AA_MODEL_SMOKE_MAX_TOKENS") or "512")
|
|
143
|
+
except ValueError:
|
|
144
|
+
value = 2048
|
|
145
|
+
return max(256, min(value, 4096))
|
|
146
|
+
|
|
147
|
+
def get_model_provider_status(provider_id: str | None = None) -> dict[str, Any]:
|
|
148
|
+
provider_id = _configured_provider(provider_id)
|
|
149
|
+
settings = get_settings()
|
|
150
|
+
recipe = MODEL_PROVIDER_RECIPES.get(provider_id)
|
|
151
|
+
env_var, key = _provider_key(provider_id, settings)
|
|
152
|
+
status = {
|
|
153
|
+
"provider_id": provider_id,
|
|
154
|
+
"provider_display": recipe.display_name if recipe else provider_id,
|
|
155
|
+
"model": _provider_model(provider_id),
|
|
156
|
+
"api_key_env_var": env_var,
|
|
157
|
+
"api_key_visible": bool(key),
|
|
158
|
+
"adapter_available": provider_id in MODEL_PROVIDER_RECIPES,
|
|
159
|
+
"provider_available": False,
|
|
160
|
+
"client_constructed_ok": False,
|
|
161
|
+
"error_type": None,
|
|
162
|
+
"error_message": None,
|
|
163
|
+
}
|
|
164
|
+
if not recipe:
|
|
165
|
+
status["error_type"] = "UnsupportedProvider"
|
|
166
|
+
status["error_message"] = f"Unsupported Mentor Model Provider: {provider_id}"
|
|
167
|
+
return status
|
|
168
|
+
if not key:
|
|
169
|
+
status["error_type"] = "APIKeyMissing"
|
|
170
|
+
status["error_message"] = f"{env_var or recipe.api_key_env_var} is not visible."
|
|
171
|
+
return status
|
|
172
|
+
if provider_id in {"openai", "openrouter"}:
|
|
173
|
+
try:
|
|
174
|
+
mod = import_openai_sdk()
|
|
175
|
+
status["sdk_import_ok"] = True
|
|
176
|
+
kwargs = {"api_key": key}
|
|
177
|
+
if provider_id in OPENAI_COMPATIBLE_BASE_URLS:
|
|
178
|
+
kwargs["base_url"] = OPENAI_COMPATIBLE_BASE_URLS[provider_id]
|
|
179
|
+
mod.OpenAI(**kwargs)
|
|
180
|
+
status["client_constructed_ok"] = True
|
|
181
|
+
status["provider_available"] = True
|
|
182
|
+
except Exception as exc:
|
|
183
|
+
status["error_type"] = type(exc).__name__
|
|
184
|
+
status["error_message"] = _safe_error_message(exc)
|
|
185
|
+
return status
|
|
186
|
+
status["client_constructed_ok"] = True
|
|
187
|
+
status["provider_available"] = True
|
|
188
|
+
return status
|
|
189
|
+
|
|
190
|
+
def classify_model_provider_error(exc: BaseException | str | None, status_code: int | None = None) -> tuple[str, str]:
|
|
191
|
+
text = redact_secrets(str(exc or ""))[:1000]
|
|
192
|
+
low = text.lower()
|
|
193
|
+
if status_code in {401, 403} or any(n in low for n in ["unauthorized", "forbidden", "invalid api key", "authentication", "permission denied"]):
|
|
194
|
+
return "auth_error", text or "Mentor Model Provider authentication failed."
|
|
195
|
+
if "insufficient balance" in low:
|
|
196
|
+
return "insufficient_balance", text or "Mentor Model Provider account balance is insufficient."
|
|
197
|
+
if status_code in {402, 429} or any(n in low for n in ["quota", "credit", "billing", "rate limit", "insufficient"]):
|
|
198
|
+
return "quota_or_credit_error", text or "Mentor Model Provider quota or credit limit reached."
|
|
199
|
+
if any(n in low for n in ["timeout", "timed out"]):
|
|
200
|
+
return "timeout", text or "Mentor Model Provider request timed out."
|
|
201
|
+
if status_code == 404 or any(n in low for n in ["model_not_found", "model not found", "unknown model", "invalid model"]):
|
|
202
|
+
return "bad_model", text or "Mentor Model Provider model was not found."
|
|
203
|
+
if any(n in low for n in ["network", "name or service not known", "temporary failure", "connection refused", "connection reset"]):
|
|
204
|
+
return "network_error", text or "Mentor Model Provider network error."
|
|
205
|
+
return "provider_error", text or "Mentor Model Provider request failed."
|
|
206
|
+
|
|
207
|
+
def _http_json(url: str, headers: dict[str, str], payload: dict[str, Any], timeout: int = 60) -> dict[str, Any]:
|
|
208
|
+
req = urllib.request.Request(
|
|
209
|
+
url,
|
|
210
|
+
data=json.dumps(payload).encode("utf-8"),
|
|
211
|
+
headers={**headers, "Content-Type": "application/json"},
|
|
212
|
+
method="POST",
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
216
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
217
|
+
except urllib.error.HTTPError as exc:
|
|
218
|
+
body = exc.read().decode("utf-8", errors="replace")
|
|
219
|
+
kind, msg = classify_model_provider_error(body or exc, exc.code)
|
|
220
|
+
raise RuntimeError(f"{kind}: {msg}") from exc
|
|
221
|
+
except TimeoutError as exc:
|
|
222
|
+
kind, msg = classify_model_provider_error(exc)
|
|
223
|
+
raise RuntimeError(f"{kind}: {msg}") from exc
|
|
224
|
+
|
|
225
|
+
def _openai_compatible_response(provider_id: str, model_name: str, prompt: str) -> str:
|
|
226
|
+
settings = get_settings()
|
|
227
|
+
_env_var, key = _provider_key(provider_id, settings)
|
|
228
|
+
mod = import_openai_sdk()
|
|
229
|
+
kwargs = {"api_key": key}
|
|
230
|
+
if provider_id in OPENAI_COMPATIBLE_BASE_URLS:
|
|
231
|
+
kwargs["base_url"] = OPENAI_COMPATIBLE_BASE_URLS[provider_id]
|
|
232
|
+
if provider_id == "openrouter":
|
|
233
|
+
try:
|
|
234
|
+
kwargs["default_headers"] = {
|
|
235
|
+
"HTTP-Referer": "https://github.com/Forsy-AI/agent-apprenticeship",
|
|
236
|
+
"X-Title": "Agent Apprenticeship",
|
|
237
|
+
}
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
240
|
+
client = mod.OpenAI(**kwargs)
|
|
241
|
+
if provider_id == "openai":
|
|
242
|
+
return _response_text(_responses_create_with_retry(client, model_name, prompt))
|
|
243
|
+
response = client.chat.completions.create(
|
|
244
|
+
model=model_name,
|
|
245
|
+
messages=[{"role": "user", "content": prompt}],
|
|
246
|
+
response_format={"type": "json_object"},
|
|
247
|
+
max_tokens=_provider_max_output_tokens(),
|
|
248
|
+
)
|
|
249
|
+
choice = response.choices[0]
|
|
250
|
+
return str(choice.message.content or "")
|
|
251
|
+
|
|
252
|
+
def _anthropic_response(model_name: str, prompt: str) -> str:
|
|
253
|
+
settings = get_settings()
|
|
254
|
+
_env_var, key = _provider_key("anthropic", settings)
|
|
255
|
+
payload = {
|
|
256
|
+
"model": model_name,
|
|
257
|
+
"max_tokens": _provider_max_output_tokens(),
|
|
258
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
259
|
+
}
|
|
260
|
+
data = _http_json(
|
|
261
|
+
"https://api.anthropic.com/v1/messages",
|
|
262
|
+
{"x-api-key": key or "", "anthropic-version": "2023-06-01"},
|
|
263
|
+
payload,
|
|
264
|
+
timeout=min(120, settings.task_timeout_seconds),
|
|
265
|
+
)
|
|
266
|
+
parts = data.get("content") or []
|
|
267
|
+
text = "".join(str(part.get("text") or "") for part in parts if isinstance(part, dict))
|
|
268
|
+
if not text:
|
|
269
|
+
raise RuntimeError("provider_response_format: Anthropic response did not contain text content.")
|
|
270
|
+
return text
|
|
271
|
+
|
|
272
|
+
def _google_response(model_name: str, prompt: str) -> str:
|
|
273
|
+
settings = get_settings()
|
|
274
|
+
_env_var, key = _provider_key("google", settings)
|
|
275
|
+
query = urllib.parse.urlencode({"key": key or ""})
|
|
276
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{urllib.parse.quote(model_name, safe='')}:generateContent?{query}"
|
|
277
|
+
payload = {
|
|
278
|
+
"contents": [{"parts": [{"text": prompt}]}],
|
|
279
|
+
"generationConfig": {"responseMimeType": "application/json", "maxOutputTokens": _provider_max_output_tokens()},
|
|
280
|
+
}
|
|
281
|
+
data = _http_json(url, {}, payload, timeout=min(120, settings.task_timeout_seconds))
|
|
282
|
+
try:
|
|
283
|
+
return str(data["candidates"][0]["content"]["parts"][0]["text"])
|
|
284
|
+
except Exception as exc:
|
|
285
|
+
raise RuntimeError("provider_response_format: Google response did not contain candidate text.") from exc
|
|
286
|
+
|
|
287
|
+
def _provider_completion_text(provider_id: str, model_name: str, prompt: str) -> str:
|
|
288
|
+
if provider_id in {"openai", "openrouter"}:
|
|
289
|
+
return _openai_compatible_response(provider_id, model_name, prompt)
|
|
290
|
+
if provider_id == "anthropic":
|
|
291
|
+
return _anthropic_response(model_name, prompt)
|
|
292
|
+
if provider_id == "google":
|
|
293
|
+
return _google_response(model_name, prompt)
|
|
294
|
+
raise RuntimeError(f"Unsupported Mentor Model Provider: {provider_id}")
|
|
295
|
+
|
|
296
|
+
def _provider_completion_text_with_retry(provider_id: str, model_name: str, prompt: str) -> str:
|
|
297
|
+
last: BaseException | None = None
|
|
298
|
+
for i in range(2):
|
|
299
|
+
try:
|
|
300
|
+
return _provider_completion_text(provider_id, model_name, prompt)
|
|
301
|
+
except Exception as exc:
|
|
302
|
+
last = exc
|
|
303
|
+
kind, _msg = classify_model_provider_error(exc)
|
|
304
|
+
if i > 0 or kind in {"auth_error", "quota_or_credit_error", "bad_model"}:
|
|
305
|
+
raise
|
|
306
|
+
time.sleep(0.75)
|
|
307
|
+
raise last or RuntimeError("Mentor Model Provider request failed.")
|
|
308
|
+
|
|
309
|
+
def _json_repair_candidates(cand: str) -> list[tuple[str, str]]:
|
|
310
|
+
repaired=[]
|
|
311
|
+
try:
|
|
312
|
+
from json_repair import repair_json # type: ignore
|
|
313
|
+
repaired_text=repair_json(cand)
|
|
314
|
+
if repaired_text and repaired_text != cand:
|
|
315
|
+
repaired.append((repaired_text, 'json_repair'))
|
|
316
|
+
except Exception:
|
|
317
|
+
pass
|
|
318
|
+
local=re.sub(r',\s*([}\]])', r'\1', cand)
|
|
319
|
+
if local != cand:
|
|
320
|
+
repaired.append((local, 'removed_trailing_commas'))
|
|
321
|
+
return repaired
|
|
322
|
+
|
|
323
|
+
def extract_json_object(text: str, return_metadata: bool=False):
|
|
324
|
+
if not text or not text.strip():
|
|
325
|
+
raise JsonExtractionError('No text to parse as JSON.')
|
|
326
|
+
candidates: list[tuple[str, str]] = [(text.strip(), 'direct')]
|
|
327
|
+
for match in re.finditer(r"```(?:json)?\s*(.*?)```", text, re.DOTALL | re.IGNORECASE):
|
|
328
|
+
candidates.append((match.group(1).strip(), 'markdown_fence'))
|
|
329
|
+
for start, ch in enumerate(text):
|
|
330
|
+
if ch not in '{[':
|
|
331
|
+
continue
|
|
332
|
+
stack=[]; in_str=False; esc=False
|
|
333
|
+
for idx in range(start, len(text)):
|
|
334
|
+
c=text[idx]
|
|
335
|
+
if in_str:
|
|
336
|
+
if esc: esc=False
|
|
337
|
+
elif c == '\\': esc=True
|
|
338
|
+
elif c == '"': in_str=False
|
|
339
|
+
continue
|
|
340
|
+
if c == '"': in_str=True
|
|
341
|
+
elif c in '{[': stack.append(c)
|
|
342
|
+
elif c in '}]':
|
|
343
|
+
if not stack: break
|
|
344
|
+
opening=stack.pop()
|
|
345
|
+
if (opening,c) not in [('{','}'),('[',']')]: break
|
|
346
|
+
if not stack:
|
|
347
|
+
candidates.append((text[start:idx+1], 'balanced_brace_extraction')); break
|
|
348
|
+
expanded=[]
|
|
349
|
+
for cand, reason in candidates:
|
|
350
|
+
expanded.append((cand, reason, False))
|
|
351
|
+
for repaired, repair_reason in _json_repair_candidates(cand):
|
|
352
|
+
expanded.append((repaired, f'{reason}+{repair_reason}', True))
|
|
353
|
+
errors=[]
|
|
354
|
+
for cand, reason, repaired in expanded:
|
|
355
|
+
try:
|
|
356
|
+
obj=json.loads(cand)
|
|
357
|
+
if isinstance(obj, dict):
|
|
358
|
+
meta={'repaired_json': repaired, 'repair_reason': reason if repaired else None, 'json_extraction_reason': reason}
|
|
359
|
+
return (obj, meta) if return_metadata else obj
|
|
360
|
+
raise JsonExtractionError('Parsed JSON was not an object.')
|
|
361
|
+
except Exception as exc:
|
|
362
|
+
errors.append(type(exc).__name__)
|
|
363
|
+
raise JsonExtractionError('Could not extract a valid JSON object from model output: ' + ','.join(errors[:5]))
|
|
364
|
+
|
|
365
|
+
def _validate_role_output(
|
|
366
|
+
role: str,
|
|
367
|
+
raw: str,
|
|
368
|
+
output_model: type[BaseModel],
|
|
369
|
+
normalizer_context: dict[str, Any] | None,
|
|
370
|
+
out_dir: Path,
|
|
371
|
+
) -> tuple[dict[str, Any], dict[str, Any], bool]:
|
|
372
|
+
parsed,json_meta=extract_json_object(raw, return_metadata=True)
|
|
373
|
+
write_json(out_dir/'raw_parsed_output.json', parsed)
|
|
374
|
+
initial_validation_error=None
|
|
375
|
+
try:
|
|
376
|
+
output_model.model_validate(parsed)
|
|
377
|
+
except Exception as first_exc:
|
|
378
|
+
initial_validation_error=_safe_error_message(first_exc)
|
|
379
|
+
normalized=normalize_role_output(role, parsed, normalizer_context)
|
|
380
|
+
normalized_used=normalized != parsed or initial_validation_error is not None
|
|
381
|
+
output_model.model_validate(normalized)
|
|
382
|
+
if normalized_used:
|
|
383
|
+
write_json(out_dir/'normalization_report.json', {'role': role, 'normalization_applied': True, 'initial_validation_error': initial_validation_error, 'raw_keys': list(parsed.keys()), 'normalized_keys': list(normalized.keys()) if isinstance(normalized, dict) else [], 'extras_preserved_in_metadata': True})
|
|
384
|
+
return normalized, json_meta, normalized_used
|
|
385
|
+
|
|
386
|
+
def _strict_json_retry_prompt(role: str, original_prompt: str, error_message: str | None) -> str:
|
|
387
|
+
return (
|
|
388
|
+
"Your previous response could not be parsed or validated. "
|
|
389
|
+
"Return ONLY one compact valid JSON object, with double-quoted JSON keys and no markdown/prose. "
|
|
390
|
+
f"Role: {role}. Error to fix: {error_message or 'invalid JSON'}.\n\n"
|
|
391
|
+
f"{original_prompt}"
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _transient_provider_error(exc: BaseException) -> bool:
|
|
395
|
+
name=type(exc).__name__.lower(); msg=str(exc).lower()
|
|
396
|
+
needles=['timeout','rate','temporar','connection reset','disconnect','websocket','service unavailable','gateway','overloaded']
|
|
397
|
+
return any(n in name or n in msg for n in needles)
|
|
398
|
+
|
|
399
|
+
def _responses_create_with_retry(client: Any, model_name: str, prompt: str, attempts: int = 3):
|
|
400
|
+
last=None
|
|
401
|
+
for i in range(attempts):
|
|
402
|
+
try:
|
|
403
|
+
return client.responses.create(model=model_name, input=prompt)
|
|
404
|
+
except Exception as exc:
|
|
405
|
+
last=exc
|
|
406
|
+
if i >= attempts-1 or not _transient_provider_error(exc):
|
|
407
|
+
raise
|
|
408
|
+
time.sleep(min(4.0, 0.5 * (2 ** i)))
|
|
409
|
+
raise last # type: ignore[misc]
|
|
410
|
+
|
|
411
|
+
def _response_text(response: Any) -> str:
|
|
412
|
+
text = getattr(response, 'output_text', None)
|
|
413
|
+
if text is not None:
|
|
414
|
+
return str(text)
|
|
415
|
+
if isinstance(response, dict):
|
|
416
|
+
return str(response.get('output_text') or response.get('text') or json.dumps(response))
|
|
417
|
+
try:
|
|
418
|
+
return response.model_dump_json()
|
|
419
|
+
except Exception:
|
|
420
|
+
return str(response)
|
|
421
|
+
|
|
422
|
+
def run_structured_role(role: str, prompt: str, output_model: type[BaseModel], out_dir: Path, allow_fallback=False, require_validation=False, model_override: str | None=None, normalizer_context: dict[str, Any] | None=None, provider_override: str | None=None) -> RoleResult:
|
|
423
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
424
|
+
(out_dir/'prompt.md').write_text(redact_secrets(prompt))
|
|
425
|
+
settings=get_settings()
|
|
426
|
+
provider_id=_configured_provider(provider_override)
|
|
427
|
+
model_name=_provider_model(provider_id, model_override)
|
|
428
|
+
start=time.time()
|
|
429
|
+
status=get_model_provider_status(provider_id)
|
|
430
|
+
meta={
|
|
431
|
+
'mentor_model_provider': provider_id,
|
|
432
|
+
'mentor_model_provider_display': status.get('provider_display'),
|
|
433
|
+
'mentor_model_provider_api_key_visible': bool(status.get('api_key_visible')),
|
|
434
|
+
'mentor_model_provider_adapter_available': bool(status.get('adapter_available')),
|
|
435
|
+
'mentor_model_provider_client_constructed_ok': bool(status.get('client_constructed_ok')),
|
|
436
|
+
'mentor_model_provider_available': bool(status.get('provider_available')),
|
|
437
|
+
}
|
|
438
|
+
if provider_id == 'openai':
|
|
439
|
+
openai_status=get_openai_status()
|
|
440
|
+
meta.update({k: openai_status.get(k) for k in ['openai_sdk_import_ok','openai_api_key_visible','openai_client_constructed_ok','openai_available']})
|
|
441
|
+
if not status['provider_available']:
|
|
442
|
+
result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=False, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=status.get('error_type') or 'MentorModelProviderUnavailable', error_message=status.get('error_message') or 'Mentor Model Provider adapter or API key unavailable; no live call made.', duration_seconds=time.time()-start, metadata_json={**meta, 'allow_fallback': allow_fallback})
|
|
443
|
+
(out_dir/'raw_output.txt').write_text(result.error_message or '')
|
|
444
|
+
write_json(out_dir/'parsed_output.json', {'error': result.error_message})
|
|
445
|
+
write_json(out_dir/'role_result.json', result)
|
|
446
|
+
if not allow_fallback:
|
|
447
|
+
raise RuntimeError(result.error_message)
|
|
448
|
+
return result
|
|
449
|
+
try:
|
|
450
|
+
raw=_provider_completion_text_with_retry(provider_id, model_name, prompt)
|
|
451
|
+
(out_dir/'raw_output.txt').write_text(redact_secrets(raw))
|
|
452
|
+
try:
|
|
453
|
+
normalized,json_meta,normalized_used=_validate_role_output(role, raw, output_model, normalizer_context, out_dir)
|
|
454
|
+
write_json(out_dir/'parsed_output.json', normalized)
|
|
455
|
+
result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=True, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=None, error_message=None, duration_seconds=time.time()-start, metadata_json={**meta, **json_meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'normalization_applied': normalized_used})
|
|
456
|
+
except Exception as exc:
|
|
457
|
+
first_error=_safe_error_message(exc)
|
|
458
|
+
retry_raw=None
|
|
459
|
+
try:
|
|
460
|
+
retry_prompt=_strict_json_retry_prompt(role, prompt, first_error)
|
|
461
|
+
retry_raw=_provider_completion_text_with_retry(provider_id, model_name, retry_prompt)
|
|
462
|
+
(out_dir/'raw_output.retry.txt').write_text(redact_secrets(retry_raw))
|
|
463
|
+
normalized,json_meta,normalized_used=_validate_role_output(role, retry_raw, output_model, normalizer_context, out_dir)
|
|
464
|
+
write_json(out_dir/'parsed_output.json', normalized)
|
|
465
|
+
result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=True, prompt_ref='prompt.md', output_ref='raw_output.retry.txt', parsed_output_ref='parsed_output.json', error_type=None, error_message=None, duration_seconds=time.time()-start, metadata_json={**meta, **json_meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'normalization_applied': normalized_used, 'structured_retry_used': True, 'initial_parse_or_validation_error': first_error})
|
|
466
|
+
except Exception as retry_exc:
|
|
467
|
+
write_json(out_dir/'parsed_output.json', {'parse_or_validation_error': _safe_error_message(retry_exc), 'initial_parse_or_validation_error': first_error, 'raw_output_preserved_ref': 'raw_output.txt', 'retry_output_preserved_ref': 'raw_output.retry.txt' if retry_raw is not None else None, 'raw_parsed_output_ref': 'raw_parsed_output.json' if (out_dir/'raw_parsed_output.json').exists() else None})
|
|
468
|
+
result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=True, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=type(retry_exc).__name__, error_message=_safe_error_message(retry_exc), duration_seconds=time.time()-start, metadata_json={**meta, 'mentor_model_provider_live_call_ok': True, f'{provider_id}_live_call_ok': True, 'fallback_used': allow_fallback, 'structured_retry_used': retry_raw is not None, 'initial_parse_or_validation_error': first_error})
|
|
469
|
+
if require_validation:
|
|
470
|
+
write_json(out_dir/'role_result.json', result)
|
|
471
|
+
raise RuntimeError(result.error_message)
|
|
472
|
+
except Exception as exc:
|
|
473
|
+
err_type, err_msg = classify_model_provider_error(exc)
|
|
474
|
+
(out_dir/'raw_output.txt').write_text(_safe_error_message(exc) or '')
|
|
475
|
+
write_json(out_dir/'parsed_output.json', {'error': err_msg})
|
|
476
|
+
result=RoleResult(role=role, provider=provider_id, model=model_name, live_call_ok=False, structured_output_validation_ok=False, prompt_ref='prompt.md', output_ref='raw_output.txt', parsed_output_ref='parsed_output.json', error_type=err_type, error_message=err_msg, duration_seconds=time.time()-start, metadata_json={**meta, 'mentor_model_provider_live_call_ok': False, f'{provider_id}_live_call_ok': False, 'fallback_used': allow_fallback})
|
|
477
|
+
if require_validation and not allow_fallback:
|
|
478
|
+
write_json(out_dir/'role_result.json', result)
|
|
479
|
+
raise
|
|
480
|
+
write_json(out_dir/'role_result.json', result)
|
|
481
|
+
return result
|
|
482
|
+
|
|
483
|
+
def _json_smoke_prompt(label: str, skeleton: dict[str, Any]) -> str:
|
|
484
|
+
return (
|
|
485
|
+
f"Return ONLY this compact valid JSON object for {label}. "
|
|
486
|
+
"Keep every key and JSON type. Use double quotes. No markdown, no prose.\n"
|
|
487
|
+
+ json.dumps(skeleton, separators=(",", ":"))
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# Compact schema prompts for llm-smoke. They use literal JSON skeletons so
|
|
491
|
+
# providers do not need to infer local Pydantic class definitions.
|
|
492
|
+
def smoke_prompts() -> dict[str, tuple[str, type[BaseModel]]]:
|
|
493
|
+
from .schemas import TaskIntakeSpec, TaskIntakeQualityReport, RubricSpec, RubricQualityReport, GraderResult, VerifierResult, EvaluatorFeedback, RevisionPlan
|
|
494
|
+
|
|
495
|
+
class _IntakeSmokeOutput(BaseModel):
|
|
496
|
+
task_intake_spec: TaskIntakeSpec
|
|
497
|
+
task_intake_quality_report: TaskIntakeQualityReport
|
|
498
|
+
|
|
499
|
+
class _RubricSmokeOutput(BaseModel):
|
|
500
|
+
rubric_spec: RubricSpec
|
|
501
|
+
rubric_quality_report: RubricQualityReport
|
|
502
|
+
|
|
503
|
+
class _EvaluatorSmokeOutput(BaseModel):
|
|
504
|
+
evaluator_feedback: EvaluatorFeedback
|
|
505
|
+
revision_plan: RevisionPlan
|
|
506
|
+
|
|
507
|
+
rubric_item = {
|
|
508
|
+
"rubric_item_id": "ri_1",
|
|
509
|
+
"criterion_name": "artifact",
|
|
510
|
+
"criterion_description": "hello.txt exists and contains hello",
|
|
511
|
+
"weight": 1.0,
|
|
512
|
+
"score_min": 0.0,
|
|
513
|
+
"score_max": 1.0,
|
|
514
|
+
"pass_threshold": 0.7,
|
|
515
|
+
"observable_evidence": ["artifacts/hello.txt"],
|
|
516
|
+
"required_artifacts": ["hello.txt"],
|
|
517
|
+
"scoring_method": "deterministic",
|
|
518
|
+
"worker_visible": True,
|
|
519
|
+
"verifier_only": False,
|
|
520
|
+
"hidden_reference_required": False,
|
|
521
|
+
"failure_modes": [],
|
|
522
|
+
"partial_credit_rules": [],
|
|
523
|
+
"edge_cases": [],
|
|
524
|
+
"anti_cheat_notes": [],
|
|
525
|
+
"metadata_json": {},
|
|
526
|
+
}
|
|
527
|
+
rubric_score = {
|
|
528
|
+
"rubric_item_id": "ri_1",
|
|
529
|
+
"criterion_name": "artifact",
|
|
530
|
+
"score": 1.0,
|
|
531
|
+
"max_score": 1.0,
|
|
532
|
+
"passed": True,
|
|
533
|
+
"evidence_refs": ["artifacts/hello.txt"],
|
|
534
|
+
"failure_mode": None,
|
|
535
|
+
"notes": "present",
|
|
536
|
+
"confidence": 0.9,
|
|
537
|
+
"artifact_presence_ok": True,
|
|
538
|
+
"semantic_correctness_score": 1.0,
|
|
539
|
+
"reasoning_summary": "The artifact is present.",
|
|
540
|
+
"improvement_suggestion": None,
|
|
541
|
+
}
|
|
542
|
+
return {
|
|
543
|
+
'intake_agent': (_json_smoke_prompt("task intake smoke", {
|
|
544
|
+
"task_intake_spec": {
|
|
545
|
+
"task_id": "smoke_task",
|
|
546
|
+
"normalized_title": "Hello artifact smoke",
|
|
547
|
+
"normalized_instruction": "Create hello.txt containing hello.",
|
|
548
|
+
"domain": "software",
|
|
549
|
+
"subdomain": "certification",
|
|
550
|
+
"professional_role": "QA engineer",
|
|
551
|
+
"apprenticeship_role": "QA engineer",
|
|
552
|
+
"task_family": "file_creation",
|
|
553
|
+
"expected_economic_value": "$50-$100",
|
|
554
|
+
"expected_economic_value_for_agent_apprentice": "$5-$15",
|
|
555
|
+
"workflow_type": "file_creation",
|
|
556
|
+
"skill_targets": ["artifact_creation"],
|
|
557
|
+
"difficulty_tier": "easy",
|
|
558
|
+
"expected_human_deliverable": "hello.txt",
|
|
559
|
+
"expected_agent_deliverable": "artifacts/hello.txt",
|
|
560
|
+
"input_requirements": [],
|
|
561
|
+
"output_requirements": ["hello.txt contains hello"],
|
|
562
|
+
"required_context": [],
|
|
563
|
+
"assumptions": [],
|
|
564
|
+
"constraints": [],
|
|
565
|
+
"allowed_tools": ["filesystem"],
|
|
566
|
+
"disallowed_tools": [],
|
|
567
|
+
"privacy_classification": "synthetic",
|
|
568
|
+
"license": None,
|
|
569
|
+
"allowed_use": "open_research",
|
|
570
|
+
"rubricability_score": 0.9,
|
|
571
|
+
"verifiability_score": 0.9,
|
|
572
|
+
"artifactability_score": 1.0,
|
|
573
|
+
"needs_expert_review": False,
|
|
574
|
+
"metadata_json": {},
|
|
575
|
+
},
|
|
576
|
+
"task_intake_quality_report": {
|
|
577
|
+
"task_id": "smoke_task",
|
|
578
|
+
"instruction_clarity_score": 0.9,
|
|
579
|
+
"input_completeness_score": 0.9,
|
|
580
|
+
"output_contract_score": 1.0,
|
|
581
|
+
"rubricability_score": 0.9,
|
|
582
|
+
"verifiability_score": 0.9,
|
|
583
|
+
"artifactability_score": 1.0,
|
|
584
|
+
"privacy_risk_score": 0.0,
|
|
585
|
+
"license_risk_score": 0.0,
|
|
586
|
+
"ambiguity_score": 0.1,
|
|
587
|
+
"overall_intake_quality_score": 0.9,
|
|
588
|
+
"quality_flags": [],
|
|
589
|
+
"blockers": [],
|
|
590
|
+
"recommended_fix": None,
|
|
591
|
+
"metadata_json": {},
|
|
592
|
+
},
|
|
593
|
+
}), _IntakeSmokeOutput),
|
|
594
|
+
'rubric_agent': (_json_smoke_prompt("rubric smoke", {
|
|
595
|
+
"rubric_spec": {
|
|
596
|
+
"rubric_id": "smoke_rubric",
|
|
597
|
+
"task_id": "smoke_task",
|
|
598
|
+
"task_family_id": None,
|
|
599
|
+
"rubric_version": "v1",
|
|
600
|
+
"rubric_items": [rubric_item],
|
|
601
|
+
"total_weight": 1.0,
|
|
602
|
+
"pass_threshold": 0.7,
|
|
603
|
+
"worker_visible_rubric_ref": "rubric/worker_visible_rubric.md",
|
|
604
|
+
"verifier_private_rubric_ref": "rubric/rubric.json",
|
|
605
|
+
"hidden_reference_policy": "none",
|
|
606
|
+
"scoring_aggregation": "weighted_sum",
|
|
607
|
+
"required_artifacts": ["hello.txt"],
|
|
608
|
+
"disqualifying_errors": [],
|
|
609
|
+
"partial_credit_allowed": True,
|
|
610
|
+
"grader_kind": "deterministic",
|
|
611
|
+
"rubric_generation_source": "deterministic_seed",
|
|
612
|
+
"rubric_generation_agent_provider": None,
|
|
613
|
+
"rubric_generation_agent_model": None,
|
|
614
|
+
"rubric_generation_confidence": 0.9,
|
|
615
|
+
"metadata_json": {},
|
|
616
|
+
},
|
|
617
|
+
"rubric_quality_report": {
|
|
618
|
+
"rubric_id": "smoke_rubric",
|
|
619
|
+
"task_id": "smoke_task",
|
|
620
|
+
"criteria_count": 1,
|
|
621
|
+
"total_weight": 1.0,
|
|
622
|
+
"weights_sum_valid": True,
|
|
623
|
+
"has_observable_evidence": True,
|
|
624
|
+
"has_required_artifacts": True,
|
|
625
|
+
"has_partial_credit_rules": False,
|
|
626
|
+
"has_disqualifying_errors": False,
|
|
627
|
+
"has_hidden_reference_policy": True,
|
|
628
|
+
"has_worker_visible_view": True,
|
|
629
|
+
"has_verifier_private_view": True,
|
|
630
|
+
"ambiguous_criteria_count": 0,
|
|
631
|
+
"unverifiable_criteria_count": 0,
|
|
632
|
+
"rubric_quality_score": 0.9,
|
|
633
|
+
"quality_flags": [],
|
|
634
|
+
"blockers": [],
|
|
635
|
+
"metadata_json": {},
|
|
636
|
+
},
|
|
637
|
+
}), _RubricSmokeOutput),
|
|
638
|
+
'grader_agent': (_json_smoke_prompt("grader smoke", {
|
|
639
|
+
"grader_result_id": "smoke_grader",
|
|
640
|
+
"task_id": "smoke_task",
|
|
641
|
+
"attempt_id": "smoke_attempt",
|
|
642
|
+
"attempt_kind": "baseline",
|
|
643
|
+
"rubric_id": "smoke_rubric",
|
|
644
|
+
"grader_kind": "model",
|
|
645
|
+
"score_source": "model_judged",
|
|
646
|
+
"score": 1.0,
|
|
647
|
+
"max_score": 1.0,
|
|
648
|
+
"passed": True,
|
|
649
|
+
"rubric_item_scores": [rubric_score],
|
|
650
|
+
"failed_criteria": [],
|
|
651
|
+
"passed_criteria": ["artifact"],
|
|
652
|
+
"evidence_refs": ["artifacts/hello.txt"],
|
|
653
|
+
"confidence": 0.9,
|
|
654
|
+
"reasoning_summary": "The expected artifact is present.",
|
|
655
|
+
"limitations": [],
|
|
656
|
+
"hidden_reference_used": False,
|
|
657
|
+
"hidden_reference_leaked": False,
|
|
658
|
+
"artifact_contract_score": 1.0,
|
|
659
|
+
"semantic_score": 1.0,
|
|
660
|
+
"model_score": 1.0,
|
|
661
|
+
"legacy_semantic_score": None,
|
|
662
|
+
"legacy_score_source": None,
|
|
663
|
+
"final_score": 1.0,
|
|
664
|
+
"model": None,
|
|
665
|
+
"provider": None,
|
|
666
|
+
"deterministic_precheck_ref": None,
|
|
667
|
+
"llm_prompt_ref_internal": None,
|
|
668
|
+
"llm_response_ref_internal": None,
|
|
669
|
+
"public_prompt_hash": None,
|
|
670
|
+
"public_response_summary": "passed",
|
|
671
|
+
"score_reliability": "verified",
|
|
672
|
+
"verifier_status": "verified",
|
|
673
|
+
"verifier_confidence": 0.9,
|
|
674
|
+
"verifier_issue_count": 0,
|
|
675
|
+
"verifier_issues_summary": None,
|
|
676
|
+
"metadata_json": {},
|
|
677
|
+
}), GraderResult),
|
|
678
|
+
'verifier_agent': (_json_smoke_prompt("verifier smoke", {
|
|
679
|
+
"verifier_result_id": "smoke_verifier",
|
|
680
|
+
"task_id": "smoke_task",
|
|
681
|
+
"attempt_id": "smoke_attempt",
|
|
682
|
+
"attempt_kind": "baseline",
|
|
683
|
+
"grader_result_id": "smoke_grader",
|
|
684
|
+
"verification_status": "verified",
|
|
685
|
+
"artifact_contract_ok": True,
|
|
686
|
+
"evidence_grounding_ok": True,
|
|
687
|
+
"score_consistency_ok": True,
|
|
688
|
+
"hidden_reference_leaked": False,
|
|
689
|
+
"issues": [],
|
|
690
|
+
"confidence": 0.9,
|
|
691
|
+
"verifier_notes": "verified",
|
|
692
|
+
"semantic_evidence_grounding_ok": True,
|
|
693
|
+
"unsupported_claims": [],
|
|
694
|
+
"leakage_check_ok": True,
|
|
695
|
+
"model": None,
|
|
696
|
+
"provider": None,
|
|
697
|
+
"metadata_json": {},
|
|
698
|
+
}), VerifierResult),
|
|
699
|
+
'evaluator_agent': (_json_smoke_prompt("evaluator smoke", {
|
|
700
|
+
"evaluator_feedback": {
|
|
701
|
+
"feedback_id": "smoke_feedback",
|
|
702
|
+
"task_id": "smoke_task",
|
|
703
|
+
"attempt_id": "smoke_attempt",
|
|
704
|
+
"target_actor": "worker",
|
|
705
|
+
"feedback_type": "other",
|
|
706
|
+
"failed_rubric_items": [],
|
|
707
|
+
"evidence_refs": ["artifacts/hello.txt"],
|
|
708
|
+
"artifact_refs": ["artifacts/hello.txt"],
|
|
709
|
+
"feedback_summary": "The attempt passes.",
|
|
710
|
+
"actionable_feedback": [],
|
|
711
|
+
"suggested_revision": "No revision needed.",
|
|
712
|
+
"revision_priority": "low",
|
|
713
|
+
"confidence": 0.9,
|
|
714
|
+
"hidden_reference_used": False,
|
|
715
|
+
"hidden_reference_leaked": False,
|
|
716
|
+
"failed_or_weak_rubric_items": [],
|
|
717
|
+
"artifact_specific_comments": [],
|
|
718
|
+
"trace_specific_comments": [],
|
|
719
|
+
"revision_plan": None,
|
|
720
|
+
"model": None,
|
|
721
|
+
"provider": None,
|
|
722
|
+
"metadata_json": {},
|
|
723
|
+
},
|
|
724
|
+
"revision_plan": {
|
|
725
|
+
"revision_plan_id": "smoke_revision",
|
|
726
|
+
"task_id": "smoke_task",
|
|
727
|
+
"source_attempt_id": "smoke_attempt",
|
|
728
|
+
"target_attempt_id": "smoke_attempt_revised",
|
|
729
|
+
"revision_kind": "local_fix",
|
|
730
|
+
"revision_reason": "No revision needed for smoke.",
|
|
731
|
+
"failed_rubric_items": [],
|
|
732
|
+
"planned_changes": [],
|
|
733
|
+
"expected_score_improvement": 0.0,
|
|
734
|
+
"risk_of_regression": "low",
|
|
735
|
+
"uses_evaluator_feedback": True,
|
|
736
|
+
"metadata_json": {},
|
|
737
|
+
},
|
|
738
|
+
}), _EvaluatorSmokeOutput),
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
def run_llm_smoke(out_dir: Path, provider_id: str | None = None) -> dict[str, Any]:
|
|
742
|
+
provider_id=_configured_provider(provider_id)
|
|
743
|
+
status=get_model_provider_status(provider_id)
|
|
744
|
+
counters={
|
|
745
|
+
'mentor_model_provider': provider_id,
|
|
746
|
+
'mentor_model_provider_api_key_visible': bool(status['api_key_visible']),
|
|
747
|
+
'mentor_model_provider_client_constructed_ok': bool(status['client_constructed_ok']),
|
|
748
|
+
'mentor_model_provider_available': bool(status['provider_available']),
|
|
749
|
+
}
|
|
750
|
+
if provider_id == "openai":
|
|
751
|
+
openai_status=get_openai_status()
|
|
752
|
+
counters.update({
|
|
753
|
+
'openai_sdk_import_ok': bool(openai_status['openai_sdk_import_ok']),
|
|
754
|
+
'openai_api_key_visible': bool(openai_status['openai_api_key_visible']),
|
|
755
|
+
'openai_client_constructed_ok': bool(openai_status['openai_client_constructed_ok']),
|
|
756
|
+
'openai_available': bool(openai_status['openai_available']),
|
|
757
|
+
})
|
|
758
|
+
secret_ok=True
|
|
759
|
+
for role,(prompt,model) in smoke_prompts().items():
|
|
760
|
+
short=role.replace('_agent','')
|
|
761
|
+
try:
|
|
762
|
+
rr=run_structured_role(role, prompt, model, out_dir/role, allow_fallback=True, provider_override=provider_id)
|
|
763
|
+
except Exception as exc:
|
|
764
|
+
rr=RoleResult(role=role, provider=provider_id, model=_provider_model(provider_id), live_call_ok=False, structured_output_validation_ok=False, prompt_ref='', output_ref='', parsed_output_ref='', error_type=type(exc).__name__, error_message=_safe_error_message(exc), metadata_json={})
|
|
765
|
+
counters[f'{short}_live_call_ok']=bool(rr.live_call_ok)
|
|
766
|
+
counters[f'{short}_structured_output_validation_ok']=bool(rr.structured_output_validation_ok)
|
|
767
|
+
if rr.error_type:
|
|
768
|
+
counters[f'{short}_error_type']=rr.error_type
|
|
769
|
+
counters[f'{short}_error_message']=rr.error_message
|
|
770
|
+
# ensure generated role files do not contain secrets
|
|
771
|
+
if (out_dir/role).exists():
|
|
772
|
+
for p in (out_dir/role).rglob('*'):
|
|
773
|
+
if p.is_file() and contains_secret(p.read_text(errors='ignore')):
|
|
774
|
+
secret_ok=False
|
|
775
|
+
counters['secret_scan_ok']=secret_ok
|
|
776
|
+
return counters
|
|
777
|
+
|
|
778
|
+
def format_smoke_counters(counters: dict[str, Any]) -> str:
|
|
779
|
+
lines=[]
|
|
780
|
+
for k,v in counters.items():
|
|
781
|
+
if isinstance(v,bool): v=_bool(v)
|
|
782
|
+
lines.append(f'{k}={v}')
|
|
783
|
+
return '\n'.join(lines)
|