agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import os, re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
def parse_env_file(path: Path) -> dict[str,str]:
|
|
6
|
+
vals={}
|
|
7
|
+
if not path.exists(): return vals
|
|
8
|
+
for line in path.read_text().splitlines():
|
|
9
|
+
line=line.strip()
|
|
10
|
+
if not line or line.startswith('#') or '=' not in line: continue
|
|
11
|
+
k,v=line.split('=',1); vals[k.strip()]=v.strip().strip('"').strip("'")
|
|
12
|
+
return vals
|
|
13
|
+
|
|
14
|
+
def load_local_env(root: Path | None=None) -> dict[str,str]:
|
|
15
|
+
if os.getenv("AA_DISABLE_LOCAL_ENV") == "1":
|
|
16
|
+
return {}
|
|
17
|
+
root = root or Path.cwd()
|
|
18
|
+
loaded={}
|
|
19
|
+
for name in ['.env.local','.env']:
|
|
20
|
+
for k,v in parse_env_file(root/name).items():
|
|
21
|
+
if k not in os.environ:
|
|
22
|
+
os.environ[k]=v; loaded[k]=v
|
|
23
|
+
return loaded
|
|
24
|
+
|
|
25
|
+
SECRET_PATTERNS=[
|
|
26
|
+
re.compile(r"sk-proj-[A-Za-z0-9_-]{20,}"),
|
|
27
|
+
re.compile(r"sk-ant-[A-Za-z0-9_-]{20,}"),
|
|
28
|
+
re.compile(r"sk-or-v1-[A-Za-z0-9_-]{20,}"),
|
|
29
|
+
re.compile(r"sk-[A-Za-z0-9_-]{20,}"),
|
|
30
|
+
re.compile(r"AIza[A-Za-z0-9_-]{20,}"),
|
|
31
|
+
re.compile(r"dsk-[A-Za-z0-9_-]{20,}"),
|
|
32
|
+
re.compile(r"user_[A-Za-z0-9_-]{12,}"),
|
|
33
|
+
re.compile(r"(['\"]user_id['\"]\s*:\s*)['\"][^'\"]+['\"]"),
|
|
34
|
+
]
|
|
35
|
+
def contains_secret(text: str) -> bool:
|
|
36
|
+
return any(p.search(text or '') for p in SECRET_PATTERNS)
|
|
37
|
+
def redact_secrets(text: str) -> str:
|
|
38
|
+
text = text or ''
|
|
39
|
+
for p in SECRET_PATTERNS:
|
|
40
|
+
if 'user_id' in p.pattern:
|
|
41
|
+
text=p.sub(r"\1'[REDACTED_USER_ID]'", text)
|
|
42
|
+
elif p.pattern.startswith('user_'):
|
|
43
|
+
text=p.sub('[REDACTED_USER_ID]', text)
|
|
44
|
+
else:
|
|
45
|
+
text=p.sub('[REDACTED_SECRET]', text)
|
|
46
|
+
return text
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from .schemas import EvaluatorFeedback, GraderResult, VerifierResult, AgentTrace, ActualOutputs, RevisionPlan
|
|
6
|
+
from .config import get_settings
|
|
7
|
+
from .io import read_json
|
|
8
|
+
from .openai_structured import get_model_provider_status, run_structured_role
|
|
9
|
+
from .public_sanitizer import sha256_text
|
|
10
|
+
from .artifact_previews import build_artifact_previews
|
|
11
|
+
|
|
12
|
+
def _mentor_provider_can_attempt() -> bool:
|
|
13
|
+
return bool(get_model_provider_status().get('provider_available'))
|
|
14
|
+
|
|
15
|
+
def _mentor_provider_id() -> str:
|
|
16
|
+
settings=get_settings()
|
|
17
|
+
return settings.model_provider or 'openai'
|
|
18
|
+
|
|
19
|
+
def _model_evaluator_active(settings) -> bool:
|
|
20
|
+
return settings.mentor_mode in {'model_assisted', 'hybrid'} or settings.evaluation_mode in {'hybrid', 'llm_required'}
|
|
21
|
+
|
|
22
|
+
class LLMEvaluatorOutput(BaseModel):
|
|
23
|
+
evaluator_feedback: EvaluatorFeedback
|
|
24
|
+
revision_plan: RevisionPlan
|
|
25
|
+
|
|
26
|
+
def deterministic_feedback(grader: GraderResult) -> EvaluatorFeedback:
|
|
27
|
+
return EvaluatorFeedback(feedback_id=f'feedback_{grader.attempt_id}', task_id=grader.task_id, attempt_id=grader.attempt_id, target_actor='worker', feedback_type='criteria_failure' if grader.failed_criteria else 'other', failed_rubric_items=grader.failed_criteria, evidence_refs=grader.evidence_refs, artifact_refs=grader.evidence_refs, feedback_summary='Improve any failed rubric items and preserve valid artifacts.', actionable_feedback=['Check every required artifact exists.', 'Address failed criteria explicitly.'], suggested_revision='Create or repair missing/weak deliverables.', revision_priority='high' if grader.failed_criteria else 'low', confidence=0.75, hidden_reference_used=False, hidden_reference_leaked=False, failed_or_weak_rubric_items=grader.failed_criteria, artifact_specific_comments=[], trace_specific_comments=[], revision_plan='Create or repair missing/weak deliverables.', model=None, provider=None, metadata_json={})
|
|
28
|
+
|
|
29
|
+
def _revision_plan_from_feedback(fb: EvaluatorFeedback, target_attempt_id: str) -> RevisionPlan:
|
|
30
|
+
return RevisionPlan(revision_plan_id=f'revision_plan_{fb.task_id}', task_id=fb.task_id, source_attempt_id=fb.attempt_id, target_attempt_id=target_attempt_id, revision_kind='local_fix', revision_reason=fb.feedback_summary, failed_rubric_items=fb.failed_rubric_items, planned_changes=fb.actionable_feedback, expected_score_improvement=0.1, risk_of_regression='low', uses_evaluator_feedback=True, metadata_json={})
|
|
31
|
+
|
|
32
|
+
def _evaluator_prompt(grader: GraderResult, verifier: VerifierResult, outputs: ActualOutputs, trace: AgentTrace | None, target_attempt_id: str, artifact_preview_bundle: dict | None=None) -> str:
|
|
33
|
+
return """Return only valid JSON. Do not include markdown. Do not add extra top-level fields; place extras under metadata_json.extra_model_fields.
|
|
34
|
+
Required skeleton: {"evaluator_feedback":{"feedback_id":"feedback_<task_id>_<attempt_kind>","task_id":"...","attempt_id":"...","target_actor":"worker","feedback_type":"other","failed_rubric_items":[],"evidence_refs":[],"artifact_refs":[],"feedback_summary":"...","actionable_feedback":[],"suggested_revision":"...","revision_priority":"medium","confidence":0.7,"hidden_reference_used":false,"hidden_reference_leaked":false,"failed_or_weak_rubric_items":[],"artifact_specific_comments":[],"trace_specific_comments":[],"metadata_json":{}},"revision_plan":{"revision_plan_id":"revision_plan_<task_id>","task_id":"...","source_attempt_id":"...","target_attempt_id":"...","revision_kind":"local_fix","revision_reason":"...","failed_rubric_items":[],"planned_changes":[],"risk_of_regression":"medium","uses_evaluator_feedback":true,"metadata_json":{}}}.
|
|
35
|
+
Evaluate the attempt using grader/verifier outputs, actual outputs, artifact content previews, artifacts, and trace. Produce actionable feedback and a concrete revision plan.
|
|
36
|
+
Target revised attempt id: """ + target_attempt_id + "\nGraderResult JSON:\n" + json.dumps(grader.model_dump(), sort_keys=True) + "\nVerifierResult JSON:\n" + json.dumps(verifier.model_dump(), sort_keys=True) + "\nActualOutputs JSON:\n" + json.dumps(outputs.model_dump(), sort_keys=True) + "\nArtifact content previews JSON:\n" + json.dumps(artifact_preview_bundle or {}, sort_keys=True)[:16000] + "\nTrace summary JSON:\n" + json.dumps((trace.model_dump() if trace else {}), sort_keys=True)[:12000]
|
|
37
|
+
|
|
38
|
+
def evaluate_attempt(grader: GraderResult, verifier: VerifierResult, outputs: ActualOutputs, trace: AgentTrace | None, target_attempt_id: str, role_root: Path | None=None, package_root: Path | None=None) -> tuple[EvaluatorFeedback, RevisionPlan]:
|
|
39
|
+
settings=get_settings(); role_root=role_root or Path('outputs/roles')
|
|
40
|
+
artifact_preview_bundle=build_artifact_previews(package_root, grader.evidence_refs or outputs.deliverable_refs or outputs.artifact_refs or outputs.files_created)
|
|
41
|
+
if _model_evaluator_active(settings) and _mentor_provider_can_attempt() and settings.llm_evaluator_enabled:
|
|
42
|
+
prompt=_evaluator_prompt(grader, verifier, outputs, trace, target_attempt_id, artifact_preview_bundle)
|
|
43
|
+
try:
|
|
44
|
+
provider=_mentor_provider_id()
|
|
45
|
+
model_override=settings.llm_evaluator_model if provider == 'openai' else None
|
|
46
|
+
rr=run_structured_role('evaluator_agent', prompt, LLMEvaluatorOutput, role_root/'evaluator_agent'/outputs.attempt_kind, allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': outputs.task_id, 'attempt_id': outputs.attempt_id, 'attempt_kind': outputs.attempt_kind, 'target_attempt_id': target_attempt_id, 'grader_result_id': grader.grader_result_id, 'verifier_result_id': verifier.verifier_result_id, 'artifact_content_refs': artifact_preview_bundle.get('artifact_content_refs'), 'artifact_content_previews': artifact_preview_bundle.get('artifact_content_previews'), 'artifact_content_hashes': artifact_preview_bundle.get('artifact_content_hashes'), 'artifact_content_preview_truncated': artifact_preview_bundle.get('artifact_content_preview_truncated'), 'model_grading_basis': artifact_preview_bundle.get('model_grading_basis'), 'model': model_override or settings.model_provider_model, 'provider':provider})
|
|
47
|
+
if rr.live_call_ok and rr.structured_output_validation_ok:
|
|
48
|
+
parsed=read_json(role_root/'evaluator_agent'/outputs.attempt_kind/'parsed_output.json')
|
|
49
|
+
fb=EvaluatorFeedback.model_validate(parsed['evaluator_feedback'])
|
|
50
|
+
rp=RevisionPlan.model_validate(parsed['revision_plan'])
|
|
51
|
+
fb.provider=rr.provider; fb.model=rr.model; fb.revision_plan=fb.revision_plan or rp.revision_reason
|
|
52
|
+
fb.metadata_json.update({'llm_prompt_ref_internal':str(role_root/'evaluator_agent'/outputs.attempt_kind/'prompt.md'),'llm_response_ref_internal':str(role_root/'evaluator_agent'/outputs.attempt_kind/'raw_output.txt'),'prompt_hash':sha256_text(prompt),'public_response_summary':'Model evaluator generated actionable feedback and revision plan.', **artifact_preview_bundle})
|
|
53
|
+
rp.metadata_json.update({'provider':rr.provider,'model':rr.model,'llm_prompt_ref_internal':str(role_root/'evaluator_agent'/outputs.attempt_kind/'prompt.md'),'llm_response_ref_internal':str(role_root/'evaluator_agent'/outputs.attempt_kind/'raw_output.txt')})
|
|
54
|
+
return fb,rp
|
|
55
|
+
if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
|
|
56
|
+
raise RuntimeError(rr.error_message or 'Model evaluator failed')
|
|
57
|
+
except Exception:
|
|
58
|
+
if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
|
|
59
|
+
raise
|
|
60
|
+
fb=deterministic_feedback(grader)
|
|
61
|
+
rp=_revision_plan_from_feedback(fb, target_attempt_id)
|
|
62
|
+
if _mentor_provider_can_attempt() and settings.llm_evaluator_enabled and settings.evaluation_mode != 'deterministic_only':
|
|
63
|
+
fb.metadata_json.update({'model_evaluator_enabled': True, 'llm_evaluator_enabled': True, 'llm_unavailable': True, 'deterministic_fallback': True, **artifact_preview_bundle})
|
|
64
|
+
return fb,rp
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import json, re
|
|
4
|
+
from .schemas import GraderResult, RubricSpec, ActualOutputs, RubricItemScore
|
|
5
|
+
from .config import get_settings
|
|
6
|
+
from .schemas import AgentTrace
|
|
7
|
+
from .io import read_json, write_json
|
|
8
|
+
from .openai_structured import get_model_provider_status, run_structured_role
|
|
9
|
+
from .public_sanitizer import sha256_text
|
|
10
|
+
from .artifact_previews import build_artifact_previews
|
|
11
|
+
|
|
12
|
+
INPUT_ARTIFACT_NAMES = {
|
|
13
|
+
'payments.csv', 'invoices.csv', 'fx_rates.csv', 'vendor_aliases.csv', 'reconciliation_policy.md'
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
def _mentor_provider_can_attempt() -> bool:
|
|
17
|
+
return bool(get_model_provider_status().get('provider_available'))
|
|
18
|
+
|
|
19
|
+
def _mentor_provider_id() -> str:
|
|
20
|
+
settings=get_settings()
|
|
21
|
+
return settings.model_provider or 'openai'
|
|
22
|
+
|
|
23
|
+
def _model_grading_active(settings) -> bool:
|
|
24
|
+
return settings.mentor_mode in {'model_assisted', 'hybrid'} or settings.evaluation_mode in {'hybrid', 'llm_required'}
|
|
25
|
+
|
|
26
|
+
def _is_output_artifact_name(name: str, output_names: set[str]) -> bool:
|
|
27
|
+
base=Path(str(name)).name
|
|
28
|
+
if base in INPUT_ARTIFACT_NAMES:
|
|
29
|
+
return False
|
|
30
|
+
return (not output_names) or base in output_names
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _expected_items_from_outputs(outputs: ActualOutputs) -> list[str]:
|
|
34
|
+
md=outputs.metadata_json or {}
|
|
35
|
+
items=md.get('expected_deliverable_items') or []
|
|
36
|
+
return [Path(str(x)).name for x in items if x]
|
|
37
|
+
|
|
38
|
+
def _produced_items(outputs: ActualOutputs) -> list[str]:
|
|
39
|
+
refs=list(outputs.deliverable_refs or []) + list(outputs.artifact_refs or []) + list(outputs.files_created or [])
|
|
40
|
+
md=outputs.metadata_json or {}
|
|
41
|
+
refs += [str(x) for x in (md.get('produced_deliverable_items') or [])]
|
|
42
|
+
return list(dict.fromkeys(Path(str(x)).name for x in refs if x))
|
|
43
|
+
|
|
44
|
+
def _deliverable_match(outputs: ActualOutputs, evidence: list[str], missing: list[str]) -> dict[str, object]:
|
|
45
|
+
expected=_expected_items_from_outputs(outputs)
|
|
46
|
+
produced=_produced_items(outputs)
|
|
47
|
+
expected_set=set(expected)
|
|
48
|
+
produced_set=set(produced)
|
|
49
|
+
if not expected:
|
|
50
|
+
status='exact' if not missing else 'partial'
|
|
51
|
+
mismatches=[]
|
|
52
|
+
else:
|
|
53
|
+
missing_expected=sorted(expected_set - produced_set)
|
|
54
|
+
extra_produced=sorted(produced_set - expected_set)
|
|
55
|
+
mismatches=[{'expected': m, 'status':'missing_or_substituted'} for m in missing_expected]
|
|
56
|
+
status='exact' if not missing_expected else ('partial' if evidence else 'mismatch')
|
|
57
|
+
if missing_expected and extra_produced:
|
|
58
|
+
status='partial'
|
|
59
|
+
return {'expected_deliverable_items': expected, 'produced_deliverable_items': produced, 'deliverable_match_status': status, 'deliverable_mismatches': mismatches}
|
|
60
|
+
|
|
61
|
+
def _candidate_refs(outputs: ActualOutputs, artifact_name: str, attempt_kind: str) -> list[str]:
|
|
62
|
+
refs=[]
|
|
63
|
+
refs.extend(outputs.deliverable_refs or [])
|
|
64
|
+
refs.extend(outputs.artifact_refs or [])
|
|
65
|
+
refs.extend(outputs.files_created or [])
|
|
66
|
+
refs.append(f'attempts/{attempt_kind}/artifacts/{artifact_name}')
|
|
67
|
+
refs.append(f'artifacts/{artifact_name}')
|
|
68
|
+
out=[]
|
|
69
|
+
for r in refs:
|
|
70
|
+
if not r: continue
|
|
71
|
+
r=str(r)
|
|
72
|
+
if r.endswith('/'+artifact_name) or Path(r).name == artifact_name:
|
|
73
|
+
if r.startswith('artifacts/'):
|
|
74
|
+
r=f'attempts/{attempt_kind}/{r}'
|
|
75
|
+
out.append(r)
|
|
76
|
+
return list(dict.fromkeys(out))
|
|
77
|
+
|
|
78
|
+
def _resolve_required_artifacts(rubric: RubricSpec, outputs: ActualOutputs, attempt_kind: str, package_root: Path | None=None) -> tuple[list[str], list[str]]:
|
|
79
|
+
output_refs=list(outputs.deliverable_refs or []) + list(outputs.artifact_refs or []) + list(outputs.files_created or [])
|
|
80
|
+
output_names={Path(str(x)).name for x in output_refs if x and ('artifacts/' in str(x) or str(x).startswith(f'attempts/{attempt_kind}/'))}
|
|
81
|
+
required=_expected_items_from_outputs(outputs)
|
|
82
|
+
if not required:
|
|
83
|
+
required=[]
|
|
84
|
+
for item in ([] if required else rubric.rubric_items):
|
|
85
|
+
required.extend(item.required_artifacts)
|
|
86
|
+
|
|
87
|
+
if not required:
|
|
88
|
+
required.extend(rubric.required_artifacts)
|
|
89
|
+
required=[Path(x).name for x in required if x and _is_output_artifact_name(str(x), output_names)]
|
|
90
|
+
if not required:
|
|
91
|
+
required=[Path(x).name for x in output_refs if x and 'artifacts/' in str(x)]
|
|
92
|
+
required=list(dict.fromkeys(required))
|
|
93
|
+
evidence=[]; missing=[]
|
|
94
|
+
refs=set(outputs.deliverable_refs + outputs.artifact_refs + outputs.files_created)
|
|
95
|
+
for name in required:
|
|
96
|
+
cands=_candidate_refs(outputs, name, attempt_kind)
|
|
97
|
+
found=None
|
|
98
|
+
for c in cands:
|
|
99
|
+
path_ok = bool(package_root and (package_root/c).exists())
|
|
100
|
+
if c in refs or path_ok:
|
|
101
|
+
found=c; break
|
|
102
|
+
if not found and package_root:
|
|
103
|
+
c=f'attempts/{attempt_kind}/artifacts/{name}'
|
|
104
|
+
if (package_root/c).exists():
|
|
105
|
+
found=c
|
|
106
|
+
if not found and not package_root:
|
|
107
|
+
for c in cands:
|
|
108
|
+
if Path(c).name == name and (c in refs or outputs.status == 'success'):
|
|
109
|
+
found=c; break
|
|
110
|
+
if found: evidence.append(found)
|
|
111
|
+
else: missing.append(name)
|
|
112
|
+
return evidence, missing
|
|
113
|
+
|
|
114
|
+
def deterministic_grade(rubric: RubricSpec, outputs: ActualOutputs, attempt_kind: str, package_root: Path | None=None) -> GraderResult:
|
|
115
|
+
evidence, missing = _resolve_required_artifacts(rubric, outputs, attempt_kind, package_root)
|
|
116
|
+
match=_deliverable_match(outputs, evidence, missing)
|
|
117
|
+
required_count=max(1, len(match.get('expected_deliverable_items') or missing) or len(evidence)+len(missing))
|
|
118
|
+
artifact_contract_score = 1.0 if (outputs.status == 'success' and not missing and bool(evidence) and match['deliverable_match_status']=='exact') else max(0.0, min(1.0, len(evidence)/required_count))
|
|
119
|
+
artifact_contract_passed = outputs.status == 'success' and artifact_contract_score >= 1.0 and not missing and match['deliverable_match_status']=='exact'
|
|
120
|
+
item_scores=[]
|
|
121
|
+
for item in rubric.rubric_items:
|
|
122
|
+
output_names={Path(str(x)).name for x in (outputs.deliverable_refs or []) + (outputs.artifact_refs or []) + (outputs.files_created or []) if x and ('artifacts/' in str(x) or str(x).startswith(f'attempts/{attempt_kind}/'))}
|
|
123
|
+
req=[Path(x).name for x in item.required_artifacts if _is_output_artifact_name(str(x), output_names)]
|
|
124
|
+
item_evidence=[e for e in evidence if Path(e).name in req] or evidence
|
|
125
|
+
item_missing=[m for m in missing if m in req]
|
|
126
|
+
ok = outputs.status == 'success' and not item_missing and bool(item_evidence)
|
|
127
|
+
item_scores.append(RubricItemScore(rubric_item_id=item.rubric_item_id, criterion_name=item.criterion_name, score=1.0 if ok else 0.0, max_score=1.0, passed=ok, evidence_refs=item_evidence, failure_mode=None if ok else 'missing_artifact', notes='Deterministic artifact-contract precheck only; semantic correctness is not evaluated.', confidence=0.8, artifact_presence_ok=ok, semantic_correctness_score=None, reasoning_summary='Required artifact paths were resolved.' if ok else 'One or more required artifacts were not resolved.', improvement_suggestion=None if ok else 'Create the missing required artifact files.'))
|
|
128
|
+
score=artifact_contract_score
|
|
129
|
+
return GraderResult(grader_result_id=f'grader_{outputs.attempt_id}', task_id=outputs.task_id, attempt_id=outputs.attempt_id, attempt_kind=attempt_kind, rubric_id=rubric.rubric_id, grader_kind='deterministic', score_source='deterministic_artifact_contract', score=score, max_score=1.0, passed=score>=rubric.pass_threshold, rubric_item_scores=item_scores, failed_criteria=[s.rubric_item_id for s in item_scores if not s.passed], passed_criteria=[s.rubric_item_id for s in item_scores if s.passed], evidence_refs=evidence, confidence=0.8, reasoning_summary='Deterministic precheck verifies artifact presence/readiness only; semantic correctness is not evaluated.', limitations=['semantic_correctness_not_evaluated'], hidden_reference_used=False, hidden_reference_leaked=False, artifact_contract_score=artifact_contract_score, semantic_score=None, final_score=score, model=None, provider=None, deterministic_precheck_ref=None, public_response_summary='Artifact-contract precheck completed.', metadata_json={'artifact_contract_passed': artifact_contract_passed, 'semantic_correctness_not_evaluated': True, 'missing_artifacts': missing, **match})
|
|
130
|
+
|
|
131
|
+
def _grader_prompt(rubric: RubricSpec, outputs: ActualOutputs, trace: AgentTrace | None, precheck: GraderResult, artifact_preview_bundle: dict | None=None) -> str:
|
|
132
|
+
return """Return only valid JSON matching GraderResult. Do not include markdown.
|
|
133
|
+
You are the model-based grader. Score each rubric item using the task rubric, attempt trace, actual outputs, and artifact content previews when available. Return a JSON object with exactly these required top-level fields: grader_result_id, task_id, attempt_id, attempt_kind, rubric_id, grader_kind, score_source, artifact_contract_score, model_score, semantic_score, final_score, score, max_score, passed, confidence, evidence_refs, rubric_item_scores, limitations, hidden_reference_used, hidden_reference_leaked, metadata_json. Each rubric_item_scores entry must include artifact_presence_ok, semantic_correctness_score, evidence_refs, reasoning_summary, confidence, failure_mode, and improvement_suggestion. Cite evidence_refs, distinguish artifact presence from model-judged correctness, and set score_source=model_judged and grader_kind=model. Deterministic artifact-contract precheck is auxiliary only.
|
|
134
|
+
Rubric JSON:
|
|
135
|
+
""" + json.dumps(rubric.model_dump(), sort_keys=True) + "\nActualOutputs JSON:\n" + json.dumps(outputs.model_dump(), sort_keys=True) + "\nArtifact content previews JSON:\n" + json.dumps(artifact_preview_bundle or {}, sort_keys=True)[:16000] + "\nTrace summary JSON:\n" + json.dumps((trace.model_dump() if trace else {}), sort_keys=True)[:12000] + "\nDeterministic precheck JSON:\n" + json.dumps(precheck.model_dump(), sort_keys=True)
|
|
136
|
+
|
|
137
|
+
def grade_attempt(rubric: RubricSpec, outputs: ActualOutputs, attempt_kind: str, trace: AgentTrace | None=None, role_root: Path | None=None, package_root: Path | None=None) -> GraderResult:
|
|
138
|
+
settings=get_settings()
|
|
139
|
+
pre=deterministic_grade(rubric, outputs, attempt_kind, package_root)
|
|
140
|
+
role_root=role_root or Path('outputs/roles')
|
|
141
|
+
artifact_preview_bundle=build_artifact_previews(package_root, pre.evidence_refs or outputs.deliverable_refs or outputs.artifact_refs or outputs.files_created)
|
|
142
|
+
pre.metadata_json.update(artifact_preview_bundle)
|
|
143
|
+
if _model_grading_active(settings) and _mentor_provider_can_attempt() and settings.llm_grader_enabled:
|
|
144
|
+
prompt=_grader_prompt(rubric, outputs, trace, pre, artifact_preview_bundle)
|
|
145
|
+
(role_root/'grader_agent'/attempt_kind).mkdir(parents=True, exist_ok=True)
|
|
146
|
+
write_json(role_root/'grader_agent'/attempt_kind/'deterministic_precheck.json', pre)
|
|
147
|
+
try:
|
|
148
|
+
provider=_mentor_provider_id()
|
|
149
|
+
model_override=settings.llm_grader_model if provider == 'openai' else None
|
|
150
|
+
rr=run_structured_role('grader_agent', prompt, GraderResult, role_root/'grader_agent'/attempt_kind, allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': outputs.task_id, 'attempt_id': outputs.attempt_id, 'attempt_kind': attempt_kind, 'rubric_id': rubric.rubric_id, 'artifact_contract_score': pre.artifact_contract_score, 'evidence_refs': pre.evidence_refs, 'artifact_content_refs': artifact_preview_bundle.get('artifact_content_refs'), 'artifact_content_previews': artifact_preview_bundle.get('artifact_content_previews'), 'artifact_content_hashes': artifact_preview_bundle.get('artifact_content_hashes'), 'artifact_content_preview_truncated': artifact_preview_bundle.get('artifact_content_preview_truncated'), 'model_grading_basis': artifact_preview_bundle.get('model_grading_basis'), 'model': model_override or settings.model_provider_model, 'provider': provider})
|
|
151
|
+
if rr.live_call_ok and rr.structured_output_validation_ok:
|
|
152
|
+
parsed=read_json(role_root/'grader_agent'/attempt_kind/'parsed_output.json')
|
|
153
|
+
g=GraderResult.model_validate(parsed)
|
|
154
|
+
g.grader_kind='model'; g.score_source='model_judged'; g.legacy_score_source='llm_semantic'; g.artifact_contract_score=pre.artifact_contract_score; g.semantic_score=g.semantic_score if g.semantic_score is not None else g.score; g.model_score=g.model_score if g.model_score is not None else g.semantic_score; g.legacy_semantic_score=g.legacy_semantic_score if g.legacy_semantic_score is not None else g.semantic_score; g.final_score=g.final_score if g.final_score is not None else g.semantic_score; g.score=g.final_score; g.model=rr.model; g.provider=rr.provider; g.deterministic_precheck_ref=str(role_root/'grader_agent'/attempt_kind/'deterministic_precheck.json'); g.llm_prompt_ref_internal=str(role_root/'grader_agent'/attempt_kind/'prompt.md'); g.llm_response_ref_internal=str(role_root/'grader_agent'/attempt_kind/'raw_output.txt'); g.public_prompt_hash=sha256_text(prompt); g.public_response_summary=g.public_response_summary or 'Model grader scored rubric items with evidence refs and artifact previews when available.'
|
|
155
|
+
g.metadata_json.update({'deterministic_precheck': pre.model_dump(), **artifact_preview_bundle, 'llm_role_result_ref_internal':str(role_root/'grader_agent'/attempt_kind/'role_result.json')})
|
|
156
|
+
return g
|
|
157
|
+
if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
|
|
158
|
+
raise RuntimeError(rr.error_message or 'Model grader failed')
|
|
159
|
+
except Exception:
|
|
160
|
+
if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
|
|
161
|
+
raise
|
|
162
|
+
if settings.evaluation_mode == 'deterministic_only':
|
|
163
|
+
pre.semantic_score=None; pre.final_score=pre.score
|
|
164
|
+
elif _mentor_provider_can_attempt() and settings.llm_grader_enabled:
|
|
165
|
+
pre.grader_kind='hybrid'; pre.score_source='deterministic_fallback'; pre.semantic_score=None; pre.final_score=pre.score
|
|
166
|
+
provider=_mentor_provider_id()
|
|
167
|
+
pre.model=settings.model_provider_model or settings.llm_grader_model; pre.provider=provider
|
|
168
|
+
pre.limitations.append('Model grader unavailable; deterministic fallback explicitly marked.')
|
|
169
|
+
pre.metadata_json.update({'llm_grader_enabled': True, 'llm_unavailable': True, 'deterministic_fallback': True})
|
|
170
|
+
return pre
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def apply_score_reliability(grader: GraderResult, verifier) -> GraderResult:
|
|
174
|
+
status=getattr(verifier, 'verification_status', None) or 'not_run'
|
|
175
|
+
issues=list(getattr(verifier, 'issues', None) or [])
|
|
176
|
+
if status == 'verified':
|
|
177
|
+
reliability='verified'
|
|
178
|
+
elif status == 'failed':
|
|
179
|
+
reliability='failed_verification'
|
|
180
|
+
else:
|
|
181
|
+
reliability='needs_review'
|
|
182
|
+
grader.score_reliability=reliability
|
|
183
|
+
grader.verifier_status=status
|
|
184
|
+
grader.verifier_confidence=getattr(verifier, 'confidence', None)
|
|
185
|
+
grader.verifier_issue_count=len(issues)
|
|
186
|
+
grader.verifier_issues_summary='; '.join(str(x) for x in issues[:5]) if issues else None
|
|
187
|
+
grader.metadata_json.update({
|
|
188
|
+
'score_reliability': reliability,
|
|
189
|
+
'verifier_status': status,
|
|
190
|
+
'verifier_confidence': getattr(verifier, 'confidence', None),
|
|
191
|
+
'verifier_issue_count': len(issues),
|
|
192
|
+
'verifier_issues_summary': grader.verifier_issues_summary,
|
|
193
|
+
})
|
|
194
|
+
return grader
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .config import get_settings, mentor_model_provider_readiness, apprentice_agent_readiness_status
|
|
8
|
+
from .io import append_jsonl, read_jsonl
|
|
9
|
+
from .recipes import MODEL_PROVIDER_RECIPES, WORKER_AGENT_RECIPES
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def smoke_cache_path(app_home: Path | None = None):
|
|
13
|
+
return (app_home or get_settings().app_home) / "integration_smoke_results.jsonl"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def record_certification_result(
|
|
17
|
+
*,
|
|
18
|
+
provider_type: str,
|
|
19
|
+
provider_id: str,
|
|
20
|
+
result: str,
|
|
21
|
+
certification_kind: str,
|
|
22
|
+
agent_id: str | None = None,
|
|
23
|
+
model_provider_id: str | None = None,
|
|
24
|
+
error_type: str | None = None,
|
|
25
|
+
error_summary: str | None = None,
|
|
26
|
+
command_or_model: str | None = None,
|
|
27
|
+
app_home: Path | None = None,
|
|
28
|
+
metadata_json: dict[str, Any] | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
row = {
|
|
31
|
+
"provider_type": provider_type,
|
|
32
|
+
"provider_id": provider_id,
|
|
33
|
+
"result": result,
|
|
34
|
+
"certification_kind": certification_kind,
|
|
35
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
36
|
+
"agent_id": agent_id,
|
|
37
|
+
"model_provider_id": model_provider_id,
|
|
38
|
+
"error_type": error_type,
|
|
39
|
+
"error_summary": error_summary,
|
|
40
|
+
"command_or_model": command_or_model,
|
|
41
|
+
"metadata_json": metadata_json or {},
|
|
42
|
+
}
|
|
43
|
+
try:
|
|
44
|
+
append_jsonl(smoke_cache_path(app_home), row)
|
|
45
|
+
except OSError:
|
|
46
|
+
# Live smokes are often run from restricted CI/sandbox contexts. A
|
|
47
|
+
# non-writable local app home should not hide the actual provider result.
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def record_live_smoke_result(
|
|
52
|
+
*,
|
|
53
|
+
provider_type: str,
|
|
54
|
+
provider_id: str,
|
|
55
|
+
result: str,
|
|
56
|
+
error_type: str | None = None,
|
|
57
|
+
error_summary: str | None = None,
|
|
58
|
+
command_or_model: str | None = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
kind = {
|
|
61
|
+
"apprentice_agent": "agent_live_smoke",
|
|
62
|
+
"mentor_model_provider": "model_live_smoke",
|
|
63
|
+
}.get(provider_type, "live_smoke")
|
|
64
|
+
record_certification_result(
|
|
65
|
+
provider_type=provider_type,
|
|
66
|
+
provider_id=provider_id,
|
|
67
|
+
result=result,
|
|
68
|
+
certification_kind=kind,
|
|
69
|
+
error_type=error_type,
|
|
70
|
+
error_summary=error_summary,
|
|
71
|
+
command_or_model=command_or_model,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def latest_smoke_results(app_home: Path | None = None) -> dict[tuple[str, str], dict[str, Any]]:
|
|
76
|
+
path = smoke_cache_path(app_home)
|
|
77
|
+
if not path.exists():
|
|
78
|
+
return {}
|
|
79
|
+
latest: dict[tuple[str, str], dict[str, Any]] = {}
|
|
80
|
+
for row in read_jsonl(path):
|
|
81
|
+
key = (str(row.get("provider_type")), str(row.get("provider_id")))
|
|
82
|
+
latest[key] = row
|
|
83
|
+
return latest
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def latest_certification_results(app_home: Path | None = None) -> list[dict[str, Any]]:
|
|
87
|
+
path = smoke_cache_path(app_home)
|
|
88
|
+
if not path.exists():
|
|
89
|
+
return []
|
|
90
|
+
return list(read_jsonl(path))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _latest_for(rows: list[dict[str, Any]], **filters: str) -> dict[str, Any]:
|
|
94
|
+
match = None
|
|
95
|
+
for row in rows:
|
|
96
|
+
if all(str(row.get(key)) == value for key, value in filters.items()):
|
|
97
|
+
match = row
|
|
98
|
+
return match or {}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _latest_full_e2e_for(rows: list[dict[str, Any]], *, agent_id: str | None = None, provider_id: str | None = None) -> dict[str, Any]:
|
|
102
|
+
match = None
|
|
103
|
+
for row in rows:
|
|
104
|
+
if row.get("certification_kind") != "full_e2e":
|
|
105
|
+
continue
|
|
106
|
+
if agent_id is not None and row.get("agent_id") != agent_id:
|
|
107
|
+
continue
|
|
108
|
+
if provider_id is not None and row.get("model_provider_id") != provider_id:
|
|
109
|
+
continue
|
|
110
|
+
match = row
|
|
111
|
+
return match or {}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _last_tested(*rows: dict[str, Any]) -> str | None:
|
|
115
|
+
timestamps = [str(row.get("timestamp")) for row in rows if row.get("timestamp")]
|
|
116
|
+
return max(timestamps) if timestamps else None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def integrations_report() -> dict[str, list[dict[str, Any]]]:
|
|
120
|
+
latest = latest_smoke_results()
|
|
121
|
+
rows = latest_certification_results()
|
|
122
|
+
settings = get_settings()
|
|
123
|
+
agents = []
|
|
124
|
+
for agent_id, recipe in WORKER_AGENT_RECIPES.items():
|
|
125
|
+
s = settings.model_copy(
|
|
126
|
+
update={
|
|
127
|
+
"worker_agent": agent_id,
|
|
128
|
+
"worker_agent_command": settings.worker_agent_command if settings.worker_agent == agent_id else None,
|
|
129
|
+
"custom_worker_command_template": settings.custom_worker_command_template if agent_id == "custom" else settings.custom_worker_command_template,
|
|
130
|
+
"custom_worker_display_name": settings.custom_worker_display_name if agent_id == "custom" else settings.custom_worker_display_name,
|
|
131
|
+
}
|
|
132
|
+
)
|
|
133
|
+
readiness = apprentice_agent_readiness_status(s)
|
|
134
|
+
smoke = latest.get(("apprentice_agent", agent_id), {})
|
|
135
|
+
if agent_id == "custom":
|
|
136
|
+
fixture_smoke = _latest_for(rows, provider_type="apprentice_agent", provider_id="custom", certification_kind="agent_live_smoke_fixture")
|
|
137
|
+
user_smoke = _latest_for(rows, provider_type="apprentice_agent", provider_id="custom", certification_kind="agent_live_smoke_user")
|
|
138
|
+
display_smoke = fixture_smoke or smoke
|
|
139
|
+
else:
|
|
140
|
+
fixture_smoke = {}
|
|
141
|
+
user_smoke = {}
|
|
142
|
+
display_smoke = smoke
|
|
143
|
+
full = _latest_full_e2e_for(rows, agent_id=agent_id)
|
|
144
|
+
agents.append(
|
|
145
|
+
{
|
|
146
|
+
"id": agent_id,
|
|
147
|
+
"display_name": recipe.display_name,
|
|
148
|
+
"adapter_implemented": True,
|
|
149
|
+
"command_expected": readiness.get("command") or recipe.command_name,
|
|
150
|
+
"command_found": bool(readiness.get("command_found")),
|
|
151
|
+
"fake_adapter_test_covered": True,
|
|
152
|
+
"live_smoke_script_available": True,
|
|
153
|
+
"latest_local_live_smoke_result": display_smoke.get("result") or "not_run",
|
|
154
|
+
"latest_full_e2e_result": full.get("result") or "not_run",
|
|
155
|
+
"last_tested_at": _last_tested(smoke, full, fixture_smoke, user_smoke),
|
|
156
|
+
"latest_error_summary": display_smoke.get("error_summary"),
|
|
157
|
+
"latest_full_e2e_error_summary": full.get("error_summary"),
|
|
158
|
+
"custom_fixture_live_smoke_result": fixture_smoke.get("result") if agent_id == "custom" else None,
|
|
159
|
+
"custom_user_live_smoke_result": user_smoke.get("result") if agent_id == "custom" else None,
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
providers = []
|
|
163
|
+
for provider_id, recipe in MODEL_PROVIDER_RECIPES.items():
|
|
164
|
+
s = settings.model_copy(
|
|
165
|
+
update={
|
|
166
|
+
"model_provider": provider_id,
|
|
167
|
+
"model_provider_api_key_env": settings.model_provider_api_key_env if settings.model_provider == provider_id else recipe.api_key_env_var,
|
|
168
|
+
"model_provider_model": settings.model_provider_model if settings.model_provider == provider_id else recipe.default_model,
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
readiness = mentor_model_provider_readiness(s)
|
|
172
|
+
smoke = latest.get(("mentor_model_provider", provider_id), {})
|
|
173
|
+
full = _latest_full_e2e_for(rows, provider_id=provider_id)
|
|
174
|
+
full_result = full.get("result") or "not_run"
|
|
175
|
+
if full_result == "passed" and str(smoke.get("result") or "").startswith("failed"):
|
|
176
|
+
full_result = "not_certified_due_to_provider_failure"
|
|
177
|
+
providers.append(
|
|
178
|
+
{
|
|
179
|
+
"id": provider_id,
|
|
180
|
+
"display_name": recipe.display_name,
|
|
181
|
+
"adapter_implemented": True,
|
|
182
|
+
"key_env_var": readiness.get("api_key_env_var") or recipe.api_key_env_var,
|
|
183
|
+
"key_visible": bool(readiness.get("api_key_visible")),
|
|
184
|
+
"fake_provider_test_covered": True,
|
|
185
|
+
"live_smoke_script_available": True,
|
|
186
|
+
"latest_local_live_smoke_result": smoke.get("result") or "not_run",
|
|
187
|
+
"latest_full_e2e_result": full_result,
|
|
188
|
+
"last_tested_at": _last_tested(smoke, full),
|
|
189
|
+
"latest_error_summary": smoke.get("error_summary"),
|
|
190
|
+
"latest_full_e2e_error_summary": full.get("error_summary"),
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
return {"apprentice_agents": agents, "mentor_model_providers": providers}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json, hashlib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
def write_json(path: Path, obj):
|
|
7
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
8
|
+
data = obj.model_dump(mode='json') if isinstance(obj, BaseModel) else obj
|
|
9
|
+
path.write_text(json.dumps(data, indent=2, sort_keys=True)+"\n")
|
|
10
|
+
|
|
11
|
+
def read_json(path: Path): return json.loads(path.read_text())
|
|
12
|
+
def append_jsonl(path: Path, obj):
|
|
13
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
data = obj.model_dump(mode='json') if isinstance(obj, BaseModel) else obj
|
|
15
|
+
with path.open('a') as f: f.write(json.dumps(data, sort_keys=True)+"\n")
|
|
16
|
+
def read_jsonl(path: Path):
|
|
17
|
+
if not path.exists(): return []
|
|
18
|
+
return [json.loads(l) for l in path.read_text().splitlines() if l.strip()]
|
|
19
|
+
def sha256_file(path: Path):
|
|
20
|
+
h=hashlib.sha256(); h.update(path.read_bytes()); return h.hexdigest()
|