agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
from .schemas import GraderResult, VerifierResult, RubricItemScore, TaskIntakeSpec, TaskIntakeQualityReport, RubricSpec, RubricQualityReport, RubricItem, EvaluatorFeedback, RevisionPlan
|
|
5
|
+
|
|
6
|
+
GRADER_ALLOWED=set(GraderResult.__annotations__.keys())
|
|
7
|
+
VERIFIER_ALLOWED=set(VerifierResult.__annotations__.keys())
|
|
8
|
+
ITEM_ALLOWED=set(RubricItemScore.__annotations__.keys())
|
|
9
|
+
INTAKE_ALLOWED=set(TaskIntakeSpec.__annotations__.keys())
|
|
10
|
+
QUALITY_ALLOWED=set(TaskIntakeQualityReport.__annotations__.keys())
|
|
11
|
+
RUBRIC_ALLOWED=set(RubricSpec.__annotations__.keys())
|
|
12
|
+
RUBRIC_ITEM_ALLOWED=set(RubricItem.__annotations__.keys())
|
|
13
|
+
RUBRIC_QUALITY_ALLOWED=set(RubricQualityReport.__annotations__.keys())
|
|
14
|
+
EVALUATOR_ALLOWED=set(EvaluatorFeedback.__annotations__.keys())
|
|
15
|
+
REVISION_ALLOWED=set(RevisionPlan.__annotations__.keys())
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class RoleContext:
|
|
19
|
+
task_id: str = 'smoke_task'
|
|
20
|
+
attempt_id: str = 'smoke_attempt'
|
|
21
|
+
attempt_kind: str = 'baseline'
|
|
22
|
+
rubric_id: str = 'smoke_rubric'
|
|
23
|
+
grader_result_id: str | None = None
|
|
24
|
+
artifact_contract_score: float | None = None
|
|
25
|
+
evidence_refs: list[str] | None = None
|
|
26
|
+
artifact_content_refs: list[str] | None = None
|
|
27
|
+
artifact_content_previews: list[dict[str, Any]] | None = None
|
|
28
|
+
artifact_content_hashes: dict[str, str] | None = None
|
|
29
|
+
artifact_content_preview_truncated: bool | None = None
|
|
30
|
+
model_grading_basis: str | None = None
|
|
31
|
+
model: str | None = None
|
|
32
|
+
provider: str = 'openai'
|
|
33
|
+
task_title: str | None = None
|
|
34
|
+
task_instruction: str | None = None
|
|
35
|
+
target_attempt_id: str | None = None
|
|
36
|
+
verifier_result_id: str | None = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _as_list(v: Any) -> list[Any]:
|
|
40
|
+
if v is None: return []
|
|
41
|
+
if isinstance(v, list): return v
|
|
42
|
+
return [v]
|
|
43
|
+
|
|
44
|
+
def _float(v: Any, default: float=0.0) -> float:
|
|
45
|
+
try:
|
|
46
|
+
if v is None: return default
|
|
47
|
+
return float(v)
|
|
48
|
+
except Exception:
|
|
49
|
+
return default
|
|
50
|
+
|
|
51
|
+
def _bool(v: Any, default: bool=False) -> bool:
|
|
52
|
+
if isinstance(v, bool): return v
|
|
53
|
+
if isinstance(v, str):
|
|
54
|
+
if v.lower() in {'true','yes','passed','pass','ok'}: return True
|
|
55
|
+
if v.lower() in {'false','no','failed','fail'}: return False
|
|
56
|
+
return default
|
|
57
|
+
|
|
58
|
+
def _metadata(raw: dict[str, Any], allowed: set[str]) -> dict[str, Any]:
|
|
59
|
+
md=dict(raw.get('metadata_json') or {}) if isinstance(raw.get('metadata_json'), dict) else {}
|
|
60
|
+
extra={k:v for k,v in raw.items() if k not in allowed}
|
|
61
|
+
if extra:
|
|
62
|
+
md['extra_model_fields']=extra
|
|
63
|
+
md['raw_model_output']=raw
|
|
64
|
+
md['raw_llm_output']=raw
|
|
65
|
+
return md
|
|
66
|
+
|
|
67
|
+
def _normalize_item(raw: Any, idx: int, fallback_refs: list[str]) -> dict[str, Any]:
|
|
68
|
+
if not isinstance(raw, dict): raw={'notes': str(raw)}
|
|
69
|
+
md=_metadata(raw, ITEM_ALLOWED)
|
|
70
|
+
score=_float(raw.get('score', raw.get('semantic_correctness_score', raw.get('final_score'))), 0.0)
|
|
71
|
+
max_score=_float(raw.get('max_score'), 1.0) or 1.0
|
|
72
|
+
refs=[str(x) for x in _as_list(raw.get('evidence_refs'))] or fallback_refs
|
|
73
|
+
passed=_bool(raw.get('passed'), score >= 0.7*max_score)
|
|
74
|
+
return {
|
|
75
|
+
'rubric_item_id': str(raw.get('rubric_item_id') or raw.get('id') or f'ri_{idx}'),
|
|
76
|
+
'criterion_name': str(raw.get('criterion_name') or raw.get('name') or raw.get('criterion') or f'criterion_{idx}'),
|
|
77
|
+
'score': score,
|
|
78
|
+
'max_score': max_score,
|
|
79
|
+
'passed': passed,
|
|
80
|
+
'evidence_refs': refs,
|
|
81
|
+
'failure_mode': raw.get('failure_mode'),
|
|
82
|
+
'notes': raw.get('notes') or raw.get('reasoning_summary'),
|
|
83
|
+
'confidence': _float(raw.get('confidence'), 0.7),
|
|
84
|
+
'artifact_presence_ok': raw.get('artifact_presence_ok') if isinstance(raw.get('artifact_presence_ok'), bool) else None,
|
|
85
|
+
'semantic_correctness_score': raw.get('semantic_correctness_score') if raw.get('semantic_correctness_score') is not None else score,
|
|
86
|
+
'reasoning_summary': raw.get('reasoning_summary') or raw.get('reasoning') or raw.get('notes'),
|
|
87
|
+
'improvement_suggestion': raw.get('improvement_suggestion') or raw.get('suggestion'),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _first_dict(raw: dict[str, Any], *keys: str) -> dict[str, Any]:
|
|
93
|
+
for key in keys:
|
|
94
|
+
val=raw.get(key)
|
|
95
|
+
if isinstance(val, dict):
|
|
96
|
+
return val
|
|
97
|
+
return raw
|
|
98
|
+
|
|
99
|
+
def _summary(v: Any, default: str) -> str:
|
|
100
|
+
if v is None:
|
|
101
|
+
return default
|
|
102
|
+
if isinstance(v, str):
|
|
103
|
+
return v
|
|
104
|
+
if isinstance(v, list):
|
|
105
|
+
return ', '.join(str(x) for x in v) or default
|
|
106
|
+
if isinstance(v, dict):
|
|
107
|
+
return ', '.join(f'{k}: {v[k]}' for k in list(v)[:5]) or default
|
|
108
|
+
return str(v)
|
|
109
|
+
|
|
110
|
+
def _score(v: Any, default: float=0.8) -> float:
|
|
111
|
+
x=_float(v, default)
|
|
112
|
+
if x < 0: return 0.0
|
|
113
|
+
if x > 1: return 1.0
|
|
114
|
+
return x
|
|
115
|
+
|
|
116
|
+
def normalize_task_intake_result(raw: dict[str, Any], context: RoleContext | None=None) -> dict[str, Any]:
|
|
117
|
+
context=context or RoleContext()
|
|
118
|
+
raw=dict(raw or {})
|
|
119
|
+
spec_raw=_first_dict(raw, 'task_intake_spec', 'intake_spec', 'task')
|
|
120
|
+
quality_raw=raw.get('task_intake_quality_report') if isinstance(raw.get('task_intake_quality_report'), dict) else raw.get('quality_report') if isinstance(raw.get('quality_report'), dict) else {}
|
|
121
|
+
task_id=str(spec_raw.get('task_id') or raw.get('task_id') or context.task_id)
|
|
122
|
+
title=spec_raw.get('normalized_title') or spec_raw.get('task_title') or spec_raw.get('title') or raw.get('task_title') or context.task_title or task_id
|
|
123
|
+
instruction=spec_raw.get('normalized_instruction') or spec_raw.get('instruction') or spec_raw.get('task_instruction') or spec_raw.get('description') or raw.get('description') or context.task_instruction or title
|
|
124
|
+
expected_outputs=spec_raw.get('expected_outputs') or spec_raw.get('output_requirements') or spec_raw.get('required_artifacts') or []
|
|
125
|
+
required_artifacts=spec_raw.get('required_artifacts') or spec_raw.get('artifacts') or expected_outputs or []
|
|
126
|
+
metadata=_metadata(spec_raw, INTAKE_ALLOWED)
|
|
127
|
+
metadata.update({
|
|
128
|
+
'task_family_guess': spec_raw.get('task_family_guess') or spec_raw.get('task_family_id') or spec_raw.get('domain') or 'unknown',
|
|
129
|
+
'task_type': spec_raw.get('task_type') or spec_raw.get('workflow_type') or 'artifact_generation',
|
|
130
|
+
'required_artifacts': _as_list(required_artifacts),
|
|
131
|
+
'hidden_reference_policy': spec_raw.get('hidden_reference_policy') or 'no_hidden_reference_available',
|
|
132
|
+
'risk_notes': _as_list(spec_raw.get('risk_notes') or spec_raw.get('risk_safety_notes')),
|
|
133
|
+
'subjectivity_level': spec_raw.get('subjectivity_level') or 'medium',
|
|
134
|
+
'evaluation_difficulty': spec_raw.get('evaluation_difficulty') or 'medium',
|
|
135
|
+
'suggested_evaluator_type': spec_raw.get('suggested_evaluator_type') or 'model_judged',
|
|
136
|
+
})
|
|
137
|
+
spec={
|
|
138
|
+
'task_id': task_id,
|
|
139
|
+
'normalized_title': str(title),
|
|
140
|
+
'normalized_instruction': str(instruction),
|
|
141
|
+
'domain': str(spec_raw.get('domain') or raw.get('domain') or 'general'),
|
|
142
|
+
'subdomain': spec_raw.get('subdomain'),
|
|
143
|
+
'professional_role': spec_raw.get('professional_role'),
|
|
144
|
+
'workflow_type': str(spec_raw.get('workflow_type') or spec_raw.get('task_type') or 'artifact_generation'),
|
|
145
|
+
'skill_targets': [str(x) for x in _as_list(spec_raw.get('skill_targets'))],
|
|
146
|
+
'difficulty_tier': spec_raw.get('difficulty_tier') if spec_raw.get('difficulty_tier') in {'easy','medium','hard','expert'} else 'medium',
|
|
147
|
+
'expected_human_deliverable': _summary(spec_raw.get('expected_human_deliverable') or expected_outputs, 'Review the generated artifacts and final answer.'),
|
|
148
|
+
'expected_agent_deliverable': _summary(spec_raw.get('expected_agent_deliverable') or expected_outputs, 'Produce required task artifacts and actual_outputs.json.'),
|
|
149
|
+
'input_requirements': [str(x) for x in _as_list(spec_raw.get('input_requirements') or spec_raw.get('required_inputs'))],
|
|
150
|
+
'output_requirements': [str(x) for x in _as_list(spec_raw.get('output_requirements') or expected_outputs or required_artifacts)],
|
|
151
|
+
'required_context': [str(x) for x in _as_list(spec_raw.get('required_context'))],
|
|
152
|
+
'assumptions': [str(x) for x in _as_list(spec_raw.get('assumptions'))],
|
|
153
|
+
'constraints': [str(x) for x in _as_list(spec_raw.get('constraints'))],
|
|
154
|
+
'allowed_tools': [str(x) for x in _as_list(spec_raw.get('allowed_tools'))] or ['python','file_read','file_write','bash'],
|
|
155
|
+
'disallowed_tools': [str(x) for x in _as_list(spec_raw.get('disallowed_tools'))],
|
|
156
|
+
'privacy_classification': spec_raw.get('privacy_classification') if spec_raw.get('privacy_classification') in {'public','synthetic','sensitive_possible','contains_pii','unknown'} else 'unknown',
|
|
157
|
+
'license': spec_raw.get('license'),
|
|
158
|
+
'allowed_use': spec_raw.get('allowed_use') or 'local research dataset generation',
|
|
159
|
+
'rubricability_score': _score(spec_raw.get('rubricability_score') or spec_raw.get('rubricability'), 0.7),
|
|
160
|
+
'verifiability_score': _score(spec_raw.get('verifiability_score') or spec_raw.get('verifiability'), 0.7),
|
|
161
|
+
'artifactability_score': _score(spec_raw.get('artifactability_score') or spec_raw.get('artifactability'), 0.7),
|
|
162
|
+
'needs_expert_review': _bool(spec_raw.get('needs_expert_review'), False),
|
|
163
|
+
'metadata_json': metadata,
|
|
164
|
+
}
|
|
165
|
+
qmd=_metadata(quality_raw, QUALITY_ALLOWED) if isinstance(quality_raw, dict) else {}
|
|
166
|
+
quality={
|
|
167
|
+
'task_id': task_id,
|
|
168
|
+
'instruction_clarity_score': _score(quality_raw.get('instruction_clarity_score') if isinstance(quality_raw, dict) else None, 0.7),
|
|
169
|
+
'input_completeness_score': _score(quality_raw.get('input_completeness_score') if isinstance(quality_raw, dict) else None, 0.7),
|
|
170
|
+
'output_contract_score': _score(quality_raw.get('output_contract_score') if isinstance(quality_raw, dict) else None, 0.7),
|
|
171
|
+
'rubricability_score': spec['rubricability_score'],
|
|
172
|
+
'verifiability_score': spec['verifiability_score'],
|
|
173
|
+
'artifactability_score': spec['artifactability_score'],
|
|
174
|
+
'privacy_risk_score': _score(quality_raw.get('privacy_risk_score') if isinstance(quality_raw, dict) else None, 0.2),
|
|
175
|
+
'license_risk_score': _score(quality_raw.get('license_risk_score') if isinstance(quality_raw, dict) else None, 0.2),
|
|
176
|
+
'ambiguity_score': _score(quality_raw.get('ambiguity_score') if isinstance(quality_raw, dict) else None, 0.3),
|
|
177
|
+
'overall_intake_quality_score': _score(quality_raw.get('overall_intake_quality_score') if isinstance(quality_raw, dict) else None, 0.7),
|
|
178
|
+
'quality_flags': [str(x) for x in _as_list(quality_raw.get('quality_flags') if isinstance(quality_raw, dict) else None)],
|
|
179
|
+
'blockers': [str(x) for x in _as_list(quality_raw.get('blockers') if isinstance(quality_raw, dict) else None)],
|
|
180
|
+
'recommended_fix': quality_raw.get('recommended_fix') if isinstance(quality_raw, dict) else None,
|
|
181
|
+
'metadata_json': qmd,
|
|
182
|
+
}
|
|
183
|
+
return {'task_intake_spec': spec, 'task_intake_quality_report': quality}
|
|
184
|
+
|
|
185
|
+
def _rubric_item(raw_item: Any, idx: int, weight: float, required_artifacts: list[str]) -> dict[str, Any]:
|
|
186
|
+
if not isinstance(raw_item, dict): raw_item={'criterion_name': str(raw_item)}
|
|
187
|
+
md=_metadata(raw_item, RUBRIC_ITEM_ALLOWED)
|
|
188
|
+
evidence=raw_item.get('observable_evidence') or raw_item.get('evidence') or raw_item.get('evidence_requirements') or raw_item.get('success_criteria') or required_artifacts or ['final answer and artifacts']
|
|
189
|
+
artifacts=raw_item.get('required_artifacts') or raw_item.get('artifacts') or required_artifacts or ['final_answer']
|
|
190
|
+
scoring_method=raw_item.get('scoring_method')
|
|
191
|
+
if not isinstance(scoring_method, str) or scoring_method not in {'llm_rubric_judge','deterministic','schema_match','regex','unit_test','hybrid','human_future'}:
|
|
192
|
+
scoring_method='llm_rubric_judge'
|
|
193
|
+
return {
|
|
194
|
+
'rubric_item_id': str(raw_item.get('rubric_item_id') or raw_item.get('id') or f'ri_{idx}'),
|
|
195
|
+
'criterion_name': str(raw_item.get('criterion_name') or raw_item.get('name') or raw_item.get('criterion') or f'criterion_{idx}'),
|
|
196
|
+
'criterion_description': str(raw_item.get('criterion_description') or raw_item.get('description') or raw_item.get('scoring_guidance') or 'Evaluate observable task success evidence.'),
|
|
197
|
+
'weight': weight,
|
|
198
|
+
'score_min': _float(raw_item.get('score_min'), 0.0),
|
|
199
|
+
'score_max': _float(raw_item.get('score_max') or raw_item.get('max_score'), 1.0) or 1.0,
|
|
200
|
+
'pass_threshold': _float(raw_item.get('pass_threshold'), 0.7),
|
|
201
|
+
'observable_evidence': [str(x) for x in _as_list(evidence)],
|
|
202
|
+
'required_artifacts': [str(x) for x in _as_list(artifacts)],
|
|
203
|
+
'scoring_method': scoring_method,
|
|
204
|
+
'worker_visible': _bool(raw_item.get('worker_visible'), True),
|
|
205
|
+
'verifier_only': _bool(raw_item.get('verifier_only'), False),
|
|
206
|
+
'hidden_reference_required': _bool(raw_item.get('hidden_reference_required'), False),
|
|
207
|
+
'failure_modes': [str(x) for x in _as_list(raw_item.get('failure_modes'))] or ['missing evidence','incorrect output'],
|
|
208
|
+
'partial_credit_rules': [str(x) for x in _as_list(raw_item.get('partial_credit_rules'))] or ['Award partial credit for partially correct, evidence-backed artifacts.'],
|
|
209
|
+
'edge_cases': [str(x) for x in _as_list(raw_item.get('edge_cases'))],
|
|
210
|
+
'anti_cheat_notes': [str(x) for x in _as_list(raw_item.get('anti_cheat_notes'))],
|
|
211
|
+
'metadata_json': md,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
def normalize_rubric_result(raw: dict[str, Any], context: RoleContext | None=None) -> dict[str, Any]:
|
|
215
|
+
context=context or RoleContext()
|
|
216
|
+
raw=dict(raw or {})
|
|
217
|
+
rub_raw=_first_dict(raw, 'rubric_spec', 'rubric')
|
|
218
|
+
quality_raw=raw.get('rubric_quality_report') if isinstance(raw.get('rubric_quality_report'), dict) else raw.get('quality_report') if isinstance(raw.get('quality_report'), dict) else {}
|
|
219
|
+
task_id=str(rub_raw.get('task_id') or raw.get('task_id') or context.task_id)
|
|
220
|
+
required=[str(x) for x in _as_list(rub_raw.get('required_artifacts') or raw.get('required_artifacts') or rub_raw.get('expected_outputs'))]
|
|
221
|
+
raw_items=rub_raw.get('rubric_items') or rub_raw.get('criteria') or rub_raw.get('items') or rub_raw.get('checks') or []
|
|
222
|
+
if not raw_items:
|
|
223
|
+
base=required or [context.task_title or 'final deliverable']
|
|
224
|
+
raw_items=[{'name': f'{name} quality', 'description': f'{name} is present, correct, and supported by evidence.', 'required_artifacts':[name], 'evidence':[name]} for name in base]
|
|
225
|
+
raw_weights=[_float(i.get('weight'), 0.0) if isinstance(i, dict) else 0.0 for i in raw_items]
|
|
226
|
+
total=sum(raw_weights)
|
|
227
|
+
weight_normalized=False
|
|
228
|
+
if total <= 0:
|
|
229
|
+
weights=[1.0/len(raw_items)]*len(raw_items)
|
|
230
|
+
weight_normalized=True
|
|
231
|
+
else:
|
|
232
|
+
weights=[w/total for w in raw_weights]
|
|
233
|
+
weight_normalized=abs(total-1.0)>1e-6
|
|
234
|
+
items=[_rubric_item(item, i+1, weights[i], required) for i,item in enumerate(raw_items)]
|
|
235
|
+
all_required=[]
|
|
236
|
+
for item in items: all_required.extend(item['required_artifacts'])
|
|
237
|
+
required=list(dict.fromkeys(required or all_required))
|
|
238
|
+
md=_metadata(rub_raw, RUBRIC_ALLOWED)
|
|
239
|
+
md.update({'grader_guidance': rub_raw.get('grader_guidance') or 'Grade against observable artifacts, trace evidence, and task requirements.', 'verifier_guidance': rub_raw.get('verifier_guidance') or 'Verify evidence grounding, score consistency, and leakage safety.', 'evaluator_guidance': rub_raw.get('evaluator_guidance') or 'Provide actionable feedback for revision.', 'limitations': [str(x) for x in _as_list(rub_raw.get('limitations'))]})
|
|
240
|
+
if weight_normalized: md['weight_normalization_applied']=True
|
|
241
|
+
rubric={
|
|
242
|
+
'rubric_id': str(rub_raw.get('rubric_id') or f'rubric_{task_id}'),
|
|
243
|
+
'task_id': task_id,
|
|
244
|
+
'task_family_id': rub_raw.get('task_family_id'),
|
|
245
|
+
'rubric_version': str(rub_raw.get('rubric_version') or '0.1'),
|
|
246
|
+
'rubric_items': items,
|
|
247
|
+
'total_weight': 1.0,
|
|
248
|
+
'pass_threshold': _float(rub_raw.get('pass_threshold'), 0.7),
|
|
249
|
+
'worker_visible_rubric_ref': rub_raw.get('worker_visible_rubric_ref') or 'rubric/worker_visible_rubric.md',
|
|
250
|
+
'verifier_private_rubric_ref': rub_raw.get('verifier_private_rubric_ref') or 'rubric/verifier_private_rubric.json',
|
|
251
|
+
'hidden_reference_policy': rub_raw.get('hidden_reference_policy') or 'no_hidden_reference_available',
|
|
252
|
+
'scoring_aggregation': rub_raw.get('scoring_aggregation') if rub_raw.get('scoring_aggregation') in {'weighted_sum','sum','all_required','custom'} else 'weighted_sum',
|
|
253
|
+
'required_artifacts': required,
|
|
254
|
+
'disqualifying_errors': [str(x) for x in _as_list(rub_raw.get('disqualifying_errors'))],
|
|
255
|
+
'partial_credit_allowed': _bool(rub_raw.get('partial_credit_allowed'), True),
|
|
256
|
+
'grader_kind': rub_raw.get('grader_kind') if rub_raw.get('grader_kind') in {'llm_rubric_judge','deterministic','hybrid','human_future'} else 'llm_rubric_judge',
|
|
257
|
+
'rubric_generation_source': rub_raw.get('rubric_generation_source') if rub_raw.get('rubric_generation_source') in {'agent_assisted','family_template','task_specific_agent_draft','expert_override','deterministic_seed'} else 'task_specific_agent_draft',
|
|
258
|
+
'rubric_generation_agent_provider': rub_raw.get('rubric_generation_agent_provider') or context.provider,
|
|
259
|
+
'rubric_generation_agent_model': rub_raw.get('rubric_generation_agent_model') or context.model,
|
|
260
|
+
'rubric_generation_confidence': _float(rub_raw.get('rubric_generation_confidence'), 0.7),
|
|
261
|
+
'metadata_json': md,
|
|
262
|
+
}
|
|
263
|
+
quality={
|
|
264
|
+
'rubric_id': rubric['rubric_id'], 'task_id': task_id, 'criteria_count': len(items), 'total_weight': 1.0,
|
|
265
|
+
'weights_sum_valid': True, 'has_observable_evidence': all(bool(i['observable_evidence']) for i in items), 'has_required_artifacts': all(bool(i['required_artifacts']) for i in items),
|
|
266
|
+
'has_partial_credit_rules': any(bool(i['partial_credit_rules']) for i in items), 'has_disqualifying_errors': bool(rubric['disqualifying_errors']), 'has_hidden_reference_policy': bool(rubric['hidden_reference_policy']),
|
|
267
|
+
'has_worker_visible_view': True, 'has_verifier_private_view': True, 'ambiguous_criteria_count': int(quality_raw.get('ambiguous_criteria_count') or 0) if isinstance(quality_raw, dict) else 0,
|
|
268
|
+
'unverifiable_criteria_count': int(quality_raw.get('unverifiable_criteria_count') or 0) if isinstance(quality_raw, dict) else 0, 'rubric_quality_score': _score(quality_raw.get('rubric_quality_score') if isinstance(quality_raw, dict) else None, 0.75),
|
|
269
|
+
'quality_flags': [str(x) for x in _as_list(quality_raw.get('quality_flags') if isinstance(quality_raw, dict) else None)], 'blockers': [str(x) for x in _as_list(quality_raw.get('blockers') if isinstance(quality_raw, dict) else None)], 'metadata_json': _metadata(quality_raw if isinstance(quality_raw, dict) else {}, RUBRIC_QUALITY_ALLOWED),
|
|
270
|
+
}
|
|
271
|
+
return {'rubric_spec': rubric, 'rubric_quality_report': quality}
|
|
272
|
+
|
|
273
|
+
def normalize_evaluator_result(raw: dict[str, Any], context: RoleContext | None=None) -> dict[str, Any]:
|
|
274
|
+
context=context or RoleContext()
|
|
275
|
+
raw=dict(raw or {})
|
|
276
|
+
fb_raw=_first_dict(raw, 'evaluator_feedback', 'feedback')
|
|
277
|
+
rp_raw=raw.get('revision_plan') if isinstance(raw.get('revision_plan'), dict) else {}
|
|
278
|
+
task_id=str(fb_raw.get('task_id') or raw.get('task_id') or context.task_id)
|
|
279
|
+
attempt_id=str(fb_raw.get('attempt_id') or raw.get('attempt_id') or context.attempt_id)
|
|
280
|
+
summary=str(fb_raw.get('feedback_summary') or fb_raw.get('summary') or fb_raw.get('feedback') or 'LLM evaluator feedback generated.')
|
|
281
|
+
actionable=fb_raw.get('actionable_feedback') or fb_raw.get('suggestions') or fb_raw.get('recommendations') or []
|
|
282
|
+
actionable=[str(x) for x in _as_list(actionable)]
|
|
283
|
+
weak=[str(x) for x in _as_list(fb_raw.get('failed_or_weak_rubric_items') or fb_raw.get('failed_rubric_items'))]
|
|
284
|
+
md=_metadata(fb_raw, EVALUATOR_ALLOWED)
|
|
285
|
+
md.update({'attempt_kind': fb_raw.get('attempt_kind') or context.attempt_kind, 'grader_result_id': context.grader_result_id, 'verifier_result_id': context.verifier_result_id})
|
|
286
|
+
feedback={
|
|
287
|
+
'feedback_id': str(fb_raw.get('feedback_id') or f'feedback_{task_id}_{context.attempt_kind}'),
|
|
288
|
+
'task_id': task_id,
|
|
289
|
+
'attempt_id': attempt_id,
|
|
290
|
+
'target_actor': fb_raw.get('target_actor') if fb_raw.get('target_actor') in {'worker','reviser','apprentice'} else 'worker',
|
|
291
|
+
'feedback_type': fb_raw.get('feedback_type') if fb_raw.get('feedback_type') in {'criteria_failure','artifact_missing','format_error','logic_error','tool_error','quality_gap','strategy_gap','safety_or_privacy','other'} else 'other',
|
|
292
|
+
'failed_rubric_items': weak,
|
|
293
|
+
'evidence_refs': [str(x) for x in _as_list(fb_raw.get('evidence_refs'))],
|
|
294
|
+
'artifact_refs': [str(x) for x in _as_list(fb_raw.get('artifact_refs'))],
|
|
295
|
+
'feedback_summary': summary,
|
|
296
|
+
'actionable_feedback': actionable,
|
|
297
|
+
'suggested_revision': str(fb_raw.get('suggested_revision') or (actionable[0] if actionable else summary)),
|
|
298
|
+
'revision_priority': fb_raw.get('revision_priority') if fb_raw.get('revision_priority') in {'low','medium','high'} else 'medium',
|
|
299
|
+
'confidence': _float(fb_raw.get('confidence'), 0.7),
|
|
300
|
+
'hidden_reference_used': _bool(fb_raw.get('hidden_reference_used'), False),
|
|
301
|
+
'hidden_reference_leaked': _bool(fb_raw.get('hidden_reference_leaked'), False),
|
|
302
|
+
'failed_or_weak_rubric_items': weak,
|
|
303
|
+
'artifact_specific_comments': [str(x) for x in _as_list(fb_raw.get('artifact_specific_comments'))],
|
|
304
|
+
'trace_specific_comments': [str(x) for x in _as_list(fb_raw.get('trace_specific_comments'))],
|
|
305
|
+
'revision_plan': fb_raw.get('revision_plan') if isinstance(fb_raw.get('revision_plan'), str) else summary,
|
|
306
|
+
'model': fb_raw.get('model') or context.model,
|
|
307
|
+
'provider': fb_raw.get('provider') or context.provider,
|
|
308
|
+
'metadata_json': md,
|
|
309
|
+
}
|
|
310
|
+
rp_md=_metadata(rp_raw, REVISION_ALLOWED) if isinstance(rp_raw, dict) else {}
|
|
311
|
+
revision={
|
|
312
|
+
'revision_plan_id': str(rp_raw.get('revision_plan_id') or f'revision_plan_{task_id}_{context.attempt_kind}'),
|
|
313
|
+
'task_id': task_id,
|
|
314
|
+
'source_attempt_id': str(rp_raw.get('source_attempt_id') or attempt_id),
|
|
315
|
+
'target_attempt_id': str(rp_raw.get('target_attempt_id') or context.target_attempt_id or f'{task_id}_revised'),
|
|
316
|
+
'revision_kind': rp_raw.get('revision_kind') if rp_raw.get('revision_kind') in {'local_fix','strategy_shift','tool_change','decomposition_change','artifact_rebuild','format_repair','other'} else 'local_fix',
|
|
317
|
+
'revision_reason': str(rp_raw.get('revision_reason') or summary),
|
|
318
|
+
'failed_rubric_items': [str(x) for x in _as_list(rp_raw.get('failed_rubric_items'))] or weak,
|
|
319
|
+
'planned_changes': [str(x) for x in _as_list(rp_raw.get('planned_changes') or rp_raw.get('instructions'))] or actionable,
|
|
320
|
+
'expected_score_improvement': rp_raw.get('expected_score_improvement'),
|
|
321
|
+
'risk_of_regression': rp_raw.get('risk_of_regression') if rp_raw.get('risk_of_regression') in {'low','medium','high'} else 'medium',
|
|
322
|
+
'uses_evaluator_feedback': _bool(rp_raw.get('uses_evaluator_feedback'), True),
|
|
323
|
+
'metadata_json': rp_md,
|
|
324
|
+
}
|
|
325
|
+
revision['metadata_json'].update({'source_attempt_kind': context.attempt_kind, 'revision_goal': summary, 'priority_items': weak, 'expected_improvements': []})
|
|
326
|
+
return {'evaluator_feedback': feedback, 'revision_plan': revision}
|
|
327
|
+
|
|
328
|
+
def normalize_grader_result(raw: dict[str, Any], context: RoleContext | None=None) -> dict[str, Any]:
|
|
329
|
+
context=context or RoleContext()
|
|
330
|
+
raw=dict(raw or {})
|
|
331
|
+
md=_metadata(raw, GRADER_ALLOWED)
|
|
332
|
+
refs=[str(x) for x in _as_list(raw.get('evidence_refs'))] or list(context.evidence_refs or [])
|
|
333
|
+
artifact_contract_score = context.artifact_contract_score if context.artifact_contract_score is not None else raw.get('artifact_contract_score')
|
|
334
|
+
semantic=raw.get('semantic_score', raw.get('final_score', raw.get('score')))
|
|
335
|
+
semantic_score=_float(semantic, 0.0)
|
|
336
|
+
final_score=_float(raw.get('final_score'), semantic_score)
|
|
337
|
+
limitations=[str(x) for x in _as_list(raw.get('limitations'))]
|
|
338
|
+
if artifact_contract_score == 0.0 and not (raw.get('evidence_refs') or context.evidence_refs):
|
|
339
|
+
semantic_score=0.0
|
|
340
|
+
final_score=0.0
|
|
341
|
+
limitations.append('Semantic grading impossible because artifact content was unavailable; score reflects missing outputs/log evidence.')
|
|
342
|
+
md['deterministic_precheck']={'artifact_contract_score': artifact_contract_score}
|
|
343
|
+
basis=context.model_grading_basis or ('artifact_content' if artifact_contract_score else ('logs_only' if refs else 'missing_outputs'))
|
|
344
|
+
md['evidence_materialization_status']='materialized' if artifact_contract_score else 'missing_or_unverified'
|
|
345
|
+
md['referenced_but_missing_artifacts']=[str(x) for x in _as_list(raw.get('referenced_but_missing_artifacts'))]
|
|
346
|
+
md['artifact_content_available']=basis in {'artifact_content','artifact_preview'}
|
|
347
|
+
md['semantic_grading_basis']=basis
|
|
348
|
+
md['model_grading_basis']=basis
|
|
349
|
+
md['artifact_content_refs']=context.artifact_content_refs or []
|
|
350
|
+
md['artifact_content_previews']=context.artifact_content_previews or []
|
|
351
|
+
md['artifact_content_hashes']=context.artifact_content_hashes or {}
|
|
352
|
+
md['artifact_content_preview_truncated']=bool(context.artifact_content_preview_truncated)
|
|
353
|
+
items=[_normalize_item(x, i+1, refs) for i,x in enumerate(_as_list(raw.get('rubric_item_scores')))]
|
|
354
|
+
passed=_bool(raw.get('passed'), final_score >= 0.7)
|
|
355
|
+
failed=[str(x) for x in _as_list(raw.get('failed_criteria'))]
|
|
356
|
+
passed_criteria=[str(x) for x in _as_list(raw.get('passed_criteria'))]
|
|
357
|
+
if not failed and items:
|
|
358
|
+
failed=[i['rubric_item_id'] for i in items if not i['passed']]
|
|
359
|
+
if not passed_criteria and items:
|
|
360
|
+
passed_criteria=[i['rubric_item_id'] for i in items if i['passed']]
|
|
361
|
+
return {
|
|
362
|
+
'grader_result_id': str(raw.get('grader_result_id') or f'grader_{context.task_id}_{context.attempt_kind}'),
|
|
363
|
+
'task_id': str(raw.get('task_id') or context.task_id),
|
|
364
|
+
'attempt_id': str(raw.get('attempt_id') or context.attempt_id),
|
|
365
|
+
'attempt_kind': str(raw.get('attempt_kind') or context.attempt_kind),
|
|
366
|
+
'rubric_id': str(raw.get('rubric_id') or context.rubric_id),
|
|
367
|
+
'grader_kind': 'model',
|
|
368
|
+
'score_source': 'model_judged',
|
|
369
|
+
'score': final_score,
|
|
370
|
+
'max_score': _float(raw.get('max_score'), 1.0) or 1.0,
|
|
371
|
+
'passed': passed,
|
|
372
|
+
'rubric_item_scores': items,
|
|
373
|
+
'failed_criteria': failed,
|
|
374
|
+
'passed_criteria': passed_criteria,
|
|
375
|
+
'evidence_refs': refs,
|
|
376
|
+
'confidence': _float(raw.get('confidence'), 0.7),
|
|
377
|
+
'reasoning_summary': raw.get('reasoning_summary') or raw.get('summary') or raw.get('notes'),
|
|
378
|
+
'limitations': limitations,
|
|
379
|
+
'hidden_reference_used': _bool(raw.get('hidden_reference_used'), False),
|
|
380
|
+
'hidden_reference_leaked': _bool(raw.get('hidden_reference_leaked'), False),
|
|
381
|
+
'artifact_contract_score': artifact_contract_score,
|
|
382
|
+
'semantic_score': semantic_score,
|
|
383
|
+
'model_score': semantic_score,
|
|
384
|
+
'legacy_semantic_score': semantic_score,
|
|
385
|
+
'legacy_score_source': 'llm_semantic',
|
|
386
|
+
'final_score': final_score,
|
|
387
|
+
'model': raw.get('model') or context.model,
|
|
388
|
+
'provider': raw.get('provider') or context.provider,
|
|
389
|
+
'deterministic_precheck_ref': raw.get('deterministic_precheck_ref'),
|
|
390
|
+
'llm_prompt_ref_internal': raw.get('llm_prompt_ref_internal'),
|
|
391
|
+
'llm_response_ref_internal': raw.get('llm_response_ref_internal'),
|
|
392
|
+
'public_prompt_hash': raw.get('public_prompt_hash'),
|
|
393
|
+
'public_response_summary': raw.get('public_response_summary') or raw.get('reasoning_summary') or 'Model grader result normalized.',
|
|
394
|
+
'metadata_json': md,
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
def _first_present(raw: dict[str, Any], *keys: str) -> Any:
|
|
398
|
+
for key in keys:
|
|
399
|
+
if key in raw and raw.get(key) is not None:
|
|
400
|
+
return raw.get(key)
|
|
401
|
+
return None
|
|
402
|
+
|
|
403
|
+
def normalize_verifier_result(raw: dict[str, Any], context: RoleContext | None=None) -> dict[str, Any]:
|
|
404
|
+
context=context or RoleContext()
|
|
405
|
+
raw=dict(raw or {})
|
|
406
|
+
md=_metadata(raw, VERIFIER_ALLOWED)
|
|
407
|
+
unsupported=[str(x) for x in _as_list(raw.get('unsupported_claims'))]
|
|
408
|
+
issues=[str(x) for x in _as_list(raw.get('issues'))]
|
|
409
|
+
if raw.get('unsupported_claims_found') is True and not unsupported:
|
|
410
|
+
unsupported.append('unsupported claims found')
|
|
411
|
+
if unsupported:
|
|
412
|
+
issues.extend([f'unsupported_claim: {x}' for x in unsupported])
|
|
413
|
+
passed_raw=_first_present(raw, 'passed', 'grade_accepted', 'accepted')
|
|
414
|
+
if passed_raw is False and not issues:
|
|
415
|
+
issues.append('verifier marked attempt as not passed')
|
|
416
|
+
artifact_ok=_bool(_first_present(raw, 'artifact_contract_ok'), True)
|
|
417
|
+
grounding=_bool(_first_present(raw, 'semantic_evidence_grounding_ok', 'evidence_grounding_ok', 'grounded', 'evidence_grounded'), True)
|
|
418
|
+
consistency=_bool(_first_present(raw, 'score_consistency_ok', 'score_consistent'), True)
|
|
419
|
+
leaked=_bool(_first_present(raw, 'hidden_reference_leaked', 'hidden_reference_leak'), False)
|
|
420
|
+
if not grounding and not any('ground' in i.lower() for i in issues):
|
|
421
|
+
issues.append('semantic evidence was not grounded')
|
|
422
|
+
if not consistency and not any('score' in i.lower() for i in issues):
|
|
423
|
+
issues.append('score was not internally consistent')
|
|
424
|
+
if leaked and not any('hidden_reference' in i.lower() or 'leak' in i.lower() for i in issues):
|
|
425
|
+
issues.append('hidden reference leakage detected')
|
|
426
|
+
ok=artifact_ok and grounding and consistency and not leaked and passed_raw is not False and not unsupported
|
|
427
|
+
if raw.get('verification_status') in {'verified','partially_verified','failed','not_run'}:
|
|
428
|
+
status=raw.get('verification_status')
|
|
429
|
+
elif passed_raw is False or leaked or unsupported:
|
|
430
|
+
status='failed'
|
|
431
|
+
else:
|
|
432
|
+
status='verified' if ok else 'partially_verified'
|
|
433
|
+
return {
|
|
434
|
+
'verifier_result_id': str(raw.get('verifier_result_id') or f'verifier_{context.task_id}_{context.attempt_kind}'),
|
|
435
|
+
'task_id': str(raw.get('task_id') or context.task_id),
|
|
436
|
+
'attempt_id': str(raw.get('attempt_id') or context.attempt_id),
|
|
437
|
+
'attempt_kind': str(raw.get('attempt_kind') or context.attempt_kind),
|
|
438
|
+
'grader_result_id': raw.get('grader_result_id') or context.grader_result_id,
|
|
439
|
+
'verification_status': status,
|
|
440
|
+
'artifact_contract_ok': artifact_ok,
|
|
441
|
+
'evidence_grounding_ok': grounding,
|
|
442
|
+
'score_consistency_ok': consistency,
|
|
443
|
+
'hidden_reference_leaked': leaked,
|
|
444
|
+
'issues': list(dict.fromkeys(issues)),
|
|
445
|
+
'confidence': _float(raw.get('confidence'), 0.7),
|
|
446
|
+
'verifier_notes': raw.get('verifier_notes') or raw.get('notes') or raw.get('summary'),
|
|
447
|
+
'semantic_evidence_grounding_ok': grounding,
|
|
448
|
+
'unsupported_claims': unsupported,
|
|
449
|
+
'leakage_check_ok': raw.get('leakage_check_ok') if isinstance(raw.get('leakage_check_ok'), bool) else not leaked,
|
|
450
|
+
'model': raw.get('model') or context.model,
|
|
451
|
+
'provider': raw.get('provider') or context.provider,
|
|
452
|
+
'metadata_json': md,
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
def normalize_role_output(role: str, raw: dict[str, Any], context: RoleContext | dict[str, Any] | None=None) -> dict[str, Any]:
|
|
456
|
+
if isinstance(context, dict): context=RoleContext(**context)
|
|
457
|
+
if role == 'intake_agent':
|
|
458
|
+
return normalize_task_intake_result(raw, context)
|
|
459
|
+
if role == 'rubric_agent':
|
|
460
|
+
return normalize_rubric_result(raw, context)
|
|
461
|
+
if role == 'evaluator_agent':
|
|
462
|
+
return normalize_evaluator_result(raw, context)
|
|
463
|
+
if role == 'grader_agent':
|
|
464
|
+
return normalize_grader_result(raw, context)
|
|
465
|
+
if role == 'verifier_agent':
|
|
466
|
+
return normalize_verifier_result(raw, context)
|
|
467
|
+
return raw
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from .schemas import RawTaskRecord, ActualOutputs, AgentTrace
|
|
5
|
+
from .config import get_settings
|
|
6
|
+
from .task_intake import task_intake
|
|
7
|
+
from .rubric_generation import generate_rubric
|
|
8
|
+
from .package_exporter import init_package, write_task_package, write_artifacts_index
|
|
9
|
+
from .apprentice_adapters import run_external_agent_attempt
|
|
10
|
+
from .codex_runner import deterministic_attempt, run_codex_attempt, run_custom_attempt
|
|
11
|
+
from .grader import grade_attempt, apply_score_reliability
|
|
12
|
+
from .verifier import verify_attempt
|
|
13
|
+
from .evaluator import evaluate_attempt
|
|
14
|
+
from .lesson_extractor import extract_lesson
|
|
15
|
+
from .training_signals import process_supervision_from_trace, reward_modeling, training_signals
|
|
16
|
+
from .revision import compute_hillclimb, preference_pair
|
|
17
|
+
from .io import write_json, append_jsonl, read_json
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _attempt(package_root, raw, spec, kind, runner, feedback=None):
|
|
21
|
+
settings = get_settings()
|
|
22
|
+
if runner == 'deterministic':
|
|
23
|
+
return deterministic_attempt(package_root, raw, spec, kind, feedback)
|
|
24
|
+
if runner == 'custom':
|
|
25
|
+
return run_custom_attempt(package_root, raw, spec, kind, timeout=settings.task_timeout_seconds)
|
|
26
|
+
if runner not in {'codex', 'deterministic', 'custom'}:
|
|
27
|
+
return run_external_agent_attempt(package_root, raw, spec, kind, timeout=settings.task_timeout_seconds)
|
|
28
|
+
return run_codex_attempt(package_root, raw, spec, kind, timeout=settings.task_timeout_seconds)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_apprentice_operational_failure(actual: ActualOutputs) -> bool:
|
|
32
|
+
metadata = actual.metadata_json or {}
|
|
33
|
+
return bool(
|
|
34
|
+
metadata.get("apprentice_agent_operational_error")
|
|
35
|
+
or metadata.get("worker_agent_operational_error")
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _write_baseline_only_signals(pkg: Path, b_actual: ActualOutputs, b_grade, b_ver, b_trace: AgentTrace, fb) -> None:
|
|
40
|
+
lesson=extract_lesson(compute_hillclimb(b_grade, b_grade)); write_json(pkg/'signals/lesson_pack.json', lesson)
|
|
41
|
+
for ex in process_supervision_from_trace(b_trace, b_grade, b_ver, fb): append_jsonl(pkg/'signals/process_supervision.jsonl', ex)
|
|
42
|
+
append_jsonl(pkg/'signals/reward_modeling.jsonl', reward_modeling(b_actual,b_grade))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def run_task(
|
|
46
|
+
raw: RawTaskRecord,
|
|
47
|
+
run_root: Path,
|
|
48
|
+
runner='deterministic',
|
|
49
|
+
max_iterations: int | None=None,
|
|
50
|
+
pre_attempt_callback: Callable[[Path], None] | None = None,
|
|
51
|
+
revision_decision_callback: Callable[[Path], bool] | None = None,
|
|
52
|
+
) -> Path:
|
|
53
|
+
settings=get_settings()
|
|
54
|
+
max_iter=max_iterations if max_iterations is not None else settings.max_iterations
|
|
55
|
+
max_iter=max(1, int(max_iter))
|
|
56
|
+
role_root=run_root/'roles'/raw.raw_task_id
|
|
57
|
+
spec,q=task_intake(raw, role_root)
|
|
58
|
+
rubric,rq=generate_rubric(spec, role_root)
|
|
59
|
+
pkg=init_package(run_root, spec.task_id); write_task_package(pkg, raw, spec, q, rubric, rq)
|
|
60
|
+
if pre_attempt_callback:
|
|
61
|
+
pre_attempt_callback(pkg)
|
|
62
|
+
actual_iterations=1
|
|
63
|
+
baseline = _attempt(pkg, raw, spec, 'baseline', runner)
|
|
64
|
+
b_actual=ActualOutputs.model_validate(read_json(pkg/'attempts/baseline/actual_outputs.json'))
|
|
65
|
+
b_trace=AgentTrace.model_validate(read_json(pkg/'attempts/baseline/agent_trace.json'))
|
|
66
|
+
revised_attempt_ids=[]; selected_attempt_id=b_actual.attempt_id
|
|
67
|
+
if _is_apprentice_operational_failure(b_actual):
|
|
68
|
+
write_json(pkg/'package_manifest.json', {'max_iterations':max_iter,'actual_iterations':1,'loop_stop_reason':'apprentice_agent_operational_error','baseline_attempt_id':b_actual.attempt_id,'revised_attempt_ids':revised_attempt_ids,'selected_attempt_id':selected_attempt_id})
|
|
69
|
+
write_artifacts_index(pkg)
|
|
70
|
+
return pkg
|
|
71
|
+
b_grade=grade_attempt(rubric, b_actual, 'baseline', b_trace, role_root, pkg)
|
|
72
|
+
b_ver=verify_attempt(b_grade, b_actual, b_trace, role_root, pkg)
|
|
73
|
+
b_grade=apply_score_reliability(b_grade, b_ver)
|
|
74
|
+
write_json(pkg/'grading/baseline_grader_result.json', b_grade); write_json(pkg/'grading/baseline_verifier_result.json', b_ver)
|
|
75
|
+
stop_reason='max_iterations_reached' if max_iter <= 1 else 'completed_baseline_and_revision'
|
|
76
|
+
if max_iter <= 1:
|
|
77
|
+
fb,rp=evaluate_attempt(b_grade, b_ver, b_actual, b_trace, f'{spec.task_id}_revised', role_root, pkg)
|
|
78
|
+
write_json(pkg/'feedback/baseline_evaluator_feedback.json', fb); write_json(pkg/'feedback/revision_plan.json', rp)
|
|
79
|
+
if revision_decision_callback:
|
|
80
|
+
revision_decision_callback(pkg)
|
|
81
|
+
_write_baseline_only_signals(pkg, b_actual, b_grade, b_ver, b_trace, fb)
|
|
82
|
+
write_json(pkg/'package_manifest.json', {'max_iterations':max_iter,'actual_iterations':actual_iterations,'loop_stop_reason':stop_reason,'baseline_attempt_id':b_actual.attempt_id,'revised_attempt_ids':revised_attempt_ids,'selected_attempt_id':selected_attempt_id})
|
|
83
|
+
write_artifacts_index(pkg)
|
|
84
|
+
return pkg
|
|
85
|
+
fb,rp=evaluate_attempt(b_grade, b_ver, b_actual, b_trace, f'{spec.task_id}_revised', role_root, pkg)
|
|
86
|
+
write_json(pkg/'feedback/baseline_evaluator_feedback.json', fb); write_json(pkg/'feedback/revision_plan.json', rp)
|
|
87
|
+
if revision_decision_callback and not revision_decision_callback(pkg):
|
|
88
|
+
stop_reason='mentor_decision_finish'
|
|
89
|
+
_write_baseline_only_signals(pkg, b_actual, b_grade, b_ver, b_trace, fb)
|
|
90
|
+
write_json(pkg/'package_manifest.json', {'max_iterations':max_iter,'actual_iterations':actual_iterations,'loop_stop_reason':stop_reason,'baseline_attempt_id':b_actual.attempt_id,'revised_attempt_ids':revised_attempt_ids,'selected_attempt_id':selected_attempt_id})
|
|
91
|
+
write_artifacts_index(pkg)
|
|
92
|
+
return pkg
|
|
93
|
+
revised = _attempt(pkg, raw, spec, 'revised', runner, fb.feedback_summary)
|
|
94
|
+
actual_iterations=2
|
|
95
|
+
r_actual=ActualOutputs.model_validate(read_json(pkg/'attempts/revised/actual_outputs.json'))
|
|
96
|
+
r_trace=AgentTrace.model_validate(read_json(pkg/'attempts/revised/agent_trace.json'))
|
|
97
|
+
r_grade=grade_attempt(rubric, r_actual, 'revised', r_trace, role_root, pkg)
|
|
98
|
+
r_ver=verify_attempt(r_grade, r_actual, r_trace, role_root, pkg)
|
|
99
|
+
r_grade=apply_score_reliability(r_grade, r_ver)
|
|
100
|
+
write_json(pkg/'grading/revised_grader_result.json', r_grade); write_json(pkg/'grading/revised_verifier_result.json', r_ver)
|
|
101
|
+
revised_attempt_ids.append(r_actual.attempt_id)
|
|
102
|
+
selected_attempt_id=r_actual.attempt_id if (r_grade.final_score or r_grade.score) >= (b_grade.final_score or b_grade.score) else b_actual.attempt_id
|
|
103
|
+
hill=compute_hillclimb(b_grade, r_grade); write_json(pkg/'signals/hillclimb_result.json', hill)
|
|
104
|
+
lesson=extract_lesson(hill); write_json(pkg/'signals/lesson_pack.json', lesson)
|
|
105
|
+
for sig in training_signals(hill, ['attempts/baseline/agent_trace.json','attempts/revised/agent_trace.json'], ['grading/baseline_grader_result.json','grading/revised_grader_result.json'], ['grading/baseline_verifier_result.json','grading/revised_verifier_result.json']): append_jsonl(pkg/'signals/training_signals.jsonl', sig)
|
|
106
|
+
for ex in process_supervision_from_trace(b_trace, b_grade, b_ver, fb)+process_supervision_from_trace(r_trace, r_grade, r_ver, fb): append_jsonl(pkg/'signals/process_supervision.jsonl', ex)
|
|
107
|
+
append_jsonl(pkg/'signals/reward_modeling.jsonl', reward_modeling(b_actual,b_grade)); append_jsonl(pkg/'signals/reward_modeling.jsonl', reward_modeling(r_actual,r_grade))
|
|
108
|
+
append_jsonl(pkg/'signals/revision_preference_pairs.jsonl', preference_pair(hill, 'rubric/rubric.json'))
|
|
109
|
+
write_json(pkg/'package_manifest.json', {'max_iterations':max_iter,'actual_iterations':actual_iterations,'loop_stop_reason':stop_reason,'baseline_attempt_id':b_actual.attempt_id,'revised_attempt_ids':revised_attempt_ids,'selected_attempt_id':selected_attempt_id})
|
|
110
|
+
write_artifacts_index(pkg)
|
|
111
|
+
return pkg
|