agent-apprenticeship 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +217 -0
- package/bin/agent-apprenticeship.js +131 -0
- package/package.json +30 -0
- package/pyproject.toml +23 -0
- package/src/agent_apprenticeship_trace/__init__.py +2 -0
- package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
- package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
- package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
- package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
- package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
- package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
- package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
- package/src/agent_apprenticeship_trace/certification.py +580 -0
- package/src/agent_apprenticeship_trace/cli.py +2979 -0
- package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
- package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
- package/src/agent_apprenticeship_trace/config.py +609 -0
- package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
- package/src/agent_apprenticeship_trace/env.py +46 -0
- package/src/agent_apprenticeship_trace/evaluator.py +64 -0
- package/src/agent_apprenticeship_trace/grader.py +194 -0
- package/src/agent_apprenticeship_trace/integration_status.py +193 -0
- package/src/agent_apprenticeship_trace/io.py +20 -0
- package/src/agent_apprenticeship_trace/learning.py +627 -0
- package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
- package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
- package/src/agent_apprenticeship_trace/loop.py +111 -0
- package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
- package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
- package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
- package/src/agent_apprenticeship_trace/progress.py +223 -0
- package/src/agent_apprenticeship_trace/public_run.py +1109 -0
- package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
- package/src/agent_apprenticeship_trace/recipes.py +129 -0
- package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
- package/src/agent_apprenticeship_trace/revision.py +21 -0
- package/src/agent_apprenticeship_trace/role_runners.py +7 -0
- package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
- package/src/agent_apprenticeship_trace/schemas.py +273 -0
- package/src/agent_apprenticeship_trace/session_events.py +99 -0
- package/src/agent_apprenticeship_trace/task_intake.py +112 -0
- package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
- package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
- package/src/agent_apprenticeship_trace/training_signals.py +30 -0
- package/src/agent_apprenticeship_trace/validation.py +210 -0
- package/src/agent_apprenticeship_trace/verifier.py +55 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from .schemas import TaskIntakeSpec, RubricItem, RubricSpec, RubricQualityReport
|
|
6
|
+
from .config import get_settings
|
|
7
|
+
from .io import read_json
|
|
8
|
+
from .openai_structured import get_model_provider_status, run_structured_role
|
|
9
|
+
from .public_sanitizer import sanitize_public_obj, sha256_text
|
|
10
|
+
|
|
11
|
+
class LLMRubricOutput(BaseModel):
|
|
12
|
+
rubric_spec: RubricSpec
|
|
13
|
+
rubric_quality_report: RubricQualityReport
|
|
14
|
+
|
|
15
|
+
def _mentor_provider_can_attempt() -> bool:
|
|
16
|
+
return bool(get_model_provider_status().get('provider_available'))
|
|
17
|
+
|
|
18
|
+
def _mentor_provider_id() -> str:
|
|
19
|
+
settings=get_settings()
|
|
20
|
+
return settings.model_provider or 'openai'
|
|
21
|
+
|
|
22
|
+
def deterministic_rubric(spec: TaskIntakeSpec) -> tuple[RubricSpec, RubricQualityReport]:
|
|
23
|
+
reqs = spec.output_requirements or ['final deliverable']
|
|
24
|
+
weight = 1.0/len(reqs)
|
|
25
|
+
items=[RubricItem(rubric_item_id=f'ri_{i+1}', criterion_name=r, criterion_description=f'Output satisfies {r}', weight=weight, score_min=0, score_max=1, pass_threshold=0.7, observable_evidence=[r], required_artifacts=[r], scoring_method='deterministic', worker_visible=True, verifier_only=False, hidden_reference_required=False, failure_modes=['missing','incorrect'], partial_credit_rules=['partial if substantially present'], edge_cases=[], anti_cheat_notes=[], metadata_json={}) for i,r in enumerate(reqs)]
|
|
26
|
+
rub=RubricSpec(rubric_id=f'rubric_{spec.task_id}', task_id=spec.task_id, task_family_id=None, rubric_version='v0.1', rubric_items=items, total_weight=1.0, pass_threshold=0.7, worker_visible_rubric_ref='rubric/worker_visible_rubric.md', verifier_private_rubric_ref='rubric/verifier_private_rubric.json', hidden_reference_policy='No hidden references for deterministic seed.', scoring_aggregation='weighted_sum', required_artifacts=reqs, disqualifying_errors=['secret leak'], partial_credit_allowed=True, grader_kind='deterministic', rubric_generation_source='deterministic_seed', rubric_generation_agent_provider=None, rubric_generation_agent_model=None, rubric_generation_confidence=0.4, metadata_json={'rubric_source':'deterministic_fallback','rubric_limitations':['LLM rubric generation unavailable'],'rubric_confidence':'low'})
|
|
27
|
+
qr=RubricQualityReport(rubric_id=rub.rubric_id, task_id=spec.task_id, criteria_count=len(items), total_weight=1.0, weights_sum_valid=True, has_observable_evidence=True, has_required_artifacts=True, has_partial_credit_rules=True, has_disqualifying_errors=True, has_hidden_reference_policy=True, has_worker_visible_view=True, has_verifier_private_view=True, ambiguous_criteria_count=0, unverifiable_criteria_count=0, rubric_quality_score=0.85, quality_flags=[], blockers=[], metadata_json={})
|
|
28
|
+
settings=get_settings()
|
|
29
|
+
if settings.rubric_mode in {'hybrid','llm_default'} and _mentor_provider_can_attempt() and settings.llm_rubric_generation_enabled:
|
|
30
|
+
provider=_mentor_provider_id()
|
|
31
|
+
rub.grader_kind='hybrid'; rub.rubric_generation_agent_provider=provider; rub.rubric_generation_agent_model=settings.model_provider_model or settings.openai_model; rub.metadata_json.update({'llm_rubric_generation_enabled': True, 'llm_unavailable': True, 'rubric_source':'deterministic_fallback', 'provider': provider})
|
|
32
|
+
qr.quality_flags.append('deterministic_fallback_used')
|
|
33
|
+
return rub, qr
|
|
34
|
+
|
|
35
|
+
def deterministic_rubric_quality_check(rubric: RubricSpec) -> RubricQualityReport:
|
|
36
|
+
vague=sum(1 for i in rubric.rubric_items if 'good quality' in (i.criterion_description or '').lower() and not i.observable_evidence)
|
|
37
|
+
return RubricQualityReport(rubric_id=rubric.rubric_id, task_id=rubric.task_id, criteria_count=len(rubric.rubric_items), total_weight=rubric.total_weight, weights_sum_valid=abs(sum(i.weight for i in rubric.rubric_items)-1.0)<1e-6, has_observable_evidence=all(bool(i.observable_evidence) for i in rubric.rubric_items), has_required_artifacts=all(bool(i.required_artifacts) for i in rubric.rubric_items), has_partial_credit_rules=all(bool(i.partial_credit_rules) for i in rubric.rubric_items), has_disqualifying_errors=bool(rubric.disqualifying_errors), has_hidden_reference_policy=bool(rubric.hidden_reference_policy), has_worker_visible_view=any(i.worker_visible for i in rubric.rubric_items), has_verifier_private_view=bool(rubric.verifier_private_rubric_ref), ambiguous_criteria_count=vague, unverifiable_criteria_count=sum(1 for i in rubric.rubric_items if not i.observable_evidence), rubric_quality_score=0.0 if vague else 0.85, quality_flags=['vague_rubric_item'] if vague else [], blockers=['vague criteria without evidence'] if vague else [], metadata_json={})
|
|
38
|
+
|
|
39
|
+
def _rubric_prompt(spec: TaskIntakeSpec) -> str:
|
|
40
|
+
public_spec=sanitize_public_obj(spec.model_dump(mode='json'))
|
|
41
|
+
return """Return only valid JSON. Do not include markdown. Do not add extra top-level fields; place extras under metadata_json.extra_model_fields.
|
|
42
|
+
Required skeleton: {"rubric_spec":{"rubric_id":"rubric_<task_id>","task_id":"...","rubric_version":"0.1","rubric_items":[{"rubric_item_id":"ri_1","criterion_name":"...","criterion_description":"...","weight":1.0,"score_min":0,"score_max":1,"pass_threshold":0.7,"observable_evidence":[],"required_artifacts":[],"scoring_method":"llm_rubric_judge","worker_visible":true,"verifier_only":false,"hidden_reference_required":false,"failure_modes":[],"partial_credit_rules":[],"edge_cases":[],"anti_cheat_notes":[],"metadata_json":{}}],"total_weight":1.0,"pass_threshold":0.7,"worker_visible_rubric_ref":"rubric/worker_visible_rubric.md","verifier_private_rubric_ref":"rubric/verifier_private_rubric.json","hidden_reference_policy":"no_hidden_reference_available","scoring_aggregation":"weighted_sum","required_artifacts":[],"disqualifying_errors":[],"partial_credit_allowed":true,"grader_kind":"llm_rubric_judge","rubric_generation_source":"task_specific_agent_draft","metadata_json":{}},"rubric_quality_report":{"rubric_id":"...","task_id":"...","criteria_count":1,"total_weight":1.0,"weights_sum_valid":true,"has_observable_evidence":true,"has_required_artifacts":true,"has_partial_credit_rules":true,"has_disqualifying_errors":false,"has_hidden_reference_policy":true,"has_worker_visible_view":true,"has_verifier_private_view":true,"ambiguous_criteria_count":0,"unverifiable_criteria_count":0,"rubric_quality_score":0.75,"quality_flags":[],"blockers":[],"metadata_json":{}}}.
|
|
43
|
+
Create a task-specific rubric with weighted rubric_items that sum to 1.0. Include explicit required_artifacts, observable_evidence, success criteria, failure modes, grader/verifier/evaluator guidance in metadata, limitations, and public-safe metadata. Do not leak hidden/reference answers to worker-visible fields.
|
|
44
|
+
TaskIntakeSpec JSON:
|
|
45
|
+
""" + json.dumps(public_spec, sort_keys=True)
|
|
46
|
+
|
|
47
|
+
def generate_rubric(spec: TaskIntakeSpec, role_root: Path | None=None) -> tuple[RubricSpec, RubricQualityReport]:
|
|
48
|
+
settings=get_settings(); role_root=role_root or Path('outputs/roles')
|
|
49
|
+
if settings.llm_rubric_generation_enabled and settings.rubric_mode in {'hybrid','llm_default','llm_required'} and _mentor_provider_can_attempt():
|
|
50
|
+
prompt=_rubric_prompt(spec)
|
|
51
|
+
try:
|
|
52
|
+
provider=_mentor_provider_id()
|
|
53
|
+
model_override=settings.llm_rubric_model if provider == 'openai' else None
|
|
54
|
+
rr=run_structured_role('rubric_agent', prompt, LLMRubricOutput, role_root/'rubric_agent', allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': spec.task_id, 'task_title': spec.normalized_title, 'task_instruction': spec.normalized_instruction, 'model': model_override or settings.model_provider_model, 'provider':provider})
|
|
55
|
+
if rr.live_call_ok and rr.structured_output_validation_ok:
|
|
56
|
+
parsed=read_json(role_root/'rubric_agent/parsed_output.json')
|
|
57
|
+
rub=RubricSpec.model_validate(parsed['rubric_spec'])
|
|
58
|
+
qr=deterministic_rubric_quality_check(rub)
|
|
59
|
+
rub.grader_kind='hybrid'; rub.rubric_generation_agent_provider=rr.provider; rub.rubric_generation_agent_model=rr.model; rub.rubric_generation_confidence=rub.rubric_generation_confidence or 0.75
|
|
60
|
+
rub.metadata_json.update({'rubric_source':'llm','provider':rr.provider,'model':rr.model,'llm_prompt_ref_internal':str(role_root/'rubric_agent/prompt.md'),'llm_response_ref_internal':str(role_root/'rubric_agent/raw_output.txt'),'prompt_template_id':'rubric_generation_agent_v0','prompt_template_version':'0.1','prompt_hash':sha256_text(prompt),'public_response_summary':'Model-generated rubric passed deterministic quality checking.'})
|
|
61
|
+
qr.metadata_json.update({'rubric_source':'llm','role_result_ref_internal':str(role_root/'rubric_agent/role_result.json')})
|
|
62
|
+
return rub, qr
|
|
63
|
+
if settings.rubric_mode == 'llm_required' or settings.llm_fail_closed:
|
|
64
|
+
raise RuntimeError(rr.error_message or 'Model rubric generation failed')
|
|
65
|
+
except Exception:
|
|
66
|
+
if settings.rubric_mode == 'llm_required' or settings.llm_fail_closed:
|
|
67
|
+
raise
|
|
68
|
+
return deterministic_rubric(spec)
|
|
69
|
+
|
|
70
|
+
def worker_visible_markdown(rubric: RubricSpec) -> str:
|
|
71
|
+
lines=[f'# Worker-visible rubric for {rubric.task_id}', '']
|
|
72
|
+
for item in rubric.rubric_items:
|
|
73
|
+
if item.worker_visible and not item.verifier_only:
|
|
74
|
+
lines.append(f'- **{item.criterion_name}** ({item.weight}): {item.criterion_description}')
|
|
75
|
+
return '\n'.join(lines)+'\n'
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Literal, Any
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
4
|
+
|
|
5
|
+
DictAny = dict[str, Any]
|
|
6
|
+
|
|
7
|
+
class StrictModel(BaseModel):
|
|
8
|
+
model_config = ConfigDict(extra="forbid")
|
|
9
|
+
|
|
10
|
+
class ArtifactRef(StrictModel):
|
|
11
|
+
artifact_id: str
|
|
12
|
+
task_id: str
|
|
13
|
+
attempt_id: str | None = None
|
|
14
|
+
artifact_kind: Literal["input","output","intermediate","log","trace","score","rubric","feedback","lesson","package_file","media","document","unknown"]
|
|
15
|
+
artifact_role: Literal["task_input","worker_output","reviser_output","apprentice_output","reference","hidden_reference","grader_output","verifier_output","evaluator_output","system_log","trace_file","other"]
|
|
16
|
+
workspace_path: str | None = None
|
|
17
|
+
package_relative_path: str
|
|
18
|
+
release_relative_path: str | None = None
|
|
19
|
+
mime_type: str | None = None
|
|
20
|
+
media_type: Literal["text","code","data","document","image","audio","video","archive","binary","unknown"]
|
|
21
|
+
size_bytes: int | None = None
|
|
22
|
+
content_hash: str | None = None
|
|
23
|
+
secret_scan_ok: bool
|
|
24
|
+
metadata_json: DictAny = Field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
class ActualOutputs(StrictModel):
|
|
27
|
+
task_id: str
|
|
28
|
+
attempt_id: str
|
|
29
|
+
attempt_kind: str
|
|
30
|
+
status: Literal["success","partial","failed","timeout","error"]
|
|
31
|
+
output_summary: str
|
|
32
|
+
primary_output_ref: str | None = None
|
|
33
|
+
deliverable_refs: list[str] = Field(default_factory=list)
|
|
34
|
+
final_message_ref: str | None = None
|
|
35
|
+
artifact_refs: list[str] = Field(default_factory=list)
|
|
36
|
+
files_created: list[str] = Field(default_factory=list)
|
|
37
|
+
files_modified: list[str] = Field(default_factory=list)
|
|
38
|
+
files_deleted: list[str] = Field(default_factory=list)
|
|
39
|
+
stdout_ref: str | None = None
|
|
40
|
+
stderr_ref: str | None = None
|
|
41
|
+
raw_log_refs: list[str] = Field(default_factory=list)
|
|
42
|
+
error_type: str | None = None
|
|
43
|
+
error_message: str | None = None
|
|
44
|
+
metadata_json: DictAny = Field(default_factory=dict)
|
|
45
|
+
|
|
46
|
+
class AgentTraceStep(StrictModel):
|
|
47
|
+
step: int
|
|
48
|
+
turn: int
|
|
49
|
+
actor: str
|
|
50
|
+
action: Literal["user_message","agent_step","output","error"]
|
|
51
|
+
operation: Literal["plan","analyze","search","read","write","edit","execute","verify","download","install","ask_user","answer","select","grade","evaluate","revise","other"] | None = None
|
|
52
|
+
tool: str | None = None
|
|
53
|
+
execution_mode: Literal["serial","parallel"] | None = None
|
|
54
|
+
parallel_group: str | None = None
|
|
55
|
+
observation: str | None = None
|
|
56
|
+
input: str | None = None
|
|
57
|
+
input_source: DictAny | None = None
|
|
58
|
+
output: str | None = None
|
|
59
|
+
state_change: str | None = None
|
|
60
|
+
reasoning: str | None = None
|
|
61
|
+
caused_by: list[int] | None = None
|
|
62
|
+
causal_type: Literal["user_request","follow_up_user_request","answer_to_agent_question","execution_of_plan","dependency_on_tool_result","retry_after_failure","correction_response","approval_response","verification_of_prior_step","dependency_on_multiple_prior_steps","delegation_to_subagent","delegated_work","used_subagent_result","handoff_from_subagent","parallel_work","other"] | None = None
|
|
63
|
+
causal_note: str | None = None
|
|
64
|
+
alternatives_considered: str | None = None
|
|
65
|
+
success: bool | None = None
|
|
66
|
+
step_outcome: Literal["progress","neutral","blocked","failed","corrected","completed"] | None = None
|
|
67
|
+
error_type: str | None = None
|
|
68
|
+
error_message: str | None = None
|
|
69
|
+
message_role: Literal["direct_request","answer_to_agent_question","correction","approval","clarification","selection","status_update","new_constraint","other"] | None = None
|
|
70
|
+
feedback_type: Literal["correction","approval","clarification","new_instruction","other"] | None = None
|
|
71
|
+
feedback_content: str | None = None
|
|
72
|
+
started_at: str | None = None
|
|
73
|
+
ended_at: str | None = None
|
|
74
|
+
retry_of: int | None = None
|
|
75
|
+
artifact_refs: list[str] = Field(default_factory=list)
|
|
76
|
+
metadata_json: DictAny = Field(default_factory=dict)
|
|
77
|
+
|
|
78
|
+
@model_validator(mode="after")
|
|
79
|
+
def validate_step(self):
|
|
80
|
+
if self.step < 1: raise ValueError("step must start at 1")
|
|
81
|
+
if self.action == "user_message":
|
|
82
|
+
required_null = ["operation","tool","execution_mode","observation","reasoning","success","step_outcome","output"]
|
|
83
|
+
bad = [name for name in required_null if getattr(self, name) is not None]
|
|
84
|
+
if self.actor != "user" or bad:
|
|
85
|
+
raise ValueError(f"user_message invariants failed: {bad}")
|
|
86
|
+
elif self.operation is None:
|
|
87
|
+
raise ValueError("operation is required for non-user steps")
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
class AgentTrace(StrictModel):
|
|
91
|
+
schema_version: str = "aa-trace-v0.1"
|
|
92
|
+
trace_id: str
|
|
93
|
+
collection_id: str | None = None
|
|
94
|
+
prior_trace_id: str | None = None
|
|
95
|
+
trace_mode: Literal["live","retraced","hybrid"]
|
|
96
|
+
task: str
|
|
97
|
+
task_id: str
|
|
98
|
+
task_family_id: str | None = None
|
|
99
|
+
attempt_id: str
|
|
100
|
+
attempt_kind: Literal["baseline","revised","apprentice_without_lessons","apprentice_with_lessons","other"]
|
|
101
|
+
attempt_status: Literal["completed","failed","blocked","fallback","partial"] | None = None
|
|
102
|
+
agent_tools: list[str]
|
|
103
|
+
started_at: str | None = None
|
|
104
|
+
ended_at: str | None = None
|
|
105
|
+
system_prompt: str | None = None
|
|
106
|
+
system_prompt_hash: str | None = None
|
|
107
|
+
skills: list[str] | None = None
|
|
108
|
+
memory: str | None = None
|
|
109
|
+
agent_config: DictAny | None = None
|
|
110
|
+
learning: str | None = None
|
|
111
|
+
termination_reason: Literal["task_complete","verifier_passed","verifier_failed","max_iterations_reached","agent_blocked","timeout","error_unrecoverable","partial_then_stopped","provider_usage_limit","other"]
|
|
112
|
+
steps: list[AgentTraceStep]
|
|
113
|
+
actual_outputs: ActualOutputs | None = None
|
|
114
|
+
artifacts: list[ArtifactRef] = Field(default_factory=list)
|
|
115
|
+
iteration_index: int | None = None
|
|
116
|
+
previous_attempt_id: str | None = None
|
|
117
|
+
revision_group_id: str | None = None
|
|
118
|
+
completion_reason: str | None = None
|
|
119
|
+
final_attempt_id: str | None = None
|
|
120
|
+
preferred_attempt_id: str | None = None
|
|
121
|
+
initial_attempt_id: str | None = None
|
|
122
|
+
revision_attempt_ids: list[str] = Field(default_factory=list)
|
|
123
|
+
metadata_json: DictAny = Field(default_factory=dict)
|
|
124
|
+
|
|
125
|
+
@model_validator(mode="after")
|
|
126
|
+
def validate_trace(self):
|
|
127
|
+
if not self.steps: raise ValueError("trace must have steps")
|
|
128
|
+
nums = [s.step for s in self.steps]
|
|
129
|
+
if nums != list(range(1, len(nums)+1)): raise ValueError("step numbers must be monotonic starting at 1")
|
|
130
|
+
seen=set()
|
|
131
|
+
for s in self.steps:
|
|
132
|
+
if s.caused_by and any(c >= s.step or c < 1 for c in s.caused_by):
|
|
133
|
+
raise ValueError("causality refs must point to earlier steps")
|
|
134
|
+
seen.add(s.step)
|
|
135
|
+
return self
|
|
136
|
+
|
|
137
|
+
class RawTaskRecord(StrictModel):
|
|
138
|
+
raw_task_id: str
|
|
139
|
+
source_kind: str
|
|
140
|
+
source_url: str | None = None
|
|
141
|
+
source_license: str | None = None
|
|
142
|
+
raw_title: str
|
|
143
|
+
raw_description: str
|
|
144
|
+
raw_payload: DictAny = Field(default_factory=dict)
|
|
145
|
+
task_id: str | None = None
|
|
146
|
+
normalized_title: str | None = None
|
|
147
|
+
normalized_instruction: str | None = None
|
|
148
|
+
input_artifact_refs: list[str] = Field(default_factory=list)
|
|
149
|
+
created_at: str | None = None
|
|
150
|
+
metadata_json: DictAny = Field(default_factory=dict)
|
|
151
|
+
normalized_domain: str | None = None
|
|
152
|
+
normalized_subdomain: str | None = None
|
|
153
|
+
apprenticeship_role: str | None = None
|
|
154
|
+
task_family: str | None = None
|
|
155
|
+
expected_deliverable: str | None = None
|
|
156
|
+
expected_economic_value: str | None = None
|
|
157
|
+
expected_economic_value_for_agent_apprentice: str | None = None
|
|
158
|
+
expected_pay: str | None = None
|
|
159
|
+
expected_apprentice_pay: str | None = None
|
|
160
|
+
source_url_or_ref: str | None = None
|
|
161
|
+
difficulty_tier: Literal["easy","medium","hard","expert"] | None = None
|
|
162
|
+
needs_expert_review: bool | None = None
|
|
163
|
+
|
|
164
|
+
@model_validator(mode="before")
|
|
165
|
+
@classmethod
|
|
166
|
+
def accept_task_sheet(cls, data):
|
|
167
|
+
if not isinstance(data, dict):
|
|
168
|
+
return data
|
|
169
|
+
d=dict(data)
|
|
170
|
+
payload=dict(d.get('raw_payload') or {})
|
|
171
|
+
if 'task_id' in d and 'raw_task_id' not in d:
|
|
172
|
+
d['raw_task_id']=d['task_id']
|
|
173
|
+
if 'normalized_title' in d and 'raw_title' not in d:
|
|
174
|
+
d['raw_title']=d['normalized_title']
|
|
175
|
+
if 'normalized_instruction' in d and 'raw_description' not in d:
|
|
176
|
+
d['raw_description']=d['normalized_instruction']
|
|
177
|
+
if 'source_url_or_ref' in d and 'source_url' not in d:
|
|
178
|
+
d['source_url']=d['source_url_or_ref']
|
|
179
|
+
if 'expected_pay' in d and 'expected_economic_value' not in d:
|
|
180
|
+
d['expected_economic_value'] = d['expected_pay']
|
|
181
|
+
if 'expected_apprentice_pay' in d and 'expected_economic_value_for_agent_apprentice' not in d:
|
|
182
|
+
d['expected_economic_value_for_agent_apprentice'] = d['expected_apprentice_pay']
|
|
183
|
+
for key in ['normalized_domain','normalized_subdomain','apprenticeship_role','task_family','expected_deliverable','expected_economic_value','expected_economic_value_for_agent_apprentice','expected_pay','expected_apprentice_pay','difficulty_tier','needs_expert_review']:
|
|
184
|
+
if key in d and key not in payload:
|
|
185
|
+
payload[key]=d[key]
|
|
186
|
+
d.setdefault('source_kind', d.get('source_kind') or 'manual_seed')
|
|
187
|
+
d.setdefault('raw_title', d.get('raw_task_id','untitled_task'))
|
|
188
|
+
d.setdefault('raw_description', d.get('expected_deliverable') or '')
|
|
189
|
+
d['raw_payload']=payload
|
|
190
|
+
return d
|
|
191
|
+
|
|
192
|
+
class TaskIntakeSpec(StrictModel):
|
|
193
|
+
task_id: str; normalized_title: str; normalized_instruction: str; domain: str
|
|
194
|
+
subdomain: str | None = None; professional_role: str | None = None; apprenticeship_role: str | None = None; task_family: str | None = None; expected_economic_value: str | None = None; expected_economic_value_for_agent_apprentice: str | None = None; expected_pay: str | None = None; expected_apprentice_pay: str | None = None; workflow_type: str
|
|
195
|
+
skill_targets: list[str] = Field(default_factory=list)
|
|
196
|
+
difficulty_tier: Literal["easy","medium","hard","expert"]
|
|
197
|
+
expected_human_deliverable: str; expected_agent_deliverable: str
|
|
198
|
+
input_requirements: list[str] = Field(default_factory=list); output_requirements: list[str] = Field(default_factory=list)
|
|
199
|
+
required_context: list[str] = Field(default_factory=list); assumptions: list[str] = Field(default_factory=list)
|
|
200
|
+
constraints: list[str] = Field(default_factory=list); allowed_tools: list[str] = Field(default_factory=list); disallowed_tools: list[str] = Field(default_factory=list)
|
|
201
|
+
privacy_classification: Literal["public","synthetic","sensitive_possible","contains_pii","unknown"]
|
|
202
|
+
license: str | None = None; allowed_use: str | None = None
|
|
203
|
+
rubricability_score: float; verifiability_score: float; artifactability_score: float
|
|
204
|
+
needs_expert_review: bool; metadata_json: DictAny = Field(default_factory=dict)
|
|
205
|
+
|
|
206
|
+
@model_validator(mode="before")
|
|
207
|
+
@classmethod
|
|
208
|
+
def accept_legacy_economic_fields(cls, data):
|
|
209
|
+
if not isinstance(data, dict):
|
|
210
|
+
return data
|
|
211
|
+
d = dict(data)
|
|
212
|
+
if d.get('expected_pay') is not None and d.get('expected_economic_value') is None:
|
|
213
|
+
d['expected_economic_value'] = d.get('expected_pay')
|
|
214
|
+
if d.get('expected_apprentice_pay') is not None and d.get('expected_economic_value_for_agent_apprentice') is None:
|
|
215
|
+
d['expected_economic_value_for_agent_apprentice'] = d.get('expected_apprentice_pay')
|
|
216
|
+
if d.get('expected_economic_value') is not None and d.get('expected_pay') is None:
|
|
217
|
+
d['expected_pay'] = d.get('expected_economic_value')
|
|
218
|
+
if d.get('expected_economic_value_for_agent_apprentice') is not None and d.get('expected_apprentice_pay') is None:
|
|
219
|
+
d['expected_apprentice_pay'] = d.get('expected_economic_value_for_agent_apprentice')
|
|
220
|
+
return d
|
|
221
|
+
|
|
222
|
+
class TaskIntakeQualityReport(StrictModel):
|
|
223
|
+
task_id: str; instruction_clarity_score: float; input_completeness_score: float; output_contract_score: float
|
|
224
|
+
rubricability_score: float; verifiability_score: float; artifactability_score: float; privacy_risk_score: float; license_risk_score: float; ambiguity_score: float; overall_intake_quality_score: float
|
|
225
|
+
quality_flags: list[str] = Field(default_factory=list); blockers: list[str] = Field(default_factory=list); recommended_fix: str | None = None; metadata_json: DictAny = Field(default_factory=dict)
|
|
226
|
+
|
|
227
|
+
class RubricItem(StrictModel):
|
|
228
|
+
rubric_item_id: str; criterion_name: str; criterion_description: str; weight: float; score_min: float; score_max: float; pass_threshold: float
|
|
229
|
+
observable_evidence: list[str]; required_artifacts: list[str]
|
|
230
|
+
scoring_method: Literal["llm_rubric_judge","deterministic","schema_match","regex","unit_test","hybrid","human_future"]
|
|
231
|
+
worker_visible: bool; verifier_only: bool; hidden_reference_required: bool
|
|
232
|
+
failure_modes: list[str] = Field(default_factory=list); partial_credit_rules: list[str] = Field(default_factory=list); edge_cases: list[str] = Field(default_factory=list); anti_cheat_notes: list[str] = Field(default_factory=list); metadata_json: DictAny = Field(default_factory=dict)
|
|
233
|
+
|
|
234
|
+
class RubricSpec(StrictModel):
|
|
235
|
+
rubric_id: str; task_id: str; task_family_id: str | None = None; rubric_version: str; rubric_items: list[RubricItem]; total_weight: float; pass_threshold: float
|
|
236
|
+
worker_visible_rubric_ref: str; verifier_private_rubric_ref: str; hidden_reference_policy: str
|
|
237
|
+
scoring_aggregation: Literal["weighted_sum","sum","all_required","custom"]
|
|
238
|
+
required_artifacts: list[str]; disqualifying_errors: list[str] = Field(default_factory=list); partial_credit_allowed: bool
|
|
239
|
+
grader_kind: Literal["llm_rubric_judge","deterministic","hybrid","human_future"]
|
|
240
|
+
rubric_generation_source: Literal["agent_assisted","family_template","task_specific_agent_draft","expert_override","deterministic_seed"]
|
|
241
|
+
rubric_generation_agent_provider: str | None = None; rubric_generation_agent_model: str | None = None; rubric_generation_confidence: float | None = None; metadata_json: DictAny = Field(default_factory=dict)
|
|
242
|
+
@model_validator(mode="after")
|
|
243
|
+
def valid_rubric(self):
|
|
244
|
+
if not self.rubric_items: raise ValueError("rubric_items required")
|
|
245
|
+
if not (abs(sum(i.weight for i in self.rubric_items)-1.0)<1e-6 or abs(sum(i.weight for i in self.rubric_items)-100)<1e-6): raise ValueError("weights must sum to 1.0 or 100")
|
|
246
|
+
if any(not i.observable_evidence or not i.required_artifacts for i in self.rubric_items): raise ValueError("observable evidence and artifacts required")
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
class RubricQualityReport(StrictModel):
|
|
250
|
+
rubric_id: str; task_id: str; criteria_count: int; total_weight: float; weights_sum_valid: bool; has_observable_evidence: bool; has_required_artifacts: bool; has_partial_credit_rules: bool; has_disqualifying_errors: bool; has_hidden_reference_policy: bool; has_worker_visible_view: bool; has_verifier_private_view: bool; ambiguous_criteria_count: int; unverifiable_criteria_count: int; rubric_quality_score: float; quality_flags: list[str] = Field(default_factory=list); blockers: list[str] = Field(default_factory=list); metadata_json: DictAny = Field(default_factory=dict)
|
|
251
|
+
|
|
252
|
+
class RubricItemScore(StrictModel):
|
|
253
|
+
rubric_item_id: str; criterion_name: str; score: float; max_score: float; passed: bool; evidence_refs: list[str] = Field(default_factory=list); failure_mode: str | None = None; notes: str | None = None; confidence: float | None = None; artifact_presence_ok: bool | None = None; semantic_correctness_score: float | None = None; reasoning_summary: str | None = None; improvement_suggestion: str | None = None
|
|
254
|
+
class GraderResult(StrictModel):
|
|
255
|
+
grader_result_id: str; task_id: str; attempt_id: str; attempt_kind: str; rubric_id: str; grader_kind: Literal["llm_rubric_judge","llm","model","deterministic","hybrid","human_future"]; score_source: Literal["llm","llm_semantic","model_judged","deterministic","deterministic_artifact_contract","deterministic_fallback","hybrid","human_future"]; score: float; max_score: float; passed: bool; rubric_item_scores: list[RubricItemScore]; failed_criteria: list[str]; passed_criteria: list[str]; evidence_refs: list[str]; confidence: float; reasoning_summary: str | None = None; limitations: list[str] = Field(default_factory=list); hidden_reference_used: bool; hidden_reference_leaked: bool; artifact_contract_score: float | None = None; semantic_score: float | None = None; model_score: float | None = None; legacy_semantic_score: float | None = None; legacy_score_source: str | None = None; final_score: float | None = None; model: str | None = None; provider: str | None = None; deterministic_precheck_ref: str | None = None; llm_prompt_ref_internal: str | None = None; llm_response_ref_internal: str | None = None; public_prompt_hash: str | None = None; public_response_summary: str | None = None; score_reliability: Literal["verified","unverified","needs_review","failed_verification"] | None = None; verifier_status: str | None = None; verifier_confidence: float | None = None; verifier_issue_count: int | None = None; verifier_issues_summary: str | None = None; metadata_json: DictAny = Field(default_factory=dict)
|
|
256
|
+
class VerifierResult(StrictModel):
|
|
257
|
+
verifier_result_id: str; task_id: str; attempt_id: str; attempt_kind: str; grader_result_id: str | None = None; verification_status: Literal["verified","partially_verified","failed","not_run"]; artifact_contract_ok: bool; evidence_grounding_ok: bool; score_consistency_ok: bool; hidden_reference_leaked: bool; issues: list[str] = Field(default_factory=list); confidence: float; verifier_notes: str | None = None; semantic_evidence_grounding_ok: bool | None = None; unsupported_claims: list[str] = Field(default_factory=list); leakage_check_ok: bool | None = None; model: str | None = None; provider: str | None = None; metadata_json: DictAny = Field(default_factory=dict)
|
|
258
|
+
class EvaluatorFeedback(StrictModel):
|
|
259
|
+
feedback_id: str; task_id: str; attempt_id: str; target_actor: Literal["worker","reviser","apprentice"]; feedback_type: Literal["criteria_failure","artifact_missing","format_error","logic_error","tool_error","quality_gap","strategy_gap","safety_or_privacy","other"]; failed_rubric_items: list[str]; evidence_refs: list[str]; artifact_refs: list[str]; feedback_summary: str; actionable_feedback: list[str]; suggested_revision: str; revision_priority: Literal["low","medium","high"]; confidence: float; hidden_reference_used: bool; hidden_reference_leaked: bool; failed_or_weak_rubric_items: list[str] = Field(default_factory=list); artifact_specific_comments: list[str] = Field(default_factory=list); trace_specific_comments: list[str] = Field(default_factory=list); revision_plan: str | None = None; model: str | None = None; provider: str | None = None; metadata_json: DictAny = Field(default_factory=dict)
|
|
260
|
+
class RevisionPlan(StrictModel):
|
|
261
|
+
revision_plan_id: str; task_id: str; source_attempt_id: str; target_attempt_id: str; revision_kind: Literal["local_fix","strategy_shift","tool_change","decomposition_change","artifact_rebuild","format_repair","other"]; revision_reason: str; failed_rubric_items: list[str]; planned_changes: list[str]; expected_score_improvement: float | None = None; risk_of_regression: Literal["low","medium","high"]; uses_evaluator_feedback: bool; metadata_json: DictAny = Field(default_factory=dict)
|
|
262
|
+
class HillclimbResult(StrictModel):
|
|
263
|
+
hillclimb_id: str; task_id: str; baseline_attempt_id: str; revised_attempt_id: str; baseline_score: float; revised_score: float; revision_score_delta: float; baseline_passed: bool; revised_passed: bool; failed_criteria_before: list[str]; failed_criteria_after: list[str]; criteria_improved: list[str]; criteria_regressed: list[str]; artifact_completeness_before: float; artifact_completeness_after: float; artifact_completeness_delta: float; regression_count: int; improvement_kind: Literal["score_delta","pass_delta","criteria_delta","artifact_delta","none","regression"]; hillclimb_evidence_strength: Literal["observed_improvement","no_observed_improvement","regression_observed"]; revision_success: bool; metadata_json: DictAny = Field(default_factory=dict)
|
|
264
|
+
class LessonPack(StrictModel):
|
|
265
|
+
lesson_id: str; task_id: str; source_attempt_ids: list[str]; lesson_summary: str; strategy_lessons: list[str]; common_failure_modes: list[str]; rubric_reminders: list[str]; artifact_requirements: list[str]; verifier_feedback_summary: str; hidden_reference_leaked: bool; metadata_json: DictAny = Field(default_factory=dict)
|
|
266
|
+
class TrainingSignal(StrictModel):
|
|
267
|
+
signal_id: str; task_id: str; signal_type: Literal["rollout","process_supervision","verifier_training","reward_modeling","revision_preference","lesson_transfer"]; source_attempt_ids: list[str]; baseline_score: float | None = None; revised_score: float | None = None; score_delta: float | None = None; criteria_improved: list[str]=Field(default_factory=list); criteria_regressed: list[str]=Field(default_factory=list); failed_criteria_before: list[str]=Field(default_factory=list); failed_criteria_after: list[str]=Field(default_factory=list); feedback_ref: str | None = None; revision_plan_ref: str | None = None; grader_result_refs: list[str]=Field(default_factory=list); verifier_result_refs: list[str]=Field(default_factory=list); trace_refs: list[str]=Field(default_factory=list); artifact_refs: list[str]=Field(default_factory=list); score_source: str; grader_kind: str; confidence: float | None = None; training_use_cases: list[str]=Field(default_factory=list); limitations: list[str]=Field(default_factory=list); metadata_json: DictAny=Field(default_factory=dict)
|
|
268
|
+
class ProcessSupervisionExample(StrictModel):
|
|
269
|
+
example_id: str; task_id: str; attempt_id: str; attempt_kind: str | None = None; trace_id: str; step: int; actor: str; action: str; operation: str | None = None; tool: str | None = None; observation: str | None = None; input: str | None = None; output: str | None = None; state_change: str | None = None; reasoning: str | None = None; caused_by: list[int] | None = None; causal_type: str | None = None; success: bool | None = None; step_outcome: str | None = None; step_quality_label: Literal["positive","neutral","negative","unknown"]; local_reward: float | None = None; failure_mode: str | None = None; grader_feedback: str | None = None; verifier_feedback: str | None = None; evaluator_feedback: str | None = None; revision_reason: str | None = None; final_outcome_score: float | None = None; label_source: Literal["grader","verifier","evaluator","heuristic","human_future","none"]; label: str = ""; metadata_json: DictAny = Field(default_factory=dict)
|
|
270
|
+
class RewardModelingExample(StrictModel):
|
|
271
|
+
example_id: str; task_id: str; attempt_id: str; rubric_ref: str; output_refs: list[str]; attempt_summary: str; rubric_item_scores: list[RubricItemScore]; final_score: float; passed: bool; failure_modes: list[str]; grader_notes: str | None = None; evidence_refs: list[str]; score_source: str; grader_kind: str; confidence: float; score_reliability: str | None = None; verifier_status: str | None = None; verifier_confidence: float | None = None; verifier_issue_count: int | None = None; verifier_issues_summary: str | None = None; metadata_json: DictAny=Field(default_factory=dict)
|
|
272
|
+
class RevisionPreferencePair(StrictModel):
|
|
273
|
+
pair_id: str; task_id: str; rubric_ref: str; baseline_attempt_ref: str; revised_attempt_ref: str; chosen_attempt_id: str; rejected_attempt_id: str; baseline_score: float; revised_score: float; score_delta: float; criteria_improved: list[str]; criteria_regressed: list[str]; preference_reason: str; score_source: str; grader_kind: str; confidence: float; metadata_json: DictAny=Field(default_factory=dict)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
from .io import append_jsonl, read_jsonl
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
SessionEventType = Literal[
|
|
14
|
+
"task_instruction",
|
|
15
|
+
"task_assets_added",
|
|
16
|
+
"agent_attempt",
|
|
17
|
+
"evaluation",
|
|
18
|
+
"revision",
|
|
19
|
+
"user_followup",
|
|
20
|
+
"followup_assets_added",
|
|
21
|
+
"session_finished",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SessionEvent(BaseModel):
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
event_type: SessionEventType
|
|
29
|
+
event_index: int
|
|
30
|
+
created_at: str
|
|
31
|
+
run_id: str
|
|
32
|
+
task_id: str | None = None
|
|
33
|
+
session_id: str | None = None
|
|
34
|
+
attempt_id: str | None = None
|
|
35
|
+
feedback_source: str | None = None
|
|
36
|
+
feedback_type: str | None = None
|
|
37
|
+
applies_to_attempt: str | None = None
|
|
38
|
+
followup_index: int | None = None
|
|
39
|
+
instruction: str | None = None
|
|
40
|
+
followup_instruction: str | None = None
|
|
41
|
+
assets: list[str] = Field(default_factory=list)
|
|
42
|
+
followup_assets: list[str] = Field(default_factory=list)
|
|
43
|
+
metadata_json: dict[str, Any] = Field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def utc_now() -> str:
|
|
47
|
+
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def events_path(run_root: Path) -> Path:
|
|
51
|
+
return run_root / "session_events.jsonl"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def read_session_events(run_root: Path) -> list[dict[str, Any]]:
|
|
55
|
+
return read_jsonl(events_path(run_root))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def next_event_index(run_root: Path) -> int:
|
|
59
|
+
return len(read_session_events(run_root)) + 1
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def append_session_event(run_root: Path, **kwargs: Any) -> SessionEvent:
|
|
63
|
+
run_root.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
event = SessionEvent(
|
|
65
|
+
event_index=next_event_index(run_root),
|
|
66
|
+
created_at=utc_now(),
|
|
67
|
+
**kwargs,
|
|
68
|
+
)
|
|
69
|
+
append_jsonl(events_path(run_root), event)
|
|
70
|
+
return event
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def backfill_session_event_task_ids(run_root: Path, task_id: str | None) -> None:
|
|
74
|
+
if not task_id:
|
|
75
|
+
return
|
|
76
|
+
path = events_path(run_root)
|
|
77
|
+
rows = read_jsonl(path)
|
|
78
|
+
if not rows:
|
|
79
|
+
return
|
|
80
|
+
changed = False
|
|
81
|
+
for row in rows:
|
|
82
|
+
if isinstance(row, dict) and not row.get("task_id"):
|
|
83
|
+
row["task_id"] = task_id
|
|
84
|
+
changed = True
|
|
85
|
+
if not changed:
|
|
86
|
+
return
|
|
87
|
+
with path.open("w") as f:
|
|
88
|
+
for row in rows:
|
|
89
|
+
f.write(json.dumps(row, sort_keys=True) + "\n")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def next_followup_index(run_root: Path) -> int:
|
|
93
|
+
events = read_session_events(run_root)
|
|
94
|
+
values = [
|
|
95
|
+
int(e.get("followup_index") or 0)
|
|
96
|
+
for e in events
|
|
97
|
+
if e.get("event_type") == "user_followup"
|
|
98
|
+
]
|
|
99
|
+
return (max(values) if values else 0) + 1
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from .schemas import RawTaskRecord, TaskIntakeSpec, TaskIntakeQualityReport
|
|
6
|
+
from .config import get_settings
|
|
7
|
+
from .io import read_json
|
|
8
|
+
from .openai_structured import get_model_provider_status, run_structured_role
|
|
9
|
+
from .public_sanitizer import sanitize_public_obj, sha256_text
|
|
10
|
+
|
|
11
|
+
class LLMTaskIntakeOutput(BaseModel):
|
|
12
|
+
task_intake_spec: TaskIntakeSpec
|
|
13
|
+
task_intake_quality_report: TaskIntakeQualityReport
|
|
14
|
+
|
|
15
|
+
SOURCE_FIELD_KEYS={'source_url_or_ref','source_kind','source_url','source_ref','source_license'}
|
|
16
|
+
|
|
17
|
+
def _mentor_provider_can_attempt() -> bool:
|
|
18
|
+
return bool(get_model_provider_status().get('provider_available'))
|
|
19
|
+
|
|
20
|
+
def _mentor_provider_id() -> str:
|
|
21
|
+
settings=get_settings()
|
|
22
|
+
return settings.model_provider or 'openai'
|
|
23
|
+
|
|
24
|
+
def _drop_source_fields(obj):
|
|
25
|
+
if isinstance(obj, list):
|
|
26
|
+
return [_drop_source_fields(v) for v in obj]
|
|
27
|
+
if isinstance(obj, dict):
|
|
28
|
+
return {k:_drop_source_fields(v) for k,v in obj.items() if k not in SOURCE_FIELD_KEYS and v is not None}
|
|
29
|
+
return obj
|
|
30
|
+
|
|
31
|
+
def _task_record_for_intake(raw: RawTaskRecord) -> dict:
|
|
32
|
+
data=raw.model_dump(mode='json')
|
|
33
|
+
return sanitize_public_obj(_drop_source_fields(data))
|
|
34
|
+
|
|
35
|
+
def direct_task_sheet_metadata(raw: RawTaskRecord) -> dict[str, object]:
|
|
36
|
+
payload=raw.raw_payload or {}
|
|
37
|
+
expected_economic_value = raw.expected_economic_value or payload.get('expected_economic_value') or raw.expected_pay or payload.get('expected_pay')
|
|
38
|
+
expected_economic_value_for_agent_apprentice = (
|
|
39
|
+
raw.expected_economic_value_for_agent_apprentice
|
|
40
|
+
or payload.get('expected_economic_value_for_agent_apprentice')
|
|
41
|
+
or raw.expected_apprentice_pay
|
|
42
|
+
or payload.get('expected_apprentice_pay')
|
|
43
|
+
)
|
|
44
|
+
return {
|
|
45
|
+
'domain': raw.normalized_domain or payload.get('normalized_domain') or payload.get('domain'),
|
|
46
|
+
'subdomain': raw.normalized_subdomain or payload.get('normalized_subdomain') or payload.get('subdomain'),
|
|
47
|
+
'apprenticeship_role': raw.apprenticeship_role or payload.get('apprenticeship_role'),
|
|
48
|
+
'expected_economic_value': expected_economic_value,
|
|
49
|
+
'expected_economic_value_for_agent_apprentice': expected_economic_value_for_agent_apprentice,
|
|
50
|
+
'expected_pay': expected_economic_value,
|
|
51
|
+
'expected_apprentice_pay': expected_economic_value_for_agent_apprentice,
|
|
52
|
+
'task_family': raw.task_family or payload.get('task_family'),
|
|
53
|
+
'difficulty_tier': raw.difficulty_tier or payload.get('difficulty_tier'),
|
|
54
|
+
'needs_expert_review': raw.needs_expert_review if raw.needs_expert_review is not None else payload.get('needs_expert_review'),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def apply_direct_task_sheet_metadata(spec: TaskIntakeSpec, raw: RawTaskRecord) -> TaskIntakeSpec:
|
|
58
|
+
direct={k:v for k,v in direct_task_sheet_metadata(raw).items() if v is not None}
|
|
59
|
+
updates={}
|
|
60
|
+
for src,dst in [('domain','domain'),('subdomain','subdomain'),('apprenticeship_role','apprenticeship_role'),('expected_economic_value','expected_economic_value'),('expected_economic_value_for_agent_apprentice','expected_economic_value_for_agent_apprentice'),('expected_pay','expected_pay'),('expected_apprentice_pay','expected_apprentice_pay'),('task_family','task_family'),('difficulty_tier','difficulty_tier'),('needs_expert_review','needs_expert_review')]:
|
|
61
|
+
if src in direct:
|
|
62
|
+
updates[dst]=direct[src]
|
|
63
|
+
md=dict(spec.metadata_json or {})
|
|
64
|
+
md['direct_task_sheet_fields']={k:v for k,v in direct.items() if k not in {'expected_pay','expected_apprentice_pay'}}
|
|
65
|
+
md['direct_task_sheet_fields_preserved']=bool(direct)
|
|
66
|
+
return spec.model_copy(update={**updates, 'metadata_json': md})
|
|
67
|
+
|
|
68
|
+
def deterministic_intake(raw: RawTaskRecord) -> tuple[TaskIntakeSpec, TaskIntakeQualityReport]:
|
|
69
|
+
tid = raw.raw_task_id.replace('raw_','task_')
|
|
70
|
+
direct=direct_task_sheet_metadata(raw)
|
|
71
|
+
expected=raw.raw_payload.get('expected_deliverable') or raw.expected_deliverable or raw.raw_payload.get('expected_agent_deliverable','Completed deliverables and audit notes')
|
|
72
|
+
spec=TaskIntakeSpec(task_id=tid, normalized_title=raw.raw_title, normalized_instruction=raw.raw_description, domain=direct.get('domain') or raw.raw_payload.get('domain','general'), subdomain=direct.get('subdomain') or raw.raw_payload.get('subdomain'), professional_role=raw.raw_payload.get('professional_role'), apprenticeship_role=direct.get('apprenticeship_role'), task_family=direct.get('task_family'), expected_economic_value=direct.get('expected_economic_value'), expected_economic_value_for_agent_apprentice=direct.get('expected_economic_value_for_agent_apprentice'), expected_pay=direct.get('expected_pay'), expected_apprentice_pay=direct.get('expected_apprentice_pay'), workflow_type=raw.raw_payload.get('workflow_type','analysis'), skill_targets=raw.raw_payload.get('skill_targets',['analysis','artifact_generation']), difficulty_tier=direct.get('difficulty_tier') or raw.raw_payload.get('difficulty_tier','medium'), expected_human_deliverable=raw.raw_payload.get('expected_human_deliverable', expected), expected_agent_deliverable=expected, input_requirements=raw.raw_payload.get('input_requirements',[]), output_requirements=raw.raw_payload.get('output_requirements',[]), required_context=raw.raw_payload.get('required_context',[]), assumptions=[], constraints=raw.raw_payload.get('constraints',[]), allowed_tools=['python','file_read','file_write','bash'], disallowed_tools=['browser'], privacy_classification=raw.raw_payload.get('privacy_classification','unknown'), license=None, allowed_use='local apprenticeship data generation', rubricability_score=0.8, verifiability_score=0.8, artifactability_score=0.9, needs_expert_review=bool(direct.get('needs_expert_review')) if direct.get('needs_expert_review') is not None else False, metadata_json={'runner':'deterministic','intake_source':'deterministic_fallback'})
|
|
73
|
+
spec=apply_direct_task_sheet_metadata(spec, raw)
|
|
74
|
+
settings=get_settings()
|
|
75
|
+
if settings.rubric_mode in {'hybrid','llm_default'} and _mentor_provider_can_attempt() and settings.llm_task_intake_enabled:
|
|
76
|
+
spec.metadata_json.update({'llm_task_intake_enabled': True, 'llm_unavailable': True, 'intake_source':'deterministic_fallback', 'provider': _mentor_provider_id()})
|
|
77
|
+
q=TaskIntakeQualityReport(task_id=tid, instruction_clarity_score=0.8, input_completeness_score=0.8, output_contract_score=0.9, rubricability_score=spec.rubricability_score, verifiability_score=spec.verifiability_score, artifactability_score=spec.artifactability_score, privacy_risk_score=0.1, license_risk_score=0.1, ambiguity_score=0.2, overall_intake_quality_score=0.82, quality_flags=[], blockers=[], recommended_fix=None, metadata_json={})
|
|
78
|
+
return spec,q
|
|
79
|
+
|
|
80
|
+
def _intake_prompt(raw: RawTaskRecord) -> str:
|
|
81
|
+
return """Return only valid JSON. Do not include markdown. Do not add extra top-level fields; place extras under metadata_json.extra_model_fields.
|
|
82
|
+
Required skeleton: {"task_intake_spec":{"task_id":"...","normalized_title":"...","normalized_instruction":"...","domain":"general","workflow_type":"artifact_generation","difficulty_tier":"medium","expected_human_deliverable":"...","expected_agent_deliverable":"...","input_requirements":[],"output_requirements":[],"required_context":[],"assumptions":[],"constraints":[],"allowed_tools":[],"disallowed_tools":[],"privacy_classification":"unknown","rubricability_score":0.7,"verifiability_score":0.7,"artifactability_score":0.7,"needs_expert_review":false,"metadata_json":{}},"task_intake_quality_report":{"task_id":"...","instruction_clarity_score":0.7,"input_completeness_score":0.7,"output_contract_score":0.7,"rubricability_score":0.7,"verifiability_score":0.7,"artifactability_score":0.7,"privacy_risk_score":0.2,"license_risk_score":0.2,"ambiguity_score":0.3,"overall_intake_quality_score":0.7,"quality_flags":[],"blockers":[],"metadata_json":{}}}.
|
|
83
|
+
Create a publishable task intake spec for reusable agent work experience. Populate title, domain, subdomain, agent apprenticeship role, expected_economic_value, and expected_economic_value_for_agent_apprentice when inferable from the task. Include task_family_guess, task_type, required_artifacts, hidden_reference_policy, risk_notes, subjectivity_level, evaluation_difficulty, and suggested_evaluator_type under metadata_json when not schema fields.
|
|
84
|
+
Raw task JSON:
|
|
85
|
+
""" + json.dumps(_task_record_for_intake(raw), sort_keys=True)
|
|
86
|
+
|
|
87
|
+
def task_intake(raw: RawTaskRecord, role_root: Path | None=None) -> tuple[TaskIntakeSpec, TaskIntakeQualityReport]:
|
|
88
|
+
settings=get_settings()
|
|
89
|
+
role_root=role_root or Path('outputs/roles')
|
|
90
|
+
if settings.llm_task_intake_enabled and settings.rubric_mode in {'hybrid','llm_default','llm_required'} and _mentor_provider_can_attempt():
|
|
91
|
+
prompt=_intake_prompt(raw)
|
|
92
|
+
try:
|
|
93
|
+
provider=_mentor_provider_id()
|
|
94
|
+
model_override=settings.llm_task_intake_model if provider == 'openai' else None
|
|
95
|
+
rr=run_structured_role('intake_agent', prompt, LLMTaskIntakeOutput, role_root/'intake_agent', allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': raw.raw_task_id.replace('raw_','task_'), 'task_title': raw.raw_title, 'task_instruction': raw.raw_description, 'model': model_override or settings.model_provider_model, 'provider':provider})
|
|
96
|
+
if rr.live_call_ok and rr.structured_output_validation_ok:
|
|
97
|
+
parsed=read_json(role_root/'intake_agent/parsed_output.json')
|
|
98
|
+
spec=TaskIntakeSpec.model_validate(parsed['task_intake_spec'])
|
|
99
|
+
spec=apply_direct_task_sheet_metadata(spec, raw)
|
|
100
|
+
q=TaskIntakeQualityReport.model_validate(parsed['task_intake_quality_report'])
|
|
101
|
+
spec.metadata_json.update({'intake_source':'llm','provider':rr.provider,'model':rr.model,'llm_prompt_ref_internal':str(role_root/'intake_agent/prompt.md'),'llm_response_ref_internal':str(role_root/'intake_agent/raw_output.txt'),'prompt_template_id':'task_intake_agent_v0','prompt_template_version':'0.1','prompt_hash':sha256_text(prompt),'public_response_summary':'Model-assisted task intake generated structured task spec.'})
|
|
102
|
+
q.metadata_json.update({'intake_source':'llm','role_result_ref_internal':str(role_root/'intake_agent/role_result.json')})
|
|
103
|
+
return spec,q
|
|
104
|
+
if settings.rubric_mode == 'llm_required' or settings.llm_fail_closed:
|
|
105
|
+
raise RuntimeError(rr.error_message or 'Model task intake failed')
|
|
106
|
+
except Exception:
|
|
107
|
+
if settings.rubric_mode == 'llm_required' or settings.llm_fail_closed:
|
|
108
|
+
raise
|
|
109
|
+
spec,q=deterministic_intake(raw)
|
|
110
|
+
if settings.llm_task_intake_enabled and _mentor_provider_can_attempt():
|
|
111
|
+
spec.metadata_json.update({'intake_source':'deterministic_fallback','llm_unavailable':True,'provider':_mentor_provider_id()})
|
|
112
|
+
return spec,q
|