agent-apprenticeship 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +217 -0
  3. package/bin/agent-apprenticeship.js +131 -0
  4. package/package.json +30 -0
  5. package/pyproject.toml +23 -0
  6. package/src/agent_apprenticeship_trace/__init__.py +2 -0
  7. package/src/agent_apprenticeship_trace/actual_outputs_normalizer.py +240 -0
  8. package/src/agent_apprenticeship_trace/apprentice_adapters.py +348 -0
  9. package/src/agent_apprenticeship_trace/artifact_capture.py +23 -0
  10. package/src/agent_apprenticeship_trace/artifact_previews.py +80 -0
  11. package/src/agent_apprenticeship_trace/artifact_resolver.py +142 -0
  12. package/src/agent_apprenticeship_trace/batch_runner.py +116 -0
  13. package/src/agent_apprenticeship_trace/bundle_exporter.py +254 -0
  14. package/src/agent_apprenticeship_trace/certification.py +580 -0
  15. package/src/agent_apprenticeship_trace/cli.py +2979 -0
  16. package/src/agent_apprenticeship_trace/codex_runner.py +428 -0
  17. package/src/agent_apprenticeship_trace/command_discovery.py +94 -0
  18. package/src/agent_apprenticeship_trace/config.py +609 -0
  19. package/src/agent_apprenticeship_trace/contract_diagnostics.py +69 -0
  20. package/src/agent_apprenticeship_trace/env.py +46 -0
  21. package/src/agent_apprenticeship_trace/evaluator.py +64 -0
  22. package/src/agent_apprenticeship_trace/grader.py +194 -0
  23. package/src/agent_apprenticeship_trace/integration_status.py +193 -0
  24. package/src/agent_apprenticeship_trace/io.py +20 -0
  25. package/src/agent_apprenticeship_trace/learning.py +627 -0
  26. package/src/agent_apprenticeship_trace/lesson_extractor.py +5 -0
  27. package/src/agent_apprenticeship_trace/llm_output_normalizer.py +467 -0
  28. package/src/agent_apprenticeship_trace/loop.py +111 -0
  29. package/src/agent_apprenticeship_trace/mentor_checkpoints.py +354 -0
  30. package/src/agent_apprenticeship_trace/openai_structured.py +783 -0
  31. package/src/agent_apprenticeship_trace/package_exporter.py +303 -0
  32. package/src/agent_apprenticeship_trace/progress.py +223 -0
  33. package/src/agent_apprenticeship_trace/public_run.py +1109 -0
  34. package/src/agent_apprenticeship_trace/public_sanitizer.py +139 -0
  35. package/src/agent_apprenticeship_trace/recipes.py +129 -0
  36. package/src/agent_apprenticeship_trace/release_exporter.py +259 -0
  37. package/src/agent_apprenticeship_trace/revision.py +21 -0
  38. package/src/agent_apprenticeship_trace/role_runners.py +7 -0
  39. package/src/agent_apprenticeship_trace/rubric_generation.py +75 -0
  40. package/src/agent_apprenticeship_trace/schemas.py +273 -0
  41. package/src/agent_apprenticeship_trace/session_events.py +99 -0
  42. package/src/agent_apprenticeship_trace/task_intake.py +112 -0
  43. package/src/agent_apprenticeship_trace/trace_normalizer.py +669 -0
  44. package/src/agent_apprenticeship_trace/trace_prompt.py +51 -0
  45. package/src/agent_apprenticeship_trace/training_signals.py +30 -0
  46. package/src/agent_apprenticeship_trace/validation.py +210 -0
  47. package/src/agent_apprenticeship_trace/verifier.py +55 -0
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ TRACE_SCHEMA_GUIDANCE = """You are the worker/reviser agent for an agent apprenticeship data-generation run.
4
+ Produce high-quality work artifacts for the task and preserve useful execution evidence while you work.
5
+ Write agent_trace.json as a structured workflow trace following the AgentTrace schema.
6
+ Each step should represent one meaningful action: reading context, using a tool, creating or editing an artifact, validating output, handling an error, or delivering the final result.
7
+ Capture exact commands, inputs, outputs, state changes, errors, retries, artifact references, and causality whenever available.
8
+ When a step uses or observes a tool, integration, capability, task asset, or artifact, record that context in the existing step fields:
9
+ - tool: name the actual tool, interface, integration, or environment used.
10
+ - operation: use read, write, execute, verify, or other to describe the step type.
11
+ - input: include the command, file path, URL, asset name, tool request, or user instruction used.
12
+ - output: include the relevant result, summary, error, validation evidence, or produced artifact.
13
+ - artifact_refs: list artifacts or task assets created, read, transformed, or validated by the step.
14
+ - state_change: describe files, records, UI state, environment state, or artifact state that changed.
15
+ - metadata_json: optionally include small structured step context when useful; do not add large metadata blocks.
16
+ For richer environments such as browser/search/screenshot/DOM/network actions, computer-use or UI automation, MCP/tool-call integrations, spreadsheets/workbooks, simulations, package/library/tool availability checks, external integrations, or specialized runtimes, capture the concrete interface and evidence naturally in tool, input, output, state_change, artifact_refs, and metadata_json.
17
+ If your worker framework uses multiple subagents, keep the same trace schema. Set step.actor to the subagent or role when known, set step.tool to the framework/tool involved, use metadata_json.subagent for small subagent context when useful, and attach subagent outputs through artifact_refs.
18
+ When reading task assets such as task_brief.md, task.json, attached files, screenshots, PDFs, spreadsheets, or other input files, record which asset was inspected, what was learned, relevant artifact/file refs, and any parsing or validation result in the normal step fields.
19
+ Use concise visible decision summaries in reasoning fields. Do not include private hidden chain-of-thought.
20
+ Record success, step_outcome, error_type, and error_message where applicable.
21
+ Do not grade your own work in the trace. Model-based grader, verifier, and evaluator roles will assess the attempt later.
22
+ Input files are available under ./input/.
23
+ Write final deliverables under ./artifacts/.
24
+ Write ./agent_trace.json and ./actual_outputs.json in the current attempt directory.
25
+ Do not write final outputs to the package root or any parent directory.
26
+
27
+ Use exactly these field names in every trace step:
28
+ step, turn, actor, action, operation, tool, observation, input, output, state_change, reasoning, caused_by, causal_type, causal_note, success, step_outcome, error_type, error_message, artifact_refs, metadata_json.
29
+ Do not use these noncanonical top-level step fields: step_number, command, inputs, outputs, state_changes, decision_summary, eval, eval_reason, directive.
30
+ Your raw trace will be preserved exactly. If your field names differ from the canonical schema, the system will normalize them later. Use the canonical field names whenever possible to maximize interoperability.
31
+ """
32
+
33
+ MASKING_GUIDANCE = """Sensitive info masking is standard for this run. Do not expose API keys, private tokens, credentials, or secrets in traces, logs, or artifacts. Preserve useful operational context while masking super-sensitive values."""
34
+
35
+
36
+ def build_worker_prompt(
37
+ task_instruction: str,
38
+ rubric_md: str = "",
39
+ attempt_kind: str = "baseline",
40
+ input_files: list[str] | None = None,
41
+ deliverables: list[str] | None = None,
42
+ sensitive_info_masking: str = "standard",
43
+ workspace_path: str | None = None,
44
+ ) -> str:
45
+ input_files = input_files or []
46
+ deliverables = deliverables or []
47
+ input_section = "\n".join(f"- ./input/{name}" for name in input_files) or "- No explicit input files declared."
48
+ deliverable_section = "\n".join(f"- ./artifacts/{name}" for name in deliverables) or "- Write required task deliverables under ./artifacts/."
49
+ workspace_section = f"\nCurrent task workspace: {workspace_path}\n" if workspace_path else ""
50
+ masking_section = f"\n\n## Sensitive info masking\n{MASKING_GUIDANCE}" if sensitive_info_masking == "standard" else ""
51
+ return f"# Agent Apprenticeship Workflow Task\n\nAttempt: {attempt_kind}\n{workspace_section}\n{TRACE_SCHEMA_GUIDANCE}{masking_section}\n\n## Input files\n{input_section}\n\n## Required output contract\nYou must write these files in the current task workspace before finishing:\n- ./agent_trace.json\n- ./actual_outputs.json\n- ./artifacts/\n{deliverable_section}\n\nBefore finishing, verify ./agent_trace.json, ./actual_outputs.json, and ./artifacts/ exist. Do not only answer in chat/stdout. Do not say the task is complete unless those files exist.\n\n## Task\n{task_instruction}\n\n## Worker-visible evaluation rubric\n{rubric_md}\n\nFinal instruction: write the required files in this workspace, verify they exist, then stop.\n"
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+ from .schemas import *
3
+ from .revision import preference_pair
4
+
5
+ def _current_evaluator_feedback_text(grader: GraderResult | None, evaluator: EvaluatorFeedback | None) -> str | None:
6
+ if not evaluator:
7
+ return None
8
+ text=evaluator.feedback_summary
9
+ md=(grader.metadata_json if grader else {}) or {}
10
+ if md.get('artifact_contract_passed') is True and 'artifact_contract_score 0.0' in text:
11
+ return 'Evaluator feedback omitted because it referenced stale artifact-contract state.'
12
+ return text
13
+
14
+ def process_supervision_from_trace(trace: AgentTrace, grader: GraderResult | None=None, verifier: VerifierResult | None=None, evaluator: EvaluatorFeedback | None=None) -> list[ProcessSupervisionExample]:
15
+ final=grader.score if grader else None
16
+ evaluator_text=_current_evaluator_feedback_text(grader, evaluator)
17
+ out=[]
18
+ for s in trace.steps:
19
+ if s.action == 'user_message': q='neutral'; reward=0.0; source='none'
20
+ elif s.step_outcome in ['failed','blocked']: q='negative'; reward=-1.0; source='heuristic'
21
+ elif s.step_outcome in ['progress','completed','corrected']: q='positive'; reward=1.0; source='heuristic'
22
+ else: q='unknown'; reward=None; source='none'
23
+ out.append(ProcessSupervisionExample(example_id=f'ps_{trace.trace_id}_{s.step}', task_id=trace.task_id, attempt_id=trace.attempt_id, attempt_kind=trace.attempt_kind, trace_id=trace.trace_id, step=s.step, actor=s.actor, action=s.action, operation=s.operation, tool=s.tool, observation=s.observation, input=s.input, output=s.output, state_change=s.state_change, reasoning=s.reasoning, caused_by=s.caused_by, causal_type=s.causal_type, success=s.success, step_outcome=s.step_outcome, step_quality_label=q, local_reward=reward, failure_mode=s.error_type, grader_feedback=grader.reasoning_summary if grader else None, verifier_feedback=verifier.verifier_notes if verifier else None, evaluator_feedback=evaluator_text, revision_reason=None, final_outcome_score=final, label_source=source, label=q, metadata_json={'raw_step': s.metadata_json.get('raw_step') if s.metadata_json else None}))
24
+ return out
25
+
26
+ def reward_modeling(outputs: ActualOutputs, grader: GraderResult) -> RewardModelingExample:
27
+ return RewardModelingExample(example_id=f'rm_{outputs.attempt_id}', task_id=outputs.task_id, attempt_id=outputs.attempt_id, rubric_ref=f'rubric/rubric.json', output_refs=outputs.deliverable_refs, attempt_summary=outputs.output_summary, rubric_item_scores=grader.rubric_item_scores, final_score=grader.final_score if grader.final_score is not None else grader.score, passed=grader.passed, failure_modes=grader.failed_criteria, grader_notes=grader.reasoning_summary, evidence_refs=grader.evidence_refs, score_source=grader.score_source, grader_kind=grader.grader_kind, confidence=grader.confidence, score_reliability=grader.score_reliability, verifier_status=grader.verifier_status, verifier_confidence=grader.verifier_confidence, verifier_issue_count=grader.verifier_issue_count, verifier_issues_summary=grader.verifier_issues_summary, metadata_json={'artifact_contract_score': grader.artifact_contract_score, 'model_score': grader.model_score if grader.model_score is not None else grader.semantic_score, 'legacy_semantic_score': grader.semantic_score, 'final_score': grader.final_score, 'verifier_confidence': grader.verifier_confidence, 'score_reliability': grader.score_reliability})
28
+
29
+ def training_signals(hill: HillclimbResult, traces: list[str], graders: list[str], verifiers: list[str]) -> list[TrainingSignal]:
30
+ return [TrainingSignal(signal_id=f'sig_{hill.task_id}_rollout', task_id=hill.task_id, signal_type='rollout', source_attempt_ids=[hill.baseline_attempt_id,hill.revised_attempt_id], baseline_score=hill.baseline_score, revised_score=hill.revised_score, score_delta=hill.revision_score_delta, criteria_improved=hill.criteria_improved, criteria_regressed=hill.criteria_regressed, failed_criteria_before=hill.failed_criteria_before, failed_criteria_after=hill.failed_criteria_after, feedback_ref='feedback/baseline_evaluator_feedback.json', revision_plan_ref='feedback/revision_plan.json', grader_result_refs=graders, verifier_result_refs=verifiers, trace_refs=traces, artifact_refs=[], score_source='deterministic', grader_kind='deterministic', confidence=0.8, training_use_cases=['rollout','reward_modeling','revision_preference'], limitations=['Artifact-contract labels are recorded as deterministic precheck evidence.'], metadata_json={})]
@@ -0,0 +1,210 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from pathlib import Path
4
+ from .schemas import AgentTrace
5
+ from .io import read_jsonl
6
+ from .env import contains_secret
7
+ from .public_sanitizer import has_prompt_leak
8
+ from .artifact_resolver import artifact_ref_resolves, is_artifact_evidence_ref, normalize_artifact_ref
9
+ REQUIRED=['dataset_manifest.json','dataset_card.md','quality_report.json','full_task_records.jsonl','tasks.jsonl','task_intake_specs.jsonl','rubrics.jsonl','rubric_items.jsonl','raw_agent_traces.jsonl','agent_traces.jsonl','trace_normalization_reports.jsonl','actual_outputs_normalization_reports.jsonl','actual_outputs.jsonl','grader_results.jsonl','verifier_results.jsonl','hillclimb_results.jsonl','training_signals.jsonl','process_supervision.jsonl','reward_modeling.jsonl','revision_preference_pairs.jsonl','role_results_index.jsonl','artifacts_index.json','packages_index.jsonl','forsy_like_collections.jsonl']
10
+
11
+ _normalize_artifact_evidence_ref = normalize_artifact_ref
12
+ _is_artifact_evidence_ref = is_artifact_evidence_ref
13
+ _artifact_ref_resolves = artifact_ref_resolves
14
+
15
+ def scan_tree_for_secrets(root: Path) -> bool:
16
+ for p in root.rglob('*'):
17
+ if p.is_file() and p.stat().st_size < 5_000_000:
18
+ if contains_secret(p.read_text(errors='ignore')): return False
19
+ return True
20
+
21
+
22
+ def validate_release(root: Path) -> dict[str, object]:
23
+ counters={'release_valid': True,'public_release_valid': True, 'task_count':0,'trace_count':0,'trace_valid_count':0,'trace_invalid_count':0,'trace_missing_count':0,'incomplete_package_count':0,'raw_trace_count':0,'raw_trace_step_count':0,'normalized_trace_count':0,'normalized_trace_step_count':0,'fallback_trace_count':0,'fallback_trace_step_count':0,'discarded_step_count':0,'raw_trace_parse_error_count':0,'trace_normalization_error_count':0,'trace_normalization_partial_count':0,'trace_lossless_count':0,'trace_lossless_failure_count':0,'trace_step_count_total':0,'trace_steps_with_tool_count':0,'trace_steps_with_input_count':0,'trace_steps_with_output_count':0,'trace_steps_with_state_change_count':0,'trace_steps_with_reasoning_count':0,'artifact_count':0,'artifact_missing_count':0,'grader_result_count':0,'verifier_result_count':0,'hillclimb_result_count':0,'process_supervision_count':0,'reward_modeling_count':0,'revision_preference_pair_count':0,'secret_scan_ok':True,'llm_task_intake_count':0,'llm_rubric_generation_count':0,'llm_evaluator_result_count':0,'llm_grader_result_count':0,'llm_verifier_result_count':0,'deterministic_precheck_count':0,'semantic_score_count':0,'artifact_contract_score_count':0,'deterministic_fallback_count':0,'llm_unavailable_count':0,'score_source_counts':{},'dependency_shadow_ok':True,'actual_outputs_raw_count':0,'actual_outputs_normalized_count':0,'actual_outputs_schema_valid_count':0,'actual_outputs_fallback_count':0,'actual_outputs_inferred_artifact_count':0,'actual_outputs_discarded_field_count':0,'artifact_contract_consistency_ok':True,'artifact_contract_consistency_issues':[],'scale_ready':False,'scale_blockers':[],'fallback_only_task_count':0,'rich_trace_task_count':0,'workflow_trace_rich_count':0,'llm_role_completeness_ok':False,'semantic_grading_grounded_count':0,'semantic_grading_logs_only_count':0,'semantic_grading_unavailable_count':0,'model_task_intake_count':0,'model_rubric_generation_count':0,'model_evaluator_result_count':0,'model_grader_result_count':0,'model_verifier_result_count':0,'model_role_completeness_ok':False,'model_score_count':0,'model_grading_grounded_count':0,'model_grading_logs_only_count':0,'model_grading_unavailable_count':0,'verifier_verified_count':0,'verifier_failed_count':0,'model_score_verified_count':0,'model_score_needs_review_count':0,'score_reliability_counts':{},'scale_warnings':[],'operation_other_count':0,'operation_mapped_count':0}
24
+ repo_root=Path.cwd()
25
+ counters['dependency_shadow_ok']=not (repo_root/'src/pydantic').exists() and not (repo_root/'src/typer').exists()
26
+ if not counters['dependency_shadow_ok']:
27
+ counters['release_valid']=False
28
+ for f in REQUIRED:
29
+ if not (root/f).exists(): counters['release_valid']=False
30
+ try:
31
+ tasks=read_jsonl(root/'full_task_records.jsonl'); counters['task_count']=len(tasks)
32
+ intake_specs=read_jsonl(root/'task_intake_specs.jsonl'); rubrics_rows=read_jsonl(root/'rubrics.jsonl')
33
+ counters['llm_task_intake_count']=sum(1 for s in intake_specs if (s.get('metadata_json') or {}).get('intake_source')=='llm')
34
+ counters['llm_rubric_generation_count']=sum(1 for r in rubrics_rows if (r.get('metadata_json') or {}).get('rubric_source')=='llm')
35
+ graders=read_jsonl(root/'grader_results.jsonl'); verifiers=read_jsonl(root/'verifier_results.jsonl'); evaluators=read_jsonl(root/'evaluator_feedback.jsonl'); roles=read_jsonl(root/'role_results_index.jsonl')
36
+ counters['grader_result_count']=len(graders); counters['verifier_result_count']=len(verifiers)
37
+ counters['llm_grader_result_count']=sum(1 for g in graders if g.get('provider')=='openai' and g.get('semantic_score') is not None)
38
+ counters['llm_verifier_result_count']=sum(1 for v in verifiers if v.get('provider')=='openai')
39
+ counters['llm_evaluator_result_count']=sum(1 for e in evaluators if e.get('provider')=='openai')
40
+ counters['llm_task_intake_count']=max(counters['llm_task_intake_count'], sum(1 for r in roles if r.get('role')=='intake_agent' and r.get('live_call_ok') and r.get('structured_output_validation_ok')))
41
+ counters['llm_rubric_generation_count']=max(counters['llm_rubric_generation_count'], sum(1 for r in roles if r.get('role')=='rubric_agent' and r.get('live_call_ok') and r.get('structured_output_validation_ok')))
42
+ counters['llm_evaluator_result_count']=max(counters['llm_evaluator_result_count'], sum(1 for r in roles if r.get('role')=='evaluator_agent' and r.get('live_call_ok') and r.get('structured_output_validation_ok')))
43
+ counters['llm_grader_result_count']=max(counters['llm_grader_result_count'], sum(1 for r in roles if r.get('role')=='grader_agent' and r.get('live_call_ok') and r.get('structured_output_validation_ok')))
44
+ counters['llm_verifier_result_count']=max(counters['llm_verifier_result_count'], sum(1 for r in roles if r.get('role')=='verifier_agent' and r.get('live_call_ok') and r.get('structured_output_validation_ok')))
45
+ counters['deterministic_precheck_count']=sum(1 for g in graders if g.get('artifact_contract_score') is not None)
46
+ counters['semantic_score_count']=sum(1 for g in graders if (g.get('model_score') is not None or g.get('semantic_score') is not None))
47
+ counters['artifact_contract_score_count']=sum(1 for g in graders if g.get('artifact_contract_score') is not None)
48
+ counters['deterministic_fallback_count']=sum(1 for g in graders if g.get('score_source')=='deterministic_fallback' or (g.get('metadata_json') or {}).get('deterministic_fallback')) + sum(1 for r in roles if r.get('fallback_used'))
49
+ counters['llm_unavailable_count']=sum(1 for g in graders if (g.get('metadata_json') or {}).get('llm_unavailable')) + sum(1 for r in roles if r.get('provider')=='unavailable')
50
+ counters['verifier_verified_count']=sum(1 for v in verifiers if v.get('verification_status')=='verified')
51
+ counters['verifier_failed_count']=sum(1 for v in verifiers if v.get('verification_status')=='failed')
52
+ counters['semantic_grading_grounded_count']=sum(1 for v in verifiers if v.get('verification_status')=='verified' and (v.get('semantic_evidence_grounding_ok') or v.get('evidence_grounding_ok')))
53
+ rel_counts={}
54
+ for g in graders:
55
+ rel=(g.get('score_reliability') or (g.get('metadata_json') or {}).get('score_reliability') or 'unverified')
56
+ rel_counts[rel]=rel_counts.get(rel,0)+1
57
+ counters['score_reliability_counts']=rel_counts
58
+ counters['model_score_verified_count']=rel_counts.get('verified',0)
59
+ counters['model_score_needs_review_count']=sum(v for k,v in rel_counts.items() if k != 'verified')
60
+ counters['semantic_grading_logs_only_count']=sum(1 for g in graders if ((g.get('metadata_json') or {}).get('model_grading_basis') or (g.get('metadata_json') or {}).get('semantic_grading_basis'))=='logs_only')
61
+ counters['semantic_grading_unavailable_count']=sum(1 for g in graders if ((g.get('metadata_json') or {}).get('model_grading_basis') or (g.get('metadata_json') or {}).get('semantic_grading_basis'))=='missing_outputs')
62
+ sc={};
63
+ for g in graders: sc[g.get('score_source')]=sc.get(g.get('score_source'),0)+1
64
+ counters['score_source_counts']=sc; counters['hillclimb_result_count']=len(read_jsonl(root/'hillclimb_results.jsonl')); counters['process_supervision_count']=len(read_jsonl(root/'process_supervision.jsonl')); counters['reward_modeling_count']=len(read_jsonl(root/'reward_modeling.jsonl')); counters['revision_preference_pair_count']=len(read_jsonl(root/'revision_preference_pairs.jsonl'))
65
+ traces=read_jsonl(root/'agent_traces.jsonl'); counters['trace_count']=len(traces)
66
+ for t in traces:
67
+ try: AgentTrace.model_validate(t); counters['trace_valid_count']+=1
68
+ except Exception: counters['trace_invalid_count']+=1; counters['release_valid']=False
69
+ steps=t.get('steps') or []; counters['trace_step_count_total'] += len(steps)
70
+ counters['trace_steps_with_tool_count'] += sum(1 for s in steps if s.get('tool'))
71
+ counters['trace_steps_with_input_count'] += sum(1 for s in steps if s.get('input'))
72
+ counters['trace_steps_with_output_count'] += sum(1 for s in steps if s.get('output'))
73
+ counters['trace_steps_with_state_change_count'] += sum(1 for s in steps if s.get('state_change'))
74
+ counters['trace_steps_with_reasoning_count'] += sum(1 for s in steps if s.get('reasoning'))
75
+ counters['operation_other_count'] += sum(1 for s in steps if s.get('operation') == 'other')
76
+ counters['operation_mapped_count'] += sum(1 for s in steps if s.get('operation') and s.get('operation') != 'other')
77
+ reports=read_jsonl(root/'trace_normalization_reports.jsonl')
78
+ ao_reports=read_jsonl(root/'actual_outputs_normalization_reports.jsonl')
79
+ counters['actual_outputs_raw_count']=sum(int(r.get('actual_outputs_raw_count') or 0) for r in ao_reports)
80
+ counters['actual_outputs_normalized_count']=sum(int(r.get('actual_outputs_normalized_count') or (1 if r.get('actual_outputs_normalized') else 0)) for r in ao_reports)
81
+ counters['actual_outputs_schema_valid_count']=sum(int(r.get('actual_outputs_schema_valid_count') or (1 if r.get('actual_outputs_schema_valid') else 0)) for r in ao_reports)
82
+ counters['actual_outputs_fallback_count']=sum(int(r.get('actual_outputs_fallback_count') or (1 if r.get('actual_outputs_fallback') else 0)) for r in ao_reports)
83
+ counters['actual_outputs_inferred_artifact_count']=sum(int(r.get('actual_outputs_inferred_artifact_count') or 0) for r in ao_reports)
84
+ counters['actual_outputs_discarded_field_count']=sum(int(r.get('actual_outputs_discarded_field_count') or 0) for r in ao_reports)
85
+ if counters['actual_outputs_discarded_field_count']:
86
+ counters['release_valid']=False
87
+ counters['raw_trace_count']=len(read_jsonl(root/'raw_agent_traces.jsonl'))
88
+ counters['normalized_trace_count']=sum(1 for r in reports if r.get('trace_normalized') or r.get('normalized_trace_ref'))
89
+ counters['fallback_trace_count']=sum(1 for r in reports if r.get('fallback_trace'))
90
+ counters['fallback_trace_step_count']=sum(int(r.get('normalized_step_count') or 0) for r in reports if r.get('fallback_trace'))
91
+ counters['discarded_step_count']=sum(int(r.get('discarded_step_count') or 0) for r in reports)
92
+ counters['raw_trace_step_count']=sum(int(r.get('raw_step_count') or 0) for r in reports)
93
+ counters['normalized_trace_step_count']=sum(int(r.get('normalized_step_count') or 0) for r in reports)
94
+ counters['raw_trace_parse_error_count']=sum(1 for r in reports if r.get('raw_trace_parse_error'))
95
+ counters['trace_normalization_error_count']=sum(1 for r in reports if r.get('trace_normalization_error'))
96
+ counters['trace_normalization_partial_count']=sum(1 for r in reports if r.get('trace_normalization_partial'))
97
+ counters['trace_lossless_count']=sum(1 for r in reports if r.get('trace_lossless'))
98
+ counters['trace_lossless_failure_count']=sum(1 for r in reports if not r.get('trace_lossless'))
99
+ arts=json.loads((root/'artifacts_index.json').read_text() or '[]'); counters['artifact_count']=len(arts); counters['artifact_missing_count']=sum(1 for a in arts if a.get('artifact_missing') or (a.get('package_relative_path') and not (root/'packages'/str(a.get('task_id',''))/a.get('package_relative_path')).exists()))
100
+ actual_outputs=read_jsonl(root/'actual_outputs.jsonl')
101
+ existing_refs=set()
102
+ for a in arts:
103
+ tid=str(a.get('task_id') or '')
104
+ rel=a.get('package_relative_path')
105
+ if rel:
106
+ existing_refs.add(str(rel)); existing_refs.add(f'packages/{tid}/{rel}')
107
+ for ao in actual_outputs:
108
+ for ref in (ao.get('deliverable_refs') or []) + (ao.get('artifact_refs') or []) + (ao.get('files_created') or []):
109
+ if ref:
110
+ existing_refs.add(str(ref))
111
+ verifiers_by_attempt={v.get('attempt_id'): v for v in verifiers}
112
+ issues=[]
113
+ for g in graders:
114
+ md=g.get('metadata_json') or {}
115
+ task_id=g.get('task_id'); attempt_kind=g.get('attempt_kind'); attempt_id=g.get('attempt_id')
116
+ missing=[m for m in (md.get('missing_artifacts') or []) if m]
117
+ mismatches=[m for m in (md.get('deliverable_mismatches') or []) if m and m.get('status') not in {'acceptable_alias','resolved'}]
118
+ for m in missing:
119
+ issues.append({'task_id':task_id,'attempt_kind':attempt_kind,'file':m,'field':'metadata_json.missing_artifacts','reason':'required artifact missing','evidence_ref':None})
120
+ for m in mismatches:
121
+ issues.append({'task_id':task_id,'attempt_kind':attempt_kind,'file':m.get('expected'),'field':'metadata_json.deliverable_mismatches','reason':'unresolved deliverable mismatch','evidence_ref':None})
122
+ for ref in g.get('evidence_refs') or []:
123
+ raw_ref=str(ref)
124
+ normalized_ref=_normalize_artifact_evidence_ref(raw_ref)
125
+ # Log/prompt/metadata refs are not artifact-contract evidence.
126
+ # Artifact refs may be decorated as preview refs and should resolve directly or by package suffix.
127
+ if _is_artifact_evidence_ref(raw_ref) and not _artifact_ref_resolves(normalized_ref, existing_refs):
128
+ issues.append({'task_id':task_id,'attempt_kind':attempt_kind,'file':None,'field':'evidence_refs','reason':'evidence_ref points to missing artifact','evidence_ref':raw_ref,'normalized_evidence_ref':normalized_ref})
129
+ verifier=verifiers_by_attempt.get(attempt_id)
130
+ if verifier and verifier.get('artifact_contract_ok') is True and missing:
131
+ issues.append({'task_id':task_id,'attempt_kind':attempt_kind,'file':None,'field':'verifier.artifact_contract_ok','reason':'verifier artifact_contract_ok=true while required artifacts are missing','evidence_ref':None})
132
+ counters['artifact_contract_consistency_issues']=issues
133
+ counters['artifact_contract_consistency_ok']=len(issues)==0
134
+ if issues:
135
+ counters['release_valid']=False
136
+ packages=read_jsonl(root/'packages_index.jsonl')
137
+ counters['incomplete_package_count']=sum(1 for p in packages if p.get('export_ready') is False)
138
+ counters['trace_missing_count']=max(0, len(packages)*2 - counters['trace_count'])
139
+ if counters['discarded_step_count'] > 0 or counters['normalized_trace_step_count'] < counters['raw_trace_step_count']:
140
+ counters['release_valid']=False
141
+ if counters['incomplete_package_count']: counters['release_valid']=False
142
+ if counters['artifact_missing_count']: counters['release_valid']=False
143
+ if len(packages)!=counters['task_count']: counters['release_valid']=False
144
+ counters['fallback_only_task_count']=counters['task_count'] if counters['task_count'] and counters['trace_count'] and counters['fallback_trace_count'] == counters['trace_count'] else 0
145
+ counters['rich_trace_task_count']=counters['task_count'] if counters['raw_trace_step_count'] > 0 and counters['fallback_only_task_count'] == 0 else 0
146
+ counters['workflow_trace_rich_count']=sum(1 for r in reports if not r.get('fallback_trace') and int(r.get('normalized_step_count') or 0) > 2)
147
+ counters['model_task_intake_count']=counters['llm_task_intake_count']; counters['model_rubric_generation_count']=counters['llm_rubric_generation_count']; counters['model_evaluator_result_count']=counters['llm_evaluator_result_count']; counters['model_grader_result_count']=counters['llm_grader_result_count']; counters['model_verifier_result_count']=counters['llm_verifier_result_count']; counters['model_score_count']=counters['semantic_score_count']; counters['model_grading_grounded_count']=counters['semantic_grading_grounded_count']; counters['model_score_verified_count']=counters.get('model_score_verified_count',0); counters['model_score_needs_review_count']=counters.get('model_score_needs_review_count',0); counters['model_grading_logs_only_count']=counters['semantic_grading_logs_only_count']; counters['model_grading_unavailable_count']=counters['semantic_grading_unavailable_count']
148
+ counters['llm_role_completeness_ok']=counters['llm_task_intake_count']>=1 and counters['llm_rubric_generation_count']>=1 and counters['llm_evaluator_result_count']>=1 and counters['llm_grader_result_count']>=2 and counters['llm_verifier_result_count']>=2; counters['model_role_completeness_ok']=counters['llm_role_completeness_ok']
149
+ except Exception:
150
+ counters['release_valid']=False
151
+ counters['secret_scan_ok']=scan_tree_for_secrets(root)
152
+ if not counters['secret_scan_ok']: counters['release_valid']=False
153
+ public=root/'public'
154
+ if public.exists():
155
+ counters['public_prompt_leak_ok']=not has_prompt_leak(public)
156
+ counters['public_secret_scan_ok']=scan_tree_for_secrets(public)
157
+ try:
158
+ ptraces=read_jsonl(public/'agent_traces.jsonl')
159
+ counters['public_trace_count']=len(ptraces)
160
+ counters['public_system_prompt_redacted_count']=sum(1 for t in ptraces if t.get('system_prompt') is None and t.get('system_prompt_hash'))
161
+ counters['public_prompt_metadata_count']=sum(1 for t in ptraces if (t.get('metadata_json') or {}).get('prompt_template_id') or (t.get('metadata_json') or {}).get('prompt_template_version') or (t.get('metadata_json') or {}).get('prompt_publication_status'))
162
+ except Exception:
163
+ counters['public_release_valid']=False
164
+ if not counters.get('public_prompt_leak_ok') or not counters.get('public_secret_scan_ok'):
165
+ counters['public_release_valid']=False
166
+ else:
167
+ counters['public_release_valid']=False
168
+ if not counters['public_release_valid']:
169
+ counters['release_valid']=False
170
+ if counters['trace_invalid_count'] or counters['trace_valid_count'] != counters['trace_count']: counters['release_valid']=False
171
+ warnings=[]
172
+ if counters.get('model_score_needs_review_count', 0) > 0:
173
+ warnings.append('Some model-judged scores failed verifier grounding/consistency checks.')
174
+ if counters.get('operation_other_count', 0) > counters.get('operation_mapped_count', 0) and counters.get('workflow_trace_rich_count', 0) > 0:
175
+ warnings.append('Many rich-trace operations are still mapped to other.')
176
+ counters['scale_warnings']=warnings
177
+ blockers=[]
178
+ if not counters['release_valid']: blockers.append('release_valid=false')
179
+ if not counters['public_release_valid']: blockers.append('public_release_valid=false')
180
+ if counters['discarded_step_count'] != 0: blockers.append('discarded_step_count_nonzero')
181
+ if counters['fallback_only_task_count'] != 0: blockers.append('fallback_only_task_count_nonzero')
182
+ if counters['raw_trace_step_count'] <= 0: blockers.append('raw_trace_step_count_zero')
183
+ if counters['normalized_trace_count'] < 1: blockers.append('normalized_trace_count_lt_1')
184
+ if counters['process_supervision_count'] <= 4: blockers.append('process_supervision_count_lte_4')
185
+ if counters['actual_outputs_schema_valid_count'] < 1: blockers.append('actual_outputs_schema_valid_count_lt_1')
186
+ if not counters['artifact_contract_consistency_ok']: blockers.append('artifact_contract_consistency_not_ok')
187
+
188
+ requires_model_role_completeness = bool(
189
+ counters.get('score_source_counts', {}).get('model_judged', 0)
190
+ or counters.get('model_task_intake_count', 0)
191
+ or counters.get('model_rubric_generation_count', 0)
192
+ or counters.get('model_evaluator_result_count', 0)
193
+ or counters.get('model_grader_result_count', 0)
194
+ or counters.get('model_verifier_result_count', 0)
195
+ )
196
+
197
+ if requires_model_role_completeness and not counters['model_role_completeness_ok']:
198
+ blockers.append('model_role_completeness_not_ok')
199
+ if requires_model_role_completeness and counters['semantic_score_count'] < 2:
200
+ blockers.append('model_score_count_lt_2')
201
+ if requires_model_role_completeness and counters.get('model_grading_logs_only_count', 0) > 0 and counters.get('artifact_count', 0) > 0:
202
+ blockers.append('model_grading_logs_only_with_artifacts')
203
+ if not counters.get('public_prompt_leak_ok', False): blockers.append('public_prompt_leak')
204
+ counters['scale_blockers']=blockers
205
+ counters['scale_ready']=not blockers
206
+ return counters
207
+
208
+ def format_counters(c: dict[str,object]) -> str:
209
+ def val(v): return str(v).lower() if isinstance(v,bool) else str(v)
210
+ return '\n'.join(f'{k}={val(v)}' for k,v in c.items())
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from pathlib import Path
4
+ from .schemas import VerifierResult, GraderResult, ActualOutputs, AgentTrace
5
+ from .config import get_settings
6
+ from .io import read_json
7
+ from .openai_structured import get_model_provider_status, run_structured_role
8
+ from .public_sanitizer import sha256_text
9
+ from .artifact_previews import build_artifact_previews
10
+
11
+ def _mentor_provider_can_attempt() -> bool:
12
+ return bool(get_model_provider_status().get('provider_available'))
13
+
14
+ def _mentor_provider_id() -> str:
15
+ settings=get_settings()
16
+ return settings.model_provider or 'openai'
17
+
18
+ def _model_verifier_active(settings) -> bool:
19
+ return settings.mentor_mode in {'model_assisted', 'hybrid'} or settings.evaluation_mode in {'hybrid', 'llm_required'}
20
+
21
+ def deterministic_verify(grader: GraderResult, outputs: ActualOutputs) -> VerifierResult:
22
+ artifact_ok=bool(grader.evidence_refs) and not grader.metadata_json.get('missing_artifacts') and outputs.status == 'success'
23
+ score_ok=0 <= grader.score <= grader.max_score
24
+ ok=artifact_ok and score_ok and not grader.hidden_reference_leaked
25
+ return VerifierResult(verifier_result_id=f'verifier_{outputs.attempt_id}', task_id=outputs.task_id, attempt_id=outputs.attempt_id, attempt_kind=outputs.attempt_kind, grader_result_id=grader.grader_result_id, verification_status='verified' if ok else 'failed', artifact_contract_ok=artifact_ok, evidence_grounding_ok=bool(grader.evidence_refs), score_consistency_ok=score_ok, hidden_reference_leaked=grader.hidden_reference_leaked, issues=[] if ok else ['missing deliverable evidence' if not artifact_ok else 'score inconsistency'], confidence=0.8, verifier_notes='Verifier checked artifact-contract evidence refs and score consistency; model grounding requires model verifier when available.', semantic_evidence_grounding_ok=None, unsupported_claims=[], leakage_check_ok=not grader.hidden_reference_leaked, model=None, provider=None, metadata_json={'score_source': grader.score_source})
26
+
27
+ def _verifier_prompt(grader: GraderResult, outputs: ActualOutputs, trace: AgentTrace | None, artifact_preview_bundle: dict | None=None) -> str:
28
+ return """Return only valid JSON matching VerifierResult. Do not include markdown.
29
+ Check whether the grader evidence is grounded in actual_outputs/artifacts/trace and especially the artifact content previews, whether score consistency is internally supported, whether unsupported claims exist, and whether hidden-reference leakage is absent. Return exactly the VerifierResult schema fields. Do not add top-level fields not in schema. If you want to report extra checks such as safe_to_publish, reproducible, within_time_limit, uses_only_allowed_apis, passed_checks, or within_memory_limit, put them inside metadata_json.extra_model_fields.
30
+ GraderResult JSON:
31
+ """ + json.dumps(grader.model_dump(), sort_keys=True) + "\nActualOutputs JSON:\n" + json.dumps(outputs.model_dump(), sort_keys=True) + "\nArtifact content previews JSON:\n" + json.dumps(artifact_preview_bundle or {}, sort_keys=True)[:16000] + "\nTrace summary JSON:\n" + json.dumps((trace.model_dump() if trace else {}), sort_keys=True)[:12000]
32
+
33
+ def verify_attempt(grader: GraderResult, outputs: ActualOutputs, trace: AgentTrace | None=None, role_root: Path | None=None, package_root: Path | None=None) -> VerifierResult:
34
+ settings=get_settings(); pre=deterministic_verify(grader, outputs); role_root=role_root or Path('outputs/roles')
35
+ artifact_preview_bundle=build_artifact_previews(package_root, grader.evidence_refs or outputs.deliverable_refs or outputs.artifact_refs or outputs.files_created)
36
+ if _model_verifier_active(settings) and _mentor_provider_can_attempt() and settings.llm_verifier_enabled:
37
+ prompt=_verifier_prompt(grader, outputs, trace, artifact_preview_bundle)
38
+ try:
39
+ provider=_mentor_provider_id()
40
+ model_override=settings.llm_verifier_model if provider == 'openai' else None
41
+ rr=run_structured_role('verifier_agent', prompt, VerifierResult, role_root/'verifier_agent'/outputs.attempt_kind, allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': outputs.task_id, 'attempt_id': outputs.attempt_id, 'attempt_kind': outputs.attempt_kind, 'grader_result_id': grader.grader_result_id, 'artifact_contract_score': grader.artifact_contract_score, 'evidence_refs': grader.evidence_refs, 'artifact_content_refs': artifact_preview_bundle.get('artifact_content_refs'), 'artifact_content_previews': artifact_preview_bundle.get('artifact_content_previews'), 'artifact_content_hashes': artifact_preview_bundle.get('artifact_content_hashes'), 'artifact_content_preview_truncated': artifact_preview_bundle.get('artifact_content_preview_truncated'), 'model_grading_basis': artifact_preview_bundle.get('model_grading_basis'), 'model': model_override or settings.model_provider_model, 'provider': provider})
42
+ if rr.live_call_ok and rr.structured_output_validation_ok:
43
+ parsed=read_json(role_root/'verifier_agent'/outputs.attempt_kind/'parsed_output.json')
44
+ v=VerifierResult.model_validate(parsed)
45
+ v.provider=rr.provider; v.model=rr.model; v.semantic_evidence_grounding_ok=v.semantic_evidence_grounding_ok if v.semantic_evidence_grounding_ok is not None else v.evidence_grounding_ok; v.leakage_check_ok=v.leakage_check_ok if v.leakage_check_ok is not None else not v.hidden_reference_leaked
46
+ v.metadata_json.update({'llm_prompt_ref_internal':str(role_root/'verifier_agent'/outputs.attempt_kind/'prompt.md'),'llm_response_ref_internal':str(role_root/'verifier_agent'/outputs.attempt_kind/'raw_output.txt'),'prompt_hash':sha256_text(prompt),'public_response_summary':'Model verifier checked grounding and score consistency.', **artifact_preview_bundle})
47
+ return v
48
+ if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
49
+ raise RuntimeError(rr.error_message or 'Model verifier failed')
50
+ except Exception:
51
+ if settings.evaluation_mode == 'llm_required' or settings.llm_fail_closed:
52
+ raise
53
+ if _mentor_provider_can_attempt() and settings.llm_verifier_enabled and settings.evaluation_mode != 'deterministic_only':
54
+ pre.metadata_json.update({'model_verifier_enabled': True, 'llm_verifier_enabled': True, 'llm_unavailable': True, 'deterministic_fallback': True, **artifact_preview_bundle})
55
+ return pre