mednotes-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opencode/agents/med-chat-triager.md +204 -0
- package/.opencode/agents/med-flashcard-maker.md +63 -0
- package/.opencode/agents/med-knowledge-architect.md +230 -0
- package/.opencode/agents/med-link-graph-curator.md +177 -0
- package/.opencode/agents/med-publish-guard.md +62 -0
- package/.opencode/commands/flashcards.md +25 -0
- package/.opencode/commands/mednotes/create.md +25 -0
- package/.opencode/commands/mednotes/enrich.md +27 -0
- package/.opencode/commands/mednotes/fix-wiki.md +27 -0
- package/.opencode/commands/mednotes/history.md +22 -0
- package/.opencode/commands/mednotes/link-body.md +25 -0
- package/.opencode/commands/mednotes/link-related.md +27 -0
- package/.opencode/commands/mednotes/link.md +27 -0
- package/.opencode/commands/mednotes/pdf-library.md +27 -0
- package/.opencode/commands/mednotes/process-chats.md +23 -0
- package/.opencode/commands/mednotes/setup.md +21 -0
- package/.opencode/commands/mednotes/status.md +27 -0
- package/.opencode/commands/mednotes/telemetry.md +27 -0
- package/.opencode/commands/report.md +26 -0
- package/.opencode/mednotes/AGENTS.md +57 -0
- package/.opencode/mednotes/agents/med-chat-triager.md +197 -0
- package/.opencode/mednotes/agents/med-flashcard-maker.md +56 -0
- package/.opencode/mednotes/agents/med-knowledge-architect.md +224 -0
- package/.opencode/mednotes/agents/med-link-graph-curator.md +171 -0
- package/.opencode/mednotes/agents/med-publish-guard.md +55 -0
- package/.opencode/mednotes/contracts/.gitkeep +1 -0
- package/.opencode/mednotes/contracts/agents.json +116 -0
- package/.opencode/mednotes/contracts/opencode-plugin.json +70 -0
- package/.opencode/mednotes/docs/agent-prompt-hardening.md +567 -0
- package/.opencode/mednotes/docs/agent-role-contracts.md +94 -0
- package/.opencode/mednotes/docs/anki-mcp-twenty-rules.md +214 -0
- package/.opencode/mednotes/docs/anki-templates/README.md +39 -0
- package/.opencode/mednotes/docs/anki-templates/cloze.back.html +23 -0
- package/.opencode/mednotes/docs/anki-templates/cloze.front.html +14 -0
- package/.opencode/mednotes/docs/anki-templates/qa.back.html +24 -0
- package/.opencode/mednotes/docs/anki-templates/qa.front.html +14 -0
- package/.opencode/mednotes/docs/anki-templates/style.css +182 -0
- package/.opencode/mednotes/docs/atomicity-splitting-policy.md +113 -0
- package/.opencode/mednotes/docs/extension-docs.md +40 -0
- package/.opencode/mednotes/docs/flashcard-ingestion.md +278 -0
- package/.opencode/mednotes/docs/knowledge-architect.md +208 -0
- package/.opencode/mednotes/docs/merge-policy.md +110 -0
- package/.opencode/mednotes/docs/public-vocabulary.md +104 -0
- package/.opencode/mednotes/docs/semantic-linker.md +141 -0
- package/.opencode/mednotes/docs/taxonomy-policy.md +90 -0
- package/.opencode/mednotes/docs/triage-policy.md +187 -0
- package/.opencode/mednotes/docs/vault-version-control.md +758 -0
- package/.opencode/mednotes/docs/vocabulary-db-recovery.md +58 -0
- package/.opencode/mednotes/docs/workflow-output-contract.md +779 -0
- package/.opencode/mednotes/hooks/hooks.json +79 -0
- package/.opencode/mednotes/package-lock.json +6361 -0
- package/.opencode/mednotes/package.json +15 -0
- package/.opencode/mednotes/pyproject.toml +48 -0
- package/.opencode/mednotes/scripts/bootstrap_windows_python_uv.cmd +13 -0
- package/.opencode/mednotes/scripts/bootstrap_windows_python_uv.ps1 +172 -0
- package/.opencode/mednotes/scripts/enrich_notes.py +23 -0
- package/.opencode/mednotes/scripts/full_reset_windows_python_uv.cmd +13 -0
- package/.opencode/mednotes/scripts/hooks/antigravity_hook_status.mjs +212 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/adapters/antigravity.mjs +169 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/adapters/harness_payload.mjs +103 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/adapters/opencode_plugin.mjs +341 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/adapters/opencode_user_config_sync.mjs +177 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/anki_preflight.mjs +214 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/cli.mjs +143 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/diagnostics.mjs +11 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/domain/agent_directive_core.mjs +160 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/fsm_directive.mjs +1470 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/hook_errors.mjs +120 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/retention.mjs +114 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/runtime.mjs +174 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/telemetry_capture.mjs +511 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook/vault_guard.mjs +624 -0
- package/.opencode/mednotes/scripts/hooks/mednotes_hook.mjs +5 -0
- package/.opencode/mednotes/scripts/mednotes/_runtime_paths.py +24 -0
- package/.opencode/mednotes/scripts/mednotes/anki_model_validator.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/capture_extension_diff.py +1562 -0
- package/.opencode/mednotes/scripts/mednotes/feedback_report.py +16 -0
- package/.opencode/mednotes/scripts/mednotes/flashcard_index.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/flashcard_pipeline.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/flashcard_report.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/flashcard_sources.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/obsidian/README.md +6 -0
- package/.opencode/mednotes/scripts/mednotes/obsidian_note_utils.py +20 -0
- package/.opencode/mednotes/scripts/mednotes/pdf_library/cli.py +16 -0
- package/.opencode/mednotes/scripts/mednotes/project_fsm.py +229 -0
- package/.opencode/mednotes/scripts/mednotes/setup_telemetry_email.py +404 -0
- package/.opencode/mednotes/scripts/mednotes/sync_anki_twenty_rules.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/sync_opencode_user_config.py +36 -0
- package/.opencode/mednotes/scripts/mednotes/wiki/cli.py +20 -0
- package/.opencode/mednotes/scripts/mednotes/wiki_graph.py +18 -0
- package/.opencode/mednotes/scripts/mednotes/wiki_tree.py +134 -0
- package/.opencode/mednotes/scripts/reset_windows_python_uv.ps1 +625 -0
- package/.opencode/mednotes/scripts/run_python.mjs +109 -0
- package/.opencode/mednotes/scripts/vault/vault_commit.ps1 +19 -0
- package/.opencode/mednotes/scripts/vault/vault_commit.sh +18 -0
- package/.opencode/mednotes/scripts/vault/vault_git.ps1 +19 -0
- package/.opencode/mednotes/scripts/vault/vault_git.py +3107 -0
- package/.opencode/mednotes/scripts/vault/vault_git.sh +18 -0
- package/.opencode/mednotes/scripts/vault/vault_precommit.ps1 +19 -0
- package/.opencode/mednotes/scripts/vault/vault_precommit.sh +18 -0
- package/.opencode/mednotes/skills/THIRD_PARTY_NOTICES.md +45 -0
- package/.opencode/mednotes/skills/create-medical-flashcards/SKILL.md +113 -0
- package/.opencode/mednotes/skills/create-medical-note/SKILL.md +90 -0
- package/.opencode/mednotes/skills/enrich-medical-note/SKILL.md +120 -0
- package/.opencode/mednotes/skills/fix-medical-wiki/SKILL.md +559 -0
- package/.opencode/mednotes/skills/link-medical-wiki/SKILL.md +224 -0
- package/.opencode/mednotes/skills/obsidian-cli/SKILL.md +118 -0
- package/.opencode/mednotes/skills/obsidian-markdown/SKILL.md +207 -0
- package/.opencode/mednotes/skills/obsidian-markdown/references/CALLOUTS.md +58 -0
- package/.opencode/mednotes/skills/obsidian-markdown/references/EMBEDS.md +63 -0
- package/.opencode/mednotes/skills/obsidian-markdown/references/PROPERTIES.md +61 -0
- package/.opencode/mednotes/skills/obsidian-ops/SKILL.md +136 -0
- package/.opencode/mednotes/skills/pdf-library/SKILL.md +45 -0
- package/.opencode/mednotes/skills/process-medical-chats/SKILL.md +246 -0
- package/.opencode/mednotes/skills/workflow-report/SKILL.md +100 -0
- package/.opencode/mednotes/src/mednotes/__init__.py +5 -0
- package/.opencode/mednotes/src/mednotes/domains/__init__.py +5 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/README.md +26 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/__init__.py +2 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/build_demo_apkg.py +177 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/contracts.py +385 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/flashcards_machine.py +522 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/fsm.py +817 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/index.py +630 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/install_models.py +445 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/model.py +359 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/obsidian_links.py +135 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/obsidian_note_utils.py +546 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/pipeline.py +580 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/report.py +510 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/sources.py +682 -0
- package/.opencode/mednotes/src/mednotes/domains/flashcards/sync_rules.py +184 -0
- package/.opencode/mednotes/src/mednotes/domains/history/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/history/history_fsm.py +852 -0
- package/.opencode/mednotes/src/mednotes/domains/history/history_machine.py +453 -0
- package/.opencode/mednotes/src/mednotes/domains/setup/__init__.py +7 -0
- package/.opencode/mednotes/src/mednotes/domains/setup/setup_fsm.py +808 -0
- package/.opencode/mednotes/src/mednotes/domains/setup/setup_machine.py +973 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/README.md +64 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/api.py +668 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/batch_state.py +102 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/atomicity/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/atomicity/atomicity.py +877 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/body_link/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/body_link/body_linker.py +1562 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/effects/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/effects/effect_adapters.py +949 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/effects/fix_wiki_runtime_adapters.py +433 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/graph/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/graph/coverage.py +413 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/graph/graph.py +396 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/graph/graph_fixes.py +161 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/hygiene/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/hygiene/hygiene.py +483 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/__init__.py +2 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/anchors.py +185 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/__init__.py +0 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/cache.py +223 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/config.py +131 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/download.py +224 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/frontmatter.py +59 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/insert.py +227 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/core/local_import.py +54 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/sources/__init__.py +42 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/sources/web_profiles.py +99 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/sources/web_search.py +203 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/illustrate/sources/wikimedia.py +102 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/markdown/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/markdown/markdown_db_adapter.mjs +434 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/markdown/markdown_node_runtime.py +274 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/markdown/markdown_query.py +227 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/artifacts.py +605 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/canonical_merge.py +277 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/markdown_zones.py +85 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/meaning_planner.py +307 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_iter.py +67 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_merge.py +278 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_plan.py +409 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_policy.py +22 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/__init__.py +79 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/fixes.py +264 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/frontmatter.py +435 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/models.py +208 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/prompts.py +37 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/tables.py +236 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/note_style/validate.py +404 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/provenance.py +478 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/raw_chats.py +273 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/notes/sources_backfill.py +235 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/__init__.py +10 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/anchors.py +16 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/captions.py +47 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/cli.py +179 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/cloud.py +52 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/config.py +196 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/context_packets.py +76 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/db.py +81 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/doctor.py +102 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/figure_ids.py +42 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/ingest.py +326 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/insert.py +316 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/mentions.py +57 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/ocr.py +71 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/paths.py +35 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/pdf_engine.py +77 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/schema.py +155 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/search.py +188 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/tui/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/tui/app.py +89 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/tui/image_backend.py +29 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/pdf/tui/state.py +65 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/publish/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/publish/publish.py +1139 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/publish/publish_receipts.py +365 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/publish/publish_recovery.py +240 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/agent_behavior_corpus.py +2069 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/agent_report_validation.py +4448 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/agent_run_audit.py +852 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/architect_prompt_eval.py +341 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/body_linker_eval.py +240 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/curator_output_validation.py +175 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/curator_prompt_eval.py +865 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/triager_prompt_eval.py +1295 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/related_notes/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/related_notes/related_notes.py +1920 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/related_notes/related_notes_headless.py +1186 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/specialist/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/specialist/plan_attestation.py +148 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/specialist/specialist_receipts.py +360 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/specialist/specialist_runtime.py +52 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/specialist/specialist_task_runner.py +2470 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/style/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/style/style.py +1952 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/subagents/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/subagents/agents.py +1767 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/alias_projection.py +331 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/link_terms.py +151 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/llm_disambiguation.py +182 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/__init__.py +116 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/audit.py +201 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/migration.py +314 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/normalize.py +72 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/policy.py +135 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/resolve.py +413 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/schema.py +157 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/taxonomy/status.py +137 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/vocabulary_bootstrap.py +509 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/vocabulary_curator_batch.py +1115 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/vocabulary_ingestion.py +632 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/vocabulary_map.py +930 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/vocabulary/vocabulary_recovery.py +1388 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/cli.py +6665 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/common.py +69 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/config.py +210 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/__init__.py +74 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/agent_report.py +242 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/agent_run_audit.py +196 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/agents.py +601 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/curator.py +256 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/effect_payloads.py +519 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/happy_path.py +190 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/link_git.py +110 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/link_runtime_artifact.py +52 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/note_plan.py +75 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/paths.py +114 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/public_report.py +53 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/publish.py +111 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/raw_coverage.py +217 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/related_notes.py +136 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/related_notes_headless.py +153 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/related_notes_runtime.py +395 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/schema_registry.py +637 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/specialist.py +432 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/status.py +62 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/style_rewrite.py +568 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/vocabulary_ingestion.py +223 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/workflow_blockers.py +510 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/workflow_guardrails.py +637 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/workflow_outcomes.py +121 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/contracts/workflow_receipts.py +100 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/__main__.py +4 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/cli.py +275 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/__init__.py +2 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/candidates.py +193 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/cli.py +189 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/gemini.py +220 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/inputs.py +120 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/models.py +34 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/parsing.py +48 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/prompts.py +216 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/quality.py +54 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/reporting.py +24 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/runner.py +433 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/utils.py +39 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/enrich/workflow/vault_guard_bridge.py +17 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_context_packets.py +454 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_decision_projection.py +133 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_effects.py +1260 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_fsm.py +2768 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_machine.py +1588 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_plan.py +306 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_primary_objective.py +316 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_problem.py +153 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_receipt_evidence.py +306 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_states.py +290 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/fix_wiki_user_report.py +342 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/fix_wiki/health.py +6332 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_fsm.py +1119 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_git.py +638 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_machine.py +1106 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_retry_governance.py +374 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_runtime_result.py +485 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/link_triggers.py +183 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/linking.py +2758 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/reference_repair.py +718 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link/related_notes_fsm.py +1855 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link_related/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/link_related/link_related_machine.py +834 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/process_chats/__init__.py +1 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/process_chats/process_chats_fsm.py +1592 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/process_chats/process_chats_machine.py +3097 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/process_chats/process_chats_primary_objective.py +28 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/flows/process_chats/process_chats_runtime_result.py +185 -0
- package/.opencode/mednotes/src/mednotes/domains/wiki/performance.py +97 -0
- package/.opencode/mednotes/src/mednotes/kernel/__init__.py +6 -0
- package/.opencode/mednotes/src/mednotes/kernel/agent_directive.py +336 -0
- package/.opencode/mednotes/src/mednotes/kernel/base.py +51 -0
- package/.opencode/mednotes/src/mednotes/kernel/blockers.py +39 -0
- package/.opencode/mednotes/src/mednotes/kernel/effect_executor.py +55 -0
- package/.opencode/mednotes/src/mednotes/kernel/effect_intent.py +69 -0
- package/.opencode/mednotes/src/mednotes/kernel/effects.py +160 -0
- package/.opencode/mednotes/src/mednotes/kernel/errors.py +38 -0
- package/.opencode/mednotes/src/mednotes/kernel/fsm_event.py +35 -0
- package/.opencode/mednotes/src/mednotes/kernel/fsm_model.py +55 -0
- package/.opencode/mednotes/src/mednotes/kernel/fsm_transition_result.py +75 -0
- package/.opencode/mednotes/src/mednotes/kernel/guardrails.py +188 -0
- package/.opencode/mednotes/src/mednotes/kernel/progress.py +319 -0
- package/.opencode/mednotes/src/mednotes/kernel/public_report.py +346 -0
- package/.opencode/mednotes/src/mednotes/kernel/state_machine.py +164 -0
- package/.opencode/mednotes/src/mednotes/kernel/workflow.py +619 -0
- package/.opencode/mednotes/src/mednotes/platform/__init__.py +5 -0
- package/.opencode/mednotes/src/mednotes/platform/backup_policy.py +382 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/__init__.py +62 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/cli.py +275 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/contracts.py +83 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/core.py +4168 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/integrity.py +989 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/operational_contract.py +2293 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/telemetry.py +875 -0
- package/.opencode/mednotes/src/mednotes/platform/feedback/telemetry_config.py +65 -0
- package/.opencode/mednotes/src/mednotes/platform/opencode_runtime_config.py +182 -0
- package/.opencode/mednotes/src/mednotes/platform/paths/__init__.py +1560 -0
- package/.opencode/mednotes/src/mednotes/platform/secrets.py +89 -0
- package/.opencode/mednotes/src/mednotes/platform/user_config.py +103 -0
- package/.opencode/mednotes/src/mednotes/platform/vault_guard.py +214 -0
- package/.opencode/mednotes/uv.lock +932 -0
- package/.opencode/mednotes.generated.json +395 -0
- package/.opencode/opencode.json +31 -0
- package/.opencode/plugins/mednotes-fsm.mjs +7 -0
- package/.opencode/plugins/mednotes_hook/adapters/antigravity.mjs +169 -0
- package/.opencode/plugins/mednotes_hook/adapters/harness_payload.mjs +103 -0
- package/.opencode/plugins/mednotes_hook/adapters/opencode_plugin.mjs +341 -0
- package/.opencode/plugins/mednotes_hook/adapters/opencode_user_config_sync.mjs +177 -0
- package/.opencode/plugins/mednotes_hook/anki_preflight.mjs +214 -0
- package/.opencode/plugins/mednotes_hook/cli.mjs +143 -0
- package/.opencode/plugins/mednotes_hook/diagnostics.mjs +11 -0
- package/.opencode/plugins/mednotes_hook/domain/agent_directive_core.mjs +160 -0
- package/.opencode/plugins/mednotes_hook/fsm_directive.mjs +1470 -0
- package/.opencode/plugins/mednotes_hook/hook_errors.mjs +120 -0
- package/.opencode/plugins/mednotes_hook/retention.mjs +114 -0
- package/.opencode/plugins/mednotes_hook/runtime.mjs +174 -0
- package/.opencode/plugins/mednotes_hook/telemetry_capture.mjs +511 -0
- package/.opencode/plugins/mednotes_hook/vault_guard.mjs +624 -0
- package/AGENTS.md +57 -0
- package/README.md +194 -0
- package/adapters/antigravity/agents.json +80 -0
- package/adapters/antigravity/templates/med-chat-triager.md +214 -0
- package/adapters/antigravity/templates/med-flashcard-maker.md +72 -0
- package/adapters/antigravity/templates/med-knowledge-architect.md +241 -0
- package/adapters/antigravity/templates/med-link-graph-curator.md +187 -0
- package/adapters/antigravity/templates/med-publish-guard.md +71 -0
- package/adapters/gemini-cli/gemini-extension.json +14 -0
- package/adapters/gemini-cli/package.json +15 -0
- package/adapters/gemini-cli/pyproject.toml +48 -0
- package/bin/mednotes-opencode.mjs +155 -0
- package/contracts/agents.json +116 -0
- package/core/agents/med-chat-triager.md +197 -0
- package/core/agents/med-flashcard-maker.md +56 -0
- package/core/agents/med-knowledge-architect.md +224 -0
- package/core/agents/med-link-graph-curator.md +171 -0
- package/core/agents/med-publish-guard.md +55 -0
- package/core/commands/flashcards.toml +22 -0
- package/core/commands/mednotes/create.toml +22 -0
- package/core/commands/mednotes/enrich.toml +24 -0
- package/core/commands/mednotes/fix-wiki.toml +24 -0
- package/core/commands/mednotes/history.toml +19 -0
- package/core/commands/mednotes/link-body.toml +22 -0
- package/core/commands/mednotes/link-related.toml +24 -0
- package/core/commands/mednotes/link.toml +24 -0
- package/core/commands/mednotes/pdf-library.toml +24 -0
- package/core/commands/mednotes/process-chats.toml +20 -0
- package/core/commands/mednotes/setup.toml +18 -0
- package/core/commands/mednotes/status.toml +24 -0
- package/core/commands/mednotes/telemetry.toml +24 -0
- package/core/commands/report.toml +23 -0
- package/core/skills/THIRD_PARTY_NOTICES.md +45 -0
- package/core/skills/create-medical-flashcards/SKILL.md +113 -0
- package/core/skills/create-medical-note/SKILL.md +90 -0
- package/core/skills/enrich-medical-note/SKILL.md +120 -0
- package/core/skills/fix-medical-wiki/SKILL.md +559 -0
- package/core/skills/link-medical-wiki/SKILL.md +224 -0
- package/core/skills/obsidian-cli/SKILL.md +118 -0
- package/core/skills/obsidian-markdown/SKILL.md +207 -0
- package/core/skills/obsidian-markdown/references/CALLOUTS.md +58 -0
- package/core/skills/obsidian-markdown/references/EMBEDS.md +63 -0
- package/core/skills/obsidian-markdown/references/PROPERTIES.md +61 -0
- package/core/skills/obsidian-ops/SKILL.md +136 -0
- package/core/skills/pdf-library/SKILL.md +45 -0
- package/core/skills/process-medical-chats/SKILL.md +246 -0
- package/core/skills/workflow-report/SKILL.md +100 -0
- package/package.json +45 -0
package/.opencode/mednotes/src/mednotes/domains/wiki/capabilities/quality/agent_behavior_corpus.py
ADDED
|
@@ -0,0 +1,2069 @@
|
|
|
1
|
+
"""Versioned offline behavior corpus gates for agent prompt changes."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import tempfile
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from pydantic import ConfigDict, Field, NonNegativeInt, StrictStr
|
|
13
|
+
from pydantic import ValidationError as PydanticValidationError
|
|
14
|
+
|
|
15
|
+
from mednotes.domains.wiki.capabilities.quality.curator_prompt_eval import (
|
|
16
|
+
evaluate_curator_prompt_outputs,
|
|
17
|
+
load_curator_prompt_expectations,
|
|
18
|
+
)
|
|
19
|
+
from mednotes.domains.wiki.capabilities.vocabulary.vocabulary_curator_batch import (
|
|
20
|
+
VOCABULARY_CURATOR_BATCH_OUTPUT_MANIFEST_SCHEMA,
|
|
21
|
+
build_curator_prompt_identity,
|
|
22
|
+
curator_plan_hash,
|
|
23
|
+
)
|
|
24
|
+
from mednotes.domains.wiki.common import ValidationError
|
|
25
|
+
from mednotes.kernel.base import ContractModel, JsonObject, JsonObjectAdapter, JsonValue, contract_error
|
|
26
|
+
|
|
27
|
+
AGENT_BEHAVIOR_CORPUS_SCHEMA = "medical-notes-workbench.agent-behavior-corpus.v1"
|
|
28
|
+
AGENT_BEHAVIOR_CORPUS_REPORT_SCHEMA = "medical-notes-workbench.agent-behavior-corpus-report.v1"
|
|
29
|
+
AGENT_BEHAVIOR_CONTRACT_EVAL_SCHEMA = "medical-notes-workbench.agent-behavior-contract-eval.v1"
|
|
30
|
+
AGENT_BEHAVIOR_CASE_DRAFT_SCHEMA = "medical-notes-workbench.agent-behavior-case-draft.v1"
|
|
31
|
+
AGENT_BEHAVIOR_CASE_DRAFT_REPORT_SCHEMA = "medical-notes-workbench.agent-behavior-case-draft-report.v1"
|
|
32
|
+
|
|
33
|
+
DEFAULT_TELEMETRY_APP = "medical-notes-workbench"
|
|
34
|
+
SEVERITY_RANK = {"low": 1, "medium": 2, "high": 3, "critical": 4}
|
|
35
|
+
DEFAULT_SIGNAL_SEVERITY = {
|
|
36
|
+
"agent.retry_loop": "high",
|
|
37
|
+
"agent.retry_without_input_change": "high",
|
|
38
|
+
"agent.ignored_next_action": "high",
|
|
39
|
+
"agent.wrong_phase": "high",
|
|
40
|
+
"agent.generated_script_workaround": "high",
|
|
41
|
+
"agent.unsafe_generated_script_recovery_bypass": "high",
|
|
42
|
+
"agent.missing_error_context": "high",
|
|
43
|
+
"agent.script_or_prompt_drift": "high",
|
|
44
|
+
"agent.unexpected_mutation": "high",
|
|
45
|
+
"agent.command_failed": "medium",
|
|
46
|
+
"agent.workflow_blocked": "medium",
|
|
47
|
+
"agent.dry_run_without_apply": "medium",
|
|
48
|
+
"dry_run_without_apply": "medium",
|
|
49
|
+
"extension_prompt_or_script_drift": "high",
|
|
50
|
+
"resource.version_control_policy_bypassed": "critical",
|
|
51
|
+
"resource.guard_missing": "critical",
|
|
52
|
+
"resource.run_finish_missing": "high",
|
|
53
|
+
"resource.restore_point_after_mutation": "critical",
|
|
54
|
+
"resource.direct_mutation_attempt": "high",
|
|
55
|
+
}
|
|
56
|
+
RISK_CODES_THAT_CREATE_DRAFTS = {
|
|
57
|
+
"mass_markdown_mutation",
|
|
58
|
+
"hardcoded_user_path",
|
|
59
|
+
"reads_obsidian_plugin_data",
|
|
60
|
+
"writes_related_notes_section",
|
|
61
|
+
"external_api_or_embedding_call",
|
|
62
|
+
"no_dry_run",
|
|
63
|
+
"encoding_corruption",
|
|
64
|
+
"extension_prompt_or_script_drift",
|
|
65
|
+
"direct_sql_mutation",
|
|
66
|
+
"queue_truth_bypass",
|
|
67
|
+
"unsafe_mass_wikilink_rewrite",
|
|
68
|
+
}
|
|
69
|
+
COMMAND_PROMPT_SOURCES = {
|
|
70
|
+
"/flashcards": "commands/flashcards.toml",
|
|
71
|
+
"/report": "commands/report.toml",
|
|
72
|
+
"/mednotes:create": "commands/mednotes/create.toml",
|
|
73
|
+
"/mednotes:enrich": "commands/mednotes/enrich.toml",
|
|
74
|
+
"/mednotes:fix-wiki": "commands/mednotes/fix-wiki.toml",
|
|
75
|
+
"/mednotes:history": "commands/mednotes/history.toml",
|
|
76
|
+
"/mednotes:link": "commands/mednotes/link.toml",
|
|
77
|
+
"/mednotes:link-body": "commands/mednotes/link-body.toml",
|
|
78
|
+
"/mednotes:link-related": "commands/mednotes/link-related.toml",
|
|
79
|
+
"/mednotes:pdf-library": "commands/mednotes/pdf-library.toml",
|
|
80
|
+
"/mednotes:process-chats": "commands/mednotes/process-chats.toml",
|
|
81
|
+
"/mednotes:setup": "commands/mednotes/setup.toml",
|
|
82
|
+
"/mednotes:status": "commands/mednotes/status.toml",
|
|
83
|
+
"/mednotes:telemetry": "commands/mednotes/telemetry.toml",
|
|
84
|
+
}
|
|
85
|
+
WORKFLOW_SKILL_PROMPT_SOURCES = {
|
|
86
|
+
"flashcards": "skills/create-medical-flashcards/SKILL.md",
|
|
87
|
+
"create": "skills/create-medical-note/SKILL.md",
|
|
88
|
+
"enrich": "skills/enrich-medical-note/SKILL.md",
|
|
89
|
+
"fix-wiki": "skills/fix-medical-wiki/SKILL.md",
|
|
90
|
+
"link": "skills/link-medical-wiki/SKILL.md",
|
|
91
|
+
"link-body": "skills/link-medical-wiki/SKILL.md",
|
|
92
|
+
"link-related": "skills/link-medical-wiki/SKILL.md",
|
|
93
|
+
"pdf-library": "skills/pdf-library/SKILL.md",
|
|
94
|
+
"process-chats": "skills/process-medical-chats/SKILL.md",
|
|
95
|
+
"report": "skills/workflow-report/SKILL.md",
|
|
96
|
+
"setup": "skills/obsidian-ops/SKILL.md",
|
|
97
|
+
"status": "skills/obsidian-ops/SKILL.md",
|
|
98
|
+
"telemetry": "skills/obsidian-ops/SKILL.md",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class _AgentBehaviorCorpusFields(ContractModel):
|
|
103
|
+
schema_id: StrictStr = Field(alias="schema", serialization_alias="schema")
|
|
104
|
+
suite_id: StrictStr = ""
|
|
105
|
+
agent: StrictStr = ""
|
|
106
|
+
surface_type: StrictStr = ""
|
|
107
|
+
evaluator: StrictStr = ""
|
|
108
|
+
prompt_sources: list[StrictStr] = Field(default_factory=list)
|
|
109
|
+
prompt_identity_hash: StrictStr = ""
|
|
110
|
+
cases_path: StrictStr = ""
|
|
111
|
+
plan_path: StrictStr = ""
|
|
112
|
+
manifest_path: StrictStr = ""
|
|
113
|
+
expectations_path: StrictStr = ""
|
|
114
|
+
baseline_eval_path: StrictStr = ""
|
|
115
|
+
case_count: NonNegativeInt = 0
|
|
116
|
+
cases: list[JsonObject] = Field(default_factory=list)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class _AgentBehaviorAssertionFields(ContractModel):
|
|
120
|
+
"""Typed assertion read from behavior-case fixtures before evaluation."""
|
|
121
|
+
|
|
122
|
+
model_config = ConfigDict(extra="ignore")
|
|
123
|
+
|
|
124
|
+
op: StrictStr = ""
|
|
125
|
+
path: StrictStr = ""
|
|
126
|
+
value: JsonValue = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class _AgentBehaviorCaseFields(ContractModel):
|
|
130
|
+
"""Fixture case boundary; raw JSON must validate before it can drive scoring."""
|
|
131
|
+
|
|
132
|
+
model_config = ConfigDict(extra="ignore")
|
|
133
|
+
|
|
134
|
+
case_id: StrictStr = ""
|
|
135
|
+
behavior: StrictStr = ""
|
|
136
|
+
output_path: StrictStr = ""
|
|
137
|
+
assertions: list[_AgentBehaviorAssertionFields] = Field(default_factory=list)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class _AgentBehaviorCasesPayloadFields(ContractModel):
|
|
141
|
+
"""Root cases file consumed by the offline behavior-contract evaluator."""
|
|
142
|
+
|
|
143
|
+
model_config = ConfigDict(extra="ignore")
|
|
144
|
+
|
|
145
|
+
schema_id: StrictStr = Field(alias="schema")
|
|
146
|
+
cases: list[_AgentBehaviorCaseFields] = Field(default_factory=list)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class _CuratorOutputManifestItemFields(ContractModel):
|
|
150
|
+
"""Typed lens for manifest fields that affect generated output resolution."""
|
|
151
|
+
|
|
152
|
+
model_config = ConfigDict(extra="ignore")
|
|
153
|
+
|
|
154
|
+
output_path: StrictStr = ""
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class _CuratorOutputManifestFields(ContractModel):
|
|
158
|
+
"""Curator manifest boundary; the raw manifest is preserved only as audit payload."""
|
|
159
|
+
|
|
160
|
+
model_config = ConfigDict(extra="ignore")
|
|
161
|
+
|
|
162
|
+
schema_id: StrictStr = Field(alias="schema")
|
|
163
|
+
items: list[JsonObject] = Field(default_factory=list)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class _TelemetryAgentEventFields(ContractModel):
|
|
167
|
+
"""Telemetry event fields that may become behavior-corpus signals."""
|
|
168
|
+
|
|
169
|
+
model_config = ConfigDict(extra="ignore")
|
|
170
|
+
|
|
171
|
+
code: StrictStr = ""
|
|
172
|
+
type: StrictStr = ""
|
|
173
|
+
severity: StrictStr = ""
|
|
174
|
+
phase: StrictStr = ""
|
|
175
|
+
expected_phase: StrictStr = ""
|
|
176
|
+
next_action_expected: StrictStr = ""
|
|
177
|
+
recovery_command: StrictStr = ""
|
|
178
|
+
command_family: StrictStr = ""
|
|
179
|
+
path: StrictStr = ""
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class _TelemetryClientLens(ContractModel):
|
|
183
|
+
"""Typed app metadata nested inside telemetry evidence payloads."""
|
|
184
|
+
|
|
185
|
+
model_config = ConfigDict(extra="ignore")
|
|
186
|
+
|
|
187
|
+
app: StrictStr = ""
|
|
188
|
+
app_version: StrictStr = ""
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class _TelemetryPayloadLens(ContractModel):
|
|
192
|
+
"""External telemetry envelopes are validated before metadata drives routing."""
|
|
193
|
+
|
|
194
|
+
model_config = ConfigDict(extra="ignore")
|
|
195
|
+
|
|
196
|
+
schema_id: StrictStr = Field(default="", alias="schema", serialization_alias="schema")
|
|
197
|
+
app: StrictStr = ""
|
|
198
|
+
app_version: StrictStr = ""
|
|
199
|
+
client: _TelemetryClientLens | None = None
|
|
200
|
+
records: list[JsonObject] = Field(default_factory=list)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class _GeneratedScriptEvidenceLens(ContractModel):
|
|
204
|
+
"""Redacted generated-script evidence promoted into prevention suggestions."""
|
|
205
|
+
|
|
206
|
+
model_config = ConfigDict(extra="ignore")
|
|
207
|
+
|
|
208
|
+
path: StrictStr = ""
|
|
209
|
+
risk_codes: list[StrictStr] = Field(default_factory=list)
|
|
210
|
+
function_or_command: StrictStr = ""
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class _CommandEventEvidenceLens(ContractModel):
|
|
214
|
+
"""Redacted command evidence promoted into prevention suggestions."""
|
|
215
|
+
|
|
216
|
+
model_config = ConfigDict(extra="ignore")
|
|
217
|
+
|
|
218
|
+
command: StrictStr = ""
|
|
219
|
+
command_family: StrictStr = ""
|
|
220
|
+
path: StrictStr = ""
|
|
221
|
+
status: StrictStr = ""
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class _TelemetryEnvironmentIntegrityLens(ContractModel):
|
|
225
|
+
"""Typed subset of environment integrity used only for version provenance."""
|
|
226
|
+
|
|
227
|
+
model_config = ConfigDict(extra="ignore")
|
|
228
|
+
|
|
229
|
+
app_version: StrictStr = ""
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class _TelemetryEnvironmentContextLens(ContractModel):
|
|
233
|
+
"""Typed subset of record environment context used by draft provenance."""
|
|
234
|
+
|
|
235
|
+
model_config = ConfigDict(extra="ignore")
|
|
236
|
+
|
|
237
|
+
extension_integrity: _TelemetryEnvironmentIntegrityLens | None = None
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class _TelemetryRecordMetadataLens(ContractModel):
|
|
241
|
+
"""Record fields allowed to affect draft naming, suite routing, and provenance."""
|
|
242
|
+
|
|
243
|
+
model_config = ConfigDict(extra="ignore")
|
|
244
|
+
|
|
245
|
+
workflow: StrictStr = ""
|
|
246
|
+
agent: StrictStr = ""
|
|
247
|
+
phase: StrictStr = ""
|
|
248
|
+
recorded_at: StrictStr = ""
|
|
249
|
+
app: StrictStr = ""
|
|
250
|
+
app_version: StrictStr = ""
|
|
251
|
+
client: _TelemetryClientLens | None = None
|
|
252
|
+
environment_context: _TelemetryEnvironmentContextLens | None = None
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class _BehaviorCandidatePayloadLens(ContractModel):
|
|
256
|
+
"""Typed edge for behavior-case candidate envelopes from telemetry or email."""
|
|
257
|
+
|
|
258
|
+
model_config = ConfigDict(extra="ignore")
|
|
259
|
+
|
|
260
|
+
behavior_case_candidates: list[JsonObject] = Field(default_factory=list)
|
|
261
|
+
first_pass_prevention_candidates: list[JsonObject] = Field(default_factory=list)
|
|
262
|
+
messages: list[JsonObject] = Field(default_factory=list)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class _BehaviorCandidateMessageLens(ContractModel):
|
|
266
|
+
"""Typed candidate lists nested inside inbox/telemetry message records."""
|
|
267
|
+
|
|
268
|
+
model_config = ConfigDict(extra="ignore")
|
|
269
|
+
|
|
270
|
+
id: StrictStr = ""
|
|
271
|
+
source_kind: StrictStr = ""
|
|
272
|
+
behavior_case_candidates: list[JsonObject] = Field(default_factory=list)
|
|
273
|
+
first_pass_prevention_candidates: list[JsonObject] = Field(default_factory=list)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _telemetry_payload_lens(payload: object) -> _TelemetryPayloadLens:
|
|
277
|
+
if not isinstance(payload, dict):
|
|
278
|
+
return _TelemetryPayloadLens()
|
|
279
|
+
return _TelemetryPayloadLens.model_validate(payload)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _telemetry_record_lens(record: object) -> _TelemetryRecordMetadataLens:
|
|
283
|
+
if not isinstance(record, dict):
|
|
284
|
+
return _TelemetryRecordMetadataLens()
|
|
285
|
+
return _TelemetryRecordMetadataLens.model_validate(record)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _agent_behavior_corpus_fields(corpus: JsonObject) -> _AgentBehaviorCorpusFields:
|
|
289
|
+
try:
|
|
290
|
+
return _AgentBehaviorCorpusFields.model_validate(corpus)
|
|
291
|
+
except PydanticValidationError as exc:
|
|
292
|
+
raise contract_error(exc, prefix="agent behavior corpus") from exc
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _agent_behavior_cases_payload_fields(payload: JsonObject) -> _AgentBehaviorCasesPayloadFields:
|
|
296
|
+
try:
|
|
297
|
+
return _AgentBehaviorCasesPayloadFields.model_validate(payload)
|
|
298
|
+
except PydanticValidationError as exc:
|
|
299
|
+
raise contract_error(exc, prefix="agent behavior cases") from exc
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _curator_output_manifest_fields(payload: JsonObject) -> _CuratorOutputManifestFields:
|
|
303
|
+
try:
|
|
304
|
+
return _CuratorOutputManifestFields.model_validate(payload)
|
|
305
|
+
except PydanticValidationError as exc:
|
|
306
|
+
raise contract_error(exc, prefix="agent behavior corpus manifest") from exc
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _telemetry_agent_event_fields(payload: JsonObject) -> _TelemetryAgentEventFields:
|
|
310
|
+
try:
|
|
311
|
+
return _TelemetryAgentEventFields.model_validate(payload)
|
|
312
|
+
except PydanticValidationError as exc:
|
|
313
|
+
raise contract_error(exc, prefix="agent behavior telemetry event") from exc
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _read_json_object(path: Path, *, label: str) -> JsonObject:
|
|
317
|
+
try:
|
|
318
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
319
|
+
except FileNotFoundError as exc:
|
|
320
|
+
raise ValidationError(f"{label} not found: {path}") from exc
|
|
321
|
+
except json.JSONDecodeError as exc:
|
|
322
|
+
raise ValidationError(f"{label} is invalid JSON: {path}: {exc}") from exc
|
|
323
|
+
if not isinstance(payload, dict):
|
|
324
|
+
raise ValidationError(f"{label} must be a JSON object: {path}")
|
|
325
|
+
return JsonObjectAdapter.validate_python(payload)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _corpus_files(path: Path) -> list[Path]:
|
|
329
|
+
if path.is_dir():
|
|
330
|
+
direct = path / "corpus.json"
|
|
331
|
+
if direct.is_file():
|
|
332
|
+
return [direct]
|
|
333
|
+
discovered = sorted(child for child in path.rglob("corpus.json") if child.is_file())
|
|
334
|
+
if discovered:
|
|
335
|
+
return discovered
|
|
336
|
+
return [direct]
|
|
337
|
+
return [path]
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _resolve(base: Path, value: Any) -> Path:
|
|
341
|
+
path = Path(str(value or ""))
|
|
342
|
+
return path if path.is_absolute() else base / path
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _serialized_output_path(base: Path, value: object) -> str:
|
|
346
|
+
"""Serialize corpus output references relative to their suite directory."""
|
|
347
|
+
|
|
348
|
+
raw_path = Path(str(value or ""))
|
|
349
|
+
output_path = raw_path if raw_path.is_absolute() else base / raw_path
|
|
350
|
+
try:
|
|
351
|
+
return output_path.resolve().relative_to(base.resolve()).as_posix()
|
|
352
|
+
except ValueError as exc:
|
|
353
|
+
raise ValidationError(f"agent behavior output_path must stay under corpus suite root: {value}") from exc
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _serialized_evidence_source_path(source_path: Path) -> str:
|
|
357
|
+
"""Keep private local paths out of promoted behavior-case evidence."""
|
|
358
|
+
|
|
359
|
+
return source_path.name if source_path.is_absolute() else source_path.as_posix()
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _relativize_output_paths(value: Any, *, base: Path) -> Any:
|
|
363
|
+
if isinstance(value, list):
|
|
364
|
+
return [_relativize_output_paths(item, base=base) for item in value]
|
|
365
|
+
if not isinstance(value, dict):
|
|
366
|
+
return value
|
|
367
|
+
normalized: dict[str, Any] = {}
|
|
368
|
+
for key, item in value.items():
|
|
369
|
+
if key == "output_path":
|
|
370
|
+
normalized[key] = _serialized_output_path(base, item)
|
|
371
|
+
else:
|
|
372
|
+
normalized[key] = _relativize_output_paths(item, base=base)
|
|
373
|
+
return normalized
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def agent_behavior_baseline_paths(corpus_path: Path) -> list[Path]:
|
|
377
|
+
"""Return baseline files declared by a corpus file or corpus bank."""
|
|
378
|
+
|
|
379
|
+
baselines: set[Path] = set()
|
|
380
|
+
for corpus_file in _corpus_files(corpus_path):
|
|
381
|
+
corpus = _read_json_object(corpus_file, label="agent behavior corpus")
|
|
382
|
+
corpus_fields = _agent_behavior_corpus_fields(corpus)
|
|
383
|
+
if corpus_fields.schema_id != AGENT_BEHAVIOR_CORPUS_SCHEMA:
|
|
384
|
+
raise ValidationError(f"agent behavior corpus must use schema {AGENT_BEHAVIOR_CORPUS_SCHEMA}")
|
|
385
|
+
baseline_value = corpus_fields.baseline_eval_path
|
|
386
|
+
if baseline_value:
|
|
387
|
+
baselines.add(_resolve(corpus_file.parent, baseline_value).expanduser().resolve())
|
|
388
|
+
return sorted(baselines)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def validate_agent_behavior_report_path(*, corpus_path: Path, report_path: Path) -> None:
|
|
392
|
+
"""Prevent writing a corpus wrapper report over a promoted behavior baseline."""
|
|
393
|
+
|
|
394
|
+
candidate = report_path.expanduser().resolve()
|
|
395
|
+
for baseline_path in agent_behavior_baseline_paths(corpus_path):
|
|
396
|
+
if candidate == baseline_path:
|
|
397
|
+
raise ValidationError(
|
|
398
|
+
"agent_behavior_corpus.report_would_overwrite_baseline: "
|
|
399
|
+
"--report writes agent-behavior-corpus-report.v1, but this path is baseline_eval_path. "
|
|
400
|
+
"Write the corpus report to a separate file and promote the nested suite eval as the baseline."
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def _with_current_prompt_identity(plan: dict[str, Any], prompt_identity: dict[str, Any]) -> dict[str, Any]:
|
|
405
|
+
normalized = dict(plan)
|
|
406
|
+
normalized["prompt_identity"] = dict(prompt_identity)
|
|
407
|
+
work_items: list[Any] = []
|
|
408
|
+
for item in normalized.get("work_items") if isinstance(normalized.get("work_items"), list) else []:
|
|
409
|
+
if isinstance(item, dict):
|
|
410
|
+
normalized_item = dict(item)
|
|
411
|
+
normalized_item["prompt_identity"] = dict(prompt_identity)
|
|
412
|
+
work_items.append(normalized_item)
|
|
413
|
+
else:
|
|
414
|
+
work_items.append(item)
|
|
415
|
+
normalized["work_items"] = work_items
|
|
416
|
+
return normalized
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _manifest_with_absolute_outputs(*, base: Path, manifest_path: Path, output_dir: Path) -> tuple[Path, dict[str, Any]]:
|
|
420
|
+
manifest = _read_json_object(manifest_path, label="agent behavior corpus manifest")
|
|
421
|
+
manifest_fields = _curator_output_manifest_fields(manifest)
|
|
422
|
+
if manifest_fields.schema_id != VOCABULARY_CURATOR_BATCH_OUTPUT_MANIFEST_SCHEMA:
|
|
423
|
+
raise ValidationError(
|
|
424
|
+
f"agent behavior corpus manifest must use schema {VOCABULARY_CURATOR_BATCH_OUTPUT_MANIFEST_SCHEMA}"
|
|
425
|
+
)
|
|
426
|
+
normalized = dict(manifest)
|
|
427
|
+
items: list[dict[str, Any]] = []
|
|
428
|
+
for raw in manifest_fields.items:
|
|
429
|
+
item_fields = _CuratorOutputManifestItemFields.model_validate(raw)
|
|
430
|
+
item = dict(raw)
|
|
431
|
+
item["output_path"] = str(_resolve(base, item_fields.output_path))
|
|
432
|
+
items.append(item)
|
|
433
|
+
normalized["items"] = items
|
|
434
|
+
normalized_path = output_dir / "manifest.absolute.json"
|
|
435
|
+
normalized_path.write_text(json.dumps(normalized, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
436
|
+
return normalized_path, manifest
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _issue(*, code: str, message: str) -> JsonObject:
|
|
440
|
+
return {"code": code, "message": message}
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _canonical_payload_hash(payload: Any) -> str:
|
|
444
|
+
encoded = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
|
445
|
+
return f"sha256:{hashlib.sha256(encoded).hexdigest()}"
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _sha256_bytes(content: bytes) -> str:
|
|
449
|
+
return f"sha256:{hashlib.sha256(content).hexdigest()}"
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _extension_root() -> Path:
|
|
453
|
+
from mednotes.platform.paths import extension_root
|
|
454
|
+
|
|
455
|
+
return extension_root()
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _source_fingerprint(relative_path: str) -> JsonObject:
|
|
459
|
+
path = _extension_root() / relative_path
|
|
460
|
+
if not path.is_file():
|
|
461
|
+
return {"path": relative_path, "exists": False, "sha256": "", "byte_count": 0, "word_count": 0}
|
|
462
|
+
content = path.read_bytes()
|
|
463
|
+
text = content.decode("utf-8", errors="replace")
|
|
464
|
+
return {
|
|
465
|
+
"path": relative_path,
|
|
466
|
+
"exists": True,
|
|
467
|
+
"sha256": _sha256_bytes(content),
|
|
468
|
+
"byte_count": len(content),
|
|
469
|
+
"word_count": len(text.split()),
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def _prompt_identity_for_corpus(corpus: _AgentBehaviorCorpusFields) -> JsonObject:
|
|
474
|
+
if not corpus.prompt_sources:
|
|
475
|
+
return JsonObjectAdapter.validate_python(build_curator_prompt_identity())
|
|
476
|
+
normalized_sources = [_source_fingerprint(source) for source in corpus.prompt_sources if source]
|
|
477
|
+
aggregate_material = [
|
|
478
|
+
{"path": source["path"], "exists": source["exists"], "sha256": source["sha256"]}
|
|
479
|
+
for source in normalized_sources
|
|
480
|
+
]
|
|
481
|
+
return JsonObjectAdapter.validate_python({
|
|
482
|
+
"schema": "medical-notes-workbench.agent-prompt-identity.v1",
|
|
483
|
+
"agent": corpus.agent,
|
|
484
|
+
"aggregate_hash": _canonical_payload_hash(aggregate_material),
|
|
485
|
+
"sources": normalized_sources,
|
|
486
|
+
})
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def _get_path(payload: JsonValue, path: str) -> tuple[bool, JsonValue]:
|
|
490
|
+
current = payload
|
|
491
|
+
if not path:
|
|
492
|
+
return True, current
|
|
493
|
+
for part in path.split("."):
|
|
494
|
+
if isinstance(current, dict):
|
|
495
|
+
if part not in current:
|
|
496
|
+
return False, None
|
|
497
|
+
current = current[part]
|
|
498
|
+
elif isinstance(current, list) and part.isdigit():
|
|
499
|
+
index = int(part)
|
|
500
|
+
if index >= len(current):
|
|
501
|
+
return False, None
|
|
502
|
+
current = current[index]
|
|
503
|
+
else:
|
|
504
|
+
return False, None
|
|
505
|
+
return True, current
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _assertion_issue(case_id: str, assertion: _AgentBehaviorAssertionFields, message: str) -> dict[str, str]:
|
|
509
|
+
return {
|
|
510
|
+
"code": "behavior_contract_failed",
|
|
511
|
+
"case_id": case_id,
|
|
512
|
+
"assertion": assertion.op,
|
|
513
|
+
"path": assertion.path,
|
|
514
|
+
"message": message,
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _expected_array_length(case_id: str, assertion: _AgentBehaviorAssertionFields) -> tuple[bool, int, list[dict[str, str]]]:
|
|
519
|
+
"""Validate array-length assertions without converting strings to numbers."""
|
|
520
|
+
|
|
521
|
+
expected = assertion.value
|
|
522
|
+
if isinstance(expected, int) and not isinstance(expected, bool):
|
|
523
|
+
return True, expected, []
|
|
524
|
+
return False, 0, [_assertion_issue(case_id, assertion, f"expected integer length, got {expected!r}")]
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _evaluate_assertion(*, case_id: str, payload: JsonObject, assertion: _AgentBehaviorAssertionFields) -> list[dict[str, str]]:
|
|
528
|
+
op = assertion.op
|
|
529
|
+
path = assertion.path
|
|
530
|
+
exists, value = _get_path(payload, path)
|
|
531
|
+
expected = assertion.value
|
|
532
|
+
if op == "path_present":
|
|
533
|
+
return [] if exists and value is not None else [_assertion_issue(case_id, assertion, "expected path to be present")]
|
|
534
|
+
if op == "path_absent":
|
|
535
|
+
return [] if not exists else [_assertion_issue(case_id, assertion, "expected path to be absent")]
|
|
536
|
+
if op == "path_equals":
|
|
537
|
+
return [] if exists and value == expected else [_assertion_issue(case_id, assertion, f"expected {expected!r}, got {value!r}")]
|
|
538
|
+
if op == "path_in":
|
|
539
|
+
choices = expected if isinstance(expected, list) else []
|
|
540
|
+
return [] if exists and value in choices else [_assertion_issue(case_id, assertion, f"expected value in {choices!r}")]
|
|
541
|
+
if op == "array_len_equals":
|
|
542
|
+
valid, expected_len, issues = _expected_array_length(case_id, assertion)
|
|
543
|
+
if not valid:
|
|
544
|
+
return issues
|
|
545
|
+
return [] if isinstance(value, list) and len(value) == expected_len else [
|
|
546
|
+
_assertion_issue(case_id, assertion, f"expected list length {expected!r}")
|
|
547
|
+
]
|
|
548
|
+
if op == "array_len_at_least":
|
|
549
|
+
valid, expected_len, issues = _expected_array_length(case_id, assertion)
|
|
550
|
+
if not valid:
|
|
551
|
+
return issues
|
|
552
|
+
return [] if isinstance(value, list) and len(value) >= expected_len else [
|
|
553
|
+
_assertion_issue(case_id, assertion, f"expected list length >= {expected!r}")
|
|
554
|
+
]
|
|
555
|
+
if op == "array_len_at_most":
|
|
556
|
+
valid, expected_len, issues = _expected_array_length(case_id, assertion)
|
|
557
|
+
if not valid:
|
|
558
|
+
return issues
|
|
559
|
+
return [] if isinstance(value, list) and len(value) <= expected_len else [
|
|
560
|
+
_assertion_issue(case_id, assertion, f"expected list length <= {expected!r}")
|
|
561
|
+
]
|
|
562
|
+
if op == "json_not_contains":
|
|
563
|
+
if not isinstance(expected, str):
|
|
564
|
+
return [_assertion_issue(case_id, assertion, f"expected forbidden text string, got {expected!r}")]
|
|
565
|
+
text = json.dumps(payload, ensure_ascii=False)
|
|
566
|
+
return [] if expected not in text else [
|
|
567
|
+
_assertion_issue(case_id, assertion, f"forbidden text was present: {expected!r}")
|
|
568
|
+
]
|
|
569
|
+
return [_assertion_issue(case_id, assertion, f"unknown assertion op: {op}")]
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _score(issue_count: int) -> int:
|
|
573
|
+
return max(0, 100 - 25 * issue_count)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
class _PromptIdentityFields(ContractModel):
|
|
577
|
+
"""Typed lens for prompt identity hashes embedded in corpus reports."""
|
|
578
|
+
|
|
579
|
+
model_config = ConfigDict(extra="ignore")
|
|
580
|
+
|
|
581
|
+
aggregate_hash: StrictStr = ""
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
class _BaselineMetadataFields(ContractModel):
|
|
585
|
+
"""Typed lens for baseline promotion state."""
|
|
586
|
+
|
|
587
|
+
model_config = ConfigDict(extra="ignore")
|
|
588
|
+
|
|
589
|
+
status: StrictStr = ""
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
class _ContractEvalAggregateFields(ContractModel):
|
|
593
|
+
"""Counts that decide corpus regression status."""
|
|
594
|
+
|
|
595
|
+
model_config = ConfigDict(extra="ignore")
|
|
596
|
+
|
|
597
|
+
case_count: int = Field(default=0, ge=0, strict=True)
|
|
598
|
+
item_count: int = Field(default=0, ge=0, strict=True)
|
|
599
|
+
issue_count: int = Field(default=0, ge=0, strict=True)
|
|
600
|
+
score: int = Field(default=0, ge=0, strict=True)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
class _ContractEvalReportFields(ContractModel):
|
|
604
|
+
"""Typed status/count lens before corpus reports can drive pass/fail."""
|
|
605
|
+
|
|
606
|
+
model_config = ConfigDict(extra="ignore")
|
|
607
|
+
|
|
608
|
+
schema_id: StrictStr = Field(default="", alias="schema")
|
|
609
|
+
status: StrictStr = ""
|
|
610
|
+
aggregate: _ContractEvalAggregateFields = Field(default_factory=_ContractEvalAggregateFields)
|
|
611
|
+
prompt_identity: _PromptIdentityFields = Field(default_factory=_PromptIdentityFields)
|
|
612
|
+
baseline_metadata: _BaselineMetadataFields = Field(default_factory=_BaselineMetadataFields)
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
class _BaselineComparisonFields(ContractModel):
|
|
616
|
+
"""Typed result of comparing the current eval with its locked baseline."""
|
|
617
|
+
|
|
618
|
+
model_config = ConfigDict(extra="ignore")
|
|
619
|
+
|
|
620
|
+
status: StrictStr = ""
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def _compare_contract_baseline(*, current: JsonObject, baseline_path: Path) -> JsonObject:
|
|
624
|
+
baseline = _read_json_object(baseline_path, label="agent behavior contract baseline")
|
|
625
|
+
current_fields = _ContractEvalReportFields.model_validate(current)
|
|
626
|
+
baseline_fields = _ContractEvalReportFields.model_validate(baseline)
|
|
627
|
+
if baseline_fields.schema_id != AGENT_BEHAVIOR_CONTRACT_EVAL_SCHEMA:
|
|
628
|
+
raise ValidationError(f"agent behavior contract baseline must use schema {AGENT_BEHAVIOR_CONTRACT_EVAL_SCHEMA}")
|
|
629
|
+
comparability_flags: list[str] = []
|
|
630
|
+
if baseline_fields.baseline_metadata.status != "active":
|
|
631
|
+
comparability_flags.append("baseline_not_promoted")
|
|
632
|
+
if current_fields.prompt_identity.aggregate_hash != baseline_fields.prompt_identity.aggregate_hash:
|
|
633
|
+
comparability_flags.append("prompt_identity_changed")
|
|
634
|
+
score_delta = current_fields.aggregate.score - baseline_fields.aggregate.score
|
|
635
|
+
issue_count_delta = current_fields.aggregate.issue_count - baseline_fields.aggregate.issue_count
|
|
636
|
+
regression_flags: list[str] = []
|
|
637
|
+
if baseline_fields.status == "pass" and current_fields.status != "pass":
|
|
638
|
+
regression_flags.append("status_regression")
|
|
639
|
+
if score_delta < 0:
|
|
640
|
+
regression_flags.append("score_regression")
|
|
641
|
+
if issue_count_delta > 0:
|
|
642
|
+
regression_flags.append("issue_count_regression")
|
|
643
|
+
comparison_status = "not_comparable" if comparability_flags else (
|
|
644
|
+
"regressed" if regression_flags else "improved_or_equal"
|
|
645
|
+
)
|
|
646
|
+
return JsonObjectAdapter.validate_python(
|
|
647
|
+
{
|
|
648
|
+
"baseline_status": baseline_fields.status,
|
|
649
|
+
"current_status": current_fields.status,
|
|
650
|
+
"score_delta": score_delta,
|
|
651
|
+
"issue_count_delta": issue_count_delta,
|
|
652
|
+
"comparability_flags": comparability_flags,
|
|
653
|
+
"regression_flags": regression_flags,
|
|
654
|
+
"status": comparison_status,
|
|
655
|
+
}
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def _promote_contract_baseline(report: dict[str, Any], *, source_path: Path) -> dict[str, Any]:
|
|
660
|
+
baseline = dict(report)
|
|
661
|
+
baseline["baseline_metadata"] = {
|
|
662
|
+
"status": "active",
|
|
663
|
+
"source_eval_path": str(source_path),
|
|
664
|
+
"source_eval_hash": _canonical_payload_hash(report),
|
|
665
|
+
}
|
|
666
|
+
return baseline
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def evaluate_json_contract_corpus(
|
|
670
|
+
*,
|
|
671
|
+
corpus: _AgentBehaviorCorpusFields,
|
|
672
|
+
base: Path,
|
|
673
|
+
prompt_identity: JsonObject,
|
|
674
|
+
baseline_path: Path | None = None,
|
|
675
|
+
) -> dict[str, Any]:
|
|
676
|
+
cases_path = _resolve(base, corpus.cases_path)
|
|
677
|
+
cases_payload = _read_json_object(cases_path, label="agent behavior cases")
|
|
678
|
+
cases_fields = _agent_behavior_cases_payload_fields(cases_payload)
|
|
679
|
+
if cases_fields.schema_id != "medical-notes-workbench.agent-behavior-cases.v1":
|
|
680
|
+
raise ValidationError("agent behavior cases must use schema medical-notes-workbench.agent-behavior-cases.v1")
|
|
681
|
+
cases: list[JsonObject] = []
|
|
682
|
+
case_scores: list[int] = []
|
|
683
|
+
assertion_counts: list[int] = []
|
|
684
|
+
total_issues: list[dict[str, str]] = []
|
|
685
|
+
for case in cases_fields.cases:
|
|
686
|
+
case_id = case.case_id
|
|
687
|
+
output_path = _resolve(base, case.output_path)
|
|
688
|
+
payload = _read_json_object(output_path, label=f"agent behavior output {case_id}")
|
|
689
|
+
case_issues: list[dict[str, str]] = []
|
|
690
|
+
for assertion in case.assertions:
|
|
691
|
+
case_issues.extend(_evaluate_assertion(case_id=case_id, payload=payload, assertion=assertion))
|
|
692
|
+
total_issues.extend(case_issues)
|
|
693
|
+
case_score = _score(len(case_issues))
|
|
694
|
+
assertion_count = len(case.assertions)
|
|
695
|
+
case_scores.append(case_score)
|
|
696
|
+
assertion_counts.append(assertion_count)
|
|
697
|
+
cases.append(
|
|
698
|
+
JsonObjectAdapter.validate_python(
|
|
699
|
+
{
|
|
700
|
+
"case_id": case_id,
|
|
701
|
+
"behavior": case.behavior,
|
|
702
|
+
"output_path": _serialized_output_path(base, output_path),
|
|
703
|
+
"status": "pass" if not case_issues else "needs_review",
|
|
704
|
+
"score": case_score,
|
|
705
|
+
"issues": case_issues,
|
|
706
|
+
"assertion_count": assertion_count,
|
|
707
|
+
}
|
|
708
|
+
)
|
|
709
|
+
)
|
|
710
|
+
issue_count = len(total_issues)
|
|
711
|
+
report_status = "pass" if issue_count == 0 else "needs_review"
|
|
712
|
+
report_next_action = "" if issue_count == 0 else "review behavior contract failures before accepting prompt changes"
|
|
713
|
+
comparison: JsonObject | None = None
|
|
714
|
+
if baseline_path is not None and baseline_path.is_file():
|
|
715
|
+
comparison = _compare_contract_baseline(
|
|
716
|
+
current=JsonObjectAdapter.validate_python(
|
|
717
|
+
{
|
|
718
|
+
"schema": AGENT_BEHAVIOR_CONTRACT_EVAL_SCHEMA,
|
|
719
|
+
"status": report_status,
|
|
720
|
+
"aggregate": {
|
|
721
|
+
"case_count": len(cases),
|
|
722
|
+
"issue_count": issue_count,
|
|
723
|
+
"score": round(sum(case_scores) / len(case_scores)) if case_scores else 100,
|
|
724
|
+
},
|
|
725
|
+
"prompt_identity": prompt_identity,
|
|
726
|
+
}
|
|
727
|
+
),
|
|
728
|
+
baseline_path=baseline_path,
|
|
729
|
+
)
|
|
730
|
+
comparison_fields = _BaselineComparisonFields.model_validate(comparison)
|
|
731
|
+
if comparison_fields.status != "improved_or_equal":
|
|
732
|
+
report_status = "needs_review"
|
|
733
|
+
report_next_action = "review behavior corpus baseline before accepting prompt changes"
|
|
734
|
+
|
|
735
|
+
report = {
|
|
736
|
+
"schema": AGENT_BEHAVIOR_CONTRACT_EVAL_SCHEMA,
|
|
737
|
+
"suite_id": corpus.suite_id,
|
|
738
|
+
"agent": corpus.agent,
|
|
739
|
+
"evaluator": "json_contract",
|
|
740
|
+
"prompt_identity": prompt_identity,
|
|
741
|
+
"status": report_status,
|
|
742
|
+
"aggregate": {
|
|
743
|
+
"case_count": len(cases),
|
|
744
|
+
"issue_count": issue_count,
|
|
745
|
+
"score": round(sum(case_scores) / len(case_scores)) if case_scores else 100,
|
|
746
|
+
"assertion_count": sum(assertion_counts),
|
|
747
|
+
},
|
|
748
|
+
"cases": cases,
|
|
749
|
+
"issues": total_issues,
|
|
750
|
+
"next_action": report_next_action,
|
|
751
|
+
}
|
|
752
|
+
if comparison is not None:
|
|
753
|
+
report["comparison"] = comparison
|
|
754
|
+
return report
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def _blocked_report(
|
|
758
|
+
*,
|
|
759
|
+
corpus: _AgentBehaviorCorpusFields,
|
|
760
|
+
prompt_identity_hash: str,
|
|
761
|
+
issues: list[JsonObject],
|
|
762
|
+
) -> JsonObject:
|
|
763
|
+
return JsonObjectAdapter.validate_python({
|
|
764
|
+
"schema": AGENT_BEHAVIOR_CORPUS_REPORT_SCHEMA,
|
|
765
|
+
"status": "needs_review",
|
|
766
|
+
"suite_id": corpus.suite_id,
|
|
767
|
+
"agent": corpus.agent,
|
|
768
|
+
"aggregate": {
|
|
769
|
+
"suite_count": 1,
|
|
770
|
+
"case_count": corpus.case_count,
|
|
771
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
772
|
+
"issue_codes": [issue["code"] for issue in issues],
|
|
773
|
+
},
|
|
774
|
+
"suites": [],
|
|
775
|
+
"issues": issues,
|
|
776
|
+
"next_action": "rerun the agent behavior corpus with the current prompt and promote a fresh baseline",
|
|
777
|
+
})
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def _evaluate_single_agent_behavior_corpus(corpus_file: Path) -> dict[str, Any]:
|
|
781
|
+
base = corpus_file.parent
|
|
782
|
+
corpus = _read_json_object(corpus_file, label="agent behavior corpus")
|
|
783
|
+
corpus_fields = _agent_behavior_corpus_fields(corpus)
|
|
784
|
+
if corpus_fields.schema_id != AGENT_BEHAVIOR_CORPUS_SCHEMA:
|
|
785
|
+
raise ValidationError(f"agent behavior corpus must use schema {AGENT_BEHAVIOR_CORPUS_SCHEMA}")
|
|
786
|
+
evaluator = corpus_fields.evaluator
|
|
787
|
+
if evaluator not in {"curator_prompt_eval", "json_contract"}:
|
|
788
|
+
raise ValidationError("agent behavior corpus supports evaluator=curator_prompt_eval or json_contract")
|
|
789
|
+
|
|
790
|
+
prompt_identity = _prompt_identity_for_corpus(corpus_fields)
|
|
791
|
+
prompt_identity_hash = str(prompt_identity.get("aggregate_hash") or "")
|
|
792
|
+
locked_prompt_hash = corpus_fields.prompt_identity_hash
|
|
793
|
+
issues: list[JsonObject] = []
|
|
794
|
+
if locked_prompt_hash != prompt_identity_hash:
|
|
795
|
+
issues.append(
|
|
796
|
+
_issue(
|
|
797
|
+
code="stale_prompt_identity",
|
|
798
|
+
message="corpus prompt_identity_hash does not match the current prompt/runbook fingerprint",
|
|
799
|
+
)
|
|
800
|
+
)
|
|
801
|
+
return _blocked_report(corpus=corpus_fields, prompt_identity_hash=prompt_identity_hash, issues=issues)
|
|
802
|
+
|
|
803
|
+
baseline_path = _resolve(base, corpus_fields.baseline_eval_path)
|
|
804
|
+
if not baseline_path.is_file():
|
|
805
|
+
issues.append(_issue(code="missing_behavior_baseline", message=f"baseline eval not found: {baseline_path}"))
|
|
806
|
+
return _blocked_report(corpus=corpus_fields, prompt_identity_hash=prompt_identity_hash, issues=issues)
|
|
807
|
+
|
|
808
|
+
if evaluator == "json_contract":
|
|
809
|
+
eval_report = evaluate_json_contract_corpus(
|
|
810
|
+
corpus=corpus_fields,
|
|
811
|
+
base=base,
|
|
812
|
+
prompt_identity=prompt_identity,
|
|
813
|
+
baseline_path=baseline_path,
|
|
814
|
+
)
|
|
815
|
+
eval_fields = _ContractEvalReportFields.model_validate(eval_report)
|
|
816
|
+
suite_status = "pass" if eval_fields.status == "pass" else "needs_review"
|
|
817
|
+
report_issues = list(issues)
|
|
818
|
+
if suite_status != "pass":
|
|
819
|
+
report_issues.append(
|
|
820
|
+
_issue(
|
|
821
|
+
code="behavior_contract_failed",
|
|
822
|
+
message="agent behavior contract returned needs_review",
|
|
823
|
+
)
|
|
824
|
+
)
|
|
825
|
+
return {
|
|
826
|
+
"schema": AGENT_BEHAVIOR_CORPUS_REPORT_SCHEMA,
|
|
827
|
+
"status": "pass" if not report_issues else "needs_review",
|
|
828
|
+
"suite_id": corpus_fields.suite_id,
|
|
829
|
+
"agent": corpus_fields.agent,
|
|
830
|
+
"aggregate": {
|
|
831
|
+
"suite_count": 1,
|
|
832
|
+
"case_count": eval_fields.aggregate.case_count or corpus_fields.case_count,
|
|
833
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
834
|
+
"issue_codes": [issue["code"] for issue in report_issues],
|
|
835
|
+
},
|
|
836
|
+
"suites": [
|
|
837
|
+
{
|
|
838
|
+
"suite_id": corpus_fields.suite_id,
|
|
839
|
+
"agent": corpus_fields.agent,
|
|
840
|
+
"evaluator": evaluator,
|
|
841
|
+
"status": suite_status,
|
|
842
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
843
|
+
"eval": eval_report,
|
|
844
|
+
}
|
|
845
|
+
],
|
|
846
|
+
"issues": report_issues,
|
|
847
|
+
"next_action": ""
|
|
848
|
+
if not report_issues
|
|
849
|
+
else "review agent behavior corpus failures before accepting prompt changes",
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
plan_path = _resolve(base, corpus_fields.plan_path)
|
|
853
|
+
manifest_path = _resolve(base, corpus_fields.manifest_path)
|
|
854
|
+
expectations_path = _resolve(base, corpus_fields.expectations_path)
|
|
855
|
+
plan = _with_current_prompt_identity(_read_json_object(plan_path, label="agent behavior corpus plan"), prompt_identity)
|
|
856
|
+
plan["evaluation_expectations_by_work_id"] = load_curator_prompt_expectations(
|
|
857
|
+
expectations_path,
|
|
858
|
+
expected_plan_hash=curator_plan_hash(plan),
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
with tempfile.TemporaryDirectory(prefix="agent-behavior-corpus-") as temp_dir:
|
|
862
|
+
normalized_manifest_path, manifest = _manifest_with_absolute_outputs(
|
|
863
|
+
base=base,
|
|
864
|
+
manifest_path=manifest_path,
|
|
865
|
+
output_dir=Path(temp_dir),
|
|
866
|
+
)
|
|
867
|
+
manifest_prompt_hash = str(manifest.get("prompt_identity_hash") or "")
|
|
868
|
+
if manifest_prompt_hash != prompt_identity_hash:
|
|
869
|
+
issues.append(
|
|
870
|
+
_issue(
|
|
871
|
+
code="stale_behavior_outputs",
|
|
872
|
+
message="manifest prompt_identity_hash does not match current prompt/runbook fingerprint",
|
|
873
|
+
)
|
|
874
|
+
)
|
|
875
|
+
return _blocked_report(corpus=corpus_fields, prompt_identity_hash=prompt_identity_hash, issues=issues)
|
|
876
|
+
baseline = _read_json_object(baseline_path, label="agent behavior corpus baseline")
|
|
877
|
+
baseline_prompt = baseline.get("prompt_identity") if isinstance(baseline.get("prompt_identity"), dict) else {}
|
|
878
|
+
if str(baseline_prompt.get("aggregate_hash") or "") != prompt_identity_hash:
|
|
879
|
+
issues.append(
|
|
880
|
+
_issue(
|
|
881
|
+
code="stale_behavior_baseline",
|
|
882
|
+
message="baseline prompt_identity does not match current prompt/runbook fingerprint",
|
|
883
|
+
)
|
|
884
|
+
)
|
|
885
|
+
return _blocked_report(corpus=corpus_fields, prompt_identity_hash=prompt_identity_hash, issues=issues)
|
|
886
|
+
eval_report = evaluate_curator_prompt_outputs(
|
|
887
|
+
plan=plan,
|
|
888
|
+
manifest_path=normalized_manifest_path,
|
|
889
|
+
baseline_eval_path=baseline_path,
|
|
890
|
+
)
|
|
891
|
+
eval_report = _relativize_output_paths(eval_report, base=base)
|
|
892
|
+
|
|
893
|
+
eval_fields = _ContractEvalReportFields.model_validate(eval_report)
|
|
894
|
+
suite_status = "pass" if eval_fields.status == "pass" else "needs_review"
|
|
895
|
+
case_count = eval_fields.aggregate.item_count or corpus_fields.case_count
|
|
896
|
+
report_issues = list(issues)
|
|
897
|
+
if suite_status != "pass":
|
|
898
|
+
report_issues.append(
|
|
899
|
+
_issue(
|
|
900
|
+
code="behavior_corpus_eval_needs_review",
|
|
901
|
+
message="curator behavior corpus eval returned needs_review",
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
return {
|
|
905
|
+
"schema": AGENT_BEHAVIOR_CORPUS_REPORT_SCHEMA,
|
|
906
|
+
"status": "pass" if not report_issues else "needs_review",
|
|
907
|
+
"suite_id": corpus_fields.suite_id,
|
|
908
|
+
"agent": corpus_fields.agent,
|
|
909
|
+
"aggregate": {
|
|
910
|
+
"suite_count": 1,
|
|
911
|
+
"case_count": case_count,
|
|
912
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
913
|
+
"issue_codes": [issue["code"] for issue in report_issues],
|
|
914
|
+
},
|
|
915
|
+
"suites": [
|
|
916
|
+
{
|
|
917
|
+
"suite_id": corpus_fields.suite_id,
|
|
918
|
+
"agent": corpus_fields.agent,
|
|
919
|
+
"evaluator": corpus_fields.evaluator,
|
|
920
|
+
"status": suite_status,
|
|
921
|
+
"plan_hash": curator_plan_hash(plan),
|
|
922
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
923
|
+
"eval": eval_report,
|
|
924
|
+
}
|
|
925
|
+
],
|
|
926
|
+
"issues": report_issues,
|
|
927
|
+
"next_action": ""
|
|
928
|
+
if not report_issues
|
|
929
|
+
else "review agent behavior corpus failures before accepting prompt changes",
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
class _CorpusAggregateFields(ContractModel):
|
|
934
|
+
"""Typed status/count lens for bank-level corpus aggregation."""
|
|
935
|
+
|
|
936
|
+
model_config = ConfigDict(extra="ignore")
|
|
937
|
+
|
|
938
|
+
status: str = ""
|
|
939
|
+
aggregate: JsonObject = Field(default_factory=dict)
|
|
940
|
+
issues: list[JsonObject] = Field(default_factory=list)
|
|
941
|
+
suites: list[JsonObject] = Field(default_factory=list)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _aggregate_corpus_reports(reports: list[dict[str, Any]]) -> dict[str, Any]:
|
|
945
|
+
issue_codes: list[str] = []
|
|
946
|
+
issues: list[dict[str, str]] = []
|
|
947
|
+
suites: list[dict[str, Any]] = []
|
|
948
|
+
case_count = 0
|
|
949
|
+
prompt_identity_hash = ""
|
|
950
|
+
typed_reports = [_CorpusAggregateFields.model_validate(report) for report in reports]
|
|
951
|
+
for report in typed_reports:
|
|
952
|
+
aggregate = report.aggregate
|
|
953
|
+
case_count += int(aggregate.get("case_count") or 0)
|
|
954
|
+
if not prompt_identity_hash:
|
|
955
|
+
prompt_identity_hash = str(aggregate.get("prompt_identity_hash") or "")
|
|
956
|
+
issue_codes.extend(str(code) for code in aggregate.get("issue_codes", []) if str(code))
|
|
957
|
+
issues.extend(issue for issue in report.issues if isinstance(issue, dict))
|
|
958
|
+
suites.extend(suite for suite in report.suites if isinstance(suite, dict))
|
|
959
|
+
status = "pass" if all(report.status == "pass" for report in typed_reports) else "needs_review"
|
|
960
|
+
return {
|
|
961
|
+
"schema": AGENT_BEHAVIOR_CORPUS_REPORT_SCHEMA,
|
|
962
|
+
"status": status,
|
|
963
|
+
"suite_id": "agent_behavior_corpus_bank",
|
|
964
|
+
"agent": "multiple",
|
|
965
|
+
"aggregate": {
|
|
966
|
+
"suite_count": len(reports),
|
|
967
|
+
"case_count": case_count,
|
|
968
|
+
"prompt_identity_hash": prompt_identity_hash,
|
|
969
|
+
"issue_codes": issue_codes,
|
|
970
|
+
},
|
|
971
|
+
"suites": suites,
|
|
972
|
+
"issues": issues,
|
|
973
|
+
"next_action": ""
|
|
974
|
+
if status == "pass"
|
|
975
|
+
else "review agent behavior corpus failures before accepting prompt changes",
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def evaluate_agent_behavior_corpus(corpus_path: Path) -> dict[str, Any]:
|
|
980
|
+
corpus_files = _corpus_files(corpus_path)
|
|
981
|
+
reports = [_evaluate_single_agent_behavior_corpus(corpus_file) for corpus_file in corpus_files]
|
|
982
|
+
if len(reports) == 1:
|
|
983
|
+
return reports[0]
|
|
984
|
+
return _aggregate_corpus_reports(reports)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def _json_payload_files(input_path: Path) -> list[Path]:
|
|
988
|
+
if input_path.is_dir():
|
|
989
|
+
return sorted(path for path in input_path.rglob("*.json") if path.is_file())
|
|
990
|
+
return [input_path]
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def _evidence_payload_files(input_path: Path) -> list[Path]:
|
|
994
|
+
if input_path.is_dir():
|
|
995
|
+
return sorted(
|
|
996
|
+
path
|
|
997
|
+
for path in input_path.rglob("*")
|
|
998
|
+
if path.is_file() and path.suffix.lower() in {".json", ".md", ".markdown", ".txt"}
|
|
999
|
+
)
|
|
1000
|
+
return [input_path]
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
def _read_json_any(path: Path) -> Any:
|
|
1004
|
+
try:
|
|
1005
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
1006
|
+
except FileNotFoundError as exc:
|
|
1007
|
+
raise ValidationError(f"telemetry input not found: {path}") from exc
|
|
1008
|
+
except json.JSONDecodeError as exc:
|
|
1009
|
+
raise ValidationError(f"telemetry input is invalid JSON: {path}: {exc}") from exc
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def _schema_app(payload: dict[str, Any]) -> str:
|
|
1013
|
+
fields = _telemetry_payload_lens(payload)
|
|
1014
|
+
schema = fields.schema_id
|
|
1015
|
+
if ".workflow-telemetry-envelope." in schema:
|
|
1016
|
+
return schema.split(".workflow-telemetry-envelope.", 1)[0]
|
|
1017
|
+
if ".workflow-run-record." in schema:
|
|
1018
|
+
return schema.split(".workflow-run-record.", 1)[0]
|
|
1019
|
+
return ""
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _payload_app(payload: dict[str, Any]) -> str:
|
|
1023
|
+
fields = _telemetry_payload_lens(payload)
|
|
1024
|
+
client_app = fields.client.app if fields.client is not None else ""
|
|
1025
|
+
for value in (fields.app, client_app, _schema_app(payload)):
|
|
1026
|
+
if value:
|
|
1027
|
+
return value
|
|
1028
|
+
return ""
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
def _telemetry_records(input_path: Path) -> list[tuple[dict[str, Any], dict[str, Any], Path]]:
|
|
1032
|
+
records: list[tuple[dict[str, Any], dict[str, Any], Path]] = []
|
|
1033
|
+
for path in _json_payload_files(input_path):
|
|
1034
|
+
payload = _read_json_any(path)
|
|
1035
|
+
payload_fields = _telemetry_payload_lens(payload)
|
|
1036
|
+
if isinstance(payload, dict) and payload_fields.records:
|
|
1037
|
+
envelope = payload
|
|
1038
|
+
for record in payload_fields.records:
|
|
1039
|
+
if isinstance(record, dict):
|
|
1040
|
+
records.append((record, envelope, path))
|
|
1041
|
+
elif isinstance(payload, dict):
|
|
1042
|
+
records.append((payload, {}, path))
|
|
1043
|
+
elif isinstance(payload, list):
|
|
1044
|
+
for record in payload:
|
|
1045
|
+
if isinstance(record, dict):
|
|
1046
|
+
records.append((record, {}, path))
|
|
1047
|
+
return records
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def _record_app(record: dict[str, Any], envelope: dict[str, Any]) -> str:
|
|
1051
|
+
record_app = _payload_app(record)
|
|
1052
|
+
envelope_app = _payload_app(envelope)
|
|
1053
|
+
if record_app:
|
|
1054
|
+
return record_app
|
|
1055
|
+
if envelope_app:
|
|
1056
|
+
return envelope_app
|
|
1057
|
+
return DEFAULT_TELEMETRY_APP
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def _record_app_version(record: dict[str, Any], envelope: dict[str, Any]) -> str:
|
|
1061
|
+
record_fields = _telemetry_record_lens(record)
|
|
1062
|
+
envelope_fields = _telemetry_payload_lens(envelope)
|
|
1063
|
+
record_client_version = record_fields.client.app_version if record_fields.client is not None else ""
|
|
1064
|
+
envelope_client_version = envelope_fields.client.app_version if envelope_fields.client is not None else ""
|
|
1065
|
+
integrity = (
|
|
1066
|
+
record_fields.environment_context.extension_integrity
|
|
1067
|
+
if record_fields.environment_context is not None
|
|
1068
|
+
else None
|
|
1069
|
+
)
|
|
1070
|
+
for value in (
|
|
1071
|
+
record_fields.app_version,
|
|
1072
|
+
record_client_version,
|
|
1073
|
+
envelope_client_version,
|
|
1074
|
+
integrity.app_version if integrity is not None else "",
|
|
1075
|
+
):
|
|
1076
|
+
if value:
|
|
1077
|
+
return value
|
|
1078
|
+
return "unknown"
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
def _list_strings(value: Any) -> list[str]:
|
|
1082
|
+
if isinstance(value, list):
|
|
1083
|
+
return [str(item) for item in value if str(item or "")]
|
|
1084
|
+
if str(value or ""):
|
|
1085
|
+
return [str(value)]
|
|
1086
|
+
return []
|
|
1087
|
+
|
|
1088
|
+
|
|
1089
|
+
def _script_risk_codes(record: dict[str, Any]) -> list[str]:
|
|
1090
|
+
codes: list[str] = []
|
|
1091
|
+
scripts = record.get("generated_scripts")
|
|
1092
|
+
if not isinstance(scripts, list):
|
|
1093
|
+
return codes
|
|
1094
|
+
for script in scripts:
|
|
1095
|
+
if not isinstance(script, dict):
|
|
1096
|
+
continue
|
|
1097
|
+
for code in _list_strings(script.get("risk_codes")):
|
|
1098
|
+
if code not in codes:
|
|
1099
|
+
codes.append(code)
|
|
1100
|
+
return codes
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
def _agent_events(record: dict[str, Any]) -> list[_TelemetryAgentEventFields]:
|
|
1104
|
+
events = record.get("agent_events")
|
|
1105
|
+
typed_events: list[_TelemetryAgentEventFields] = []
|
|
1106
|
+
if not isinstance(events, list):
|
|
1107
|
+
return typed_events
|
|
1108
|
+
for event in events:
|
|
1109
|
+
if isinstance(event, dict):
|
|
1110
|
+
typed_events.append(_telemetry_agent_event_fields(JsonObjectAdapter.validate_python(event)))
|
|
1111
|
+
return typed_events
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
def _signals_for_record(record: dict[str, Any]) -> list[str]:
|
|
1115
|
+
diagnostic = record.get("diagnostic_context") if isinstance(record.get("diagnostic_context"), dict) else {}
|
|
1116
|
+
behavior = (
|
|
1117
|
+
diagnostic.get("agent_behavior_context")
|
|
1118
|
+
if isinstance(diagnostic.get("agent_behavior_context"), dict)
|
|
1119
|
+
else {}
|
|
1120
|
+
)
|
|
1121
|
+
signals: list[str] = []
|
|
1122
|
+
for value in _list_strings(behavior.get("codes")):
|
|
1123
|
+
if value not in signals:
|
|
1124
|
+
signals.append(value)
|
|
1125
|
+
root = str(diagnostic.get("root_cause_code") or "")
|
|
1126
|
+
if root and (root.startswith("agent.") or root in DEFAULT_SIGNAL_SEVERITY) and root not in signals:
|
|
1127
|
+
signals.append(root)
|
|
1128
|
+
for event in _agent_events(record):
|
|
1129
|
+
code = event.code
|
|
1130
|
+
if code and code not in signals:
|
|
1131
|
+
signals.append(code)
|
|
1132
|
+
risk_codes = set(_script_risk_codes(record))
|
|
1133
|
+
if risk_codes & RISK_CODES_THAT_CREATE_DRAFTS and "agent.generated_script_workaround" not in signals:
|
|
1134
|
+
signals.append("agent.generated_script_workaround")
|
|
1135
|
+
if "extension_prompt_or_script_drift" in risk_codes and "extension_prompt_or_script_drift" not in signals:
|
|
1136
|
+
signals.append("extension_prompt_or_script_drift")
|
|
1137
|
+
return signals
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
def _severity_for_signal(record: dict[str, Any], signal: str) -> str:
|
|
1141
|
+
severities: list[str] = []
|
|
1142
|
+
for event in _agent_events(record):
|
|
1143
|
+
if event.code == signal or event.type == signal:
|
|
1144
|
+
severity = event.severity.lower()
|
|
1145
|
+
if severity:
|
|
1146
|
+
severities.append(severity)
|
|
1147
|
+
severities.append(DEFAULT_SIGNAL_SEVERITY.get(signal, "low"))
|
|
1148
|
+
return max(severities, key=lambda item: SEVERITY_RANK.get(item, 0))
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
def _passes_min_severity(record: dict[str, Any], signal: str, min_severity: str) -> bool:
|
|
1152
|
+
return SEVERITY_RANK.get(_severity_for_signal(record, signal), 0) >= SEVERITY_RANK.get(min_severity, 2)
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def _clean_text(value: Any, *, max_chars: int = 320) -> str:
|
|
1156
|
+
text = str(value or "").replace("\r", " ").replace("\n", " ").strip()
|
|
1157
|
+
text = re.sub(
|
|
1158
|
+
r"(?i)(token|auth[_-]?token|api[_-]?key|secret|authorization|bearer)\s*[:=]\s*['\"]?[^'\"\s]+",
|
|
1159
|
+
r"\1=<redacted>",
|
|
1160
|
+
text,
|
|
1161
|
+
)
|
|
1162
|
+
text = re.sub(r"(?i)(RESEND_API_KEY|INGEST_TOKEN|OPENAI_API_KEY|ANTHROPIC_API_KEY)[^,\s]*", r"\1=<redacted>", text)
|
|
1163
|
+
return text[:max_chars]
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _event_sample(event: _TelemetryAgentEventFields) -> dict[str, str]:
|
|
1167
|
+
allowed = (
|
|
1168
|
+
"code",
|
|
1169
|
+
"type",
|
|
1170
|
+
"severity",
|
|
1171
|
+
"phase",
|
|
1172
|
+
"expected_phase",
|
|
1173
|
+
"next_action_expected",
|
|
1174
|
+
"recovery_command",
|
|
1175
|
+
"command_family",
|
|
1176
|
+
"path",
|
|
1177
|
+
)
|
|
1178
|
+
return {key: _clean_text(getattr(event, key)) for key in allowed if getattr(event, key)}
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def _redacted_evidence(record: dict[str, Any], envelope: dict[str, Any], *, signal: str, source_path: Path) -> dict[str, Any]:
|
|
1182
|
+
diagnostic = record.get("diagnostic_context") if isinstance(record.get("diagnostic_context"), dict) else {}
|
|
1183
|
+
behavior = (
|
|
1184
|
+
diagnostic.get("agent_behavior_context")
|
|
1185
|
+
if isinstance(diagnostic.get("agent_behavior_context"), dict)
|
|
1186
|
+
else {}
|
|
1187
|
+
)
|
|
1188
|
+
evidence = {
|
|
1189
|
+
"source_path": _serialized_evidence_source_path(source_path),
|
|
1190
|
+
"run_id": _clean_text(record.get("run_id")),
|
|
1191
|
+
"workflow": _clean_text(record.get("workflow")),
|
|
1192
|
+
"status": _clean_text(record.get("status")),
|
|
1193
|
+
"phase": _clean_text(record.get("phase")),
|
|
1194
|
+
"blocked_reason": _clean_text(record.get("blocked_reason")),
|
|
1195
|
+
"next_action": _clean_text(record.get("next_action")),
|
|
1196
|
+
"root_cause_code": _clean_text(diagnostic.get("root_cause_code")),
|
|
1197
|
+
"recovery_command": _clean_text(diagnostic.get("recovery_command")),
|
|
1198
|
+
"agent_behavior_codes": _list_strings(behavior.get("codes")),
|
|
1199
|
+
"risk_codes": _script_risk_codes(record),
|
|
1200
|
+
"event_samples": [_event_sample(event) for event in _agent_events(record)[:3]],
|
|
1201
|
+
"payload_level": _clean_text(envelope.get("payload_level")),
|
|
1202
|
+
"signal": signal,
|
|
1203
|
+
}
|
|
1204
|
+
return {key: value for key, value in evidence.items() if value not in ("", [], {})}
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
def _workflow_key(workflow: str) -> str:
|
|
1208
|
+
value = workflow.strip()
|
|
1209
|
+
if not value:
|
|
1210
|
+
return ""
|
|
1211
|
+
if value in COMMAND_PROMPT_SOURCES:
|
|
1212
|
+
return value.rsplit(":", 1)[-1].lstrip("/")
|
|
1213
|
+
if value.startswith("/mednotes:"):
|
|
1214
|
+
return value.split(":", 1)[1].split()[0]
|
|
1215
|
+
if value.startswith("/flashcards"):
|
|
1216
|
+
return "flashcards"
|
|
1217
|
+
return value.split()[0].replace("/mednotes:", "").replace("/", "")
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
def _command_source_for_workflow(workflow: str) -> str:
|
|
1221
|
+
normalized = workflow.strip().split()[0] if workflow.strip() else ""
|
|
1222
|
+
return COMMAND_PROMPT_SOURCES.get(normalized, "")
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
def _suggested_prompt_sources(workflow: str) -> list[str]:
|
|
1226
|
+
sources: list[str] = []
|
|
1227
|
+
command_source = _command_source_for_workflow(workflow)
|
|
1228
|
+
if command_source:
|
|
1229
|
+
sources.append(command_source)
|
|
1230
|
+
skill_source = WORKFLOW_SKILL_PROMPT_SOURCES.get(_workflow_key(workflow))
|
|
1231
|
+
if skill_source and skill_source not in sources:
|
|
1232
|
+
sources.append(skill_source)
|
|
1233
|
+
return sources
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
def _prompt_snippet(relative_path: str, *, signal: str) -> str:
|
|
1237
|
+
path = _extension_root() / relative_path
|
|
1238
|
+
try:
|
|
1239
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
1240
|
+
except OSError:
|
|
1241
|
+
return ""
|
|
1242
|
+
keywords = ["next_action", "blocked", "bloque", "script", "comando", "workflow"]
|
|
1243
|
+
if "tool" in signal or "command" in signal:
|
|
1244
|
+
keywords.extend(["exit code", "shell", "terminal"])
|
|
1245
|
+
if "script" in signal:
|
|
1246
|
+
keywords.extend(["workaround", "oficial", "manual"])
|
|
1247
|
+
selected = 0
|
|
1248
|
+
for index, line in enumerate(lines):
|
|
1249
|
+
lowered = line.casefold()
|
|
1250
|
+
if any(keyword in lowered for keyword in keywords):
|
|
1251
|
+
selected = max(0, index - 1)
|
|
1252
|
+
break
|
|
1253
|
+
snippet = " ".join(line.strip() for line in lines[selected : selected + 3] if line.strip())
|
|
1254
|
+
return _clean_text(snippet, max_chars=420)
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def _surface_items(value: Any, *, kind: str) -> list[dict[str, str]]:
|
|
1258
|
+
if not isinstance(value, list):
|
|
1259
|
+
return []
|
|
1260
|
+
allowed = (
|
|
1261
|
+
("path", "snippet", "reason")
|
|
1262
|
+
if kind == "prompt"
|
|
1263
|
+
else ("path", "function_or_command", "reason")
|
|
1264
|
+
)
|
|
1265
|
+
items: list[dict[str, str]] = []
|
|
1266
|
+
for item in value:
|
|
1267
|
+
if not isinstance(item, dict):
|
|
1268
|
+
continue
|
|
1269
|
+
clean = {key: _clean_text(item.get(key), max_chars=420) for key in allowed if _clean_text(item.get(key))}
|
|
1270
|
+
if clean.get("path"):
|
|
1271
|
+
items.append(clean)
|
|
1272
|
+
return items
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
def _suspect_prompts_from_sources(prompt_sources: list[str], *, signal: str) -> list[dict[str, str]]:
|
|
1276
|
+
prompts: list[dict[str, str]] = []
|
|
1277
|
+
for source in prompt_sources:
|
|
1278
|
+
path = _clean_text(source)
|
|
1279
|
+
if not path:
|
|
1280
|
+
continue
|
|
1281
|
+
snippet = _prompt_snippet(path, signal=signal)
|
|
1282
|
+
prompts.append(
|
|
1283
|
+
{
|
|
1284
|
+
"path": path,
|
|
1285
|
+
"snippet": snippet or "Trecho não disponível no bundle local.",
|
|
1286
|
+
"reason": f"Fonte de prompt vinculada ao workflow/sinal {signal}; revisar se deveria prevenir o desvio.",
|
|
1287
|
+
}
|
|
1288
|
+
)
|
|
1289
|
+
return prompts
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _suspect_scripts_from_record(record: dict[str, Any]) -> list[dict[str, str]]:
|
|
1293
|
+
scripts: list[dict[str, str]] = []
|
|
1294
|
+
raw_scripts = record.get("generated_scripts")
|
|
1295
|
+
if isinstance(raw_scripts, list):
|
|
1296
|
+
for item in raw_scripts:
|
|
1297
|
+
if not isinstance(item, dict):
|
|
1298
|
+
continue
|
|
1299
|
+
script = _GeneratedScriptEvidenceLens.model_validate(item)
|
|
1300
|
+
path = _clean_text(script.path)
|
|
1301
|
+
if not path:
|
|
1302
|
+
continue
|
|
1303
|
+
risks = ", ".join(_list_strings(script.risk_codes)[:5])
|
|
1304
|
+
scripts.append(
|
|
1305
|
+
{
|
|
1306
|
+
"path": path,
|
|
1307
|
+
"function_or_command": _clean_text(script.function_or_command or "generated_script"),
|
|
1308
|
+
"reason": f"Script capturado na evidência; risk_codes={risks}" if risks else "Script capturado na evidência.",
|
|
1309
|
+
}
|
|
1310
|
+
)
|
|
1311
|
+
raw_commands = record.get("command_events")
|
|
1312
|
+
if isinstance(raw_commands, list):
|
|
1313
|
+
for item in raw_commands:
|
|
1314
|
+
if not isinstance(item, dict):
|
|
1315
|
+
continue
|
|
1316
|
+
event = _CommandEventEvidenceLens.model_validate(item)
|
|
1317
|
+
command = _clean_text(event.command or event.command_family, max_chars=260)
|
|
1318
|
+
if not command:
|
|
1319
|
+
continue
|
|
1320
|
+
scripts.append(
|
|
1321
|
+
{
|
|
1322
|
+
"path": _clean_text(event.path or "terminal"),
|
|
1323
|
+
"function_or_command": command,
|
|
1324
|
+
"reason": _clean_text(event.status or "command_event"),
|
|
1325
|
+
}
|
|
1326
|
+
)
|
|
1327
|
+
return scripts
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def _prevention_owner_note(*, prompts: list[dict[str, str]], scripts: list[dict[str, str]]) -> str:
|
|
1331
|
+
if prompts or scripts:
|
|
1332
|
+
return "Superfícies suspeitas listadas para revisão; isso não prova culpa sem reprodução."
|
|
1333
|
+
return "Nenhum prompt ou script encarregado de prevenir este comportamento foi identificado na evidência redigida."
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
def _target_suite(record: dict[str, Any]) -> str:
|
|
1337
|
+
fields = _telemetry_record_lens(record)
|
|
1338
|
+
workflow = fields.workflow
|
|
1339
|
+
if _command_source_for_workflow(workflow):
|
|
1340
|
+
return "extension_commands.core_behavior.v1"
|
|
1341
|
+
agent = fields.agent
|
|
1342
|
+
if agent:
|
|
1343
|
+
normalized = agent.replace("-", "_")
|
|
1344
|
+
return f"{normalized}.core_behavior.v1"
|
|
1345
|
+
return "extension_skills.core_behavior.v1"
|
|
1346
|
+
|
|
1347
|
+
|
|
1348
|
+
def _suggested_assertions(signal: str) -> list[dict[str, Any]]:
|
|
1349
|
+
shared_block = [
|
|
1350
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1351
|
+
{"op": "path_present", "path": "next_action"},
|
|
1352
|
+
]
|
|
1353
|
+
mapping: dict[str, list[dict[str, Any]]] = {
|
|
1354
|
+
"agent.retry_loop": shared_block
|
|
1355
|
+
+ [
|
|
1356
|
+
{"op": "path_present", "path": "diagnostic_context.agent_behavior_context.codes"},
|
|
1357
|
+
{"op": "path_present", "path": "error_context.retry_scope"},
|
|
1358
|
+
],
|
|
1359
|
+
"agent.retry_without_input_change": shared_block
|
|
1360
|
+
+ [
|
|
1361
|
+
{"op": "path_equals", "path": "blocked_reason", "value": "retry_without_input_change"},
|
|
1362
|
+
{"op": "path_present", "path": "error_context.input_hash"},
|
|
1363
|
+
],
|
|
1364
|
+
"agent.ignored_next_action": shared_block
|
|
1365
|
+
+ [
|
|
1366
|
+
{"op": "path_present", "path": "next_action_expected"},
|
|
1367
|
+
{"op": "path_equals", "path": "followed_next_action", "value": True},
|
|
1368
|
+
],
|
|
1369
|
+
"agent.wrong_phase": shared_block
|
|
1370
|
+
+ [
|
|
1371
|
+
{"op": "path_present", "path": "expected_phase"},
|
|
1372
|
+
{"op": "path_equals", "path": "mutated", "value": False},
|
|
1373
|
+
],
|
|
1374
|
+
"agent.generated_script_workaround": shared_block
|
|
1375
|
+
+ [
|
|
1376
|
+
{"op": "path_equals", "path": "used_official_recovery_command", "value": True},
|
|
1377
|
+
{"op": "path_equals", "path": "unsafe_workaround_created", "value": False},
|
|
1378
|
+
],
|
|
1379
|
+
"agent.unsafe_generated_script_recovery_bypass": shared_block
|
|
1380
|
+
+ [
|
|
1381
|
+
{"op": "path_equals", "path": "used_official_recovery_command", "value": True},
|
|
1382
|
+
{"op": "path_equals", "path": "unsafe_workaround_created", "value": False},
|
|
1383
|
+
],
|
|
1384
|
+
"agent.missing_error_context": shared_block
|
|
1385
|
+
+ [
|
|
1386
|
+
{"op": "path_present", "path": "error_context.cause"},
|
|
1387
|
+
{"op": "path_present", "path": "error_context.retry_scope"},
|
|
1388
|
+
],
|
|
1389
|
+
"agent.script_or_prompt_drift": shared_block
|
|
1390
|
+
+ [
|
|
1391
|
+
{"op": "path_equals", "path": "drift_classified", "value": True},
|
|
1392
|
+
{"op": "path_present", "path": "recovery_command"},
|
|
1393
|
+
],
|
|
1394
|
+
"extension_prompt_or_script_drift": shared_block
|
|
1395
|
+
+ [
|
|
1396
|
+
{"op": "path_equals", "path": "drift_classified", "value": True},
|
|
1397
|
+
{"op": "path_present", "path": "recovery_command"},
|
|
1398
|
+
],
|
|
1399
|
+
"resource.version_control_policy_bypassed": [
|
|
1400
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1401
|
+
{"op": "path_present", "path": "version_control_safety"},
|
|
1402
|
+
{"op": "path_equals", "path": "version_control_safety.mutation_without_guard", "value": False},
|
|
1403
|
+
{"op": "path_equals", "path": "version_control_safety.run_start_seen", "value": True},
|
|
1404
|
+
{"op": "path_equals", "path": "version_control_safety.run_finish_seen", "value": True},
|
|
1405
|
+
],
|
|
1406
|
+
"resource.guard_missing": [
|
|
1407
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1408
|
+
{"op": "path_equals", "path": "blocked_reason", "value": "vault_guard_required"},
|
|
1409
|
+
{"op": "path_equals", "path": "version_control_safety.mutation_without_guard", "value": False},
|
|
1410
|
+
{"op": "path_present", "path": "recovery_command"},
|
|
1411
|
+
],
|
|
1412
|
+
"resource.run_finish_missing": [
|
|
1413
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1414
|
+
{"op": "path_equals", "path": "version_control_safety.run_start_seen", "value": True},
|
|
1415
|
+
{"op": "path_equals", "path": "version_control_safety.run_finish_seen", "value": True},
|
|
1416
|
+
{"op": "path_present", "path": "version_control_safety.restore_point_after"},
|
|
1417
|
+
],
|
|
1418
|
+
"resource.restore_point_after_mutation": [
|
|
1419
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1420
|
+
{"op": "path_equals", "path": "version_control_safety.restore_point_before", "value": True},
|
|
1421
|
+
{"op": "path_equals", "path": "version_control_safety.restore_point_after", "value": True},
|
|
1422
|
+
],
|
|
1423
|
+
"resource.direct_mutation_attempt": [
|
|
1424
|
+
{"op": "path_equals", "path": "status", "value": "blocked"},
|
|
1425
|
+
{"op": "path_equals", "path": "blocked_reason", "value": "direct_mutation_forbidden"},
|
|
1426
|
+
{"op": "path_equals", "path": "version_control_safety.direct_mutation_forbidden", "value": True},
|
|
1427
|
+
{"op": "path_present", "path": "recovery_command"},
|
|
1428
|
+
],
|
|
1429
|
+
"agent.dry_run_without_apply": [
|
|
1430
|
+
{"op": "path_in", "path": "status", "value": ["ready_to_apply", "blocked", "discarded"]},
|
|
1431
|
+
{"op": "path_present", "path": "next_action"},
|
|
1432
|
+
{"op": "path_equals", "path": "dry_run_called_completed", "value": False},
|
|
1433
|
+
],
|
|
1434
|
+
"dry_run_without_apply": [
|
|
1435
|
+
{"op": "path_in", "path": "status", "value": ["ready_to_apply", "blocked", "discarded"]},
|
|
1436
|
+
{"op": "path_present", "path": "next_action"},
|
|
1437
|
+
{"op": "path_equals", "path": "dry_run_called_completed", "value": False},
|
|
1438
|
+
],
|
|
1439
|
+
}
|
|
1440
|
+
return mapping.get(
|
|
1441
|
+
signal,
|
|
1442
|
+
shared_block
|
|
1443
|
+
+ [
|
|
1444
|
+
{"op": "path_present", "path": "diagnostic_context.root_cause_code"},
|
|
1445
|
+
{"op": "path_present", "path": "error_context.next_action"},
|
|
1446
|
+
],
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
def _promotion_checklist(signal: str) -> list[str]:
|
|
1451
|
+
return [
|
|
1452
|
+
"Confirmar que a evidência está redigida e não contém conteúdo clínico bruto, HTML, tokens ou chaves.",
|
|
1453
|
+
"Escolher a suite final e criar output fixture que reproduza o comportamento corrigido.",
|
|
1454
|
+
"Manter ao menos duas assertions fortes e promover baseline somente após o corpus passar.",
|
|
1455
|
+
f"Verificar que o caso falharia antes da correção do prompt para {signal}.",
|
|
1456
|
+
]
|
|
1457
|
+
|
|
1458
|
+
|
|
1459
|
+
def _slug(value: str) -> str:
|
|
1460
|
+
slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
|
|
1461
|
+
return slug[:80] or "telemetry"
|
|
1462
|
+
|
|
1463
|
+
|
|
1464
|
+
def _draft_date(record: dict[str, Any], envelope: dict[str, Any]) -> str:
|
|
1465
|
+
record_fields = _telemetry_record_lens(record)
|
|
1466
|
+
envelope_generated_at = envelope["generated_at"] if "generated_at" in envelope and isinstance(envelope["generated_at"], str) else ""
|
|
1467
|
+
for text in (record_fields.recorded_at, envelope_generated_at):
|
|
1468
|
+
if re.match(r"\d{4}-\d{2}-\d{2}", text):
|
|
1469
|
+
return text[:10]
|
|
1470
|
+
return datetime.now(UTC).date().isoformat()
|
|
1471
|
+
|
|
1472
|
+
|
|
1473
|
+
def _unique_draft_output_path(output_dir: Path, stem: str, reserved: set[Path]) -> Path:
|
|
1474
|
+
output_path = output_dir / f"{stem}.json"
|
|
1475
|
+
suffix = 2
|
|
1476
|
+
while output_path.exists() or output_path in reserved:
|
|
1477
|
+
output_path = output_dir / f"{stem}-{suffix}.json"
|
|
1478
|
+
suffix += 1
|
|
1479
|
+
reserved.add(output_path)
|
|
1480
|
+
return output_path
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
def _draft_for_signal(
|
|
1484
|
+
*,
|
|
1485
|
+
record: dict[str, Any],
|
|
1486
|
+
envelope: dict[str, Any],
|
|
1487
|
+
signal: str,
|
|
1488
|
+
source_path: Path,
|
|
1489
|
+
) -> dict[str, Any]:
|
|
1490
|
+
record_fields = _telemetry_record_lens(record)
|
|
1491
|
+
workflow = record_fields.workflow
|
|
1492
|
+
prompt_sources = _suggested_prompt_sources(workflow)
|
|
1493
|
+
suspect_prompts = _suspect_prompts_from_sources(prompt_sources, signal=signal)
|
|
1494
|
+
suspect_scripts = _suspect_scripts_from_record(record)
|
|
1495
|
+
return {
|
|
1496
|
+
"schema": AGENT_BEHAVIOR_CASE_DRAFT_SCHEMA,
|
|
1497
|
+
"status": "draft",
|
|
1498
|
+
"source": "telemetry",
|
|
1499
|
+
"app": _record_app(record, envelope),
|
|
1500
|
+
"app_version": _record_app_version(record, envelope),
|
|
1501
|
+
"workflow": _clean_text(workflow),
|
|
1502
|
+
"phase": _clean_text(record_fields.phase),
|
|
1503
|
+
"signal": signal,
|
|
1504
|
+
"severity": _severity_for_signal(record, signal),
|
|
1505
|
+
"target_suite": _target_suite(record),
|
|
1506
|
+
"prompt_sources_suggested": prompt_sources,
|
|
1507
|
+
"suspect_prompts": suspect_prompts,
|
|
1508
|
+
"suspect_scripts": suspect_scripts,
|
|
1509
|
+
"prevention_owner_note": _prevention_owner_note(prompts=suspect_prompts, scripts=suspect_scripts),
|
|
1510
|
+
"redacted_evidence": _redacted_evidence(record, envelope, signal=signal, source_path=source_path),
|
|
1511
|
+
"suggested_assertions": _suggested_assertions(signal),
|
|
1512
|
+
"promotion_checklist": _promotion_checklist(signal),
|
|
1513
|
+
}
|
|
1514
|
+
|
|
1515
|
+
|
|
1516
|
+
def suggest_agent_behavior_cases_from_telemetry(
|
|
1517
|
+
input_path: Path,
|
|
1518
|
+
*,
|
|
1519
|
+
output_dir: Path,
|
|
1520
|
+
app: str = DEFAULT_TELEMETRY_APP,
|
|
1521
|
+
app_version: str | None = None,
|
|
1522
|
+
min_severity: str = "medium",
|
|
1523
|
+
) -> dict[str, Any]:
|
|
1524
|
+
"""Create reviewable behavior-corpus draft cases from redacted telemetry JSON."""
|
|
1525
|
+
drafts: list[dict[str, Any]] = []
|
|
1526
|
+
reserved_paths: set[Path] = set()
|
|
1527
|
+
skipped = 0
|
|
1528
|
+
for record, envelope, source_path in _telemetry_records(input_path):
|
|
1529
|
+
if _record_app(record, envelope) != app:
|
|
1530
|
+
skipped += 1
|
|
1531
|
+
continue
|
|
1532
|
+
if app_version and _record_app_version(record, envelope) != app_version:
|
|
1533
|
+
skipped += 1
|
|
1534
|
+
continue
|
|
1535
|
+
signals = _signals_for_record(record)
|
|
1536
|
+
selected = [signal for signal in signals if _passes_min_severity(record, signal, min_severity)]
|
|
1537
|
+
if not selected:
|
|
1538
|
+
skipped += 1
|
|
1539
|
+
continue
|
|
1540
|
+
for signal in selected:
|
|
1541
|
+
draft = _draft_for_signal(record=record, envelope=envelope, signal=signal, source_path=source_path)
|
|
1542
|
+
date_prefix = _draft_date(record, envelope)
|
|
1543
|
+
record_fields = _telemetry_record_lens(record)
|
|
1544
|
+
workflow_slug = _slug(record_fields.workflow or "workflow")
|
|
1545
|
+
signal_slug = _slug(signal)
|
|
1546
|
+
output_path = _unique_draft_output_path(
|
|
1547
|
+
output_dir,
|
|
1548
|
+
f"{date_prefix}-{signal_slug}-{workflow_slug}",
|
|
1549
|
+
reserved_paths,
|
|
1550
|
+
)
|
|
1551
|
+
drafts.append({"path": str(output_path), "draft": draft})
|
|
1552
|
+
if drafts:
|
|
1553
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1554
|
+
for item in drafts:
|
|
1555
|
+
Path(item["path"]).write_text(json.dumps(item["draft"], ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
1556
|
+
return {
|
|
1557
|
+
"schema": AGENT_BEHAVIOR_CASE_DRAFT_REPORT_SCHEMA,
|
|
1558
|
+
"status": "drafts_created" if drafts else "no_drafts",
|
|
1559
|
+
"app": app,
|
|
1560
|
+
"app_version": app_version or "",
|
|
1561
|
+
"min_severity": min_severity,
|
|
1562
|
+
"aggregate": {
|
|
1563
|
+
"draft_count": len(drafts),
|
|
1564
|
+
"skipped_record_count": skipped,
|
|
1565
|
+
},
|
|
1566
|
+
"drafts": [
|
|
1567
|
+
{
|
|
1568
|
+
"path": item["path"],
|
|
1569
|
+
"signal": item["draft"]["signal"],
|
|
1570
|
+
"target_suite": item["draft"]["target_suite"],
|
|
1571
|
+
"app_version": item["draft"]["app_version"],
|
|
1572
|
+
}
|
|
1573
|
+
for item in drafts
|
|
1574
|
+
],
|
|
1575
|
+
"next_action": "review drafts, promote selected cases into a corpus suite, then rerun eval-agent-behavior-corpus"
|
|
1576
|
+
if drafts
|
|
1577
|
+
else "",
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
def _looks_like_telemetry_payload(payload: Any) -> bool:
|
|
1582
|
+
if not isinstance(payload, dict):
|
|
1583
|
+
return False
|
|
1584
|
+
fields = _telemetry_payload_lens(payload)
|
|
1585
|
+
schema = fields.schema_id
|
|
1586
|
+
return ".workflow-telemetry-envelope." in schema or ".workflow-run-record." in schema or bool(fields.records)
|
|
1587
|
+
|
|
1588
|
+
|
|
1589
|
+
def _json_blocks_from_markdown(text: str) -> list[Any]:
|
|
1590
|
+
payloads: list[Any] = []
|
|
1591
|
+
stripped = text.strip()
|
|
1592
|
+
if stripped.startswith(("{", "[")):
|
|
1593
|
+
try:
|
|
1594
|
+
payloads.append(json.loads(stripped))
|
|
1595
|
+
except json.JSONDecodeError:
|
|
1596
|
+
pass
|
|
1597
|
+
for match in re.finditer(r"```(?:json)?\s*(.*?)```", text, flags=re.S | re.I):
|
|
1598
|
+
block = match.group(1).strip()
|
|
1599
|
+
if not block:
|
|
1600
|
+
continue
|
|
1601
|
+
try:
|
|
1602
|
+
payloads.append(json.loads(block))
|
|
1603
|
+
except json.JSONDecodeError:
|
|
1604
|
+
continue
|
|
1605
|
+
return payloads
|
|
1606
|
+
|
|
1607
|
+
|
|
1608
|
+
def _candidate_payloads(payload: Any) -> list[JsonObject]:
|
|
1609
|
+
candidates: list[JsonObject] = []
|
|
1610
|
+
if isinstance(payload, dict):
|
|
1611
|
+
fields = _BehaviorCandidatePayloadLens.model_validate(payload)
|
|
1612
|
+
candidates.extend(JsonObjectAdapter.validate_python(item) for item in fields.behavior_case_candidates)
|
|
1613
|
+
for item in fields.first_pass_prevention_candidates:
|
|
1614
|
+
enriched = dict(item)
|
|
1615
|
+
enriched.setdefault("case_kind", "first_pass_prevention")
|
|
1616
|
+
candidates.append(JsonObjectAdapter.validate_python(enriched))
|
|
1617
|
+
for message in fields.messages:
|
|
1618
|
+
message_fields = _BehaviorCandidateMessageLens.model_validate(message)
|
|
1619
|
+
for item in message_fields.behavior_case_candidates:
|
|
1620
|
+
enriched = dict(item)
|
|
1621
|
+
enriched.setdefault("source_message_id", message_fields.id)
|
|
1622
|
+
enriched.setdefault("source_kind", message_fields.source_kind)
|
|
1623
|
+
candidates.append(JsonObjectAdapter.validate_python(enriched))
|
|
1624
|
+
for item in message_fields.first_pass_prevention_candidates:
|
|
1625
|
+
enriched = dict(item)
|
|
1626
|
+
enriched.setdefault("case_kind", "first_pass_prevention")
|
|
1627
|
+
enriched.setdefault("source_message_id", message_fields.id)
|
|
1628
|
+
enriched.setdefault("source_kind", message_fields.source_kind)
|
|
1629
|
+
candidates.append(JsonObjectAdapter.validate_python(enriched))
|
|
1630
|
+
elif isinstance(payload, list):
|
|
1631
|
+
candidates.extend(JsonObjectAdapter.validate_python(item) for item in payload if isinstance(item, dict) and item.get("signal"))
|
|
1632
|
+
return candidates
|
|
1633
|
+
|
|
1634
|
+
|
|
1635
|
+
def _sanitize_evidence(value: Any) -> Any:
|
|
1636
|
+
if isinstance(value, dict):
|
|
1637
|
+
sanitized: dict[str, Any] = {}
|
|
1638
|
+
for key, item in value.items():
|
|
1639
|
+
lower = str(key).lower()
|
|
1640
|
+
if any(token in lower for token in ("content", "body", "html", "markdown", "raw", "token", "secret", "api_key", "script")):
|
|
1641
|
+
continue
|
|
1642
|
+
sanitized[str(key)] = _sanitize_evidence(item)
|
|
1643
|
+
return {key: item for key, item in sanitized.items() if item not in ("", [], {})}
|
|
1644
|
+
if isinstance(value, list):
|
|
1645
|
+
return [_sanitize_evidence(item) for item in value if _sanitize_evidence(item) not in ("", [], {})]
|
|
1646
|
+
if isinstance(value, str):
|
|
1647
|
+
return _clean_text(value, max_chars=700)
|
|
1648
|
+
return value
|
|
1649
|
+
|
|
1650
|
+
|
|
1651
|
+
def _candidate_text_list(candidate: dict[str, Any], key: str) -> list[str]:
|
|
1652
|
+
value = candidate.get(key)
|
|
1653
|
+
if isinstance(value, list):
|
|
1654
|
+
return [_clean_text(item) for item in value if _clean_text(item)]
|
|
1655
|
+
if isinstance(value, str) and value.strip():
|
|
1656
|
+
return [_clean_text(value)]
|
|
1657
|
+
return []
|
|
1658
|
+
|
|
1659
|
+
|
|
1660
|
+
def _candidate_count_map(candidate: JsonObject, key: str) -> dict[str, int]:
|
|
1661
|
+
value = candidate.get(key)
|
|
1662
|
+
if not isinstance(value, dict):
|
|
1663
|
+
return {}
|
|
1664
|
+
counts: dict[str, int] = {}
|
|
1665
|
+
for raw_key, raw_count in value.items():
|
|
1666
|
+
name = _clean_text(raw_key)
|
|
1667
|
+
if not name:
|
|
1668
|
+
continue
|
|
1669
|
+
try:
|
|
1670
|
+
counts[name] = int(raw_count)
|
|
1671
|
+
except (TypeError, ValueError):
|
|
1672
|
+
counts[name] = 1
|
|
1673
|
+
return counts
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
def _candidate_signal(candidate: dict[str, Any]) -> str:
|
|
1677
|
+
signal = str(candidate.get("signal") or candidate.get("root_cause") or candidate.get("root_cause_code") or "")
|
|
1678
|
+
if signal:
|
|
1679
|
+
return signal
|
|
1680
|
+
evidence = json.dumps(candidate, ensure_ascii=False).lower()
|
|
1681
|
+
if "retry loop" in evidence or ("loop" in evidence and "retry" in evidence):
|
|
1682
|
+
return "agent.retry_loop"
|
|
1683
|
+
if "ignored next_action" in evidence or "ignorou next_action" in evidence:
|
|
1684
|
+
return "agent.ignored_next_action"
|
|
1685
|
+
if "wrong phase" in evidence or "fase errada" in evidence:
|
|
1686
|
+
return "agent.wrong_phase"
|
|
1687
|
+
if "generated script" in evidence or "script gerado" in evidence:
|
|
1688
|
+
return "agent.generated_script_workaround"
|
|
1689
|
+
if "missing error_context" in evidence or "sem error_context" in evidence:
|
|
1690
|
+
return "agent.missing_error_context"
|
|
1691
|
+
if "dry-run" in evidence and "apply" in evidence:
|
|
1692
|
+
return "dry_run_without_apply"
|
|
1693
|
+
return "agent.workflow_blocked"
|
|
1694
|
+
|
|
1695
|
+
|
|
1696
|
+
def _candidate_workflow(candidate: JsonObject) -> str:
|
|
1697
|
+
raw_workflow = candidate.get("workflow")
|
|
1698
|
+
workflow = raw_workflow.strip() if isinstance(raw_workflow, str) else ""
|
|
1699
|
+
if workflow:
|
|
1700
|
+
return workflow
|
|
1701
|
+
text = json.dumps(candidate, ensure_ascii=False)
|
|
1702
|
+
match = re.search(r"/(?:mednotes:[a-z0-9_-]+|flashcards)", text, flags=re.I)
|
|
1703
|
+
return match.group(0) if match else ""
|
|
1704
|
+
|
|
1705
|
+
|
|
1706
|
+
def _candidate_app_version(candidate: dict[str, Any]) -> str:
|
|
1707
|
+
for key in ("app_version", "version"):
|
|
1708
|
+
if str(candidate.get(key) or ""):
|
|
1709
|
+
return str(candidate[key])
|
|
1710
|
+
text = json.dumps(candidate, ensure_ascii=False)
|
|
1711
|
+
match = re.search(r"(?:app[_ ]version|vers[aã]o)\s*[:=` ]+\s*([0-9]+(?:\.[0-9]+){1,3})", text, flags=re.I)
|
|
1712
|
+
return match.group(1) if match else "unknown"
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
def _draft_from_candidate(
|
|
1716
|
+
candidate: dict[str, Any],
|
|
1717
|
+
*,
|
|
1718
|
+
source_path: Path,
|
|
1719
|
+
confidence: str,
|
|
1720
|
+
) -> dict[str, Any]:
|
|
1721
|
+
signal = _candidate_signal(candidate)
|
|
1722
|
+
workflow = _candidate_workflow(candidate)
|
|
1723
|
+
source = str(candidate.get("source_kind") or candidate.get("source") or "agent_report")
|
|
1724
|
+
evidence = candidate.get("redacted_evidence") if isinstance(candidate.get("redacted_evidence"), dict) else {}
|
|
1725
|
+
sanitized_evidence = _sanitize_evidence(evidence or candidate)
|
|
1726
|
+
if isinstance(sanitized_evidence, dict):
|
|
1727
|
+
sanitized_evidence.setdefault("source_path", _serialized_evidence_source_path(source_path))
|
|
1728
|
+
sanitized_evidence.setdefault("signal", signal)
|
|
1729
|
+
else:
|
|
1730
|
+
sanitized_evidence = {
|
|
1731
|
+
"summary": _clean_text(sanitized_evidence),
|
|
1732
|
+
"source_path": _serialized_evidence_source_path(source_path),
|
|
1733
|
+
"signal": signal,
|
|
1734
|
+
}
|
|
1735
|
+
assertions = candidate.get("suggested_assertions")
|
|
1736
|
+
if not isinstance(assertions, list) or not all(isinstance(item, dict) for item in assertions):
|
|
1737
|
+
assertions = _suggested_assertions(signal)
|
|
1738
|
+
prompt_sources = candidate.get("prompt_sources_suggested")
|
|
1739
|
+
if not isinstance(prompt_sources, list):
|
|
1740
|
+
prompt_sources = candidate.get("prompt_surface")
|
|
1741
|
+
if not isinstance(prompt_sources, list):
|
|
1742
|
+
prompt_sources = _suggested_prompt_sources(workflow)
|
|
1743
|
+
prompt_sources = [str(item) for item in prompt_sources if str(item)]
|
|
1744
|
+
suspect_prompts = _surface_items(candidate.get("suspect_prompts"), kind="prompt")
|
|
1745
|
+
if not suspect_prompts:
|
|
1746
|
+
suspect_prompts = _suspect_prompts_from_sources(prompt_sources, signal=signal)
|
|
1747
|
+
suspect_scripts = _surface_items(candidate.get("suspect_scripts"), kind="script")
|
|
1748
|
+
target_suite = str(candidate.get("target_suite") or "")
|
|
1749
|
+
if not target_suite:
|
|
1750
|
+
target_suite = "extension_commands.core_behavior.v1" if _command_source_for_workflow(workflow) else "extension_skills.core_behavior.v1"
|
|
1751
|
+
draft = {
|
|
1752
|
+
"schema": AGENT_BEHAVIOR_CASE_DRAFT_SCHEMA,
|
|
1753
|
+
"status": "draft",
|
|
1754
|
+
"source": source,
|
|
1755
|
+
"confidence": confidence,
|
|
1756
|
+
"case_kind": str(candidate.get("case_kind") or "behavior_regression"),
|
|
1757
|
+
"app": str(candidate.get("app") or DEFAULT_TELEMETRY_APP),
|
|
1758
|
+
"app_version": _candidate_app_version(candidate),
|
|
1759
|
+
"workflow": _clean_text(workflow),
|
|
1760
|
+
"phase": _clean_text(candidate.get("phase")),
|
|
1761
|
+
"signal": signal,
|
|
1762
|
+
"severity": str(candidate.get("severity") or DEFAULT_SIGNAL_SEVERITY.get(signal, "medium")),
|
|
1763
|
+
"target_suite": target_suite,
|
|
1764
|
+
"prompt_sources_suggested": prompt_sources,
|
|
1765
|
+
"suspect_prompts": suspect_prompts,
|
|
1766
|
+
"suspect_scripts": suspect_scripts,
|
|
1767
|
+
"prevention_owner_note": _prevention_owner_note(prompts=suspect_prompts, scripts=suspect_scripts),
|
|
1768
|
+
"redacted_evidence": sanitized_evidence,
|
|
1769
|
+
"suggested_assertions": assertions,
|
|
1770
|
+
"promotion_checklist": _promotion_checklist(signal),
|
|
1771
|
+
}
|
|
1772
|
+
if draft["case_kind"] == "first_pass_prevention":
|
|
1773
|
+
prevention = {
|
|
1774
|
+
"prevention_type": _clean_text(candidate.get("prevention_type")),
|
|
1775
|
+
"optimization_class": _clean_text(candidate.get("optimization_class") or "first_pass_prevention"),
|
|
1776
|
+
"first_pass_failure_mode": _clean_text(candidate.get("first_pass_failure_mode"), max_chars=700),
|
|
1777
|
+
"bad_artifact_type": _clean_text(candidate.get("bad_artifact_type")),
|
|
1778
|
+
"failure_facets": _candidate_text_list(candidate, "failure_facets"),
|
|
1779
|
+
"suspected_upstream_prompt_source": _candidate_text_list(candidate, "suspected_upstream_prompt_source"),
|
|
1780
|
+
"desired_first_pass_behavior": _clean_text(candidate.get("desired_first_pass_behavior"), max_chars=700),
|
|
1781
|
+
"recommended_prompt_change": _clean_text(candidate.get("recommended_prompt_change"), max_chars=700),
|
|
1782
|
+
"recommended_contract_change": _clean_text(candidate.get("recommended_contract_change"), max_chars=700),
|
|
1783
|
+
"suggested_fixture": _clean_text(candidate.get("suggested_fixture")),
|
|
1784
|
+
"root_cause_counts": _candidate_count_map(candidate, "root_cause_counts"),
|
|
1785
|
+
"workflow_counts": _candidate_count_map(candidate, "workflow_counts"),
|
|
1786
|
+
"example_records": _sanitize_evidence(candidate.get("example_records") or []),
|
|
1787
|
+
"prompt_optimization_ready": bool(prompt_sources and assertions),
|
|
1788
|
+
"recovery_only": str(candidate.get("optimization_class") or "").lower() == "recovery_governance"
|
|
1789
|
+
or _clean_text(candidate.get("prevention_type")) == "recovery_only",
|
|
1790
|
+
}
|
|
1791
|
+
draft["first_pass_prevention"] = {
|
|
1792
|
+
key: value for key, value in prevention.items() if value not in ("", [], {})
|
|
1793
|
+
}
|
|
1794
|
+
if str(candidate.get("source_message_id") or ""):
|
|
1795
|
+
draft["source_message_id"] = str(candidate["source_message_id"])
|
|
1796
|
+
return draft
|
|
1797
|
+
|
|
1798
|
+
|
|
1799
|
+
def _freeform_mentions_workbench(text: str) -> bool:
|
|
1800
|
+
lowered = text.lower()
|
|
1801
|
+
return any(token in lowered for token in ("medical-notes-workbench", "wiki_medicina", "/mednotes:", "linker", "workbench"))
|
|
1802
|
+
|
|
1803
|
+
|
|
1804
|
+
def _freeform_candidate(text: str, *, source_kind: str) -> dict[str, Any] | None:
|
|
1805
|
+
if not _freeform_mentions_workbench(text):
|
|
1806
|
+
return None
|
|
1807
|
+
lowered = text.lower()
|
|
1808
|
+
signal = ""
|
|
1809
|
+
if "retry loop" in lowered or ("retry" in lowered and "loop" in lowered) or "repetiu diagnóstico" in lowered:
|
|
1810
|
+
signal = "agent.retry_loop"
|
|
1811
|
+
elif "ignored next_action" in lowered or "ignorou next_action" in lowered:
|
|
1812
|
+
signal = "agent.ignored_next_action"
|
|
1813
|
+
elif "wrong phase" in lowered or "fase errada" in lowered:
|
|
1814
|
+
signal = "agent.wrong_phase"
|
|
1815
|
+
elif "generated script" in lowered or "script gerado" in lowered or "criou script" in lowered:
|
|
1816
|
+
signal = "agent.generated_script_workaround"
|
|
1817
|
+
elif "missing error_context" in lowered or "sem error_context" in lowered:
|
|
1818
|
+
signal = "agent.missing_error_context"
|
|
1819
|
+
if not signal:
|
|
1820
|
+
return None
|
|
1821
|
+
workflow_match = re.search(r"/(?:mednotes:[a-z0-9_-]+|flashcards)", text, flags=re.I)
|
|
1822
|
+
version_match = re.search(r"(?:app[_ ]version|vers[aã]o)\s*[:=` ]+\s*([0-9]+(?:\.[0-9]+){1,3})", text, flags=re.I)
|
|
1823
|
+
return {
|
|
1824
|
+
"source_kind": source_kind,
|
|
1825
|
+
"app_version": version_match.group(1) if version_match else "unknown",
|
|
1826
|
+
"workflow": workflow_match.group(0) if workflow_match else "",
|
|
1827
|
+
"signal": signal,
|
|
1828
|
+
"severity": DEFAULT_SIGNAL_SEVERITY.get(signal, "medium"),
|
|
1829
|
+
"redacted_evidence": {"summary": _clean_text(text, max_chars=700)},
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
|
|
1833
|
+
def _draft_items_from_evidence_payload(
|
|
1834
|
+
payload: Any,
|
|
1835
|
+
*,
|
|
1836
|
+
source_path: Path,
|
|
1837
|
+
source_kind: str,
|
|
1838
|
+
) -> tuple[list[dict[str, Any]], str]:
|
|
1839
|
+
candidates = _candidate_payloads(payload)
|
|
1840
|
+
if candidates:
|
|
1841
|
+
return [
|
|
1842
|
+
_draft_from_candidate(candidate, source_path=source_path, confidence="medium")
|
|
1843
|
+
for candidate in candidates
|
|
1844
|
+
], "structured_candidates"
|
|
1845
|
+
if isinstance(payload, str):
|
|
1846
|
+
text = payload
|
|
1847
|
+
else:
|
|
1848
|
+
text = json.dumps(payload, ensure_ascii=False)
|
|
1849
|
+
candidate = _freeform_candidate(text, source_kind=source_kind)
|
|
1850
|
+
if candidate:
|
|
1851
|
+
return [_draft_from_candidate(candidate, source_path=source_path, confidence="low")], "freeform_inference"
|
|
1852
|
+
return [], "no_candidate_signal"
|
|
1853
|
+
|
|
1854
|
+
|
|
1855
|
+
class _DraftReportAggregateFields(ContractModel):
|
|
1856
|
+
"""Counts that decide the wrapper report for draft generation."""
|
|
1857
|
+
|
|
1858
|
+
model_config = ConfigDict(extra="ignore")
|
|
1859
|
+
|
|
1860
|
+
draft_count: int = Field(default=0, ge=0, strict=True)
|
|
1861
|
+
skipped_record_count: int = Field(default=0, ge=0, strict=True)
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
class _DraftReportFields(ContractModel):
|
|
1865
|
+
"""Typed lens for draft-generation reports before directory aggregation."""
|
|
1866
|
+
|
|
1867
|
+
model_config = ConfigDict(extra="ignore")
|
|
1868
|
+
|
|
1869
|
+
aggregate: _DraftReportAggregateFields = Field(default_factory=_DraftReportAggregateFields)
|
|
1870
|
+
drafts: list[JsonObject] = Field(default_factory=list)
|
|
1871
|
+
|
|
1872
|
+
|
|
1873
|
+
def _write_draft_items(
|
|
1874
|
+
draft_payloads: list[dict[str, Any]],
|
|
1875
|
+
*,
|
|
1876
|
+
output_dir: Path,
|
|
1877
|
+
source_path: Path,
|
|
1878
|
+
app: str,
|
|
1879
|
+
app_version: str | None,
|
|
1880
|
+
min_severity: str,
|
|
1881
|
+
skipped: int,
|
|
1882
|
+
mode: str,
|
|
1883
|
+
) -> dict[str, Any]:
|
|
1884
|
+
drafts: list[dict[str, Any]] = []
|
|
1885
|
+
reserved_paths: set[Path] = set()
|
|
1886
|
+
for draft in draft_payloads:
|
|
1887
|
+
if draft.get("app") != app:
|
|
1888
|
+
skipped += 1
|
|
1889
|
+
continue
|
|
1890
|
+
if app_version and draft.get("app_version") != app_version:
|
|
1891
|
+
skipped += 1
|
|
1892
|
+
continue
|
|
1893
|
+
if SEVERITY_RANK.get(str(draft.get("severity") or "low"), 0) < SEVERITY_RANK.get(min_severity, 2):
|
|
1894
|
+
skipped += 1
|
|
1895
|
+
continue
|
|
1896
|
+
date_prefix = datetime.now(UTC).date().isoformat()
|
|
1897
|
+
signal_slug = _slug(str(draft.get("signal") or "evidence"))
|
|
1898
|
+
workflow_slug = _slug(str(draft.get("workflow") or source_path.stem))
|
|
1899
|
+
output_path = _unique_draft_output_path(
|
|
1900
|
+
output_dir,
|
|
1901
|
+
f"{date_prefix}-{signal_slug}-{workflow_slug}",
|
|
1902
|
+
reserved_paths,
|
|
1903
|
+
)
|
|
1904
|
+
drafts.append({"path": str(output_path), "draft": draft})
|
|
1905
|
+
if drafts:
|
|
1906
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1907
|
+
for item in drafts:
|
|
1908
|
+
Path(item["path"]).write_text(json.dumps(item["draft"], ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
1909
|
+
return {
|
|
1910
|
+
"schema": AGENT_BEHAVIOR_CASE_DRAFT_REPORT_SCHEMA,
|
|
1911
|
+
"status": "drafts_created" if drafts else "no_drafts",
|
|
1912
|
+
"app": app,
|
|
1913
|
+
"app_version": app_version or "",
|
|
1914
|
+
"min_severity": min_severity,
|
|
1915
|
+
"mode": mode,
|
|
1916
|
+
"aggregate": {
|
|
1917
|
+
"draft_count": len(drafts),
|
|
1918
|
+
"skipped_record_count": skipped,
|
|
1919
|
+
},
|
|
1920
|
+
"drafts": [
|
|
1921
|
+
{
|
|
1922
|
+
"path": item["path"],
|
|
1923
|
+
"signal": item["draft"]["signal"],
|
|
1924
|
+
"target_suite": item["draft"]["target_suite"],
|
|
1925
|
+
"app_version": item["draft"]["app_version"],
|
|
1926
|
+
"source": item["draft"].get("source", ""),
|
|
1927
|
+
"confidence": item["draft"].get("confidence", ""),
|
|
1928
|
+
}
|
|
1929
|
+
for item in drafts
|
|
1930
|
+
],
|
|
1931
|
+
"next_action": "review drafts, promote selected cases into a corpus suite, then rerun eval-agent-behavior-corpus"
|
|
1932
|
+
if drafts
|
|
1933
|
+
else "",
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
|
|
1937
|
+
def _merge_existing_draft_report(result: JsonObject, existing_drafts: list[JsonObject]) -> JsonObject:
|
|
1938
|
+
"""Merge telemetry-created drafts into the directory-level report without dict mutation."""
|
|
1939
|
+
|
|
1940
|
+
result_fields = _DraftReportFields.model_validate(result)
|
|
1941
|
+
drafts = [*existing_drafts, *result_fields.drafts]
|
|
1942
|
+
merged = dict(result)
|
|
1943
|
+
merged.update(
|
|
1944
|
+
{
|
|
1945
|
+
"status": "drafts_created",
|
|
1946
|
+
"aggregate": {
|
|
1947
|
+
**result_fields.aggregate.model_dump(mode="json"),
|
|
1948
|
+
"draft_count": len(drafts),
|
|
1949
|
+
},
|
|
1950
|
+
"drafts": drafts,
|
|
1951
|
+
"next_action": "review drafts, promote selected cases into a corpus suite, then rerun eval-agent-behavior-corpus",
|
|
1952
|
+
}
|
|
1953
|
+
)
|
|
1954
|
+
return JsonObjectAdapter.validate_python(merged)
|
|
1955
|
+
|
|
1956
|
+
|
|
1957
|
+
def suggest_agent_behavior_cases_from_evidence(
|
|
1958
|
+
input_path: Path,
|
|
1959
|
+
*,
|
|
1960
|
+
output_dir: Path,
|
|
1961
|
+
app: str = DEFAULT_TELEMETRY_APP,
|
|
1962
|
+
app_version: str | None = None,
|
|
1963
|
+
min_severity: str = "medium",
|
|
1964
|
+
source_kind: str = "auto",
|
|
1965
|
+
) -> dict[str, Any]:
|
|
1966
|
+
"""Create reviewable behavior-corpus drafts from telemetry, reports, manifests, or freeform evidence."""
|
|
1967
|
+
if input_path.is_dir():
|
|
1968
|
+
draft_payloads: list[dict[str, Any]] = []
|
|
1969
|
+
existing_drafts: list[dict[str, Any]] = []
|
|
1970
|
+
skipped = 0
|
|
1971
|
+
modes: set[str] = set()
|
|
1972
|
+
for path in _evidence_payload_files(input_path):
|
|
1973
|
+
try:
|
|
1974
|
+
payload = _read_json_any(path)
|
|
1975
|
+
if _looks_like_telemetry_payload(payload):
|
|
1976
|
+
telemetry_result = suggest_agent_behavior_cases_from_telemetry(
|
|
1977
|
+
path,
|
|
1978
|
+
output_dir=output_dir,
|
|
1979
|
+
app=app,
|
|
1980
|
+
app_version=app_version,
|
|
1981
|
+
min_severity=min_severity,
|
|
1982
|
+
)
|
|
1983
|
+
telemetry_fields = _DraftReportFields.model_validate(telemetry_result)
|
|
1984
|
+
modes.add("telemetry")
|
|
1985
|
+
skipped += telemetry_fields.aggregate.skipped_record_count
|
|
1986
|
+
existing_drafts.extend(telemetry_fields.drafts)
|
|
1987
|
+
continue
|
|
1988
|
+
items, mode = _draft_items_from_evidence_payload(
|
|
1989
|
+
payload,
|
|
1990
|
+
source_path=path,
|
|
1991
|
+
source_kind="agent_report" if source_kind == "auto" else source_kind,
|
|
1992
|
+
)
|
|
1993
|
+
except ValidationError:
|
|
1994
|
+
text = path.read_text(encoding="utf-8")
|
|
1995
|
+
items = []
|
|
1996
|
+
mode = "freeform_inference"
|
|
1997
|
+
for payload in _json_blocks_from_markdown(text):
|
|
1998
|
+
block_items, block_mode = _draft_items_from_evidence_payload(
|
|
1999
|
+
payload,
|
|
2000
|
+
source_path=path,
|
|
2001
|
+
source_kind="inbox_report" if source_kind == "auto" else source_kind,
|
|
2002
|
+
)
|
|
2003
|
+
if block_items:
|
|
2004
|
+
mode = block_mode
|
|
2005
|
+
items.extend(block_items)
|
|
2006
|
+
if not items:
|
|
2007
|
+
candidate = _freeform_candidate(text, source_kind="agent_report" if source_kind == "auto" else source_kind)
|
|
2008
|
+
if candidate:
|
|
2009
|
+
items.append(_draft_from_candidate(candidate, source_path=path, confidence="low"))
|
|
2010
|
+
if items:
|
|
2011
|
+
draft_payloads.extend(items)
|
|
2012
|
+
modes.add(mode)
|
|
2013
|
+
result = _write_draft_items(
|
|
2014
|
+
draft_payloads,
|
|
2015
|
+
output_dir=output_dir,
|
|
2016
|
+
source_path=input_path,
|
|
2017
|
+
app=app,
|
|
2018
|
+
app_version=app_version,
|
|
2019
|
+
min_severity=min_severity,
|
|
2020
|
+
skipped=skipped,
|
|
2021
|
+
mode="+".join(sorted(modes)) if modes else "no_candidate_signal",
|
|
2022
|
+
)
|
|
2023
|
+
if existing_drafts:
|
|
2024
|
+
result = _merge_existing_draft_report(JsonObjectAdapter.validate_python(result), existing_drafts)
|
|
2025
|
+
return result
|
|
2026
|
+
|
|
2027
|
+
try:
|
|
2028
|
+
payload = _read_json_any(input_path)
|
|
2029
|
+
if _looks_like_telemetry_payload(payload):
|
|
2030
|
+
return suggest_agent_behavior_cases_from_telemetry(
|
|
2031
|
+
input_path,
|
|
2032
|
+
output_dir=output_dir,
|
|
2033
|
+
app=app,
|
|
2034
|
+
app_version=app_version,
|
|
2035
|
+
min_severity=min_severity,
|
|
2036
|
+
)
|
|
2037
|
+
draft_payloads, mode = _draft_items_from_evidence_payload(
|
|
2038
|
+
payload,
|
|
2039
|
+
source_path=input_path,
|
|
2040
|
+
source_kind="agent_report" if source_kind == "auto" else source_kind,
|
|
2041
|
+
)
|
|
2042
|
+
except ValidationError:
|
|
2043
|
+
text = input_path.read_text(encoding="utf-8")
|
|
2044
|
+
json_payloads = _json_blocks_from_markdown(text)
|
|
2045
|
+
draft_payloads = []
|
|
2046
|
+
mode = "freeform_inference"
|
|
2047
|
+
for payload in json_payloads:
|
|
2048
|
+
items, item_mode = _draft_items_from_evidence_payload(
|
|
2049
|
+
payload,
|
|
2050
|
+
source_path=input_path,
|
|
2051
|
+
source_kind="inbox_report" if source_kind == "auto" else source_kind,
|
|
2052
|
+
)
|
|
2053
|
+
if items:
|
|
2054
|
+
mode = item_mode
|
|
2055
|
+
draft_payloads.extend(items)
|
|
2056
|
+
if not draft_payloads:
|
|
2057
|
+
candidate = _freeform_candidate(text, source_kind="agent_report" if source_kind == "auto" else source_kind)
|
|
2058
|
+
if candidate:
|
|
2059
|
+
draft_payloads.append(_draft_from_candidate(candidate, source_path=input_path, confidence="low"))
|
|
2060
|
+
return _write_draft_items(
|
|
2061
|
+
draft_payloads,
|
|
2062
|
+
output_dir=output_dir,
|
|
2063
|
+
source_path=input_path,
|
|
2064
|
+
app=app,
|
|
2065
|
+
app_version=app_version,
|
|
2066
|
+
min_severity=min_severity,
|
|
2067
|
+
skipped=0,
|
|
2068
|
+
mode=mode if "mode" in locals() else "no_candidate_signal",
|
|
2069
|
+
)
|