devflow-engine 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devflow_engine/__init__.py +3 -0
- devflow_engine/agentic_prompts.py +100 -0
- devflow_engine/agentic_runtime.py +398 -0
- devflow_engine/api_key_flow_harness.py +539 -0
- devflow_engine/api_keys.py +357 -0
- devflow_engine/bootstrap/__init__.py +2 -0
- devflow_engine/bootstrap/provision_from_template.py +84 -0
- devflow_engine/cli/__init__.py +0 -0
- devflow_engine/cli/app.py +7270 -0
- devflow_engine/core/__init__.py +0 -0
- devflow_engine/core/config.py +86 -0
- devflow_engine/core/logging.py +29 -0
- devflow_engine/core/paths.py +45 -0
- devflow_engine/core/toml_kv.py +33 -0
- devflow_engine/devflow_event_worker.py +1292 -0
- devflow_engine/devflow_state.py +201 -0
- devflow_engine/devin2/__init__.py +9 -0
- devflow_engine/devin2/agent_definition.py +120 -0
- devflow_engine/devin2/pi_runner.py +204 -0
- devflow_engine/devin_orchestration.py +69 -0
- devflow_engine/docs/prompts/anti-patterns.md +42 -0
- devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
- devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
- devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
- devflow_engine/doctor/__init__.py +2 -0
- devflow_engine/doctor/triage.py +140 -0
- devflow_engine/error/__init__.py +0 -0
- devflow_engine/error/remediation.py +21 -0
- devflow_engine/errors/error_solver_dag.py +522 -0
- devflow_engine/errors/runtime_observability.py +67 -0
- devflow_engine/idea/__init__.py +4 -0
- devflow_engine/idea/actors.py +481 -0
- devflow_engine/idea/agentic.py +465 -0
- devflow_engine/idea/analyze.py +93 -0
- devflow_engine/idea/devin_chat_dag.py +1 -0
- devflow_engine/idea/diff.py +99 -0
- devflow_engine/idea/drafts.py +446 -0
- devflow_engine/idea/idea_creation_dag.py +643 -0
- devflow_engine/idea/ideation_enrichment.py +355 -0
- devflow_engine/idea/ideation_enrichment_worker.py +19 -0
- devflow_engine/idea/paths.py +28 -0
- devflow_engine/idea/promote.py +53 -0
- devflow_engine/idea/redaction.py +27 -0
- devflow_engine/idea/repo_tools.py +1277 -0
- devflow_engine/idea/response_mode.py +30 -0
- devflow_engine/idea/story_pipeline.py +1585 -0
- devflow_engine/idea/sufficiency.py +376 -0
- devflow_engine/idea/traditional_stories.py +1257 -0
- devflow_engine/implementation/__init__.py +0 -0
- devflow_engine/implementation/alembic_preflight.py +700 -0
- devflow_engine/implementation/dag.py +8450 -0
- devflow_engine/implementation/green_gate.py +93 -0
- devflow_engine/implementation/prompts.py +108 -0
- devflow_engine/implementation/test_runtime.py +623 -0
- devflow_engine/integration/__init__.py +19 -0
- devflow_engine/integration/agentic.py +66 -0
- devflow_engine/integration/dag.py +3539 -0
- devflow_engine/integration/prompts.py +114 -0
- devflow_engine/integration/supabase_schema.sql +31 -0
- devflow_engine/integration/supabase_sync.py +177 -0
- devflow_engine/llm/__init__.py +1 -0
- devflow_engine/llm/cli_one_shot.py +84 -0
- devflow_engine/llm/cli_stream.py +371 -0
- devflow_engine/llm/execution_context.py +26 -0
- devflow_engine/llm/invoke.py +1322 -0
- devflow_engine/llm/provider_api.py +304 -0
- devflow_engine/llm/repo_knowledge.py +588 -0
- devflow_engine/llm_primitives.py +315 -0
- devflow_engine/orchestration.py +62 -0
- devflow_engine/planning/__init__.py +0 -0
- devflow_engine/planning/analyze_repo.py +92 -0
- devflow_engine/planning/render_drafts.py +133 -0
- devflow_engine/playground/__init__.py +0 -0
- devflow_engine/playground/hooks.py +26 -0
- devflow_engine/playwright_workflow/__init__.py +5 -0
- devflow_engine/playwright_workflow/dag.py +1317 -0
- devflow_engine/process/__init__.py +5 -0
- devflow_engine/process/dag.py +59 -0
- devflow_engine/project_registration/__init__.py +3 -0
- devflow_engine/project_registration/dag.py +1581 -0
- devflow_engine/project_registry.py +109 -0
- devflow_engine/prompts/devin/generic/prompt.md +6 -0
- devflow_engine/prompts/devin/ideation/prompt.md +263 -0
- devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
- devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
- devflow_engine/prompts/devin/insight/prompt.md +11 -0
- devflow_engine/prompts/devin/insight/scenarios.md +5 -0
- devflow_engine/prompts/devin/intake/prompt.md +15 -0
- devflow_engine/prompts/devin/iterate/prompt.md +12 -0
- devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
- devflow_engine/prompts/devin/shared/principles.md +246 -0
- devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
- devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
- devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
- devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
- devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
- devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/red/prompt.md +27 -0
- devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
- devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
- devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
- devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
- devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
- devflow_engine/prompts/integration/README.md +185 -0
- devflow_engine/prompts/integration/green/example.md +67 -0
- devflow_engine/prompts/integration/green/green/prompt.md +10 -0
- devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green_enrich/example.md +79 -0
- devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
- devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red/example.md +152 -0
- devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red_review/example.md +71 -0
- devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
- devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
- devflow_engine/prompts/integration/resolve/example.md +111 -0
- devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
- devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
- devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
- devflow_engine/prompts/integration/validate/example.md +143 -0
- devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
- devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
- devflow_engine/prompts/integration/write_workflows/example.md +100 -0
- devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
- devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
- devflow_engine/prompts/iterate/README.md +7 -0
- devflow_engine/prompts/iterate/coder/prompt.md +11 -0
- devflow_engine/prompts/iterate/framer/prompt.md +11 -0
- devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
- devflow_engine/prompts/iterate/observer/prompt.md +11 -0
- devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
- devflow_engine/prompts/recovery/execution/prompt.md +8 -0
- devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
- devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
- devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
- devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
- devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
- devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
- devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
- devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
- devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
- devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
- devflow_engine/recovery/__init__.py +3 -0
- devflow_engine/recovery/dag.py +2609 -0
- devflow_engine/recovery/models.py +220 -0
- devflow_engine/refactor.py +93 -0
- devflow_engine/registry/__init__.py +1 -0
- devflow_engine/registry/cards.py +238 -0
- devflow_engine/registry/domain_normalize.py +60 -0
- devflow_engine/registry/effects.py +65 -0
- devflow_engine/registry/enforce_report.py +150 -0
- devflow_engine/registry/module_cards_classify.py +164 -0
- devflow_engine/registry/module_cards_draft.py +184 -0
- devflow_engine/registry/module_cards_gate.py +59 -0
- devflow_engine/registry/packages.py +347 -0
- devflow_engine/registry/pathways.py +323 -0
- devflow_engine/review/__init__.py +11 -0
- devflow_engine/review/dag.py +588 -0
- devflow_engine/review/review_story.py +67 -0
- devflow_engine/scope_idea/__init__.py +3 -0
- devflow_engine/scope_idea/agentic.py +39 -0
- devflow_engine/scope_idea/dag.py +1069 -0
- devflow_engine/scope_idea/models.py +175 -0
- devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
- devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
- devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
- devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
- devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
- devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
- devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
- devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
- devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
- devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
- devflow_engine/skills/registry.example.yaml +42 -0
- devflow_engine/source_doc_assumptions.py +291 -0
- devflow_engine/source_doc_mutation_dag.py +1606 -0
- devflow_engine/source_doc_mutation_eval.py +417 -0
- devflow_engine/source_doc_mutation_worker.py +25 -0
- devflow_engine/source_docs_schema.py +207 -0
- devflow_engine/source_docs_updater.py +309 -0
- devflow_engine/source_scope/__init__.py +15 -0
- devflow_engine/source_scope/agentic.py +45 -0
- devflow_engine/source_scope/dag.py +1626 -0
- devflow_engine/source_scope/models.py +177 -0
- devflow_engine/stores/__init__.py +0 -0
- devflow_engine/stores/execution_store.py +3534 -0
- devflow_engine/story/__init__.py +0 -0
- devflow_engine/story/contracts.py +160 -0
- devflow_engine/story/discovery.py +47 -0
- devflow_engine/story/evidence.py +118 -0
- devflow_engine/story/hashing.py +27 -0
- devflow_engine/story/implemented_queue_purge.py +148 -0
- devflow_engine/story/indexer.py +105 -0
- devflow_engine/story/io.py +20 -0
- devflow_engine/story/markdown_contracts.py +298 -0
- devflow_engine/story/reconciliation.py +408 -0
- devflow_engine/story/validate_stories.py +149 -0
- devflow_engine/story/validate_tests_story.py +512 -0
- devflow_engine/story/validation.py +133 -0
- devflow_engine/ui_grounding/__init__.py +11 -0
- devflow_engine/ui_grounding/agentic.py +31 -0
- devflow_engine/ui_grounding/dag.py +874 -0
- devflow_engine/ui_grounding/models.py +224 -0
- devflow_engine/ui_grounding/pencil_bridge.py +247 -0
- devflow_engine/vendor/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
- devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
- devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
- devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
- devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
- devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
- devflow_engine/worker.py +1086 -0
- devflow_engine/worker_guard.py +233 -0
- devflow_engine-1.0.0.dist-info/METADATA +235 -0
- devflow_engine-1.0.0.dist-info/RECORD +393 -0
- devflow_engine-1.0.0.dist-info/WHEEL +4 -0
- devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
- devin/__init__.py +6 -0
- devin/dag.py +58 -0
- devin/dag_two_arm.py +138 -0
- devin/devin_chat_scenario_catalog.json +588 -0
- devin/devin_eval.py +677 -0
- devin/nodes/__init__.py +0 -0
- devin/nodes/ideation/__init__.py +0 -0
- devin/nodes/ideation/node.py +195 -0
- devin/nodes/ideation/playground.py +267 -0
- devin/nodes/ideation/prompt.md +65 -0
- devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
- devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
- devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
- devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
- devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
- devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
- devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
- devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
- devin/nodes/ideation/scenarios/vague_idea.py +16 -0
- devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
- devin/nodes/ideation/tools.json +312 -0
- devin/nodes/insight/__init__.py +0 -0
- devin/nodes/insight/node.py +49 -0
- devin/nodes/insight/playground.py +154 -0
- devin/nodes/insight/prompt.md +61 -0
- devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
- devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
- devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
- devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
- devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
- devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
- devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
- devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
- devin/nodes/insight/scenarios/operational_debugging.py +15 -0
- devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
- devin/nodes/insight/scenarios/operational_question.py +9 -0
- devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
- devin/nodes/insight/scenarios/queue_status.py +15 -0
- devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
- devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
- devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
- devin/nodes/insight/scenarios/worker_state_check.py +15 -0
- devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
- devin/nodes/insight/tools.json +126 -0
- devin/nodes/intake/__init__.py +0 -0
- devin/nodes/intake/node.py +27 -0
- devin/nodes/intake/playground.py +47 -0
- devin/nodes/intake/prompt.md +12 -0
- devin/nodes/intake/scenarios/ideation_routing.py +4 -0
- devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
- devin/nodes/intake/scenarios/insight_routing.py +4 -0
- devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
- devin/nodes/iterate/README.md +44 -0
- devin/nodes/iterate/__init__.py +1 -0
- devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
- devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
- devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
- devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
- devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
- devin/nodes/iterate/agent-roles.md +89 -0
- devin/nodes/iterate/agents/README.md +10 -0
- devin/nodes/iterate/artifacts.md +504 -0
- devin/nodes/iterate/contract.md +100 -0
- devin/nodes/iterate/eval-plan.md +74 -0
- devin/nodes/iterate/node.py +100 -0
- devin/nodes/iterate/pipeline/README.md +13 -0
- devin/nodes/iterate/playground-contract.md +76 -0
- devin/nodes/iterate/prompt.md +11 -0
- devin/nodes/iterate/scenarios/README.md +38 -0
- devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
- devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
- devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
- devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
- devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
- devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
- devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
- devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
- devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
- devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
- devin/nodes/shared/__init__.py +0 -0
- devin/nodes/shared/filemaker_expert.md +80 -0
- devin/nodes/shared/filemaker_expert.py +354 -0
- devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
- devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
- devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
- devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
- devin/nodes/shared/helpers.py +156 -0
- devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
- devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
- devin/nodes/shared/models.py +44 -0
- devin/nodes/shared/post.py +40 -0
- devin/nodes/shared/router.py +107 -0
- devin/nodes/shared/tools.py +191 -0
- devin/shared/devin-chat-rubric.md +237 -0
- devin/shared/devin-chat-scenario-suite.md +90 -0
- devin/shared/eval_doctrine.md +9 -0
|
@@ -0,0 +1,2609 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import sqlite3
|
|
7
|
+
import subprocess
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib import error as urllib_error
|
|
12
|
+
from urllib import request as urllib_request
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from ..agentic_prompts import load_agentic_prompt_lines
|
|
17
|
+
from ..agentic_runtime import run_agent_step
|
|
18
|
+
from ..devflow_state import publish_devflow_state
|
|
19
|
+
from ..implementation.dag import LocalSetupContract, _get_docker_service_logs
|
|
20
|
+
from ..llm.cli_stream import llm_sessions_db
|
|
21
|
+
from ..implementation.test_runtime import (
|
|
22
|
+
discover_story_scoped_test_paths,
|
|
23
|
+
load_story_test_runtime_contract,
|
|
24
|
+
normalize_recovery_story_runtime_contract,
|
|
25
|
+
persist_story_runtime_contract,
|
|
26
|
+
resolve_story_runtime_contract,
|
|
27
|
+
story_test_runtime_contract_path,
|
|
28
|
+
)
|
|
29
|
+
from ..stores.execution_store import ExecutionStore
|
|
30
|
+
from ..vendor.datalumina_genai.core.nodes.agent import AgentConfig, AgentNode
|
|
31
|
+
from ..vendor.datalumina_genai.core.nodes.base import Node
|
|
32
|
+
from ..vendor.datalumina_genai.core.nodes.router import BaseRouter, RouterNode
|
|
33
|
+
from ..vendor.datalumina_genai.core.schema import NodeConfig, WorkflowSchema
|
|
34
|
+
from ..vendor.datalumina_genai.core.task import TaskContext
|
|
35
|
+
from ..vendor.datalumina_genai.core.workflow import Workflow
|
|
36
|
+
from .models import (
|
|
37
|
+
FailedQueueItemArtifact,
|
|
38
|
+
RecoveryInvestigationArtifact,
|
|
39
|
+
RecoveryNonConvergenceArtifact,
|
|
40
|
+
RecoveryDiagnosisArtifact,
|
|
41
|
+
RecoverySuccessCriterion,
|
|
42
|
+
RecoveryExecutionArtifact,
|
|
43
|
+
PreReplayCheckArtifact,
|
|
44
|
+
RecoveryOutcomeArtifact,
|
|
45
|
+
RemediationPlanArtifact,
|
|
46
|
+
ReenqueueArtifact,
|
|
47
|
+
RecoveryHandoffArtifact,
|
|
48
|
+
SystemicPatternArtifact,
|
|
49
|
+
CodeRootCauseArtifact,
|
|
50
|
+
RemediationResultArtifact,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
DAG_ID = "post_queue_failure_recovery_dag"
|
|
54
|
+
_CURRENT_STORE: ExecutionStore | None = None
|
|
55
|
+
_CURRENT_RUN_ID: str | None = None
|
|
56
|
+
_CURRENT_STRENGTH: str | None = None
|
|
57
|
+
_CURRENT_REPO_ROOT: Path | None = None
|
|
58
|
+
|
|
59
|
+
_QUEUE_TABLE_BY_KIND = {
|
|
60
|
+
"scope": ("scope_queue", "scope_queue_id"),
|
|
61
|
+
"idea_creation": ("idea_creation_queue", "idea_creation_queue_id"),
|
|
62
|
+
"idea": ("idea_queue", "idea_queue_id"),
|
|
63
|
+
"story": ("story_queue", "story_queue_id"),
|
|
64
|
+
"integration": ("integration_queue", "integration_queue_id"),
|
|
65
|
+
"recovery": ("recovery_queue", "recovery_queue_id"),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FailureRecoveryDagEvent(BaseModel):
|
|
70
|
+
repo_root: str
|
|
71
|
+
project_id: str
|
|
72
|
+
queue_type: str
|
|
73
|
+
item_id: str
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True)
|
|
77
|
+
class FailureRecoveryDagResult:
|
|
78
|
+
exit_code: int
|
|
79
|
+
run_id: str
|
|
80
|
+
outcome: dict[str, Any]
|
|
81
|
+
message: str
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _store_run() -> tuple[ExecutionStore, str]:
|
|
85
|
+
if _CURRENT_STORE is None or _CURRENT_RUN_ID is None:
|
|
86
|
+
raise RuntimeError("recovery workflow missing runtime bindings")
|
|
87
|
+
return _CURRENT_STORE, _CURRENT_RUN_ID
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _repo_root() -> Path:
|
|
91
|
+
if _CURRENT_REPO_ROOT is None:
|
|
92
|
+
raise RuntimeError("recovery workflow missing repo root binding")
|
|
93
|
+
return _CURRENT_REPO_ROOT
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _persist_node(*, node_id: str, node_name: str, fn):
|
|
97
|
+
store, run_id = _store_run()
|
|
98
|
+
node_exec_id = store.create_node_attempt(run_id=run_id, node_id=node_id, node_name=node_name, attempt=1)
|
|
99
|
+
try:
|
|
100
|
+
output, task_context = fn(node_exec_id)
|
|
101
|
+
except Exception as exc:
|
|
102
|
+
store.mark_node_finished(node_exec_id=node_exec_id, status="failed", error={"message": str(exc)})
|
|
103
|
+
raise
|
|
104
|
+
store.mark_node_finished(node_exec_id=node_exec_id, status="succeeded", output=output)
|
|
105
|
+
return task_context
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _recovery_display_path(recovery_id: str) -> str:
|
|
109
|
+
return f"recovery:recovery_{recovery_id}"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _publish(project_id: str, run_id: str, state: str, status: str, summary: str, error: str | None = None, recovery_id: str | None = None) -> None:
|
|
113
|
+
publish_devflow_state(
|
|
114
|
+
project_id=project_id,
|
|
115
|
+
run_id=run_id,
|
|
116
|
+
current_state=state,
|
|
117
|
+
current_status=status,
|
|
118
|
+
run_summary=summary,
|
|
119
|
+
error_message=error,
|
|
120
|
+
display="project",
|
|
121
|
+
display_path=_recovery_display_path(recovery_id or run_id),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _publish_node(project_id: str, run_id: str, summary: str, recovery_id: str | None = None) -> None:
|
|
126
|
+
publish_devflow_state(
|
|
127
|
+
project_id=project_id,
|
|
128
|
+
run_id=run_id,
|
|
129
|
+
current_state="running",
|
|
130
|
+
current_status="processing",
|
|
131
|
+
run_summary=summary,
|
|
132
|
+
display="project",
|
|
133
|
+
display_path=_recovery_display_path(recovery_id or run_id),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _normalize_text(value: Any) -> str:
|
|
138
|
+
return str(value or "").strip().lower()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
_SOFT_PROVENANCE_REASON_MARKERS = (
|
|
142
|
+
"provenance",
|
|
143
|
+
"byte identity",
|
|
144
|
+
"byte-ident",
|
|
145
|
+
"byte-for-byte",
|
|
146
|
+
"forensic",
|
|
147
|
+
"checksum",
|
|
148
|
+
"digest",
|
|
149
|
+
"hash certainty",
|
|
150
|
+
"identity certainty",
|
|
151
|
+
"cannot prove byte identity",
|
|
152
|
+
"cannot prove identity",
|
|
153
|
+
"cannot prove provenance",
|
|
154
|
+
"cannot verify provenance",
|
|
155
|
+
"audit trail",
|
|
156
|
+
"metadata certainty",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
_HARD_BLOCK_REASON_MARKERS = (
|
|
160
|
+
"manual review",
|
|
161
|
+
"human",
|
|
162
|
+
"operator",
|
|
163
|
+
"approval",
|
|
164
|
+
"contradict",
|
|
165
|
+
"conflict",
|
|
166
|
+
"inconsistent",
|
|
167
|
+
"mismatch",
|
|
168
|
+
"failed",
|
|
169
|
+
"failure",
|
|
170
|
+
"not viable",
|
|
171
|
+
"unsafe",
|
|
172
|
+
"missing artifact",
|
|
173
|
+
"missing runtime",
|
|
174
|
+
"unauthorized",
|
|
175
|
+
"forbidden",
|
|
176
|
+
"denied",
|
|
177
|
+
"exhausted",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _is_soft_provenance_reason(reason: str) -> bool:
|
|
182
|
+
normalized = _normalize_text(reason)
|
|
183
|
+
return bool(normalized) and any(marker in normalized for marker in _SOFT_PROVENANCE_REASON_MARKERS) and not any(
|
|
184
|
+
marker in normalized for marker in _HARD_BLOCK_REASON_MARKERS
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _verification_allows_reenqueue(
|
|
189
|
+
*,
|
|
190
|
+
execution: RecoveryExecutionArtifact,
|
|
191
|
+
verified: PreReplayCheckArtifact,
|
|
192
|
+
diagnosis: RecoveryDiagnosisArtifact | None,
|
|
193
|
+
) -> bool:
|
|
194
|
+
if execution.outcome != "reenqueued":
|
|
195
|
+
return False
|
|
196
|
+
if verified.ready:
|
|
197
|
+
return True
|
|
198
|
+
if diagnosis is not None and diagnosis.suggested_action == "manual_review_required":
|
|
199
|
+
return False
|
|
200
|
+
if not verified.checks:
|
|
201
|
+
return False
|
|
202
|
+
blocking_reasons = [str(reason).strip() for reason in verified.blocking_reasons if str(reason).strip()]
|
|
203
|
+
if not blocking_reasons:
|
|
204
|
+
return False
|
|
205
|
+
return all(_is_soft_provenance_reason(reason) for reason in blocking_reasons)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _normalized_failure_signature(*, failure_message: str | None, failure_context: dict[str, Any] | None) -> str:
|
|
209
|
+
ctx = failure_context if isinstance(failure_context, dict) else {}
|
|
210
|
+
message = str(failure_message or "").strip()
|
|
211
|
+
error_type = str(ctx.get("error_type") or "").strip()
|
|
212
|
+
failed_stage = str(ctx.get("failed_stage") or "").strip().lower()
|
|
213
|
+
actual_failed_node = str(ctx.get("actual_failed_node") or "").strip().lower() or failed_stage
|
|
214
|
+
if "Prompt is too long" in message:
|
|
215
|
+
base = "prompt_too_long"
|
|
216
|
+
elif "NameError" in message:
|
|
217
|
+
base = f"name_error:{error_type or 'unknown'}"
|
|
218
|
+
elif error_type:
|
|
219
|
+
base = error_type.lower().replace(" ", "_")[:80]
|
|
220
|
+
else:
|
|
221
|
+
base = message[:80].lower().replace(" ", "_")
|
|
222
|
+
return f"{actual_failed_node}:{base}" if actual_failed_node else base
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
_RECOVERY_CHURN_GATE_THRESHOLD = 3
|
|
226
|
+
_RECOVERY_CHURN_GATE_VERSION = 1
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _durable_recovery_identity(item: FailedQueueItemArtifact) -> str:
|
|
230
|
+
if item.queue_type == "story" and str(item.story_id or "").strip():
|
|
231
|
+
return f"story:{str(item.story_id or '').strip()}"
|
|
232
|
+
raw = item.raw_row if isinstance(item.raw_row, dict) else {}
|
|
233
|
+
for key in ("scope_id", "idea_id", "integration_id"):
|
|
234
|
+
value = str(raw.get(key) or "").strip()
|
|
235
|
+
if value:
|
|
236
|
+
return f"{item.queue_type}:{value}"
|
|
237
|
+
return f"{item.queue_type}:{item.item_id}"
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _recovery_churn_key(*, item: FailedQueueItemArtifact, failure_signature: str) -> str:
|
|
241
|
+
return (
|
|
242
|
+
f"recovery_churn_gate:v{_RECOVERY_CHURN_GATE_VERSION}:"
|
|
243
|
+
f"{_durable_recovery_identity(item)}:{failure_signature}:no_material_change"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _recovery_churn_fingerprint_inputs(*, item: FailedQueueItemArtifact, failure_signature: str) -> dict[str, Any]:
|
|
248
|
+
return {
|
|
249
|
+
"surface": "recovery_churn_gate",
|
|
250
|
+
"version": _RECOVERY_CHURN_GATE_VERSION,
|
|
251
|
+
"queue_type": item.queue_type,
|
|
252
|
+
"durable_identity": _durable_recovery_identity(item),
|
|
253
|
+
"failure_signature": failure_signature,
|
|
254
|
+
"no_material_change": True,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _load_recovery_churn_gate_state(
|
|
259
|
+
*,
|
|
260
|
+
store: ExecutionStore,
|
|
261
|
+
project_id: str,
|
|
262
|
+
item: FailedQueueItemArtifact,
|
|
263
|
+
failure_signature: str,
|
|
264
|
+
) -> dict[str, Any]:
|
|
265
|
+
fingerprint = store._fingerprint_from_inputs( # type: ignore[attr-defined]
|
|
266
|
+
_recovery_churn_fingerprint_inputs(item=item, failure_signature=failure_signature)
|
|
267
|
+
)
|
|
268
|
+
with store._connect() as conn:
|
|
269
|
+
row = conn.execute(
|
|
270
|
+
(
|
|
271
|
+
"SELECT error_task_id, status, occurrence_count "
|
|
272
|
+
"FROM error_tasks WHERE project_id=? AND fingerprint=? "
|
|
273
|
+
"ORDER BY created_at DESC LIMIT 1"
|
|
274
|
+
),
|
|
275
|
+
(project_id, fingerprint),
|
|
276
|
+
).fetchone()
|
|
277
|
+
return {
|
|
278
|
+
"fingerprint": fingerprint,
|
|
279
|
+
"churn_key": _recovery_churn_key(item=item, failure_signature=failure_signature),
|
|
280
|
+
"error_task_id": None if row is None else str(row["error_task_id"] or ""),
|
|
281
|
+
"status": None if row is None else str(row["status"] or ""),
|
|
282
|
+
"occurrence_count": 0 if row is None else int(row["occurrence_count"] or 0),
|
|
283
|
+
"threshold": _RECOVERY_CHURN_GATE_THRESHOLD,
|
|
284
|
+
"threshold_met": row is not None and int(row["occurrence_count"] or 0) >= _RECOVERY_CHURN_GATE_THRESHOLD,
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _record_recovery_churn_strike(
|
|
289
|
+
*,
|
|
290
|
+
store: ExecutionStore,
|
|
291
|
+
project_id: str,
|
|
292
|
+
run_id: str,
|
|
293
|
+
item: FailedQueueItemArtifact,
|
|
294
|
+
failure_signature: str,
|
|
295
|
+
message: str,
|
|
296
|
+
) -> dict[str, Any]:
|
|
297
|
+
error_task_id = store.create_error_task_from_failure(
|
|
298
|
+
project_id=project_id,
|
|
299
|
+
run_id=run_id,
|
|
300
|
+
plane="process_error",
|
|
301
|
+
source_kind="recovery",
|
|
302
|
+
source_ref=item.item_id,
|
|
303
|
+
title=f"Recovery churn gate: {_durable_recovery_identity(item)}",
|
|
304
|
+
severity="high",
|
|
305
|
+
error_type="recovery_churn_no_material_change",
|
|
306
|
+
message=message,
|
|
307
|
+
stacktrace=None,
|
|
308
|
+
next_steps=[
|
|
309
|
+
"Review repeated recovery churn on the same durable identity.",
|
|
310
|
+
"Apply a material fix before retrying recovery again.",
|
|
311
|
+
],
|
|
312
|
+
fingerprint_inputs=_recovery_churn_fingerprint_inputs(item=item, failure_signature=failure_signature),
|
|
313
|
+
)
|
|
314
|
+
state = _load_recovery_churn_gate_state(
|
|
315
|
+
store=store,
|
|
316
|
+
project_id=project_id,
|
|
317
|
+
item=item,
|
|
318
|
+
failure_signature=failure_signature,
|
|
319
|
+
)
|
|
320
|
+
state["error_task_id"] = error_task_id
|
|
321
|
+
return state
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _decode_failure_context_blob(raw: Any) -> dict[str, Any]:
|
|
325
|
+
if isinstance(raw, dict):
|
|
326
|
+
return dict(raw)
|
|
327
|
+
try:
|
|
328
|
+
payload = json.loads(str(raw or "{}") or "{}")
|
|
329
|
+
return payload if isinstance(payload, dict) else {}
|
|
330
|
+
except Exception:
|
|
331
|
+
return {}
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _story_churn_source_item_ids(*, conn, item: FailedQueueItemArtifact) -> list[str]:
|
|
335
|
+
if item.queue_type == "story" and str(item.story_id or "").strip():
|
|
336
|
+
rows = conn.execute(
|
|
337
|
+
"SELECT story_queue_id FROM story_queue WHERE story_id=?",
|
|
338
|
+
(str(item.story_id or "").strip(),),
|
|
339
|
+
).fetchall()
|
|
340
|
+
ids = [str(row["story_queue_id"] or "") for row in rows if str(row["story_queue_id"] or "").strip()]
|
|
341
|
+
if ids:
|
|
342
|
+
return ids
|
|
343
|
+
return [item.item_id]
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _collect_failure_evidence(
|
|
347
|
+
*,
|
|
348
|
+
item: FailedQueueItemArtifact,
|
|
349
|
+
investigation: RecoveryInvestigationArtifact | None = None,
|
|
350
|
+
extra_evidence: list[str] | None = None,
|
|
351
|
+
) -> list[str]:
|
|
352
|
+
evidence: list[str] = []
|
|
353
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
354
|
+
for raw in (
|
|
355
|
+
item.failure_message,
|
|
356
|
+
failure_context.get("error"),
|
|
357
|
+
failure_context.get("error_type"),
|
|
358
|
+
failure_context.get("failed_stage"),
|
|
359
|
+
json.dumps(failure_context, sort_keys=True) if failure_context else None,
|
|
360
|
+
None if investigation is None else investigation.summary,
|
|
361
|
+
None if investigation is None else investigation.failure_nature,
|
|
362
|
+
*(extra_evidence or []),
|
|
363
|
+
):
|
|
364
|
+
text = str(raw or "").strip()
|
|
365
|
+
if text:
|
|
366
|
+
evidence.append(text)
|
|
367
|
+
return evidence
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _load_json_file(path: Path) -> dict[str, Any] | None:
|
|
371
|
+
try:
|
|
372
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
373
|
+
except Exception:
|
|
374
|
+
return None
|
|
375
|
+
return payload if isinstance(payload, dict) else None
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _preflight_artifact_paths(*, repo_root: Path, item: FailedQueueItemArtifact) -> list[Path]:
|
|
379
|
+
candidates: list[Path] = []
|
|
380
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
381
|
+
for raw_path in failure_context.get("artifact_paths") or []:
|
|
382
|
+
text = str(raw_path or "").strip()
|
|
383
|
+
if not text:
|
|
384
|
+
continue
|
|
385
|
+
path = Path(text)
|
|
386
|
+
candidates.append(path if path.is_absolute() else repo_root / path)
|
|
387
|
+
story_id = str(item.story_id or "").strip()
|
|
388
|
+
if story_id:
|
|
389
|
+
candidates.append(repo_root / ".devflow" / "stories" / story_id / "preflight.json")
|
|
390
|
+
seen: set[str] = set()
|
|
391
|
+
ordered: list[Path] = []
|
|
392
|
+
for candidate in candidates:
|
|
393
|
+
key = str(candidate)
|
|
394
|
+
if key in seen:
|
|
395
|
+
continue
|
|
396
|
+
seen.add(key)
|
|
397
|
+
ordered.append(candidate)
|
|
398
|
+
return ordered
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _load_preflight_health_failure_details(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
|
|
402
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
403
|
+
if str(failure_context.get("failed_stage") or "").strip().lower() != "preflight":
|
|
404
|
+
return None
|
|
405
|
+
for artifact_path in _preflight_artifact_paths(repo_root=repo_root, item=item):
|
|
406
|
+
payload = _load_json_file(artifact_path)
|
|
407
|
+
if payload is None:
|
|
408
|
+
continue
|
|
409
|
+
blocking_issues = payload.get("blocking_issues") if isinstance(payload.get("blocking_issues"), list) else []
|
|
410
|
+
health_issues = [issue for issue in blocking_issues if isinstance(issue, dict) and str(issue.get("kind") or "").strip() == "health_check_failed"]
|
|
411
|
+
if not health_issues:
|
|
412
|
+
continue
|
|
413
|
+
return {
|
|
414
|
+
"artifact_path": artifact_path,
|
|
415
|
+
"report": payload,
|
|
416
|
+
"health_issues": health_issues,
|
|
417
|
+
}
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _get_recovery_runtime(failure_context: dict[str, Any] | None) -> dict[str, Any]:
|
|
422
|
+
if not isinstance(failure_context, dict):
|
|
423
|
+
return {"strategy_history": [], "last_success": None}
|
|
424
|
+
payload = failure_context.get("recovery_runtime")
|
|
425
|
+
if not isinstance(payload, dict):
|
|
426
|
+
return {"strategy_history": [], "last_success": None}
|
|
427
|
+
return {
|
|
428
|
+
"strategy_history": list(payload.get("strategy_history") or []),
|
|
429
|
+
"last_success": payload.get("last_success"),
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _write_recovery_runtime(failure_context: dict[str, Any] | None, runtime: dict[str, Any]) -> dict[str, Any]:
|
|
434
|
+
payload = dict(failure_context or {})
|
|
435
|
+
payload["recovery_runtime"] = {
|
|
436
|
+
"strategy_history": list(runtime.get("strategy_history") or []),
|
|
437
|
+
"last_success": runtime.get("last_success"),
|
|
438
|
+
}
|
|
439
|
+
return payload
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _persist_queue_failure_context(*, store: ExecutionStore, item: FailedQueueItemArtifact, failure_context: dict[str, Any]) -> None:
|
|
443
|
+
mapping = _QUEUE_TABLE_BY_KIND.get(item.queue_type)
|
|
444
|
+
if mapping is None:
|
|
445
|
+
return
|
|
446
|
+
table, id_col = mapping
|
|
447
|
+
now = int(__import__("time").time())
|
|
448
|
+
with store._connect() as conn:
|
|
449
|
+
conn.execute(
|
|
450
|
+
f"UPDATE {table} SET failure_context_json=?, updated_at=? WHERE {id_col}=?",
|
|
451
|
+
(json.dumps(failure_context, sort_keys=True), now, item.item_id),
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _dedupe_strings(values: list[Any]) -> list[str]:
|
|
456
|
+
seen: set[str] = set()
|
|
457
|
+
ordered: list[str] = []
|
|
458
|
+
for raw in values:
|
|
459
|
+
value = str(raw or "").strip()
|
|
460
|
+
if not value or value in seen:
|
|
461
|
+
continue
|
|
462
|
+
seen.add(value)
|
|
463
|
+
ordered.append(value)
|
|
464
|
+
return ordered
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _normalize_success_criteria(
|
|
468
|
+
*,
|
|
469
|
+
verification_targets: list[RecoverySuccessCriterion] | list[dict[str, Any]] | None,
|
|
470
|
+
fallback_targets: list[RecoverySuccessCriterion] | list[dict[str, Any]] | None,
|
|
471
|
+
) -> list[RecoverySuccessCriterion]:
|
|
472
|
+
chosen = verification_targets or fallback_targets or [
|
|
473
|
+
{
|
|
474
|
+
"criterion": "Recovery action completed",
|
|
475
|
+
"oracle": "The queue item state matches the selected recovery strategy family.",
|
|
476
|
+
}
|
|
477
|
+
]
|
|
478
|
+
normalized: list[RecoverySuccessCriterion] = []
|
|
479
|
+
for target in chosen:
|
|
480
|
+
criterion = RecoverySuccessCriterion.model_validate(target)
|
|
481
|
+
if not str(criterion.oracle or "").strip():
|
|
482
|
+
raise ValueError("Recovery success criteria must include a non-empty oracle.")
|
|
483
|
+
normalized.append(criterion)
|
|
484
|
+
return normalized
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _recoverable_story_state_root(*, repo_root: Path) -> Path:
|
|
488
|
+
return repo_root / ".devflow" / "stories"
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _recovery_handoff_artifact_path(*, repo_root: Path, item: FailedQueueItemArtifact) -> Path:
|
|
492
|
+
story_id = str(item.story_id or "").strip()
|
|
493
|
+
if story_id:
|
|
494
|
+
safe_story_id = re.sub(r"[^A-Za-z0-9_.-]+", "_", story_id) or "unknown_story"
|
|
495
|
+
return _recoverable_story_state_root(repo_root=repo_root) / safe_story_id / "recovery_handoff.json"
|
|
496
|
+
safe_item_id = re.sub(r"[^A-Za-z0-9_.-]+", "_", item.item_id) or "unknown_item"
|
|
497
|
+
return repo_root / ".devflow" / "recovery_handoffs" / item.queue_type / f"{safe_item_id}.json"
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _extract_log_refs(payload: Any) -> tuple[list[str], list[str]]:
|
|
501
|
+
log_paths: list[str] = []
|
|
502
|
+
session_ids: list[str] = []
|
|
503
|
+
|
|
504
|
+
def _walk(value: Any, *, key: str | None = None) -> None:
|
|
505
|
+
if isinstance(value, dict):
|
|
506
|
+
for child_key, child_value in value.items():
|
|
507
|
+
_walk(child_value, key=str(child_key))
|
|
508
|
+
return
|
|
509
|
+
if isinstance(value, list):
|
|
510
|
+
for item in value:
|
|
511
|
+
_walk(item, key=key)
|
|
512
|
+
return
|
|
513
|
+
if not isinstance(value, str):
|
|
514
|
+
return
|
|
515
|
+
lowered = str(key or "").lower()
|
|
516
|
+
if lowered in {"log_path", "logfile", "log_file", "journal_path"} and value.strip():
|
|
517
|
+
log_paths.append(value.strip())
|
|
518
|
+
elif lowered == "session_id" and value.strip():
|
|
519
|
+
session_ids.append(value.strip())
|
|
520
|
+
|
|
521
|
+
_walk(payload)
|
|
522
|
+
return _dedupe_strings(log_paths), _dedupe_strings(session_ids)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def _read_log_excerpt(*, path: Path, max_lines: int = 60, max_chars: int = 6000) -> str:
|
|
526
|
+
try:
|
|
527
|
+
if path.suffix == ".jsonl":
|
|
528
|
+
lines = path.read_text(encoding="utf-8").splitlines()[-max_lines:]
|
|
529
|
+
rendered: list[str] = []
|
|
530
|
+
for raw in lines:
|
|
531
|
+
try:
|
|
532
|
+
record = json.loads(raw)
|
|
533
|
+
except Exception:
|
|
534
|
+
rendered.append(raw)
|
|
535
|
+
continue
|
|
536
|
+
stream = str(record.get("stream") or "log").strip()
|
|
537
|
+
line = str(record.get("line") or "").rstrip()
|
|
538
|
+
if line:
|
|
539
|
+
rendered.append(f"[{stream}] {line}")
|
|
540
|
+
return "\\n".join(rendered)[-max_chars:]
|
|
541
|
+
return path.read_text(encoding="utf-8")[-max_chars:]
|
|
542
|
+
except Exception:
|
|
543
|
+
return ""
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _load_llm_session_log_refs(*, run_id: str | None, node_exec_id: str | None) -> tuple[list[str], list[str]]:
|
|
547
|
+
db_path = llm_sessions_db()
|
|
548
|
+
if not db_path.exists() or (not run_id and not node_exec_id):
|
|
549
|
+
return [], []
|
|
550
|
+
query = "SELECT session_id, log_path FROM dev_journal_entries WHERE "
|
|
551
|
+
params: list[str] = []
|
|
552
|
+
clauses: list[str] = []
|
|
553
|
+
if node_exec_id:
|
|
554
|
+
clauses.append("node_exec_id=?")
|
|
555
|
+
params.append(node_exec_id)
|
|
556
|
+
if run_id:
|
|
557
|
+
clauses.append("run_id=?")
|
|
558
|
+
params.append(run_id)
|
|
559
|
+
query += " OR ".join(clauses) + " ORDER BY started_at DESC LIMIT 6"
|
|
560
|
+
try:
|
|
561
|
+
conn = sqlite3.connect(str(db_path))
|
|
562
|
+
conn.row_factory = sqlite3.Row
|
|
563
|
+
try:
|
|
564
|
+
rows = conn.execute(query, tuple(params)).fetchall()
|
|
565
|
+
finally:
|
|
566
|
+
conn.close()
|
|
567
|
+
except Exception:
|
|
568
|
+
return [], []
|
|
569
|
+
log_paths = [str(row["log_path"] or "").strip() for row in rows if str(row["log_path"] or "").strip()]
|
|
570
|
+
session_ids = [str(row["session_id"] or "").strip() for row in rows if str(row["session_id"] or "").strip()]
|
|
571
|
+
return _dedupe_strings(log_paths), _dedupe_strings(session_ids)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def _gather_log_first_recovery_evidence(*, store: ExecutionStore, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any]:
|
|
575
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
576
|
+
run_id = str(failure_context.get("implementation_run_id") or "").strip()
|
|
577
|
+
failed_stage = str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or "").strip()
|
|
578
|
+
if not run_id or not failed_stage:
|
|
579
|
+
return {"available": False, "source": None, "refs": [], "session_ids": [], "excerpt": "", "stage": failed_stage or None}
|
|
580
|
+
node = store.get_latest_node_attempt(run_id=run_id, node_id=failed_stage)
|
|
581
|
+
node_output = node.get("output") if isinstance(node, dict) and isinstance(node.get("output"), dict) else {}
|
|
582
|
+
log_paths, session_ids = _extract_log_refs(node_output)
|
|
583
|
+
journal_log_paths, journal_session_ids = _load_llm_session_log_refs(run_id=run_id, node_exec_id=None if not isinstance(node, dict) else str(node.get("node_exec_id") or "") or None)
|
|
584
|
+
log_paths = _dedupe_strings([*log_paths, *journal_log_paths])
|
|
585
|
+
session_ids = _dedupe_strings([*session_ids, *journal_session_ids])
|
|
586
|
+
excerpt = ""
|
|
587
|
+
used_refs: list[str] = []
|
|
588
|
+
for raw_path in log_paths:
|
|
589
|
+
path = Path(raw_path).expanduser()
|
|
590
|
+
if not path.is_absolute():
|
|
591
|
+
path = repo_root / raw_path
|
|
592
|
+
if not path.exists() or not path.is_file():
|
|
593
|
+
continue
|
|
594
|
+
excerpt = _read_log_excerpt(path=path)
|
|
595
|
+
if excerpt:
|
|
596
|
+
used_refs.append(str(path))
|
|
597
|
+
break
|
|
598
|
+
refs = _dedupe_strings([*used_refs, *session_ids])
|
|
599
|
+
source = "streamed_agent_logs" if used_refs else ("llm_session_journal" if session_ids else None)
|
|
600
|
+
return {
|
|
601
|
+
"available": bool(source),
|
|
602
|
+
"source": source,
|
|
603
|
+
"refs": refs,
|
|
604
|
+
"session_ids": session_ids,
|
|
605
|
+
"excerpt": excerpt,
|
|
606
|
+
"stage": failed_stage,
|
|
607
|
+
"implementation_run_id": run_id,
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def _enrich_investigation_with_log_evidence(*, investigation: RecoveryInvestigationArtifact, log_evidence: dict[str, Any]) -> RecoveryInvestigationArtifact:
|
|
612
|
+
if not log_evidence.get("available"):
|
|
613
|
+
return investigation
|
|
614
|
+
evidence = list(investigation.evidence or [])
|
|
615
|
+
source = str(log_evidence.get("source") or "streamed_agent_logs")
|
|
616
|
+
source_ref = ", ".join(str(ref) for ref in (log_evidence.get("refs") or []) if str(ref).strip())
|
|
617
|
+
source_line = f"primary_evidence_source={source}" + (f" ({source_ref})" if source_ref else "")
|
|
618
|
+
if source_line not in evidence:
|
|
619
|
+
evidence.insert(0, source_line)
|
|
620
|
+
return investigation.model_copy(update={
|
|
621
|
+
"evidence": evidence[:8],
|
|
622
|
+
"primary_evidence_source": investigation.primary_evidence_source or source,
|
|
623
|
+
"primary_evidence_refs": list(investigation.primary_evidence_refs or []) or [str(ref) for ref in (log_evidence.get("refs") or []) if str(ref).strip()],
|
|
624
|
+
"primary_log_insight": investigation.primary_log_insight or investigation.summary,
|
|
625
|
+
})
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _persist_recovery_handoff_artifact(
|
|
629
|
+
*,
|
|
630
|
+
repo_root: Path,
|
|
631
|
+
recovery_run_id: str,
|
|
632
|
+
item: FailedQueueItemArtifact,
|
|
633
|
+
investigation: RecoveryInvestigationArtifact | None,
|
|
634
|
+
diagnosis: RecoveryDiagnosisArtifact | None,
|
|
635
|
+
execution: RecoveryExecutionArtifact | None,
|
|
636
|
+
pre_replay: PreReplayCheckArtifact | None,
|
|
637
|
+
) -> Path | None:
|
|
638
|
+
useful = any(
|
|
639
|
+
[
|
|
640
|
+
investigation is not None,
|
|
641
|
+
diagnosis is not None,
|
|
642
|
+
execution is not None and bool(str(execution.verification_summary or execution.execution_summary or "").strip()),
|
|
643
|
+
pre_replay is not None and bool(pre_replay.blocking_reasons),
|
|
644
|
+
]
|
|
645
|
+
)
|
|
646
|
+
if not useful:
|
|
647
|
+
return None
|
|
648
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
649
|
+
disproven_dead_ends: list[str] = []
|
|
650
|
+
if investigation is not None and investigation.non_convergence is not None and investigation.non_convergence.unchanged_test_surface:
|
|
651
|
+
disproven_dead_ends.append("Repeated retries did not materially change the failing surface.")
|
|
652
|
+
if investigation is not None and investigation.non_convergence is not None and investigation.non_convergence.wrong_seam:
|
|
653
|
+
disproven_dead_ends.append("Prior recovery attempts stayed on the wrong seam.")
|
|
654
|
+
payload = RecoveryHandoffArtifact(
|
|
655
|
+
queue_type=item.queue_type,
|
|
656
|
+
item_id=item.item_id,
|
|
657
|
+
story_id=item.story_id,
|
|
658
|
+
implementation_run_id=str(failure_context.get("implementation_run_id") or "").strip() or None,
|
|
659
|
+
failed_stage=str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or "").strip() or None,
|
|
660
|
+
primary_evidence_source=(None if investigation is None else investigation.primary_evidence_source) or (None if execution is None else execution.primary_evidence_source),
|
|
661
|
+
primary_evidence_refs=([] if investigation is None else investigation.primary_evidence_refs) or ([] if execution is None else execution.primary_evidence_refs),
|
|
662
|
+
key_log_insight=(None if investigation is None else investigation.primary_log_insight) or (None if execution is None else execution.primary_log_insight),
|
|
663
|
+
failing_surface_summary=(None if investigation is None else investigation.summary) or str(item.failure_message or execution.execution_summary if execution else item.item_id),
|
|
664
|
+
likely_seam=(None if investigation is None else investigation.affected_boundary) or str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or diagnosis.strategy if diagnosis else "").strip() or None,
|
|
665
|
+
disproven_dead_ends=disproven_dead_ends[:4],
|
|
666
|
+
verification_blockers=[] if pre_replay is None else [str(reason) for reason in pre_replay.blocking_reasons if str(reason).strip()][:4],
|
|
667
|
+
non_convergence_insight=None if investigation is None or investigation.non_convergence is None else investigation.non_convergence.reason,
|
|
668
|
+
produced_by_recovery_run_id=recovery_run_id,
|
|
669
|
+
)
|
|
670
|
+
path = _recovery_handoff_artifact_path(repo_root=repo_root, item=item)
|
|
671
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
672
|
+
path.write_text(json.dumps(payload.model_dump(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
673
|
+
return path
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _build_diagnosis(
|
|
677
|
+
*,
|
|
678
|
+
item: FailedQueueItemArtifact,
|
|
679
|
+
investigation: RecoveryInvestigationArtifact | None,
|
|
680
|
+
prior_execution: RecoveryExecutionArtifact | None = None,
|
|
681
|
+
prior_verification: PreReplayCheckArtifact | None = None,
|
|
682
|
+
attempt: int = 1,
|
|
683
|
+
) -> tuple[RecoveryDiagnosisArtifact, RemediationPlanArtifact]:
|
|
684
|
+
diagnosis, _envelope = run_agent_step(
|
|
685
|
+
repo_root=_repo_root(),
|
|
686
|
+
stage_name="recovery_diagnosis",
|
|
687
|
+
output_model=RecoveryDiagnosisArtifact,
|
|
688
|
+
context_payload={
|
|
689
|
+
"failed_item": item.model_dump(),
|
|
690
|
+
"investigation": None if investigation is None else investigation.model_dump(),
|
|
691
|
+
"prior_execution": None if prior_execution is None else prior_execution.model_dump(),
|
|
692
|
+
"prior_verification": None if prior_verification is None else prior_verification.model_dump(),
|
|
693
|
+
"attempt": attempt,
|
|
694
|
+
},
|
|
695
|
+
guidance=load_agentic_prompt_lines("recovery_diagnosis"),
|
|
696
|
+
timeout_seconds=300,
|
|
697
|
+
strength=_CURRENT_STRENGTH,
|
|
698
|
+
)
|
|
699
|
+
targets = _normalize_success_criteria(
|
|
700
|
+
verification_targets=diagnosis.verification_targets,
|
|
701
|
+
fallback_targets=None if investigation is None else investigation.success_criteria,
|
|
702
|
+
)
|
|
703
|
+
enforced = diagnosis.model_copy(
|
|
704
|
+
update={
|
|
705
|
+
"queue_type": item.queue_type,
|
|
706
|
+
"item_id": item.item_id,
|
|
707
|
+
"summary": str(diagnosis.summary or (None if investigation is None else investigation.summary) or "Recovery diagnosis").strip(),
|
|
708
|
+
"rationale": str(diagnosis.rationale or (None if investigation is None else investigation.failure_nature) or "Recovery diagnosis").strip(),
|
|
709
|
+
"verification_targets": targets,
|
|
710
|
+
"replay_path": str(diagnosis.replay_path or (None if investigation is None else investigation.replay_path) or "") or None,
|
|
711
|
+
"evidence": [str(entry) for entry in (diagnosis.evidence or (None if investigation is None else investigation.evidence) or []) if str(entry).strip()][:8],
|
|
712
|
+
}
|
|
713
|
+
)
|
|
714
|
+
plan = RemediationPlanArtifact(
|
|
715
|
+
queue_type=item.queue_type,
|
|
716
|
+
action=enforced.suggested_action,
|
|
717
|
+
summary=enforced.summary,
|
|
718
|
+
steps=[criterion.criterion for criterion in enforced.verification_targets],
|
|
719
|
+
replay_path=enforced.replay_path,
|
|
720
|
+
)
|
|
721
|
+
return enforced, plan
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _record_recovery_attempt(
|
|
725
|
+
*,
|
|
726
|
+
item: FailedQueueItemArtifact,
|
|
727
|
+
diagnosis: RecoveryDiagnosisArtifact | None = None,
|
|
728
|
+
success: bool = False,
|
|
729
|
+
failure_signature: str | None = None,
|
|
730
|
+
material_change: bool | None = None,
|
|
731
|
+
remediation_artifact: str | None = None,
|
|
732
|
+
) -> dict[str, Any]:
|
|
733
|
+
runtime = _get_recovery_runtime(item.failure_context if isinstance(item.failure_context, dict) else {})
|
|
734
|
+
if diagnosis is not None:
|
|
735
|
+
entry: dict[str, Any] = {
|
|
736
|
+
"strategy": diagnosis.strategy,
|
|
737
|
+
"summary": diagnosis.summary,
|
|
738
|
+
"success": success,
|
|
739
|
+
}
|
|
740
|
+
if failure_signature is not None:
|
|
741
|
+
entry["failure_signature"] = failure_signature
|
|
742
|
+
if material_change is not None:
|
|
743
|
+
entry["material_change"] = material_change
|
|
744
|
+
if remediation_artifact is not None:
|
|
745
|
+
entry["remediation_artifact"] = remediation_artifact
|
|
746
|
+
runtime["strategy_history"].append(entry)
|
|
747
|
+
if success and diagnosis is not None:
|
|
748
|
+
runtime["last_success"] = {
|
|
749
|
+
"strategy": diagnosis.strategy,
|
|
750
|
+
"failure_signature": failure_signature,
|
|
751
|
+
"material_change": material_change,
|
|
752
|
+
}
|
|
753
|
+
return _write_recovery_runtime(item.failure_context if isinstance(item.failure_context, dict) else {}, runtime)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
_DOWNSTREAM_RECOVERY_STAGES = {"redreview", "security", "verifygreen", "gitcommit(refactor)"}
|
|
757
|
+
|
|
758
|
+
_SCHEMA_RUNTIME_DRIFT_MARKERS = (
|
|
759
|
+
"schema drift",
|
|
760
|
+
"runtime drift",
|
|
761
|
+
"contract drift",
|
|
762
|
+
"migration mismatch",
|
|
763
|
+
"serialization",
|
|
764
|
+
"deserial",
|
|
765
|
+
"no tests collected",
|
|
766
|
+
"collected 0 items",
|
|
767
|
+
"pytest import error",
|
|
768
|
+
"vitest",
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def _build_non_convergence_analysis(
|
|
773
|
+
*,
|
|
774
|
+
item: FailedQueueItemArtifact,
|
|
775
|
+
failure_signature: str,
|
|
776
|
+
occurrence_count: int,
|
|
777
|
+
remediation_artifact: str | None = None,
|
|
778
|
+
) -> RecoveryNonConvergenceArtifact:
|
|
779
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
780
|
+
runtime = _get_recovery_runtime(failure_context)
|
|
781
|
+
history = [entry for entry in (runtime.get("strategy_history") or []) if isinstance(entry, dict)]
|
|
782
|
+
matching_history = [
|
|
783
|
+
entry
|
|
784
|
+
for entry in history
|
|
785
|
+
if str(entry.get("failure_signature") or "").strip() == failure_signature or not str(entry.get("failure_signature") or "").strip()
|
|
786
|
+
]
|
|
787
|
+
attempts_reviewed = max(int(occurrence_count or 0), len(matching_history))
|
|
788
|
+
recent_attempts = matching_history[-max(attempts_reviewed, _RECOVERY_CHURN_GATE_THRESHOLD) :]
|
|
789
|
+
recent_failure_signatures = {str(entry.get("failure_signature") or "").strip() for entry in recent_attempts if str(entry.get("failure_signature") or "").strip()}
|
|
790
|
+
same_failure_surface = bool(failure_signature) and (not recent_failure_signatures or recent_failure_signatures == {failure_signature})
|
|
791
|
+
unchanged_test_surface = same_failure_surface and all(entry.get("material_change") is False for entry in recent_attempts)
|
|
792
|
+
measurable_progress = any(bool(entry.get("success")) or entry.get("material_change") is True for entry in recent_attempts)
|
|
793
|
+
failed_stage = str(failure_context.get("failed_stage") or "").strip().lower() or None
|
|
794
|
+
downstream_blocker = bool(failed_stage and failed_stage in _DOWNSTREAM_RECOVERY_STAGES)
|
|
795
|
+
strategy_set = {str(entry.get("strategy") or "").strip() for entry in recent_attempts if str(entry.get("strategy") or "").strip()}
|
|
796
|
+
artifact_set = {
|
|
797
|
+
str(entry.get("remediation_artifact") or "").strip()
|
|
798
|
+
for entry in recent_attempts
|
|
799
|
+
if str(entry.get("remediation_artifact") or "").strip()
|
|
800
|
+
}
|
|
801
|
+
combined_text = "\n".join(
|
|
802
|
+
[
|
|
803
|
+
str(item.failure_message or ""),
|
|
804
|
+
str(failure_context.get("error") or ""),
|
|
805
|
+
str(failure_context.get("error_type") or ""),
|
|
806
|
+
str(failure_context.get("failed_stage") or ""),
|
|
807
|
+
*(str(entry.get("summary") or "") for entry in recent_attempts),
|
|
808
|
+
*(str(entry.get("remediation_artifact") or "") for entry in recent_attempts),
|
|
809
|
+
]
|
|
810
|
+
).lower()
|
|
811
|
+
schema_or_runtime_drift = any(marker in combined_text for marker in _SCHEMA_RUNTIME_DRIFT_MARKERS)
|
|
812
|
+
wrong_seam = bool(
|
|
813
|
+
same_failure_surface
|
|
814
|
+
and not measurable_progress
|
|
815
|
+
and not downstream_blocker
|
|
816
|
+
and not schema_or_runtime_drift
|
|
817
|
+
and (len(strategy_set) == 1 or len(artifact_set) == 1 or remediation_artifact)
|
|
818
|
+
)
|
|
819
|
+
summary_bits: list[str] = []
|
|
820
|
+
if unchanged_test_surface:
|
|
821
|
+
summary_bits.append("the same failure surface stayed unchanged across attempts")
|
|
822
|
+
if wrong_seam:
|
|
823
|
+
summary_bits.append("recovery kept editing the wrong seam")
|
|
824
|
+
if downstream_blocker and failed_stage:
|
|
825
|
+
summary_bits.append(f"the blocking node stayed downstream at {failed_stage} rather than the green boundary")
|
|
826
|
+
if schema_or_runtime_drift:
|
|
827
|
+
summary_bits.append("schema/runtime drift kept invalidating the attempted fixes")
|
|
828
|
+
if not measurable_progress:
|
|
829
|
+
summary_bits.append("there was no measurable improvement across the three attempts")
|
|
830
|
+
if not summary_bits:
|
|
831
|
+
summary_bits.append("three attempts did not materially change the failing boundary")
|
|
832
|
+
evidence: list[str] = [
|
|
833
|
+
f"failure_signature={failure_signature}",
|
|
834
|
+
f"attempts_reviewed={attempts_reviewed}",
|
|
835
|
+
]
|
|
836
|
+
if failed_stage:
|
|
837
|
+
evidence.append(f"failed_stage={failed_stage}")
|
|
838
|
+
if strategy_set:
|
|
839
|
+
evidence.append("strategies=" + ", ".join(sorted(strategy_set)))
|
|
840
|
+
if artifact_set:
|
|
841
|
+
evidence.append("remediation_artifacts=" + ", ".join(sorted(artifact_set)))
|
|
842
|
+
if unchanged_test_surface:
|
|
843
|
+
evidence.append("all reviewed attempts recorded material_change=false")
|
|
844
|
+
if remediation_artifact and remediation_artifact not in artifact_set:
|
|
845
|
+
evidence.append(f"current_remediation_artifact={remediation_artifact}")
|
|
846
|
+
return RecoveryNonConvergenceArtifact(
|
|
847
|
+
summary="Non-convergence analysis for repeated recovery churn.",
|
|
848
|
+
reason="; ".join(summary_bits),
|
|
849
|
+
attempts_reviewed=attempts_reviewed,
|
|
850
|
+
same_failure_surface=same_failure_surface,
|
|
851
|
+
unchanged_test_surface=unchanged_test_surface,
|
|
852
|
+
wrong_seam=wrong_seam,
|
|
853
|
+
downstream_blocker=downstream_blocker,
|
|
854
|
+
downstream_blocker_stage=failed_stage if downstream_blocker else None,
|
|
855
|
+
schema_or_runtime_drift=schema_or_runtime_drift,
|
|
856
|
+
measurable_progress=measurable_progress,
|
|
857
|
+
evidence=evidence[:8],
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def _build_churn_gate_investigation(
|
|
862
|
+
*,
|
|
863
|
+
item: FailedQueueItemArtifact,
|
|
864
|
+
failure_signature: str,
|
|
865
|
+
occurrence_count: int,
|
|
866
|
+
threshold: int,
|
|
867
|
+
churn_key: str | None = None,
|
|
868
|
+
remediation_artifact: str | None = None,
|
|
869
|
+
) -> RecoveryInvestigationArtifact:
|
|
870
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
871
|
+
failed_stage = str(failure_context.get("failed_stage") or "").strip().lower() or None
|
|
872
|
+
non_convergence = _build_non_convergence_analysis(
|
|
873
|
+
item=item,
|
|
874
|
+
failure_signature=failure_signature,
|
|
875
|
+
occurrence_count=occurrence_count,
|
|
876
|
+
remediation_artifact=remediation_artifact,
|
|
877
|
+
)
|
|
878
|
+
underlying_issue = str(item.failure_message or failure_context.get("error") or failure_signature or "Repeated recovery failure").strip()
|
|
879
|
+
evidence = _collect_failure_evidence(item=item, extra_evidence=[*non_convergence.evidence, churn_key or ""])
|
|
880
|
+
return RecoveryInvestigationArtifact(
|
|
881
|
+
queue_type=item.queue_type,
|
|
882
|
+
item_id=item.item_id,
|
|
883
|
+
summary=(
|
|
884
|
+
f"Recovery churn gate hit after {occurrence_count}/{threshold} no-progress attempts on "
|
|
885
|
+
f"{failed_stage or failure_signature or item.item_id}."
|
|
886
|
+
),
|
|
887
|
+
failure_nature=underlying_issue,
|
|
888
|
+
evidence=evidence[:8],
|
|
889
|
+
primary_evidence_source="recovery_churn_history",
|
|
890
|
+
primary_log_insight=non_convergence.reason,
|
|
891
|
+
affected_boundary=failed_stage or failure_signature or item.queue_type,
|
|
892
|
+
likely_failed_stage=failed_stage,
|
|
893
|
+
confidence="high" if occurrence_count >= threshold else "medium",
|
|
894
|
+
recovery_goal="Explain both the underlying failure and why implementation did not converge before any further replay.",
|
|
895
|
+
success_criteria=[
|
|
896
|
+
{
|
|
897
|
+
"criterion": "Underlying failure boundary identified",
|
|
898
|
+
"oracle": "The investigation names the concrete failing node/stage and the defect surface that stayed broken.",
|
|
899
|
+
},
|
|
900
|
+
{
|
|
901
|
+
"criterion": "Non-convergence reason identified",
|
|
902
|
+
"oracle": "The investigation explains why three attempts made no measurable progress and whether the repeated work stayed on the wrong seam, on an unchanged test surface, or behind downstream/runtime drift.",
|
|
903
|
+
},
|
|
904
|
+
],
|
|
905
|
+
verification_evidence=[
|
|
906
|
+
f"Churn gate occurrence count reached {occurrence_count}/{threshold}.",
|
|
907
|
+
non_convergence.reason,
|
|
908
|
+
],
|
|
909
|
+
replay_path="manual_review_required",
|
|
910
|
+
escalation_conditions=[
|
|
911
|
+
"Do not requeue again until the non-convergence reason has a concrete fix plan.",
|
|
912
|
+
"Require a materially different intervention when the same story/node has failed three times with no measurable improvement.",
|
|
913
|
+
],
|
|
914
|
+
non_convergence=non_convergence,
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _build_story_replay_metadata(*, item: FailedQueueItemArtifact, diagnosis: RecoveryDiagnosisArtifact | None, execution: RecoveryExecutionArtifact | None) -> dict[str, Any] | None:
|
|
919
|
+
if item.queue_type != "story":
|
|
920
|
+
return None
|
|
921
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
922
|
+
prior_run_id = str(failure_context.get("implementation_run_id") or "").strip()
|
|
923
|
+
failed_stage = str(failure_context.get("failed_stage") or "").strip().lower()
|
|
924
|
+
if not prior_run_id or not failed_stage:
|
|
925
|
+
return None
|
|
926
|
+
canonical_order = [
|
|
927
|
+
"normalize",
|
|
928
|
+
"preflight",
|
|
929
|
+
"dependencyassessment",
|
|
930
|
+
"storyimplementationplanning",
|
|
931
|
+
"testdesign",
|
|
932
|
+
"red",
|
|
933
|
+
"redreview",
|
|
934
|
+
"storysufficiencyreconciliation",
|
|
935
|
+
"green",
|
|
936
|
+
"refactor",
|
|
937
|
+
"security",
|
|
938
|
+
"gitcommit(refactor)",
|
|
939
|
+
]
|
|
940
|
+
if failed_stage not in canonical_order:
|
|
941
|
+
return None
|
|
942
|
+
invalidated = {failed_stage}
|
|
943
|
+
invalidated.update(str(item).strip().lower() for item in (failure_context.get("invalidated_stages") or []) if str(item).strip())
|
|
944
|
+
for criterion in ((execution.success_criteria if execution is not None else None) or []):
|
|
945
|
+
criterion_text = f"{criterion.criterion} {criterion.oracle}".lower()
|
|
946
|
+
for stage_id in canonical_order:
|
|
947
|
+
if stage_id in criterion_text and stage_id != failed_stage:
|
|
948
|
+
invalidated.add(stage_id)
|
|
949
|
+
valid_prior = [stage_id for stage_id in canonical_order if stage_id not in invalidated and canonical_order.index(stage_id) < canonical_order.index(failed_stage)]
|
|
950
|
+
repaired_artifacts = failure_context.get("repaired_artifacts")
|
|
951
|
+
if not isinstance(repaired_artifacts, list):
|
|
952
|
+
repaired_artifacts = []
|
|
953
|
+
replay_metadata = {
|
|
954
|
+
"resume_from_stage": failed_stage,
|
|
955
|
+
"valid_prior_stages": valid_prior,
|
|
956
|
+
"invalidated_stages": [stage_id for stage_id in canonical_order if stage_id in invalidated],
|
|
957
|
+
"repaired_artifacts": repaired_artifacts,
|
|
958
|
+
"prior_run_id": prior_run_id,
|
|
959
|
+
"strategy": None if diagnosis is None else diagnosis.strategy,
|
|
960
|
+
}
|
|
961
|
+
return replay_metadata
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
_TEST_RUNTIME_STAGE_IDS = {"red", "redreview", "green", "verifygreen", "gitcommit(refactor)"}
|
|
965
|
+
_TEST_RUNTIME_FAILURE_PATTERNS = (
|
|
966
|
+
"story test validation failed",
|
|
967
|
+
"story-scoped test sufficiency failed",
|
|
968
|
+
"insufficient_story_tests",
|
|
969
|
+
"test validation",
|
|
970
|
+
"no story-scoped tests discovered",
|
|
971
|
+
"no tests collected",
|
|
972
|
+
"collected 0 items",
|
|
973
|
+
"pytest",
|
|
974
|
+
"vitest",
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
_PYTESTMARK_BLOCK_START_RE = re.compile(r"^\s*pytestmark\s*=\s*\[")
|
|
978
|
+
_OPEN_FROM_IMPORT_RE = re.compile(r"^\s*from\b.+\bimport\s*\(\s*$")
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def _count_parens(text: str) -> int:
|
|
982
|
+
return text.count("(") - text.count(")")
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
def _find_pytestmark_block(lines: list[str], *, start_index: int) -> tuple[int, int] | None:
|
|
986
|
+
for index in range(start_index, len(lines)):
|
|
987
|
+
if not _PYTESTMARK_BLOCK_START_RE.match(lines[index]):
|
|
988
|
+
continue
|
|
989
|
+
balance = lines[index].count("[") - lines[index].count("]")
|
|
990
|
+
end_index = index + 1
|
|
991
|
+
while balance > 0 and end_index < len(lines):
|
|
992
|
+
balance += lines[end_index].count("[") - lines[end_index].count("]")
|
|
993
|
+
end_index += 1
|
|
994
|
+
if balance == 0:
|
|
995
|
+
return index, end_index
|
|
996
|
+
return None
|
|
997
|
+
return None
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _repair_malformed_story_pytestmark_import_block(*, text: str) -> str | None:
|
|
1001
|
+
lines = text.splitlines()
|
|
1002
|
+
for import_start, line in enumerate(lines):
|
|
1003
|
+
if not _OPEN_FROM_IMPORT_RE.match(line):
|
|
1004
|
+
continue
|
|
1005
|
+
block = _find_pytestmark_block(lines, start_index=import_start + 1)
|
|
1006
|
+
if block is None:
|
|
1007
|
+
continue
|
|
1008
|
+
pytestmark_start, pytestmark_end = block
|
|
1009
|
+
paren_balance = _count_parens(line)
|
|
1010
|
+
closing_index: int | None = None
|
|
1011
|
+
cursor = import_start + 1
|
|
1012
|
+
while cursor < len(lines):
|
|
1013
|
+
if pytestmark_start <= cursor < pytestmark_end:
|
|
1014
|
+
cursor = pytestmark_end
|
|
1015
|
+
continue
|
|
1016
|
+
paren_balance += _count_parens(lines[cursor])
|
|
1017
|
+
if paren_balance <= 0:
|
|
1018
|
+
closing_index = cursor
|
|
1019
|
+
break
|
|
1020
|
+
cursor += 1
|
|
1021
|
+
if closing_index is None:
|
|
1022
|
+
continue
|
|
1023
|
+
repaired_lines = lines[:pytestmark_start] + lines[pytestmark_end:]
|
|
1024
|
+
adjusted_closing_index = closing_index - (pytestmark_end - pytestmark_start)
|
|
1025
|
+
insert_at = adjusted_closing_index + 1
|
|
1026
|
+
pytestmark_lines = lines[pytestmark_start:pytestmark_end]
|
|
1027
|
+
prefix = repaired_lines[:insert_at]
|
|
1028
|
+
suffix = repaired_lines[insert_at:]
|
|
1029
|
+
updated_lines = [
|
|
1030
|
+
*prefix,
|
|
1031
|
+
*([""] if prefix and prefix[-1].strip() else []),
|
|
1032
|
+
*pytestmark_lines,
|
|
1033
|
+
*([""] if suffix and suffix[0].strip() else []),
|
|
1034
|
+
*suffix,
|
|
1035
|
+
]
|
|
1036
|
+
updated_text = "\n".join(updated_lines).rstrip() + "\n"
|
|
1037
|
+
if updated_text == text:
|
|
1038
|
+
continue
|
|
1039
|
+
try:
|
|
1040
|
+
compile(updated_text, "<recovered_story_test>", "exec")
|
|
1041
|
+
except SyntaxError:
|
|
1042
|
+
continue
|
|
1043
|
+
return updated_text
|
|
1044
|
+
return None
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def _repair_story_test_files_for_runtime_failure(*, repo_root: Path, test_paths: list[str]) -> list[str]:
|
|
1048
|
+
changed: list[str] = []
|
|
1049
|
+
for raw_path in test_paths:
|
|
1050
|
+
relative = str(raw_path or "").strip()
|
|
1051
|
+
if not relative or not relative.endswith(".py"):
|
|
1052
|
+
continue
|
|
1053
|
+
candidate = repo_root / relative
|
|
1054
|
+
if not candidate.exists() or not candidate.is_file():
|
|
1055
|
+
continue
|
|
1056
|
+
try:
|
|
1057
|
+
original = candidate.read_text(encoding="utf-8")
|
|
1058
|
+
compile(original, str(candidate), "exec")
|
|
1059
|
+
continue
|
|
1060
|
+
except SyntaxError:
|
|
1061
|
+
pass
|
|
1062
|
+
except Exception:
|
|
1063
|
+
continue
|
|
1064
|
+
repaired = _repair_malformed_story_pytestmark_import_block(text=original)
|
|
1065
|
+
if repaired is None:
|
|
1066
|
+
continue
|
|
1067
|
+
candidate.write_text(repaired, encoding="utf-8")
|
|
1068
|
+
changed.append(relative)
|
|
1069
|
+
return changed
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
def _maybe_repair_story_test_runtime_contract(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
|
|
1073
|
+
if item.queue_type != "story":
|
|
1074
|
+
return None
|
|
1075
|
+
story_id = str(item.story_id or "").strip()
|
|
1076
|
+
if not story_id:
|
|
1077
|
+
return None
|
|
1078
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
1079
|
+
failed_stage = str(failure_context.get("failed_stage") or "").strip().lower()
|
|
1080
|
+
if failed_stage not in _TEST_RUNTIME_STAGE_IDS:
|
|
1081
|
+
return None
|
|
1082
|
+
current_contract = load_story_test_runtime_contract(repo_root=repo_root, story_id=story_id) or {}
|
|
1083
|
+
current_story_uuid = str(current_contract.get("story_uuid") or "").strip() or None
|
|
1084
|
+
failure_blob = " ".join(
|
|
1085
|
+
str(part or "")
|
|
1086
|
+
for part in (
|
|
1087
|
+
item.failure_message,
|
|
1088
|
+
failure_context.get("error"),
|
|
1089
|
+
failure_context.get("error_type"),
|
|
1090
|
+
)
|
|
1091
|
+
).lower()
|
|
1092
|
+
fresh_test_paths = [str(path) for path in (current_contract.get("test_paths") or []) if str(path).strip()]
|
|
1093
|
+
if not fresh_test_paths:
|
|
1094
|
+
fresh_test_paths = discover_story_scoped_test_paths(
|
|
1095
|
+
repo_root=repo_root,
|
|
1096
|
+
story_id=story_id,
|
|
1097
|
+
story_uuid=current_story_uuid,
|
|
1098
|
+
)
|
|
1099
|
+
if current_contract:
|
|
1100
|
+
fresh_contract = dict(current_contract)
|
|
1101
|
+
if fresh_test_paths:
|
|
1102
|
+
fresh_contract["test_paths"] = fresh_test_paths
|
|
1103
|
+
else:
|
|
1104
|
+
fresh_contract = resolve_story_runtime_contract(
|
|
1105
|
+
repo_root=repo_root,
|
|
1106
|
+
story_id=story_id,
|
|
1107
|
+
story_uuid=current_story_uuid,
|
|
1108
|
+
test_paths=fresh_test_paths,
|
|
1109
|
+
prefer_story_contract=False,
|
|
1110
|
+
)
|
|
1111
|
+
fresh_contract = normalize_recovery_story_runtime_contract(
|
|
1112
|
+
repo_root=repo_root,
|
|
1113
|
+
contract=fresh_contract,
|
|
1114
|
+
)
|
|
1115
|
+
fresh_contract["source"] = "recovery_repair"
|
|
1116
|
+
repaired_test_files = _repair_story_test_files_for_runtime_failure(
|
|
1117
|
+
repo_root=repo_root,
|
|
1118
|
+
test_paths=fresh_test_paths,
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
comparable_keys = ("framework", "cwd", "run_cmd", "env", "setup_cmd", "test_paths")
|
|
1122
|
+
current_fingerprint = {key: current_contract.get(key) for key in comparable_keys}
|
|
1123
|
+
fresh_fingerprint = {key: fresh_contract.get(key) for key in comparable_keys}
|
|
1124
|
+
looks_like_runtime_failure = any(pattern in failure_blob for pattern in _TEST_RUNTIME_FAILURE_PATTERNS)
|
|
1125
|
+
if not looks_like_runtime_failure and current_fingerprint == fresh_fingerprint and current_contract and not repaired_test_files:
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
path = story_test_runtime_contract_path(repo_root=repo_root, story_id=story_id)
|
|
1129
|
+
updated = current_fingerprint != fresh_fingerprint or not current_contract or bool(repaired_test_files)
|
|
1130
|
+
if updated:
|
|
1131
|
+
if current_fingerprint != fresh_fingerprint or not current_contract:
|
|
1132
|
+
path = persist_story_runtime_contract(repo_root=repo_root, contract=fresh_contract)
|
|
1133
|
+
return {
|
|
1134
|
+
"story_id": story_id,
|
|
1135
|
+
"path": str(path),
|
|
1136
|
+
"updated": updated,
|
|
1137
|
+
"previous_contract": current_contract or None,
|
|
1138
|
+
"runtime_contract": fresh_contract,
|
|
1139
|
+
"files_changed": repaired_test_files,
|
|
1140
|
+
"reason": "test_runtime_boundary_repaired",
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
def _load_local_setup_contract(repo_root: Path) -> tuple[Path, LocalSetupContract] | None:
|
|
1145
|
+
path = repo_root / ".devflow" / "local_setup.json"
|
|
1146
|
+
payload = _load_json_file(path)
|
|
1147
|
+
if payload is None:
|
|
1148
|
+
return None
|
|
1149
|
+
try:
|
|
1150
|
+
return path, LocalSetupContract.model_validate(payload)
|
|
1151
|
+
except Exception:
|
|
1152
|
+
return None
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def _check_health_endpoint(url: str, expected_status: int) -> bool:
|
|
1156
|
+
try:
|
|
1157
|
+
with urllib_request.urlopen(url, timeout=5) as response:
|
|
1158
|
+
return int(response.status) == int(expected_status)
|
|
1159
|
+
except urllib_error.HTTPError as exc:
|
|
1160
|
+
return int(exc.code) == int(expected_status)
|
|
1161
|
+
except Exception:
|
|
1162
|
+
return False
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
def _collect_preflight_repair_files(*, repo_root: Path, service_hints: list[str]) -> dict[str, str]:
|
|
1166
|
+
candidates = [
|
|
1167
|
+
".devflow/local_setup.json",
|
|
1168
|
+
"docker-compose.yml",
|
|
1169
|
+
"docker-compose.yaml",
|
|
1170
|
+
"compose.yml",
|
|
1171
|
+
"compose.yaml",
|
|
1172
|
+
"package.json",
|
|
1173
|
+
"pnpm-workspace.yaml",
|
|
1174
|
+
"pnpm-workspace.yml",
|
|
1175
|
+
"pnpm-lock.yaml",
|
|
1176
|
+
"package-lock.json",
|
|
1177
|
+
"yarn.lock",
|
|
1178
|
+
"pyproject.toml",
|
|
1179
|
+
"uv.lock",
|
|
1180
|
+
"requirements.txt",
|
|
1181
|
+
"requirements-dev.txt",
|
|
1182
|
+
"vite.config.ts",
|
|
1183
|
+
"vite.config.js",
|
|
1184
|
+
"vite.config.mjs",
|
|
1185
|
+
"next.config.js",
|
|
1186
|
+
"next.config.mjs",
|
|
1187
|
+
"next.config.ts",
|
|
1188
|
+
]
|
|
1189
|
+
files: dict[str, str] = {}
|
|
1190
|
+
for relative in candidates:
|
|
1191
|
+
path = repo_root / relative
|
|
1192
|
+
if not path.exists() or not path.is_file():
|
|
1193
|
+
continue
|
|
1194
|
+
try:
|
|
1195
|
+
files[relative] = path.read_text(encoding="utf-8")
|
|
1196
|
+
except Exception:
|
|
1197
|
+
continue
|
|
1198
|
+
hints = {hint for hint in service_hints if hint}
|
|
1199
|
+
for child in sorted(repo_root.iterdir()):
|
|
1200
|
+
if not child.is_dir() or child.name.startswith("."):
|
|
1201
|
+
continue
|
|
1202
|
+
child_name = child.name.lower()
|
|
1203
|
+
if hints and not any(hint in child_name or child_name in hint for hint in hints):
|
|
1204
|
+
continue
|
|
1205
|
+
for name in ("package.json", "pyproject.toml", "vite.config.ts", "vite.config.js", "next.config.js", "next.config.ts"):
|
|
1206
|
+
path = child / name
|
|
1207
|
+
if not path.exists() or not path.is_file():
|
|
1208
|
+
continue
|
|
1209
|
+
rel = str(path.relative_to(repo_root))
|
|
1210
|
+
if rel in files:
|
|
1211
|
+
continue
|
|
1212
|
+
try:
|
|
1213
|
+
files[rel] = path.read_text(encoding="utf-8")
|
|
1214
|
+
except Exception:
|
|
1215
|
+
continue
|
|
1216
|
+
return files
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def _select_recovery_health_checks(*, contract: LocalSetupContract, failing_urls: set[str]) -> list[HealthCheckEntry]:
|
|
1220
|
+
checks = [hc for hc in contract.health_checks if hc.url in failing_urls] if failing_urls else list(contract.health_checks)
|
|
1221
|
+
return checks or list(contract.health_checks)
|
|
1222
|
+
|
|
1223
|
+
|
|
1224
|
+
def _verify_recovery_health_checks(*, checks: list[HealthCheckEntry]) -> tuple[list[str], list[str]]:
|
|
1225
|
+
verification_checks: list[str] = []
|
|
1226
|
+
blocking_reasons: list[str] = []
|
|
1227
|
+
for health_check in checks:
|
|
1228
|
+
if _check_health_endpoint(health_check.url, health_check.expected_status):
|
|
1229
|
+
verification_checks.append(f"{health_check.name} returned {health_check.expected_status} at {health_check.url}")
|
|
1230
|
+
else:
|
|
1231
|
+
blocking_reasons.append(f"{health_check.name} still failed health verification at {health_check.url}")
|
|
1232
|
+
return verification_checks, blocking_reasons
|
|
1233
|
+
|
|
1234
|
+
|
|
1235
|
+
def _maybe_repair_story_preflight_health_boundary(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
|
|
1236
|
+
details = _load_preflight_health_failure_details(repo_root=repo_root, item=item)
|
|
1237
|
+
if details is None:
|
|
1238
|
+
return None
|
|
1239
|
+
setup = _load_local_setup_contract(repo_root)
|
|
1240
|
+
if setup is None:
|
|
1241
|
+
return {
|
|
1242
|
+
"ready": False,
|
|
1243
|
+
"blocking_reasons": ["Cannot repair preflight health failure because .devflow/local_setup.json is missing or invalid."],
|
|
1244
|
+
"files_changed": [],
|
|
1245
|
+
"artifact_path": str(details["artifact_path"]),
|
|
1246
|
+
"verification_checks": [],
|
|
1247
|
+
"repair_result": None,
|
|
1248
|
+
"start_command_executed": False,
|
|
1249
|
+
}
|
|
1250
|
+
setup_path, contract = setup
|
|
1251
|
+
failing_urls = {
|
|
1252
|
+
str(issue.get("message") or "").split(": ", 1)[1].split(" did not return ", 1)[0].strip()
|
|
1253
|
+
for issue in details["health_issues"]
|
|
1254
|
+
if ": " in str(issue.get("message") or "") and " did not return " in str(issue.get("message") or "")
|
|
1255
|
+
}
|
|
1256
|
+
failing_checks = _select_recovery_health_checks(contract=contract, failing_urls=failing_urls)
|
|
1257
|
+
initial_verification_checks, initial_blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
|
|
1258
|
+
if not initial_blocking_reasons:
|
|
1259
|
+
return {
|
|
1260
|
+
"ready": True,
|
|
1261
|
+
"blocking_reasons": [],
|
|
1262
|
+
"files_changed": [],
|
|
1263
|
+
"artifact_path": str(details["artifact_path"]),
|
|
1264
|
+
"verification_checks": initial_verification_checks
|
|
1265
|
+
+ [
|
|
1266
|
+
"Skipped local setup start command because the previously failing health checks are already healthy."
|
|
1267
|
+
],
|
|
1268
|
+
"repair_result": None,
|
|
1269
|
+
"start_command_executed": False,
|
|
1270
|
+
}
|
|
1271
|
+
service_hints = [str(hc.name or "").split("-", 1)[0].strip().lower() for hc in failing_checks]
|
|
1272
|
+
logs_by_service = {
|
|
1273
|
+
hint: _get_docker_service_logs(hint, repo_root)
|
|
1274
|
+
for hint in service_hints
|
|
1275
|
+
if hint
|
|
1276
|
+
}
|
|
1277
|
+
repair_result, _envelope = run_agent_step(
|
|
1278
|
+
repo_root=repo_root,
|
|
1279
|
+
stage_name="recovery_preflight_health_repo_repair",
|
|
1280
|
+
output_model=RemediationResultArtifact,
|
|
1281
|
+
context_payload={
|
|
1282
|
+
"failed_item": item.model_dump(),
|
|
1283
|
+
"preflight_report_path": str(details["artifact_path"]),
|
|
1284
|
+
"preflight_report": details["report"],
|
|
1285
|
+
"health_issues": details["health_issues"],
|
|
1286
|
+
"local_setup_path": str(setup_path),
|
|
1287
|
+
"local_setup": contract.model_dump(),
|
|
1288
|
+
"service_logs": logs_by_service,
|
|
1289
|
+
"files_to_change": _collect_preflight_repair_files(repo_root=repo_root, service_hints=service_hints),
|
|
1290
|
+
},
|
|
1291
|
+
guidance=load_agentic_prompt_lines("recovery_preflight_health_repo_repair"),
|
|
1292
|
+
timeout_seconds=600,
|
|
1293
|
+
strength=_CURRENT_STRENGTH,
|
|
1294
|
+
)
|
|
1295
|
+
files_written: list[str] = []
|
|
1296
|
+
for patch in repair_result.file_patches or []:
|
|
1297
|
+
target = repo_root / patch.path
|
|
1298
|
+
try:
|
|
1299
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
1300
|
+
target.write_text(patch.content, encoding="utf-8")
|
|
1301
|
+
files_written.append(patch.path)
|
|
1302
|
+
except Exception:
|
|
1303
|
+
continue
|
|
1304
|
+
repair_result = repair_result.model_copy(update={"fix_applied": bool(files_written), "files_changed": files_written or repair_result.files_changed})
|
|
1305
|
+
if not repair_result.fix_applied:
|
|
1306
|
+
return {
|
|
1307
|
+
"ready": False,
|
|
1308
|
+
"blocking_reasons": ["Preflight health repair produced no file changes, so recovery cannot verify a real repo/config fix."],
|
|
1309
|
+
"files_changed": [],
|
|
1310
|
+
"artifact_path": str(details["artifact_path"]),
|
|
1311
|
+
"verification_checks": [],
|
|
1312
|
+
"repair_result": repair_result,
|
|
1313
|
+
"start_command_executed": False,
|
|
1314
|
+
}
|
|
1315
|
+
refreshed_setup = _load_local_setup_contract(repo_root)
|
|
1316
|
+
if refreshed_setup is not None:
|
|
1317
|
+
_setup_path, contract = refreshed_setup
|
|
1318
|
+
failing_checks = _select_recovery_health_checks(contract=contract, failing_urls=failing_urls)
|
|
1319
|
+
verification_checks, blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
|
|
1320
|
+
start_command_executed = False
|
|
1321
|
+
if blocking_reasons:
|
|
1322
|
+
subprocess.run(
|
|
1323
|
+
["/bin/sh", "-lc", contract.start_command],
|
|
1324
|
+
cwd=str(repo_root),
|
|
1325
|
+
capture_output=True,
|
|
1326
|
+
text=True,
|
|
1327
|
+
check=False,
|
|
1328
|
+
timeout=600,
|
|
1329
|
+
)
|
|
1330
|
+
start_command_executed = True
|
|
1331
|
+
verification_checks, blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
|
|
1332
|
+
else:
|
|
1333
|
+
verification_checks.append(
|
|
1334
|
+
"Skipped local setup start command because the previously failing health checks are already healthy after the repo/config repair."
|
|
1335
|
+
)
|
|
1336
|
+
return {
|
|
1337
|
+
"ready": not blocking_reasons,
|
|
1338
|
+
"blocking_reasons": blocking_reasons,
|
|
1339
|
+
"files_changed": files_written,
|
|
1340
|
+
"artifact_path": str(details["artifact_path"]),
|
|
1341
|
+
"verification_checks": verification_checks,
|
|
1342
|
+
"repair_result": repair_result,
|
|
1343
|
+
"start_command_executed": start_command_executed,
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
class LoadFailedQueueItemNode(Node):
|
|
1348
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1349
|
+
def _run(_node_exec_id: str):
|
|
1350
|
+
event = task_context.event
|
|
1351
|
+
store, run_id = _store_run()
|
|
1352
|
+
row = None
|
|
1353
|
+
payload_ref = None
|
|
1354
|
+
story_id = None
|
|
1355
|
+
repo_root = Path(str(event.repo_root))
|
|
1356
|
+
queue_type = str(event.queue_type)
|
|
1357
|
+
item_id = str(event.item_id)
|
|
1358
|
+
if queue_type == "scope":
|
|
1359
|
+
row = store.get_scope_queue_item(scope_queue_id=item_id)
|
|
1360
|
+
payload_ref = None if row is None else str(row.get("scope_payload_path") or "") or None
|
|
1361
|
+
elif queue_type == "idea_creation":
|
|
1362
|
+
row = store.get_idea_creation_queue_item(idea_creation_queue_id=item_id)
|
|
1363
|
+
payload_ref = None if row is None else str(row.get("idea_payload_path") or "") or None
|
|
1364
|
+
elif queue_type == "idea":
|
|
1365
|
+
row = store.get_idea_queue_item(idea_queue_id=item_id)
|
|
1366
|
+
payload_ref = None if row is None else str(row.get("idea_payload_path") or "") or None
|
|
1367
|
+
elif queue_type == "story":
|
|
1368
|
+
row = store.get_story_queue_item(story_queue_id=item_id)
|
|
1369
|
+
if row is not None:
|
|
1370
|
+
artifact = store.get_artifact(artifact_id=str(row.get("story_artifact_id") or ""))
|
|
1371
|
+
payload_ref = None if artifact is None else str(artifact.get("uri") or "") or None
|
|
1372
|
+
story_id = str(row.get("story_id") or "") or None
|
|
1373
|
+
elif queue_type == "integration":
|
|
1374
|
+
row = store.get_integration_queue_item(integration_queue_id=item_id)
|
|
1375
|
+
payload_ref = None if row is None else str(row.get("integration_payload_path") or "") or None
|
|
1376
|
+
else:
|
|
1377
|
+
raise ValueError(f"unsupported queue_type={queue_type}")
|
|
1378
|
+
if row is None:
|
|
1379
|
+
raise ValueError(f"queue item not found: {queue_type}:{item_id}")
|
|
1380
|
+
artifact = FailedQueueItemArtifact(
|
|
1381
|
+
queue_type=queue_type, # type: ignore[arg-type]
|
|
1382
|
+
item_id=item_id,
|
|
1383
|
+
project_id=row.get("project_id"),
|
|
1384
|
+
dfs_project_id=row.get("dfs_project_id"),
|
|
1385
|
+
enqueue_run_id=str(row.get("enqueue_run_id") or ""),
|
|
1386
|
+
status=str(row.get("status") or ""),
|
|
1387
|
+
title=str(row.get("title") or item_id),
|
|
1388
|
+
payload_ref=payload_ref,
|
|
1389
|
+
payload_exists=bool(payload_ref and Path(payload_ref).exists()),
|
|
1390
|
+
story_id=story_id,
|
|
1391
|
+
failure_message=row.get("failure_message"),
|
|
1392
|
+
failure_context=dict(row.get("failure_context") or {}),
|
|
1393
|
+
raw_row=dict(row),
|
|
1394
|
+
)
|
|
1395
|
+
task_context.metadata["failed_item"] = artifact
|
|
1396
|
+
_publish_node(artifact.dfs_project_id or event.project_id, run_id, "Loading failure", recovery_id=item_id)
|
|
1397
|
+
_publish(artifact.dfs_project_id or event.project_id, run_id, "running", "processing", f"Loading failed {queue_type} queue item", recovery_id=item_id)
|
|
1398
|
+
self.save_output(artifact)
|
|
1399
|
+
return artifact.model_dump(), task_context
|
|
1400
|
+
return _persist_node(node_id="load_failed_item", node_name="LoadFailedQueueItem", fn=_run)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
class SystemicPatternAnalysisNode(Node):
|
|
1404
|
+
"""Deterministic node: queries DB for sibling failures sharing the same error signature.
|
|
1405
|
+
|
|
1406
|
+
Produces SystemicPatternArtifact stored in task_context.metadata["systemic_pattern"].
|
|
1407
|
+
Also sets metadata["churn_detected"] when the durable recovery churn gate is already
|
|
1408
|
+
at or above threshold for the same durable identity + normalized failure signature.
|
|
1409
|
+
"""
|
|
1410
|
+
|
|
1411
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1412
|
+
def _run(_node_exec_id: str):
|
|
1413
|
+
store, run_id = _store_run()
|
|
1414
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1415
|
+
project_id = item.dfs_project_id or task_context.event.project_id
|
|
1416
|
+
_publish_node(project_id, run_id, "Analysing failure pattern", recovery_id=item.item_id)
|
|
1417
|
+
|
|
1418
|
+
failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
|
|
1419
|
+
failure_message = str(item.failure_message or "").strip()
|
|
1420
|
+
error_type = str(failure_context.get("error_type") or "").strip()
|
|
1421
|
+
failed_stage = str(failure_context.get("failed_stage") or "").strip()
|
|
1422
|
+
|
|
1423
|
+
failure_signature = _normalized_failure_signature(
|
|
1424
|
+
failure_message=failure_message,
|
|
1425
|
+
failure_context=failure_context,
|
|
1426
|
+
)
|
|
1427
|
+
|
|
1428
|
+
queue_type = item.queue_type
|
|
1429
|
+
item_id = item.item_id
|
|
1430
|
+
|
|
1431
|
+
affected_item_ids: list[str] = []
|
|
1432
|
+
sample_failure_messages: list[str] = []
|
|
1433
|
+
failed_stages: list[str] = []
|
|
1434
|
+
churn_state = _load_recovery_churn_gate_state(
|
|
1435
|
+
store=store,
|
|
1436
|
+
project_id=project_id,
|
|
1437
|
+
item=item,
|
|
1438
|
+
failure_signature=failure_signature,
|
|
1439
|
+
)
|
|
1440
|
+
churn_detected = bool(churn_state["threshold_met"])
|
|
1441
|
+
|
|
1442
|
+
with store._connect() as conn:
|
|
1443
|
+
# --- Find sibling failed items in the source queue with matching failure_message ---
|
|
1444
|
+
if queue_type == "story":
|
|
1445
|
+
table = "story_queue"
|
|
1446
|
+
id_col = "story_queue_id"
|
|
1447
|
+
elif queue_type == "idea_creation":
|
|
1448
|
+
table = "idea_creation_queue"
|
|
1449
|
+
id_col = "idea_creation_queue_id"
|
|
1450
|
+
elif queue_type == "idea":
|
|
1451
|
+
table = "idea_queue"
|
|
1452
|
+
id_col = "idea_queue_id"
|
|
1453
|
+
elif queue_type == "scope":
|
|
1454
|
+
table = "scope_queue"
|
|
1455
|
+
id_col = "scope_queue_id"
|
|
1456
|
+
elif queue_type == "integration":
|
|
1457
|
+
table = "integration_queue"
|
|
1458
|
+
id_col = "integration_queue_id"
|
|
1459
|
+
else:
|
|
1460
|
+
table = None
|
|
1461
|
+
id_col = None
|
|
1462
|
+
|
|
1463
|
+
if table and failure_message:
|
|
1464
|
+
# Search by LIKE on failure_message (primary)
|
|
1465
|
+
sig_fragment = failure_message[:60].replace("%", "")
|
|
1466
|
+
sibling_rows = conn.execute(
|
|
1467
|
+
f"SELECT {id_col} as item_id, failure_message, failure_context_json FROM {table} "
|
|
1468
|
+
f"WHERE status='failed' AND failure_message LIKE ? LIMIT 100",
|
|
1469
|
+
(f"%{sig_fragment}%",),
|
|
1470
|
+
).fetchall()
|
|
1471
|
+
for r in sibling_rows:
|
|
1472
|
+
sid = str(r["item_id"] or "")
|
|
1473
|
+
if sid and sid not in affected_item_ids:
|
|
1474
|
+
affected_item_ids.append(sid)
|
|
1475
|
+
msg = str(r["failure_message"] or "")
|
|
1476
|
+
if msg and msg not in sample_failure_messages:
|
|
1477
|
+
sample_failure_messages.append(msg)
|
|
1478
|
+
# Extract failed_stage from failure_context_json
|
|
1479
|
+
try:
|
|
1480
|
+
fc = json.loads(str(r["failure_context_json"] or "{}") or "{}")
|
|
1481
|
+
fs = str(fc.get("failed_stage") or "").strip()
|
|
1482
|
+
if fs and fs not in failed_stages:
|
|
1483
|
+
failed_stages.append(fs)
|
|
1484
|
+
except Exception:
|
|
1485
|
+
pass
|
|
1486
|
+
|
|
1487
|
+
# --- Also search recovery_queue for matching failure signature ---
|
|
1488
|
+
rq_rows = conn.execute(
|
|
1489
|
+
"SELECT source_item_id, failure_message FROM recovery_queue "
|
|
1490
|
+
"WHERE failure_message LIKE ? LIMIT 100",
|
|
1491
|
+
(f"%{failure_message[:60].replace('%', '')}%",) if failure_message else ("%",),
|
|
1492
|
+
).fetchall()
|
|
1493
|
+
for r in rq_rows:
|
|
1494
|
+
sid = str(r["source_item_id"] or "")
|
|
1495
|
+
if sid and sid not in affected_item_ids:
|
|
1496
|
+
affected_item_ids.append(sid)
|
|
1497
|
+
|
|
1498
|
+
# Include current item if not already in list
|
|
1499
|
+
if item_id not in affected_item_ids:
|
|
1500
|
+
affected_item_ids.insert(0, item_id)
|
|
1501
|
+
if failure_message and failure_message not in sample_failure_messages:
|
|
1502
|
+
sample_failure_messages.insert(0, failure_message)
|
|
1503
|
+
if failed_stage and failed_stage not in failed_stages:
|
|
1504
|
+
failed_stages.insert(0, failed_stage)
|
|
1505
|
+
|
|
1506
|
+
total_affected = len(affected_item_ids)
|
|
1507
|
+
is_systemic = total_affected >= 3
|
|
1508
|
+
|
|
1509
|
+
pattern = SystemicPatternArtifact(
|
|
1510
|
+
failure_signature=failure_signature,
|
|
1511
|
+
is_systemic=is_systemic,
|
|
1512
|
+
affected_queue_type=queue_type,
|
|
1513
|
+
affected_item_ids=affected_item_ids,
|
|
1514
|
+
sample_failure_messages=sample_failure_messages[:5],
|
|
1515
|
+
failed_stages=failed_stages[:10],
|
|
1516
|
+
total_affected=total_affected,
|
|
1517
|
+
pattern_summary=(
|
|
1518
|
+
f"{'Systemic' if is_systemic else 'Isolated'} failure: {failure_signature}. "
|
|
1519
|
+
f"{total_affected} affected item(s) in {queue_type} queue."
|
|
1520
|
+
),
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
task_context.metadata["systemic_pattern"] = pattern
|
|
1524
|
+
if churn_detected:
|
|
1525
|
+
task_context.metadata["churn_detected"] = True
|
|
1526
|
+
task_context.metadata["churn"] = {
|
|
1527
|
+
"detected": True,
|
|
1528
|
+
"failure_signature": failure_signature,
|
|
1529
|
+
"occurrence_count": int(churn_state["occurrence_count"]),
|
|
1530
|
+
"threshold": int(churn_state["threshold"]),
|
|
1531
|
+
"error_task_id": churn_state["error_task_id"],
|
|
1532
|
+
"churn_key": churn_state["churn_key"],
|
|
1533
|
+
"story_id": item.story_id,
|
|
1534
|
+
"item_id": item.item_id,
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
self.save_output(pattern)
|
|
1538
|
+
return pattern.model_dump(), task_context
|
|
1539
|
+
|
|
1540
|
+
return _persist_node(node_id="systemic_pattern_analysis", node_name="SystemicPatternAnalysis", fn=_run)
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
class AgenticFailureInvestigationNode(AgentNode):
|
|
1544
|
+
def get_agent_config(self) -> AgentConfig:
|
|
1545
|
+
return AgentConfig(instructions="Investigate the nature of a process failure, identify the affected boundary, and define what successful recovery would look like.", output_type=RecoveryInvestigationArtifact)
|
|
1546
|
+
|
|
1547
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1548
|
+
def _run(_node_exec_id: str):
|
|
1549
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1550
|
+
if task_context.metadata.get("churn_detected"):
|
|
1551
|
+
churn = task_context.metadata.get("churn") or {}
|
|
1552
|
+
artifact = _build_churn_gate_investigation(
|
|
1553
|
+
item=item,
|
|
1554
|
+
failure_signature=str(churn.get("failure_signature") or ""),
|
|
1555
|
+
occurrence_count=int(churn.get("occurrence_count") or 0),
|
|
1556
|
+
threshold=int(churn.get("threshold") or _RECOVERY_CHURN_GATE_THRESHOLD),
|
|
1557
|
+
churn_key=str(churn.get("churn_key") or "") or None,
|
|
1558
|
+
)
|
|
1559
|
+
task_context.metadata["investigation"] = artifact
|
|
1560
|
+
self.save_output(artifact)
|
|
1561
|
+
return artifact.model_dump(), task_context
|
|
1562
|
+
_publish_node(item.dfs_project_id or task_context.event.project_id, _store_run()[1], "Investigating failure", recovery_id=item.item_id)
|
|
1563
|
+
store, _run_id = _store_run()
|
|
1564
|
+
repo_root = Path(str(task_context.event.repo_root))
|
|
1565
|
+
log_evidence = _gather_log_first_recovery_evidence(store=store, repo_root=repo_root, item=item)
|
|
1566
|
+
artifact, _envelope = run_agent_step(
|
|
1567
|
+
repo_root=repo_root,
|
|
1568
|
+
stage_name="recovery_failure_investigation",
|
|
1569
|
+
output_model=RecoveryInvestigationArtifact,
|
|
1570
|
+
context_payload={"failed_item": item.model_dump(), "log_evidence": log_evidence},
|
|
1571
|
+
guidance=load_agentic_prompt_lines("recovery_failure_investigation"),
|
|
1572
|
+
timeout_seconds=300,
|
|
1573
|
+
strength=_CURRENT_STRENGTH,
|
|
1574
|
+
)
|
|
1575
|
+
artifact = _enrich_investigation_with_log_evidence(investigation=artifact, log_evidence=log_evidence)
|
|
1576
|
+
task_context.metadata["investigation"] = artifact
|
|
1577
|
+
task_context.metadata["log_evidence"] = log_evidence
|
|
1578
|
+
self.save_output(artifact)
|
|
1579
|
+
return artifact.model_dump(), task_context
|
|
1580
|
+
return _persist_node(node_id="failure_investigation", node_name="AgenticFailureInvestigation", fn=_run)
|
|
1581
|
+
|
|
1582
|
+
|
|
1583
|
+
class RootCauseCodeInvestigationNode(AgentNode):
|
|
1584
|
+
"""AgentNode: given a systemic pattern, reads key source files and asks the LLM to
|
|
1585
|
+
identify the root cause and propose a concrete fix plan.
|
|
1586
|
+
|
|
1587
|
+
Output: CodeRootCauseArtifact
|
|
1588
|
+
"""
|
|
1589
|
+
|
|
1590
|
+
def get_agent_config(self) -> AgentConfig:
|
|
1591
|
+
return AgentConfig(
|
|
1592
|
+
instructions="Investigate the root cause of a systemic failure pattern in the DevFlow codebase. Identify the exact code location, describe the problem, and propose a specific fix plan.",
|
|
1593
|
+
output_type=CodeRootCauseArtifact,
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1597
|
+
def _run(_node_exec_id: str):
|
|
1598
|
+
event = task_context.event
|
|
1599
|
+
store, run_id = _store_run()
|
|
1600
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1601
|
+
pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
|
|
1602
|
+
repo_root = Path(str(event.repo_root))
|
|
1603
|
+
project_id = item.dfs_project_id or event.project_id
|
|
1604
|
+
|
|
1605
|
+
_publish_node(project_id, run_id, "Investigating root cause", recovery_id=item.item_id)
|
|
1606
|
+
|
|
1607
|
+
# Deterministically read key source files before calling agent
|
|
1608
|
+
key_source_files: dict[str, str] = {}
|
|
1609
|
+
|
|
1610
|
+
# 1. agentic_runtime.py — always include
|
|
1611
|
+
_read_file_into(key_source_files, repo_root / "src" / "devflow_engine" / "agentic_runtime.py")
|
|
1612
|
+
|
|
1613
|
+
# 2. implementation/dag.py — Red/Green context-building sections
|
|
1614
|
+
impl_dag = repo_root / "src" / "devflow_engine" / "implementation" / "dag.py"
|
|
1615
|
+
if impl_dag.exists():
|
|
1616
|
+
try:
|
|
1617
|
+
full = impl_dag.read_text(encoding="utf-8")
|
|
1618
|
+
# Extract lines around _build_red_generation_context and _build_green_generation_context
|
|
1619
|
+
lines = full.splitlines()
|
|
1620
|
+
relevant: list[str] = []
|
|
1621
|
+
capture = False
|
|
1622
|
+
for i, line in enumerate(lines):
|
|
1623
|
+
if "_build_red_generation_context" in line or "_build_green_generation_context" in line:
|
|
1624
|
+
capture = True
|
|
1625
|
+
if capture:
|
|
1626
|
+
relevant.append(line)
|
|
1627
|
+
# Stop after a reasonable chunk (50 lines per section)
|
|
1628
|
+
if len(relevant) > 200:
|
|
1629
|
+
relevant.append("... [truncated]")
|
|
1630
|
+
break
|
|
1631
|
+
key_source_files["implementation/dag.py (red/green context sections)"] = "\n".join(relevant[:200])
|
|
1632
|
+
except Exception:
|
|
1633
|
+
pass
|
|
1634
|
+
|
|
1635
|
+
# 3. List .claude/agents/ directory sizes if present
|
|
1636
|
+
agents_dir = repo_root / ".claude" / "agents"
|
|
1637
|
+
if agents_dir.exists():
|
|
1638
|
+
sizes: list[str] = []
|
|
1639
|
+
try:
|
|
1640
|
+
for p in sorted(agents_dir.iterdir()):
|
|
1641
|
+
if p.is_file():
|
|
1642
|
+
sizes.append(f"{p.name}: {p.stat().st_size} bytes")
|
|
1643
|
+
except Exception:
|
|
1644
|
+
pass
|
|
1645
|
+
key_source_files[".claude/agents/ directory"] = "\n".join(sizes) if sizes else "(empty)"
|
|
1646
|
+
|
|
1647
|
+
# 4. CLI config
|
|
1648
|
+
cli_config = ""
|
|
1649
|
+
config_path = Path.home() / ".devflow" / "config.toml"
|
|
1650
|
+
if config_path.exists():
|
|
1651
|
+
try:
|
|
1652
|
+
cli_config = config_path.read_text(encoding="utf-8")
|
|
1653
|
+
except Exception:
|
|
1654
|
+
cli_config = "(unreadable)"
|
|
1655
|
+
|
|
1656
|
+
artifact, _envelope = run_agent_step(
|
|
1657
|
+
repo_root=repo_root,
|
|
1658
|
+
stage_name="recovery_root_cause_investigation",
|
|
1659
|
+
output_model=CodeRootCauseArtifact,
|
|
1660
|
+
context_payload={
|
|
1661
|
+
"systemic_pattern": pattern.model_dump(),
|
|
1662
|
+
"failed_item": item.model_dump(),
|
|
1663
|
+
"key_source_files": key_source_files,
|
|
1664
|
+
"cli_config": cli_config,
|
|
1665
|
+
},
|
|
1666
|
+
guidance=load_agentic_prompt_lines("recovery_root_cause_investigation"),
|
|
1667
|
+
timeout_seconds=600,
|
|
1668
|
+
strength=_CURRENT_STRENGTH,
|
|
1669
|
+
)
|
|
1670
|
+
task_context.metadata["root_cause"] = artifact
|
|
1671
|
+
self.save_output(artifact)
|
|
1672
|
+
return artifact.model_dump(), task_context
|
|
1673
|
+
|
|
1674
|
+
return _persist_node(node_id="root_cause_investigation", node_name="RootCauseCodeInvestigation", fn=_run)
|
|
1675
|
+
|
|
1676
|
+
|
|
1677
|
+
def _read_file_into(dest: dict[str, str], path: Path) -> None:
|
|
1678
|
+
"""Helper: read a file and store its content in dest under its relative path."""
|
|
1679
|
+
try:
|
|
1680
|
+
if path.exists():
|
|
1681
|
+
dest[str(path.name)] = path.read_text(encoding="utf-8")
|
|
1682
|
+
except Exception:
|
|
1683
|
+
pass
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
class RemediationExecutionNode(AgentNode):
|
|
1687
|
+
"""AgentNode: given CodeRootCauseArtifact, applies the fix.
|
|
1688
|
+
|
|
1689
|
+
After parsing the agent response:
|
|
1690
|
+
- Writes file_patches to disk at repo_root / patch.path
|
|
1691
|
+
- Sets fix_applied=True if files were written
|
|
1692
|
+
|
|
1693
|
+
Output: RemediationResultArtifact
|
|
1694
|
+
"""
|
|
1695
|
+
|
|
1696
|
+
def get_agent_config(self) -> AgentConfig:
|
|
1697
|
+
return AgentConfig(
|
|
1698
|
+
instructions="Apply a diagnosed code/config fix to resolve a systemic DevFlow failure. Produce file patches with complete corrected file contents.",
|
|
1699
|
+
output_type=RemediationResultArtifact,
|
|
1700
|
+
)
|
|
1701
|
+
|
|
1702
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1703
|
+
def _run(_node_exec_id: str):
|
|
1704
|
+
event = task_context.event
|
|
1705
|
+
store, run_id = _store_run()
|
|
1706
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1707
|
+
root_cause: CodeRootCauseArtifact = task_context.metadata["root_cause"]
|
|
1708
|
+
pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
|
|
1709
|
+
repo_root = Path(str(event.repo_root))
|
|
1710
|
+
project_id = item.dfs_project_id or event.project_id
|
|
1711
|
+
|
|
1712
|
+
_publish_node(project_id, run_id, "Applying remediation", recovery_id=item.item_id)
|
|
1713
|
+
|
|
1714
|
+
# Deterministically read current content of files the fix will touch
|
|
1715
|
+
files_to_change: dict[str, str] = {}
|
|
1716
|
+
if root_cause.root_cause_location:
|
|
1717
|
+
# root_cause_location is like "src/devflow_engine/agentic_runtime.py:run_agent_step"
|
|
1718
|
+
location_path = root_cause.root_cause_location.split(":")[0].strip()
|
|
1719
|
+
candidate = repo_root / location_path
|
|
1720
|
+
_read_file_into(files_to_change, candidate)
|
|
1721
|
+
|
|
1722
|
+
# Also read files listed in files_inspected (capped at 3)
|
|
1723
|
+
for fi in (root_cause.files_inspected or [])[:3]:
|
|
1724
|
+
_read_file_into(files_to_change, repo_root / fi)
|
|
1725
|
+
|
|
1726
|
+
artifact, _envelope = run_agent_step(
|
|
1727
|
+
repo_root=repo_root,
|
|
1728
|
+
stage_name="recovery_remediation_execution",
|
|
1729
|
+
output_model=RemediationResultArtifact,
|
|
1730
|
+
context_payload={
|
|
1731
|
+
"root_cause": root_cause.model_dump(),
|
|
1732
|
+
"systemic_pattern": pattern.model_dump(),
|
|
1733
|
+
"files_to_change": files_to_change,
|
|
1734
|
+
"affected_items": pattern.affected_item_ids,
|
|
1735
|
+
},
|
|
1736
|
+
guidance=load_agentic_prompt_lines("recovery_remediation_execution"),
|
|
1737
|
+
timeout_seconds=900,
|
|
1738
|
+
strength=_CURRENT_STRENGTH,
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1741
|
+
# Write file patches to disk
|
|
1742
|
+
files_written: list[str] = []
|
|
1743
|
+
for patch in (artifact.file_patches or []):
|
|
1744
|
+
try:
|
|
1745
|
+
target = repo_root / patch.path
|
|
1746
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
1747
|
+
target.write_text(patch.content, encoding="utf-8")
|
|
1748
|
+
files_written.append(patch.path)
|
|
1749
|
+
except Exception as exc:
|
|
1750
|
+
# Log but don't fail the node; note it in verification
|
|
1751
|
+
pass
|
|
1752
|
+
|
|
1753
|
+
if files_written:
|
|
1754
|
+
artifact = artifact.model_copy(update={"fix_applied": True, "files_changed": files_written})
|
|
1755
|
+
|
|
1756
|
+
task_context.metadata["remediation_result"] = artifact
|
|
1757
|
+
self.save_output(artifact)
|
|
1758
|
+
return artifact.model_dump(), task_context
|
|
1759
|
+
|
|
1760
|
+
return _persist_node(node_id="remediation_execution", node_name="RemediationExecution", fn=_run)
|
|
1761
|
+
|
|
1762
|
+
|
|
1763
|
+
class BulkReenqueueNode(Node):
|
|
1764
|
+
"""Deterministic node: after remediation, bulk-re-enqueues all items listed in
|
|
1765
|
+
RemediationResultArtifact.items_to_requeue and dead-letters items in items_to_dead_letter.
|
|
1766
|
+
"""
|
|
1767
|
+
|
|
1768
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1769
|
+
def _run(_node_exec_id: str):
|
|
1770
|
+
event = task_context.event
|
|
1771
|
+
store, run_id = _store_run()
|
|
1772
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1773
|
+
remediation: RemediationResultArtifact = task_context.metadata["remediation_result"]
|
|
1774
|
+
pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
|
|
1775
|
+
project_id = item.dfs_project_id or event.project_id
|
|
1776
|
+
|
|
1777
|
+
_publish_node(project_id, run_id, "Re-enqueuing fixed items", recovery_id=item.item_id)
|
|
1778
|
+
|
|
1779
|
+
queue_type = pattern.affected_queue_type
|
|
1780
|
+
requeued: list[str] = []
|
|
1781
|
+
dead_lettered: list[str] = []
|
|
1782
|
+
errors: list[str] = []
|
|
1783
|
+
|
|
1784
|
+
for qid in (remediation.items_to_requeue or []):
|
|
1785
|
+
try:
|
|
1786
|
+
if queue_type == "story":
|
|
1787
|
+
store.retry_story_queue_item(project_id=project_id, story_queue_id=qid, preserve_failure_context=False)
|
|
1788
|
+
elif queue_type == "idea_creation":
|
|
1789
|
+
store.retry_idea_creation_queue_item(project_id=project_id, idea_creation_queue_id=qid, preserve_failure_context=False)
|
|
1790
|
+
elif queue_type == "idea":
|
|
1791
|
+
store.retry_idea_queue_item(project_id=project_id, idea_queue_id=qid, preserve_failure_context=False)
|
|
1792
|
+
elif queue_type == "scope":
|
|
1793
|
+
store.retry_scope_queue_item(project_id=project_id, scope_queue_id=qid, preserve_failure_context=False)
|
|
1794
|
+
elif queue_type == "integration":
|
|
1795
|
+
store.retry_integration_queue_item(project_id=project_id, integration_queue_id=qid, preserve_failure_context=False)
|
|
1796
|
+
requeued.append(qid)
|
|
1797
|
+
except Exception as exc:
|
|
1798
|
+
errors.append(f"{qid}: {exc}")
|
|
1799
|
+
|
|
1800
|
+
# Mark dead-letter items as failed in queue (set failure_message to indicate dead-lettered)
|
|
1801
|
+
for qid in (remediation.items_to_dead_letter or []):
|
|
1802
|
+
try:
|
|
1803
|
+
with store._connect() as conn:
|
|
1804
|
+
now_ts = int(__import__("time").time())
|
|
1805
|
+
if queue_type == "story":
|
|
1806
|
+
conn.execute(
|
|
1807
|
+
"UPDATE story_queue SET status='failed', failure_message=?, updated_at=? WHERE story_queue_id=?",
|
|
1808
|
+
("dead_lettered_by_recovery", now_ts, qid),
|
|
1809
|
+
)
|
|
1810
|
+
elif queue_type == "idea_creation":
|
|
1811
|
+
conn.execute(
|
|
1812
|
+
"UPDATE idea_creation_queue SET status='failed', failure_message=?, updated_at=? WHERE idea_creation_queue_id=?",
|
|
1813
|
+
("dead_lettered_by_recovery", now_ts, qid),
|
|
1814
|
+
)
|
|
1815
|
+
elif queue_type == "idea":
|
|
1816
|
+
conn.execute(
|
|
1817
|
+
"UPDATE idea_queue SET status='failed', failure_message=?, updated_at=? WHERE idea_queue_id=?",
|
|
1818
|
+
("dead_lettered_by_recovery", now_ts, qid),
|
|
1819
|
+
)
|
|
1820
|
+
elif queue_type == "scope":
|
|
1821
|
+
conn.execute(
|
|
1822
|
+
"UPDATE scope_queue SET status='failed', failure_message=?, updated_at=? WHERE scope_queue_id=?",
|
|
1823
|
+
("dead_lettered_by_recovery", now_ts, qid),
|
|
1824
|
+
)
|
|
1825
|
+
elif queue_type == "integration":
|
|
1826
|
+
conn.execute(
|
|
1827
|
+
"UPDATE integration_queue SET status='failed', failure_message=?, updated_at=? WHERE integration_queue_id=?",
|
|
1828
|
+
("dead_lettered_by_recovery", now_ts, qid),
|
|
1829
|
+
)
|
|
1830
|
+
dead_lettered.append(qid)
|
|
1831
|
+
except Exception as exc:
|
|
1832
|
+
errors.append(f"dead_letter {qid}: {exc}")
|
|
1833
|
+
|
|
1834
|
+
outcome = "reenqueued" if requeued else "blocked"
|
|
1835
|
+
task_context.metadata["outcome"] = outcome
|
|
1836
|
+
task_context.metadata["bulk_reenqueue_result"] = {
|
|
1837
|
+
"requeued": requeued,
|
|
1838
|
+
"dead_lettered": dead_lettered,
|
|
1839
|
+
"errors": errors,
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
summary = f"Bulk re-enqueue: {len(requeued)} re-queued, {len(dead_lettered)} dead-lettered, {len(errors)} errors."
|
|
1843
|
+
result = {"requeued": requeued, "dead_lettered": dead_lettered, "errors": errors, "outcome": outcome, "summary": summary}
|
|
1844
|
+
self.save_output(result)
|
|
1845
|
+
return result, task_context
|
|
1846
|
+
|
|
1847
|
+
return _persist_node(node_id="bulk_reenqueue", node_name="BulkReenqueue", fn=_run)
|
|
1848
|
+
|
|
1849
|
+
|
|
1850
|
+
# ---------------------------------------------------------------------------
|
|
1851
|
+
# Router redesign
|
|
1852
|
+
# ---------------------------------------------------------------------------
|
|
1853
|
+
|
|
1854
|
+
class _RouteLoopGuard(RouterNode):
|
|
1855
|
+
def determine_next_node(self, task_context: TaskContext) -> Node | None:
|
|
1856
|
+
if task_context.metadata.get("churn_detected"):
|
|
1857
|
+
task_context.metadata["outcome"] = "blocked"
|
|
1858
|
+
task_context.metadata["delegation_summary"] = None
|
|
1859
|
+
failure_signature = str((task_context.metadata.get("churn") or {}).get("failure_signature") or "same normalized failure")
|
|
1860
|
+
churn_key = str((task_context.metadata.get("churn") or {}).get("churn_key") or "")
|
|
1861
|
+
occurrence_count = int((task_context.metadata.get("churn") or {}).get("occurrence_count") or 0)
|
|
1862
|
+
threshold = int((task_context.metadata.get("churn") or {}).get("threshold") or _RECOVERY_CHURN_GATE_THRESHOLD)
|
|
1863
|
+
task_context.metadata["pre_replay"] = PreReplayCheckArtifact(
|
|
1864
|
+
queue_type=task_context.metadata["failed_item"].queue_type,
|
|
1865
|
+
ready=False,
|
|
1866
|
+
checks=[],
|
|
1867
|
+
blocking_reasons=[
|
|
1868
|
+
(
|
|
1869
|
+
f"CHURN: durable recovery gate blocked {failure_signature} "
|
|
1870
|
+
f"after {occurrence_count}/{threshold} no-material-change strikes"
|
|
1871
|
+
+ (f" ({churn_key})" if churn_key else "")
|
|
1872
|
+
)
|
|
1873
|
+
],
|
|
1874
|
+
)
|
|
1875
|
+
return PublishRecoveryStateNode(task_context=task_context)
|
|
1876
|
+
return None
|
|
1877
|
+
|
|
1878
|
+
|
|
1879
|
+
class _RouteSystemic(RouterNode):
|
|
1880
|
+
def determine_next_node(self, task_context: TaskContext) -> Node | None:
|
|
1881
|
+
pattern: SystemicPatternArtifact | None = task_context.metadata.get("systemic_pattern")
|
|
1882
|
+
if pattern and pattern.is_systemic:
|
|
1883
|
+
return RootCauseCodeInvestigationNode(task_context=task_context)
|
|
1884
|
+
return None
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
class _RouteIsolatedProcess(RouterNode):
|
|
1888
|
+
def determine_next_node(self, task_context: TaskContext) -> Node | None:
|
|
1889
|
+
return AgenticRecoveryDiagnosisNode(task_context=task_context)
|
|
1890
|
+
|
|
1891
|
+
|
|
1892
|
+
class SystemicVsIsolatedRouter(BaseRouter):
|
|
1893
|
+
def __init__(self) -> None:
|
|
1894
|
+
self.routes = [_RouteLoopGuard(), _RouteSystemic(), _RouteIsolatedProcess()]
|
|
1895
|
+
self.fallback = PublishRecoveryStateNode()
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
# ---------------------------------------------------------------------------
|
|
1899
|
+
# Legacy router (kept for backwards-compat in case referenced externally)
|
|
1900
|
+
# ---------------------------------------------------------------------------
|
|
1901
|
+
|
|
1902
|
+
class _RouteCodeError(RouterNode):
|
|
1903
|
+
def determine_next_node(self, task_context: TaskContext) -> Node | None:
|
|
1904
|
+
if False:
|
|
1905
|
+
return AgenticRecoveryExecutionNode(task_context=task_context)
|
|
1906
|
+
return None
|
|
1907
|
+
|
|
1908
|
+
|
|
1909
|
+
class _RouteProcessError(RouterNode):
|
|
1910
|
+
def determine_next_node(self, task_context: TaskContext) -> Node | None:
|
|
1911
|
+
if True:
|
|
1912
|
+
return AgenticRecoveryDiagnosisNode(task_context=task_context)
|
|
1913
|
+
return None
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
class FailureTypeRouter(BaseRouter):
|
|
1917
|
+
def __init__(self) -> None:
|
|
1918
|
+
self.routes = [_RouteCodeError(), _RouteProcessError()]
|
|
1919
|
+
self.fallback = PublishRecoveryStateNode()
|
|
1920
|
+
|
|
1921
|
+
|
|
1922
|
+
class AgenticRecoveryDiagnosisNode(AgentNode):
|
|
1923
|
+
def get_agent_config(self) -> AgentConfig:
|
|
1924
|
+
return AgentConfig(instructions="Diagnose the best recovery strategy for a failed queue item.", output_type=RecoveryDiagnosisArtifact)
|
|
1925
|
+
|
|
1926
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1927
|
+
def _run(_node_exec_id: str):
|
|
1928
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1929
|
+
investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
|
|
1930
|
+
_publish_node(item.dfs_project_id or task_context.event.project_id, _store_run()[1], "Diagnosing recovery", recovery_id=item.item_id)
|
|
1931
|
+
diagnosis, plan = _build_diagnosis(item=item, investigation=investigation)
|
|
1932
|
+
task_context.metadata["diagnosis"] = diagnosis
|
|
1933
|
+
task_context.metadata["plan"] = plan
|
|
1934
|
+
self.save_output(diagnosis)
|
|
1935
|
+
return {"diagnosis": diagnosis.model_dump(), "plan": plan.model_dump()}, task_context
|
|
1936
|
+
return _persist_node(node_id="recovery_diagnosis", node_name="AgenticRecoveryDiagnosis", fn=_run)
|
|
1937
|
+
|
|
1938
|
+
|
|
1939
|
+
class AgenticRecoveryExecutionNode(AgentNode):
|
|
1940
|
+
def get_agent_config(self) -> AgentConfig:
|
|
1941
|
+
return AgentConfig(instructions="Execute recovery and verify the outcome against agent-defined success criteria inside the same node.", output_type=RecoveryExecutionArtifact)
|
|
1942
|
+
|
|
1943
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
1944
|
+
def _run(_node_exec_id: str):
|
|
1945
|
+
event = task_context.event
|
|
1946
|
+
store, run_id = _store_run()
|
|
1947
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
1948
|
+
diagnosis: RecoveryDiagnosisArtifact | None = task_context.metadata.get("diagnosis")
|
|
1949
|
+
plan: RemediationPlanArtifact | None = task_context.metadata.get("plan")
|
|
1950
|
+
investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
|
|
1951
|
+
repo_root = Path(str(event.repo_root))
|
|
1952
|
+
_publish_node(item.dfs_project_id or event.project_id, run_id, "Executing recovery", recovery_id=item.item_id)
|
|
1953
|
+
last_reenqueue: ReenqueueArtifact | None = None
|
|
1954
|
+
last_execution: RecoveryExecutionArtifact | None = None
|
|
1955
|
+
last_verified: PreReplayCheckArtifact | None = None
|
|
1956
|
+
|
|
1957
|
+
preflight_health_repair = _maybe_repair_story_preflight_health_boundary(repo_root=repo_root, item=item)
|
|
1958
|
+
if preflight_health_repair is not None:
|
|
1959
|
+
failure_signature = _normalized_failure_signature(
|
|
1960
|
+
failure_message=item.failure_message,
|
|
1961
|
+
failure_context=item.failure_context if isinstance(item.failure_context, dict) else {},
|
|
1962
|
+
)
|
|
1963
|
+
health_repair_diagnosis = RecoveryDiagnosisArtifact(
|
|
1964
|
+
queue_type=item.queue_type,
|
|
1965
|
+
item_id=item.item_id,
|
|
1966
|
+
strategy="preflight_health_repair_recovery",
|
|
1967
|
+
summary="Repair repo/config causing preflight health failure.",
|
|
1968
|
+
rationale="Preflight evidence contains health_check_failed blockers that require a bounded repo/config repair before replay.",
|
|
1969
|
+
verification_targets=[
|
|
1970
|
+
{
|
|
1971
|
+
"criterion": "Failing health endpoint returns the expected status",
|
|
1972
|
+
"oracle": "The previously failing preflight boundary is reachable and returns the expected HTTP status after the repair.",
|
|
1973
|
+
"evidence_ref": preflight_health_repair["artifact_path"],
|
|
1974
|
+
}
|
|
1975
|
+
],
|
|
1976
|
+
suggested_action="repair_artifact_then_requeue",
|
|
1977
|
+
)
|
|
1978
|
+
plan = RemediationPlanArtifact(
|
|
1979
|
+
queue_type=item.queue_type,
|
|
1980
|
+
action="repair_artifact_then_requeue",
|
|
1981
|
+
summary="Repair repo/config causing preflight health failure, verify the boundary, then requeue.",
|
|
1982
|
+
steps=[criterion.criterion for criterion in health_repair_diagnosis.verification_targets],
|
|
1983
|
+
remediation_artifact=preflight_health_repair["artifact_path"],
|
|
1984
|
+
)
|
|
1985
|
+
task_context.metadata["diagnosis"] = health_repair_diagnosis
|
|
1986
|
+
task_context.metadata["plan"] = plan
|
|
1987
|
+
repair_result: RemediationResultArtifact | None = preflight_health_repair.get("repair_result")
|
|
1988
|
+
if not preflight_health_repair["ready"]:
|
|
1989
|
+
execution = RecoveryExecutionArtifact(
|
|
1990
|
+
queue_type=item.queue_type,
|
|
1991
|
+
item_id=item.item_id,
|
|
1992
|
+
outcome="blocked",
|
|
1993
|
+
execution_summary=(
|
|
1994
|
+
"Preflight health repo/config repair did not restore the failing boundary."
|
|
1995
|
+
if repair_result is not None and repair_result.fix_applied
|
|
1996
|
+
else "Preflight health repo/config repair could not produce a bounded fix."
|
|
1997
|
+
),
|
|
1998
|
+
preserve_failure_context=True,
|
|
1999
|
+
attempts_used=1,
|
|
2000
|
+
success_criteria=health_repair_diagnosis.verification_targets,
|
|
2001
|
+
verification_summary="; ".join(preflight_health_repair["blocking_reasons"]),
|
|
2002
|
+
)
|
|
2003
|
+
last_verified = PreReplayCheckArtifact(
|
|
2004
|
+
queue_type=item.queue_type,
|
|
2005
|
+
ready=False,
|
|
2006
|
+
checks=preflight_health_repair["verification_checks"],
|
|
2007
|
+
blocking_reasons=preflight_health_repair["blocking_reasons"],
|
|
2008
|
+
)
|
|
2009
|
+
updated_context = _record_recovery_attempt(
|
|
2010
|
+
item=item,
|
|
2011
|
+
diagnosis=health_repair_diagnosis,
|
|
2012
|
+
success=False,
|
|
2013
|
+
failure_signature=failure_signature,
|
|
2014
|
+
material_change=bool(preflight_health_repair["files_changed"]),
|
|
2015
|
+
remediation_artifact=preflight_health_repair["artifact_path"],
|
|
2016
|
+
)
|
|
2017
|
+
item.failure_context = updated_context
|
|
2018
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
|
|
2019
|
+
task_context.metadata["preflight_health_repair"] = preflight_health_repair
|
|
2020
|
+
task_context.metadata["pre_replay"] = last_verified
|
|
2021
|
+
task_context.metadata["outcome"] = "blocked"
|
|
2022
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2023
|
+
self.save_output(execution)
|
|
2024
|
+
return {
|
|
2025
|
+
"outcome": "blocked",
|
|
2026
|
+
"recovery_execution": execution.model_dump(),
|
|
2027
|
+
"pre_replay": last_verified.model_dump(),
|
|
2028
|
+
"preflight_health_repair": {
|
|
2029
|
+
"files_changed": preflight_health_repair["files_changed"],
|
|
2030
|
+
"artifact_path": preflight_health_repair["artifact_path"],
|
|
2031
|
+
},
|
|
2032
|
+
}, task_context
|
|
2033
|
+
health_repair_changed_files = bool(preflight_health_repair["files_changed"])
|
|
2034
|
+
execution = RecoveryExecutionArtifact(
|
|
2035
|
+
queue_type=item.queue_type,
|
|
2036
|
+
item_id=item.item_id,
|
|
2037
|
+
outcome="reenqueued",
|
|
2038
|
+
execution_summary=(
|
|
2039
|
+
"Repaired repo/config for preflight health failure and re-verified the boundary."
|
|
2040
|
+
if health_repair_changed_files
|
|
2041
|
+
else "Re-verified the preflight health boundary and skipped local setup bootstrap because the stack was already healthy."
|
|
2042
|
+
),
|
|
2043
|
+
preserve_failure_context=True,
|
|
2044
|
+
attempts_used=1,
|
|
2045
|
+
success_criteria=health_repair_diagnosis.verification_targets,
|
|
2046
|
+
verification_summary="; ".join(preflight_health_repair["verification_checks"]),
|
|
2047
|
+
)
|
|
2048
|
+
updated_context = _record_recovery_attempt(
|
|
2049
|
+
item=item,
|
|
2050
|
+
diagnosis=health_repair_diagnosis,
|
|
2051
|
+
success=True,
|
|
2052
|
+
failure_signature=failure_signature,
|
|
2053
|
+
material_change=health_repair_changed_files,
|
|
2054
|
+
remediation_artifact=preflight_health_repair["artifact_path"],
|
|
2055
|
+
)
|
|
2056
|
+
updated_context["preflight_health_repair"] = {
|
|
2057
|
+
"artifact_path": preflight_health_repair["artifact_path"],
|
|
2058
|
+
"files_changed": preflight_health_repair["files_changed"],
|
|
2059
|
+
}
|
|
2060
|
+
item.failure_context = updated_context
|
|
2061
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
|
|
2062
|
+
replay_metadata = _build_story_replay_metadata(item=item, diagnosis=health_repair_diagnosis, execution=execution)
|
|
2063
|
+
row = store.retry_story_queue_item(
|
|
2064
|
+
project_id=event.project_id,
|
|
2065
|
+
story_queue_id=item.item_id,
|
|
2066
|
+
preserve_failure_context=True,
|
|
2067
|
+
replay_metadata=replay_metadata,
|
|
2068
|
+
)
|
|
2069
|
+
failure_context = dict(item.failure_context if isinstance(item.failure_context, dict) else {})
|
|
2070
|
+
failure_context["preflight_health_repair"] = {
|
|
2071
|
+
"artifact_path": preflight_health_repair["artifact_path"],
|
|
2072
|
+
"files_changed": preflight_health_repair["files_changed"],
|
|
2073
|
+
}
|
|
2074
|
+
if replay_metadata is not None:
|
|
2075
|
+
failure_context["replay"] = replay_metadata
|
|
2076
|
+
last_reenqueue = ReenqueueArtifact(
|
|
2077
|
+
queue_type=item.queue_type,
|
|
2078
|
+
item_id=item.item_id,
|
|
2079
|
+
status=str(row.get("status") or "queued"),
|
|
2080
|
+
failure_context=failure_context,
|
|
2081
|
+
replay_metadata=replay_metadata,
|
|
2082
|
+
)
|
|
2083
|
+
last_verified = PreReplayCheckArtifact(
|
|
2084
|
+
queue_type=item.queue_type,
|
|
2085
|
+
ready=True,
|
|
2086
|
+
checks=preflight_health_repair["verification_checks"],
|
|
2087
|
+
blocking_reasons=[],
|
|
2088
|
+
)
|
|
2089
|
+
task_context.metadata["preflight_health_repair"] = preflight_health_repair
|
|
2090
|
+
task_context.metadata["reenqueue"] = last_reenqueue
|
|
2091
|
+
task_context.metadata["pre_replay"] = last_verified
|
|
2092
|
+
task_context.metadata["outcome"] = "reenqueued"
|
|
2093
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2094
|
+
self.save_output(execution)
|
|
2095
|
+
return {
|
|
2096
|
+
"outcome": "reenqueued",
|
|
2097
|
+
"recovery_execution": execution.model_dump(),
|
|
2098
|
+
"pre_replay": last_verified.model_dump(),
|
|
2099
|
+
"preflight_health_repair": {
|
|
2100
|
+
"files_changed": preflight_health_repair["files_changed"],
|
|
2101
|
+
"artifact_path": preflight_health_repair["artifact_path"],
|
|
2102
|
+
},
|
|
2103
|
+
}, task_context
|
|
2104
|
+
|
|
2105
|
+
runtime_contract_repair = _maybe_repair_story_test_runtime_contract(repo_root=repo_root, item=item)
|
|
2106
|
+
if runtime_contract_repair is not None:
|
|
2107
|
+
failure_signature = _normalized_failure_signature(
|
|
2108
|
+
failure_message=item.failure_message,
|
|
2109
|
+
failure_context=item.failure_context if isinstance(item.failure_context, dict) else {},
|
|
2110
|
+
)
|
|
2111
|
+
repaired_test_files = [str(path) for path in (runtime_contract_repair.get("files_changed") or []) if str(path).strip()]
|
|
2112
|
+
contract_updated = bool(
|
|
2113
|
+
(runtime_contract_repair.get("previous_contract") or None) is None
|
|
2114
|
+
or any(
|
|
2115
|
+
(runtime_contract_repair.get("previous_contract") or {}).get(key) != runtime_contract_repair["runtime_contract"].get(key)
|
|
2116
|
+
for key in ("framework", "cwd", "run_cmd", "env", "setup_cmd", "test_paths")
|
|
2117
|
+
)
|
|
2118
|
+
)
|
|
2119
|
+
runtime_repair_diagnosis = diagnosis or RecoveryDiagnosisArtifact(
|
|
2120
|
+
queue_type=item.queue_type,
|
|
2121
|
+
item_id=item.item_id,
|
|
2122
|
+
strategy="artifact_regeneration_recovery",
|
|
2123
|
+
summary="Repair runtime contract",
|
|
2124
|
+
rationale="Story runtime contract repair is the bounded automated fix path.",
|
|
2125
|
+
suggested_action="repair_artifact_then_requeue",
|
|
2126
|
+
)
|
|
2127
|
+
runtime_repair_diagnosis = runtime_repair_diagnosis.model_copy(update={"strategy": "artifact_regeneration_recovery"})
|
|
2128
|
+
if plan is None:
|
|
2129
|
+
plan = RemediationPlanArtifact(
|
|
2130
|
+
queue_type=item.queue_type,
|
|
2131
|
+
action="repair_artifact_then_requeue",
|
|
2132
|
+
summary="Repair story test runtime contract and requeue.",
|
|
2133
|
+
)
|
|
2134
|
+
if not runtime_contract_repair["updated"]:
|
|
2135
|
+
churn_message = (
|
|
2136
|
+
"Recovery runtime repair produced no material runtime-contract delta "
|
|
2137
|
+
f"for {_durable_recovery_identity(item)} on {failure_signature}."
|
|
2138
|
+
)
|
|
2139
|
+
churn_state = _record_recovery_churn_strike(
|
|
2140
|
+
store=store,
|
|
2141
|
+
project_id=event.project_id,
|
|
2142
|
+
run_id=run_id,
|
|
2143
|
+
item=item,
|
|
2144
|
+
failure_signature=failure_signature,
|
|
2145
|
+
message=churn_message,
|
|
2146
|
+
)
|
|
2147
|
+
noop_cycles = int(churn_state["occurrence_count"])
|
|
2148
|
+
updated_context = _record_recovery_attempt(
|
|
2149
|
+
item=item,
|
|
2150
|
+
diagnosis=runtime_repair_diagnosis,
|
|
2151
|
+
success=False,
|
|
2152
|
+
failure_signature=failure_signature,
|
|
2153
|
+
material_change=False,
|
|
2154
|
+
remediation_artifact=runtime_contract_repair["path"],
|
|
2155
|
+
)
|
|
2156
|
+
item.failure_context = updated_context
|
|
2157
|
+
task_context.metadata["failed_item"] = item
|
|
2158
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
|
|
2159
|
+
task_context.metadata["durable_churn_key"] = churn_state["churn_key"]
|
|
2160
|
+
task_context.metadata["durable_churn_error_task_id"] = churn_state["error_task_id"]
|
|
2161
|
+
if churn_state["threshold_met"]:
|
|
2162
|
+
investigation = _build_churn_gate_investigation(
|
|
2163
|
+
item=item,
|
|
2164
|
+
failure_signature=failure_signature,
|
|
2165
|
+
occurrence_count=noop_cycles,
|
|
2166
|
+
threshold=_RECOVERY_CHURN_GATE_THRESHOLD,
|
|
2167
|
+
churn_key=str(churn_state.get("churn_key") or "") or None,
|
|
2168
|
+
remediation_artifact=str(runtime_contract_repair["path"]),
|
|
2169
|
+
)
|
|
2170
|
+
task_context.metadata["investigation"] = investigation
|
|
2171
|
+
plan = plan.model_copy(
|
|
2172
|
+
update={
|
|
2173
|
+
"action": "manual_review_required",
|
|
2174
|
+
"summary": "Durable recovery churn gate blocked repeated no-op runtime repair.",
|
|
2175
|
+
"remediation_artifact": runtime_contract_repair["path"],
|
|
2176
|
+
}
|
|
2177
|
+
)
|
|
2178
|
+
task_context.metadata["plan"] = plan
|
|
2179
|
+
execution = RecoveryExecutionArtifact(
|
|
2180
|
+
queue_type=item.queue_type,
|
|
2181
|
+
item_id=item.item_id,
|
|
2182
|
+
outcome="blocked",
|
|
2183
|
+
execution_summary=(
|
|
2184
|
+
"Blocked repeated story runtime repair because the same normalized failure produced no material runtime-contract delta."
|
|
2185
|
+
),
|
|
2186
|
+
preserve_failure_context=True,
|
|
2187
|
+
attempts_used=noop_cycles,
|
|
2188
|
+
success_criteria=[
|
|
2189
|
+
{
|
|
2190
|
+
"criterion": "Durable churn gate blocks repeated no-op runtime repair",
|
|
2191
|
+
"oracle": "Recovery hard-blocks after 3 strikes on the same durable identity with no material runtime-contract delta.",
|
|
2192
|
+
"evidence_ref": runtime_contract_repair["path"],
|
|
2193
|
+
}
|
|
2194
|
+
],
|
|
2195
|
+
verification_summary=(
|
|
2196
|
+
f"CHURN: durable recovery gate blocked {failure_signature} after {noop_cycles}/{_RECOVERY_CHURN_GATE_THRESHOLD} no-material-change strikes"
|
|
2197
|
+
f" ({churn_state['churn_key']})."
|
|
2198
|
+
),
|
|
2199
|
+
)
|
|
2200
|
+
last_verified = PreReplayCheckArtifact(
|
|
2201
|
+
queue_type=item.queue_type,
|
|
2202
|
+
ready=False,
|
|
2203
|
+
checks=[],
|
|
2204
|
+
blocking_reasons=[
|
|
2205
|
+
f"CHURN: durable recovery gate blocked {failure_signature} after {noop_cycles}/{_RECOVERY_CHURN_GATE_THRESHOLD} no-material-change strikes"
|
|
2206
|
+
f" ({churn_state['churn_key']})."
|
|
2207
|
+
],
|
|
2208
|
+
)
|
|
2209
|
+
task_context.metadata["runtime_contract_update"] = runtime_contract_repair
|
|
2210
|
+
task_context.metadata["pre_replay"] = last_verified
|
|
2211
|
+
task_context.metadata["outcome"] = "blocked"
|
|
2212
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2213
|
+
task_context.metadata["churn_detected"] = True
|
|
2214
|
+
task_context.metadata["churn"] = {
|
|
2215
|
+
"detected": True,
|
|
2216
|
+
"failure_signature": failure_signature,
|
|
2217
|
+
"noop_cycles": noop_cycles,
|
|
2218
|
+
"occurrence_count": noop_cycles,
|
|
2219
|
+
"threshold": _RECOVERY_CHURN_GATE_THRESHOLD,
|
|
2220
|
+
"error_task_id": churn_state["error_task_id"],
|
|
2221
|
+
"churn_key": churn_state["churn_key"],
|
|
2222
|
+
"reason": "no_material_runtime_contract_delta",
|
|
2223
|
+
}
|
|
2224
|
+
self.save_output(execution)
|
|
2225
|
+
return {
|
|
2226
|
+
"outcome": "blocked",
|
|
2227
|
+
"recovery_execution": execution.model_dump(),
|
|
2228
|
+
"pre_replay": last_verified.model_dump(),
|
|
2229
|
+
"runtime_contract_update": runtime_contract_repair,
|
|
2230
|
+
}, task_context
|
|
2231
|
+
execution_summary_parts: list[str] = []
|
|
2232
|
+
success_criteria = []
|
|
2233
|
+
verification_checks = []
|
|
2234
|
+
remediation_artifact = str(runtime_contract_repair["path"])
|
|
2235
|
+
if contract_updated:
|
|
2236
|
+
execution_summary_parts.append(f"Updated story test runtime contract at {runtime_contract_repair['path']}.")
|
|
2237
|
+
success_criteria.append(
|
|
2238
|
+
{
|
|
2239
|
+
"criterion": "Story runtime contract updated",
|
|
2240
|
+
"oracle": "The canonical story test_runtime.json matches the corrected run pattern.",
|
|
2241
|
+
"evidence_ref": runtime_contract_repair["path"],
|
|
2242
|
+
}
|
|
2243
|
+
)
|
|
2244
|
+
verification_checks.append("story runtime contract updated")
|
|
2245
|
+
if repaired_test_files:
|
|
2246
|
+
execution_summary_parts.append(
|
|
2247
|
+
"Repaired malformed story-scoped pytest file(s): " + ", ".join(repaired_test_files) + "."
|
|
2248
|
+
)
|
|
2249
|
+
success_criteria.append(
|
|
2250
|
+
{
|
|
2251
|
+
"criterion": "Malformed story-scoped pytest file repaired",
|
|
2252
|
+
"oracle": "Injected pytestmark blocks no longer sit inside an open Python import list and the repaired file compiles.",
|
|
2253
|
+
"evidence_ref": repaired_test_files[0],
|
|
2254
|
+
}
|
|
2255
|
+
)
|
|
2256
|
+
verification_checks.append("malformed story-scoped pytest file repaired")
|
|
2257
|
+
remediation_artifact = repaired_test_files[0]
|
|
2258
|
+
plan = plan.model_copy(
|
|
2259
|
+
update={
|
|
2260
|
+
"action": "repair_artifact_then_requeue",
|
|
2261
|
+
"summary": " ".join(execution_summary_parts).strip(),
|
|
2262
|
+
"remediation_artifact": remediation_artifact,
|
|
2263
|
+
}
|
|
2264
|
+
)
|
|
2265
|
+
task_context.metadata["plan"] = plan
|
|
2266
|
+
execution = RecoveryExecutionArtifact(
|
|
2267
|
+
queue_type=item.queue_type,
|
|
2268
|
+
item_id=item.item_id,
|
|
2269
|
+
outcome="reenqueued",
|
|
2270
|
+
execution_summary=" ".join(execution_summary_parts).strip(),
|
|
2271
|
+
preserve_failure_context=True,
|
|
2272
|
+
attempts_used=1,
|
|
2273
|
+
success_criteria=[
|
|
2274
|
+
*success_criteria,
|
|
2275
|
+
{
|
|
2276
|
+
"criterion": "Story re-enqueued",
|
|
2277
|
+
"oracle": "The story queue row returns to queued so the next run uses the repaired runtime boundary.",
|
|
2278
|
+
},
|
|
2279
|
+
],
|
|
2280
|
+
verification_summary="; ".join([*verification_checks, "story re-enqueued"]),
|
|
2281
|
+
)
|
|
2282
|
+
updated_context = _record_recovery_attempt(
|
|
2283
|
+
item=item,
|
|
2284
|
+
diagnosis=runtime_repair_diagnosis,
|
|
2285
|
+
success=True,
|
|
2286
|
+
failure_signature=failure_signature,
|
|
2287
|
+
material_change=True,
|
|
2288
|
+
remediation_artifact=runtime_contract_repair["path"],
|
|
2289
|
+
)
|
|
2290
|
+
updated_context["runtime_contract_update"] = {
|
|
2291
|
+
"path": runtime_contract_repair["path"],
|
|
2292
|
+
"source": runtime_contract_repair["runtime_contract"].get("source"),
|
|
2293
|
+
"files_changed": repaired_test_files,
|
|
2294
|
+
}
|
|
2295
|
+
item.failure_context = updated_context
|
|
2296
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
|
|
2297
|
+
replay_metadata = _build_story_replay_metadata(item=item, diagnosis=diagnosis, execution=execution)
|
|
2298
|
+
row = store.retry_story_queue_item(
|
|
2299
|
+
project_id=event.project_id,
|
|
2300
|
+
story_queue_id=item.item_id,
|
|
2301
|
+
preserve_failure_context=True,
|
|
2302
|
+
replay_metadata=replay_metadata,
|
|
2303
|
+
)
|
|
2304
|
+
failure_context = dict(item.failure_context if isinstance(item.failure_context, dict) else {})
|
|
2305
|
+
failure_context["runtime_contract_update"] = {
|
|
2306
|
+
"path": runtime_contract_repair["path"],
|
|
2307
|
+
"source": runtime_contract_repair["runtime_contract"].get("source"),
|
|
2308
|
+
"files_changed": repaired_test_files,
|
|
2309
|
+
}
|
|
2310
|
+
if replay_metadata is not None:
|
|
2311
|
+
failure_context["replay"] = replay_metadata
|
|
2312
|
+
last_reenqueue = ReenqueueArtifact(
|
|
2313
|
+
queue_type=item.queue_type,
|
|
2314
|
+
item_id=item.item_id,
|
|
2315
|
+
status=str(row.get("status") or "queued"),
|
|
2316
|
+
failure_context=failure_context,
|
|
2317
|
+
replay_metadata=replay_metadata,
|
|
2318
|
+
)
|
|
2319
|
+
last_verified = PreReplayCheckArtifact(
|
|
2320
|
+
queue_type=item.queue_type,
|
|
2321
|
+
ready=True,
|
|
2322
|
+
checks=[*verification_checks, "story queue item reset to queued"],
|
|
2323
|
+
blocking_reasons=[],
|
|
2324
|
+
)
|
|
2325
|
+
task_context.metadata["runtime_contract_update"] = runtime_contract_repair
|
|
2326
|
+
task_context.metadata["reenqueue"] = last_reenqueue
|
|
2327
|
+
task_context.metadata["pre_replay"] = last_verified
|
|
2328
|
+
task_context.metadata["outcome"] = "reenqueued"
|
|
2329
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2330
|
+
self.save_output(execution)
|
|
2331
|
+
return {
|
|
2332
|
+
"outcome": "reenqueued",
|
|
2333
|
+
"recovery_execution": execution.model_dump(),
|
|
2334
|
+
"pre_replay": last_verified.model_dump(),
|
|
2335
|
+
"runtime_contract_update": runtime_contract_repair,
|
|
2336
|
+
}, task_context
|
|
2337
|
+
|
|
2338
|
+
for attempt in range(1, 4):
|
|
2339
|
+
if diagnosis is None or plan is None:
|
|
2340
|
+
diagnosis, plan = _build_diagnosis(item=item, investigation=investigation, attempt=attempt)
|
|
2341
|
+
task_context.metadata["diagnosis"] = diagnosis
|
|
2342
|
+
task_context.metadata["plan"] = plan
|
|
2343
|
+
try:
|
|
2344
|
+
execution, _envelope = run_agent_step(
|
|
2345
|
+
repo_root=repo_root,
|
|
2346
|
+
stage_name="recovery_execution",
|
|
2347
|
+
output_model=RecoveryExecutionArtifact,
|
|
2348
|
+
context_payload={
|
|
2349
|
+
"failed_item": item.model_dump(),
|
|
2350
|
+
"diagnosis": None if diagnosis is None else diagnosis.model_dump(),
|
|
2351
|
+
"success_criteria": [] if diagnosis is None else [c.model_dump() for c in diagnosis.verification_targets],
|
|
2352
|
+
"plan": None if plan is None else plan.model_dump(),
|
|
2353
|
+
"attempt": attempt,
|
|
2354
|
+
"previous_reenqueue": None if last_reenqueue is None else last_reenqueue.model_dump(),
|
|
2355
|
+
"previous_execution": None if last_execution is None else last_execution.model_dump(),
|
|
2356
|
+
},
|
|
2357
|
+
guidance=load_agentic_prompt_lines("recovery_execution"),
|
|
2358
|
+
timeout_seconds=300,
|
|
2359
|
+
strength=_CURRENT_STRENGTH,
|
|
2360
|
+
)
|
|
2361
|
+
except Exception as exc:
|
|
2362
|
+
error_summary = f"recovery_execution agent step failed (attempt {attempt}): {exc}"
|
|
2363
|
+
task_context.metadata["outcome"] = "reenqueued"
|
|
2364
|
+
task_context.metadata["recovery_execution_error"] = error_summary
|
|
2365
|
+
if item.queue_type == "scope":
|
|
2366
|
+
store.retry_scope_queue_item(project_id=event.project_id, scope_queue_id=item.item_id, preserve_failure_context=False)
|
|
2367
|
+
elif item.queue_type == "idea_creation":
|
|
2368
|
+
store.retry_idea_creation_queue_item(project_id=event.project_id, idea_creation_queue_id=item.item_id, preserve_failure_context=False)
|
|
2369
|
+
elif item.queue_type == "idea":
|
|
2370
|
+
store.retry_idea_queue_item(project_id=event.project_id, idea_queue_id=item.item_id, preserve_failure_context=False)
|
|
2371
|
+
elif item.queue_type == "integration":
|
|
2372
|
+
store.retry_integration_queue_item(project_id=event.project_id, integration_queue_id=item.item_id, preserve_failure_context=False)
|
|
2373
|
+
else:
|
|
2374
|
+
store.retry_story_queue_item(project_id=event.project_id, story_queue_id=item.item_id, preserve_failure_context=False, replay_metadata=None)
|
|
2375
|
+
fallback_artifact = RecoveryExecutionArtifact(
|
|
2376
|
+
queue_type=item.queue_type,
|
|
2377
|
+
item_id=item.item_id,
|
|
2378
|
+
outcome="reenqueued",
|
|
2379
|
+
execution_summary=error_summary,
|
|
2380
|
+
attempts_used=attempt,
|
|
2381
|
+
)
|
|
2382
|
+
self.save_output(fallback_artifact)
|
|
2383
|
+
return {"outcome": "reenqueued", "recovery_execution": fallback_artifact.model_dump(), "recovery_execution_error": error_summary}, task_context
|
|
2384
|
+
execution.attempts_used = attempt
|
|
2385
|
+
last_execution = execution
|
|
2386
|
+
if execution.outcome == "delegated":
|
|
2387
|
+
updated_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=False)
|
|
2388
|
+
item.failure_context = updated_context
|
|
2389
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
|
|
2390
|
+
task_context.metadata["outcome"] = "delegated"
|
|
2391
|
+
task_context.metadata["delegation_summary"] = execution.delegation_summary or execution.execution_summary
|
|
2392
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2393
|
+
self.save_output(execution)
|
|
2394
|
+
return {"outcome": "delegated", "recovery_execution": execution.model_dump()}, task_context
|
|
2395
|
+
if execution.outcome == "blocked":
|
|
2396
|
+
if attempt < 3:
|
|
2397
|
+
diagnosis, plan = _build_diagnosis(
|
|
2398
|
+
item=item,
|
|
2399
|
+
investigation=investigation,
|
|
2400
|
+
prior_execution=execution,
|
|
2401
|
+
attempt=attempt + 1,
|
|
2402
|
+
)
|
|
2403
|
+
task_context.metadata["diagnosis"] = diagnosis
|
|
2404
|
+
task_context.metadata["plan"] = plan
|
|
2405
|
+
continue
|
|
2406
|
+
task_context.metadata["outcome"] = "blocked"
|
|
2407
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2408
|
+
self.save_output(execution)
|
|
2409
|
+
return {"outcome": "blocked", "recovery_execution": execution.model_dump()}, task_context
|
|
2410
|
+
preserve = execution.preserve_failure_context if plan is None else plan.preserve_failure_context
|
|
2411
|
+
if preserve:
|
|
2412
|
+
failure_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=False)
|
|
2413
|
+
item.failure_context = failure_context
|
|
2414
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=failure_context)
|
|
2415
|
+
replay_metadata = _build_story_replay_metadata(item=item, diagnosis=diagnosis, execution=execution)
|
|
2416
|
+
if item.queue_type == "scope":
|
|
2417
|
+
row = store.retry_scope_queue_item(project_id=event.project_id, scope_queue_id=item.item_id, preserve_failure_context=preserve)
|
|
2418
|
+
elif item.queue_type == "idea_creation":
|
|
2419
|
+
row = store.retry_idea_creation_queue_item(project_id=event.project_id, idea_creation_queue_id=item.item_id, preserve_failure_context=preserve)
|
|
2420
|
+
elif item.queue_type == "idea":
|
|
2421
|
+
row = store.retry_idea_queue_item(project_id=event.project_id, idea_queue_id=item.item_id, preserve_failure_context=preserve)
|
|
2422
|
+
elif item.queue_type == "integration":
|
|
2423
|
+
row = store.retry_integration_queue_item(project_id=event.project_id, integration_queue_id=item.item_id, preserve_failure_context=preserve)
|
|
2424
|
+
else:
|
|
2425
|
+
row = store.retry_story_queue_item(
|
|
2426
|
+
project_id=event.project_id,
|
|
2427
|
+
story_queue_id=item.item_id,
|
|
2428
|
+
preserve_failure_context=preserve,
|
|
2429
|
+
replay_metadata=replay_metadata,
|
|
2430
|
+
)
|
|
2431
|
+
failure_context = item.failure_context if preserve else None
|
|
2432
|
+
if isinstance(failure_context, dict) and replay_metadata is not None:
|
|
2433
|
+
failure_context = dict(failure_context)
|
|
2434
|
+
failure_context["replay"] = replay_metadata
|
|
2435
|
+
last_reenqueue = ReenqueueArtifact(
|
|
2436
|
+
queue_type=item.queue_type,
|
|
2437
|
+
item_id=item.item_id,
|
|
2438
|
+
status=str(row.get("status") or "queued"),
|
|
2439
|
+
failure_context=failure_context,
|
|
2440
|
+
replay_metadata=replay_metadata,
|
|
2441
|
+
)
|
|
2442
|
+
task_context.metadata["reenqueue"] = last_reenqueue
|
|
2443
|
+
_publish_node(item.dfs_project_id or event.project_id, run_id, "Verifying recovery", recovery_id=item.item_id)
|
|
2444
|
+
verified, _verify_env = run_agent_step(
|
|
2445
|
+
repo_root=repo_root,
|
|
2446
|
+
stage_name="recovery_execution_verification",
|
|
2447
|
+
output_model=PreReplayCheckArtifact,
|
|
2448
|
+
context_payload={
|
|
2449
|
+
"failed_item": item.model_dump(),
|
|
2450
|
+
"diagnosis": None if diagnosis is None else diagnosis.model_dump(),
|
|
2451
|
+
"success_criteria": [] if diagnosis is None else [c.model_dump() for c in diagnosis.verification_targets],
|
|
2452
|
+
"execution": execution.model_dump(),
|
|
2453
|
+
"reenqueue": last_reenqueue.model_dump(),
|
|
2454
|
+
"attempt": attempt,
|
|
2455
|
+
},
|
|
2456
|
+
guidance=load_agentic_prompt_lines("recovery_execution_verification"),
|
|
2457
|
+
timeout_seconds=300,
|
|
2458
|
+
strength=_CURRENT_STRENGTH,
|
|
2459
|
+
)
|
|
2460
|
+
last_verified = verified
|
|
2461
|
+
task_context.metadata["pre_replay"] = verified
|
|
2462
|
+
if _verification_allows_reenqueue(execution=execution, verified=verified, diagnosis=diagnosis):
|
|
2463
|
+
if preserve:
|
|
2464
|
+
failure_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=True)
|
|
2465
|
+
item.failure_context = failure_context
|
|
2466
|
+
_persist_queue_failure_context(store=store, item=item, failure_context=failure_context)
|
|
2467
|
+
task_context.metadata["outcome"] = "reenqueued"
|
|
2468
|
+
task_context.metadata["recovery_execution"] = execution
|
|
2469
|
+
self.save_output(execution)
|
|
2470
|
+
return {"outcome": "reenqueued", "recovery_execution": execution.model_dump(), "pre_replay": verified.model_dump()}, task_context
|
|
2471
|
+
if attempt < 3:
|
|
2472
|
+
diagnosis, plan = _build_diagnosis(
|
|
2473
|
+
item=item,
|
|
2474
|
+
investigation=investigation,
|
|
2475
|
+
prior_execution=execution,
|
|
2476
|
+
prior_verification=verified,
|
|
2477
|
+
attempt=attempt + 1,
|
|
2478
|
+
)
|
|
2479
|
+
task_context.metadata["diagnosis"] = diagnosis
|
|
2480
|
+
task_context.metadata["plan"] = plan
|
|
2481
|
+
task_context.metadata["outcome"] = "blocked"
|
|
2482
|
+
artifact = last_execution or RecoveryExecutionArtifact(queue_type=item.queue_type, item_id=item.item_id, outcome="blocked", execution_summary="Recovery attempts exhausted", attempts_used=3)
|
|
2483
|
+
task_context.metadata["recovery_execution"] = artifact
|
|
2484
|
+
if last_verified is not None:
|
|
2485
|
+
task_context.metadata["pre_replay"] = last_verified
|
|
2486
|
+
self.save_output(artifact)
|
|
2487
|
+
return {"outcome": "blocked", "recovery_execution": artifact.model_dump(), "pre_replay": None if last_verified is None else last_verified.model_dump()}, task_context
|
|
2488
|
+
return _persist_node(node_id="recovery_execution", node_name="AgenticRecoveryExecution", fn=_run)
|
|
2489
|
+
|
|
2490
|
+
|
|
2491
|
+
class PublishRecoveryStateNode(Node):
|
|
2492
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
2493
|
+
def _run(_node_exec_id: str):
|
|
2494
|
+
event = task_context.event
|
|
2495
|
+
_store, run_id = _store_run()
|
|
2496
|
+
item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
|
|
2497
|
+
_publish_node(item.dfs_project_id or event.project_id, run_id, "Publishing outcome", recovery_id=item.item_id)
|
|
2498
|
+
investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
|
|
2499
|
+
plan: RemediationPlanArtifact | None = task_context.metadata.get("plan")
|
|
2500
|
+
reenqueue: ReenqueueArtifact | None = task_context.metadata.get("reenqueue")
|
|
2501
|
+
recovery_execution: RecoveryExecutionArtifact | None = task_context.metadata.get("recovery_execution")
|
|
2502
|
+
outcome = str(task_context.metadata.get("outcome") or "blocked")
|
|
2503
|
+
pre_replay: PreReplayCheckArtifact | None = task_context.metadata.get("pre_replay")
|
|
2504
|
+
if outcome == "reenqueued":
|
|
2505
|
+
summary = f"Recovery re-enqueued failed {item.queue_type} item {item.item_id}."
|
|
2506
|
+
_publish(item.dfs_project_id or event.project_id, run_id, "idle", "completed", summary, recovery_id=item.item_id)
|
|
2507
|
+
exit_code = 0
|
|
2508
|
+
elif outcome == "delegated":
|
|
2509
|
+
summary = str(task_context.metadata.get("delegation_summary") or f"Delegated {item.queue_type} item {item.item_id} to code error recovery.")
|
|
2510
|
+
_publish(item.dfs_project_id or event.project_id, run_id, "idle", "completed", summary, recovery_id=item.item_id)
|
|
2511
|
+
exit_code = 0
|
|
2512
|
+
else:
|
|
2513
|
+
reason = "recovery blocked"
|
|
2514
|
+
if pre_replay is not None and pre_replay.blocking_reasons:
|
|
2515
|
+
reason = "; ".join(pre_replay.blocking_reasons)
|
|
2516
|
+
elif recovery_execution is not None and recovery_execution.verification_summary:
|
|
2517
|
+
reason = recovery_execution.verification_summary
|
|
2518
|
+
elif investigation is not None and investigation.summary:
|
|
2519
|
+
reason = investigation.summary
|
|
2520
|
+
elif plan is not None and plan.summary:
|
|
2521
|
+
reason = plan.summary
|
|
2522
|
+
if investigation is not None and investigation.non_convergence is not None:
|
|
2523
|
+
reason = f"{reason}; non-convergence: {investigation.non_convergence.reason}" if reason else investigation.non_convergence.reason
|
|
2524
|
+
summary = f"Recovery blocked for failed {item.queue_type} item {item.item_id}: {reason}"
|
|
2525
|
+
_publish(item.dfs_project_id or event.project_id, run_id, "failed", "blocked", summary, reason, recovery_id=item.item_id)
|
|
2526
|
+
exit_code = 2
|
|
2527
|
+
diagnosis: RecoveryDiagnosisArtifact | None = task_context.metadata.get("diagnosis")
|
|
2528
|
+
handoff_path = _persist_recovery_handoff_artifact(
|
|
2529
|
+
repo_root=Path(str(event.repo_root)),
|
|
2530
|
+
recovery_run_id=run_id,
|
|
2531
|
+
item=item,
|
|
2532
|
+
investigation=investigation,
|
|
2533
|
+
diagnosis=diagnosis,
|
|
2534
|
+
execution=recovery_execution,
|
|
2535
|
+
pre_replay=pre_replay,
|
|
2536
|
+
)
|
|
2537
|
+
if recovery_execution is not None and handoff_path is not None:
|
|
2538
|
+
recovery_execution = recovery_execution.model_copy(update={
|
|
2539
|
+
"recovery_handoff_artifact_path": str(handoff_path),
|
|
2540
|
+
"recovery_handoff_summary": f"Persisted compact recovery handoff at {handoff_path}",
|
|
2541
|
+
})
|
|
2542
|
+
task_context.metadata["recovery_execution"] = recovery_execution
|
|
2543
|
+
artifact = RecoveryOutcomeArtifact(
|
|
2544
|
+
queue_type=item.queue_type,
|
|
2545
|
+
item_id=item.item_id,
|
|
2546
|
+
project_id=item.project_id,
|
|
2547
|
+
outcome=outcome, # type: ignore[arg-type]
|
|
2548
|
+
summary=summary,
|
|
2549
|
+
investigation=investigation,
|
|
2550
|
+
plan=plan,
|
|
2551
|
+
reenqueue=reenqueue,
|
|
2552
|
+
recovery_handoff_artifact_path=None if handoff_path is None else str(handoff_path),
|
|
2553
|
+
)
|
|
2554
|
+
task_context.metadata["result"] = artifact.model_dump()
|
|
2555
|
+
task_context.metadata["message"] = json.dumps(artifact.model_dump(), sort_keys=True)
|
|
2556
|
+
task_context.metadata["exit_code"] = exit_code
|
|
2557
|
+
self.save_output(artifact)
|
|
2558
|
+
return {"outcome": outcome, "summary": summary, "exit_code": exit_code}, task_context
|
|
2559
|
+
return _persist_node(node_id="publish_recovery_state", node_name="PublishRecoveryState", fn=_run)
|
|
2560
|
+
|
|
2561
|
+
|
|
2562
|
+
class FailureRecoveryWorkflow(Workflow):
|
|
2563
|
+
workflow_schema = WorkflowSchema(
|
|
2564
|
+
description="Post-queue-drain failure recovery DAG — systemic-aware",
|
|
2565
|
+
event_schema=FailureRecoveryDagEvent,
|
|
2566
|
+
start=LoadFailedQueueItemNode,
|
|
2567
|
+
nodes=[
|
|
2568
|
+
NodeConfig(node=LoadFailedQueueItemNode, connections=[SystemicPatternAnalysisNode]),
|
|
2569
|
+
NodeConfig(node=SystemicPatternAnalysisNode, connections=[AgenticFailureInvestigationNode]),
|
|
2570
|
+
NodeConfig(node=AgenticFailureInvestigationNode, connections=[SystemicVsIsolatedRouter]),
|
|
2571
|
+
NodeConfig(node=SystemicVsIsolatedRouter, connections=[PublishRecoveryStateNode, RootCauseCodeInvestigationNode, AgenticRecoveryDiagnosisNode], is_router=True),
|
|
2572
|
+
NodeConfig(node=RootCauseCodeInvestigationNode, connections=[RemediationExecutionNode]),
|
|
2573
|
+
NodeConfig(node=RemediationExecutionNode, connections=[BulkReenqueueNode]),
|
|
2574
|
+
NodeConfig(node=BulkReenqueueNode, connections=[PublishRecoveryStateNode]),
|
|
2575
|
+
NodeConfig(node=AgenticRecoveryDiagnosisNode, connections=[AgenticRecoveryExecutionNode]),
|
|
2576
|
+
NodeConfig(node=AgenticRecoveryExecutionNode, connections=[PublishRecoveryStateNode]),
|
|
2577
|
+
NodeConfig(node=PublishRecoveryStateNode, connections=[]),
|
|
2578
|
+
],
|
|
2579
|
+
)
|
|
2580
|
+
|
|
2581
|
+
|
|
2582
|
+
def run_failure_recovery_dag(*, repo_root: Path, store: ExecutionStore, project_id: str, queue_type: str, item_id: str, run_id: str | None = None, strength: str | None = None) -> FailureRecoveryDagResult:
|
|
2583
|
+
owns_run = run_id is None
|
|
2584
|
+
if run_id is None:
|
|
2585
|
+
run_id = store.create_run(dag_id=DAG_ID, dag_version="v2", root_correlation_id=f"corr_recovery_{queue_type}_{item_id}", config={"project_id": project_id, "queue_type": queue_type, "item_id": item_id})
|
|
2586
|
+
store.mark_run_started(run_id=run_id)
|
|
2587
|
+
_publish(project_id, run_id, "running", "processing", f"Recovering failed {queue_type} queue item", recovery_id=item_id)
|
|
2588
|
+
wf = FailureRecoveryWorkflow()
|
|
2589
|
+
global _CURRENT_STORE, _CURRENT_RUN_ID, _CURRENT_STRENGTH, _CURRENT_REPO_ROOT
|
|
2590
|
+
_CURRENT_STORE = store
|
|
2591
|
+
_CURRENT_RUN_ID = run_id
|
|
2592
|
+
_CURRENT_STRENGTH = strength
|
|
2593
|
+
_CURRENT_REPO_ROOT = repo_root
|
|
2594
|
+
try:
|
|
2595
|
+
ctx = wf.run({"repo_root": str(repo_root), "project_id": project_id, "queue_type": queue_type, "item_id": item_id})
|
|
2596
|
+
except Exception as exc:
|
|
2597
|
+
if owns_run:
|
|
2598
|
+
store.mark_run_finished(run_id=run_id, status="failed")
|
|
2599
|
+
_publish(project_id, run_id, "failed", "failed", f"Recovery failed for {queue_type}:{item_id}", str(exc), recovery_id=item_id)
|
|
2600
|
+
raise
|
|
2601
|
+
finally:
|
|
2602
|
+
_CURRENT_STORE = None
|
|
2603
|
+
_CURRENT_RUN_ID = None
|
|
2604
|
+
_CURRENT_STRENGTH = None
|
|
2605
|
+
_CURRENT_REPO_ROOT = None
|
|
2606
|
+
exit_code = int(ctx.metadata.get("exit_code") or 0)
|
|
2607
|
+
if owns_run:
|
|
2608
|
+
store.mark_run_finished(run_id=run_id, status="succeeded" if exit_code == 0 else "failed")
|
|
2609
|
+
return FailureRecoveryDagResult(exit_code=exit_code, run_id=run_id, outcome=dict(ctx.metadata.get("result") or {}), message=str(ctx.metadata.get("message") or ""))
|