devflow-engine 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. devflow_engine/__init__.py +3 -0
  2. devflow_engine/agentic_prompts.py +100 -0
  3. devflow_engine/agentic_runtime.py +398 -0
  4. devflow_engine/api_key_flow_harness.py +539 -0
  5. devflow_engine/api_keys.py +357 -0
  6. devflow_engine/bootstrap/__init__.py +2 -0
  7. devflow_engine/bootstrap/provision_from_template.py +84 -0
  8. devflow_engine/cli/__init__.py +0 -0
  9. devflow_engine/cli/app.py +7270 -0
  10. devflow_engine/core/__init__.py +0 -0
  11. devflow_engine/core/config.py +86 -0
  12. devflow_engine/core/logging.py +29 -0
  13. devflow_engine/core/paths.py +45 -0
  14. devflow_engine/core/toml_kv.py +33 -0
  15. devflow_engine/devflow_event_worker.py +1292 -0
  16. devflow_engine/devflow_state.py +201 -0
  17. devflow_engine/devin2/__init__.py +9 -0
  18. devflow_engine/devin2/agent_definition.py +120 -0
  19. devflow_engine/devin2/pi_runner.py +204 -0
  20. devflow_engine/devin_orchestration.py +69 -0
  21. devflow_engine/docs/prompts/anti-patterns.md +42 -0
  22. devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
  23. devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
  24. devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
  25. devflow_engine/doctor/__init__.py +2 -0
  26. devflow_engine/doctor/triage.py +140 -0
  27. devflow_engine/error/__init__.py +0 -0
  28. devflow_engine/error/remediation.py +21 -0
  29. devflow_engine/errors/error_solver_dag.py +522 -0
  30. devflow_engine/errors/runtime_observability.py +67 -0
  31. devflow_engine/idea/__init__.py +4 -0
  32. devflow_engine/idea/actors.py +481 -0
  33. devflow_engine/idea/agentic.py +465 -0
  34. devflow_engine/idea/analyze.py +93 -0
  35. devflow_engine/idea/devin_chat_dag.py +1 -0
  36. devflow_engine/idea/diff.py +99 -0
  37. devflow_engine/idea/drafts.py +446 -0
  38. devflow_engine/idea/idea_creation_dag.py +643 -0
  39. devflow_engine/idea/ideation_enrichment.py +355 -0
  40. devflow_engine/idea/ideation_enrichment_worker.py +19 -0
  41. devflow_engine/idea/paths.py +28 -0
  42. devflow_engine/idea/promote.py +53 -0
  43. devflow_engine/idea/redaction.py +27 -0
  44. devflow_engine/idea/repo_tools.py +1277 -0
  45. devflow_engine/idea/response_mode.py +30 -0
  46. devflow_engine/idea/story_pipeline.py +1585 -0
  47. devflow_engine/idea/sufficiency.py +376 -0
  48. devflow_engine/idea/traditional_stories.py +1257 -0
  49. devflow_engine/implementation/__init__.py +0 -0
  50. devflow_engine/implementation/alembic_preflight.py +700 -0
  51. devflow_engine/implementation/dag.py +8450 -0
  52. devflow_engine/implementation/green_gate.py +93 -0
  53. devflow_engine/implementation/prompts.py +108 -0
  54. devflow_engine/implementation/test_runtime.py +623 -0
  55. devflow_engine/integration/__init__.py +19 -0
  56. devflow_engine/integration/agentic.py +66 -0
  57. devflow_engine/integration/dag.py +3539 -0
  58. devflow_engine/integration/prompts.py +114 -0
  59. devflow_engine/integration/supabase_schema.sql +31 -0
  60. devflow_engine/integration/supabase_sync.py +177 -0
  61. devflow_engine/llm/__init__.py +1 -0
  62. devflow_engine/llm/cli_one_shot.py +84 -0
  63. devflow_engine/llm/cli_stream.py +371 -0
  64. devflow_engine/llm/execution_context.py +26 -0
  65. devflow_engine/llm/invoke.py +1322 -0
  66. devflow_engine/llm/provider_api.py +304 -0
  67. devflow_engine/llm/repo_knowledge.py +588 -0
  68. devflow_engine/llm_primitives.py +315 -0
  69. devflow_engine/orchestration.py +62 -0
  70. devflow_engine/planning/__init__.py +0 -0
  71. devflow_engine/planning/analyze_repo.py +92 -0
  72. devflow_engine/planning/render_drafts.py +133 -0
  73. devflow_engine/playground/__init__.py +0 -0
  74. devflow_engine/playground/hooks.py +26 -0
  75. devflow_engine/playwright_workflow/__init__.py +5 -0
  76. devflow_engine/playwright_workflow/dag.py +1317 -0
  77. devflow_engine/process/__init__.py +5 -0
  78. devflow_engine/process/dag.py +59 -0
  79. devflow_engine/project_registration/__init__.py +3 -0
  80. devflow_engine/project_registration/dag.py +1581 -0
  81. devflow_engine/project_registry.py +109 -0
  82. devflow_engine/prompts/devin/generic/prompt.md +6 -0
  83. devflow_engine/prompts/devin/ideation/prompt.md +263 -0
  84. devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
  85. devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
  86. devflow_engine/prompts/devin/insight/prompt.md +11 -0
  87. devflow_engine/prompts/devin/insight/scenarios.md +5 -0
  88. devflow_engine/prompts/devin/intake/prompt.md +15 -0
  89. devflow_engine/prompts/devin/iterate/prompt.md +12 -0
  90. devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
  91. devflow_engine/prompts/devin/shared/principles.md +246 -0
  92. devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
  93. devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
  94. devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
  95. devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
  96. devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
  97. devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
  98. devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
  99. devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
  100. devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
  101. devflow_engine/prompts/implementation/red/prompt.md +27 -0
  102. devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
  103. devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
  104. devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
  105. devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
  106. devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
  107. devflow_engine/prompts/integration/README.md +185 -0
  108. devflow_engine/prompts/integration/green/example.md +67 -0
  109. devflow_engine/prompts/integration/green/green/prompt.md +10 -0
  110. devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
  111. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
  112. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
  113. devflow_engine/prompts/integration/green_enrich/example.md +79 -0
  114. devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
  115. devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
  116. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
  117. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  118. devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
  119. devflow_engine/prompts/integration/red/example.md +152 -0
  120. devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
  121. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  122. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
  123. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
  124. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
  125. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  126. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
  127. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
  128. devflow_engine/prompts/integration/red/red/prompt.md +11 -0
  129. devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
  130. devflow_engine/prompts/integration/red_review/example.md +71 -0
  131. devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
  132. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  133. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
  134. devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
  135. devflow_engine/prompts/integration/resolve/example.md +111 -0
  136. devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
  137. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
  138. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
  139. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
  140. devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
  141. devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
  142. devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
  143. devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
  144. devflow_engine/prompts/integration/validate/example.md +143 -0
  145. devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
  146. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  147. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
  148. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
  149. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
  150. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  151. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
  152. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
  153. devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
  154. devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
  155. devflow_engine/prompts/integration/write_workflows/example.md +100 -0
  156. devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
  157. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
  158. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
  159. devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
  160. devflow_engine/prompts/iterate/README.md +7 -0
  161. devflow_engine/prompts/iterate/coder/prompt.md +11 -0
  162. devflow_engine/prompts/iterate/framer/prompt.md +11 -0
  163. devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
  164. devflow_engine/prompts/iterate/observer/prompt.md +11 -0
  165. devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
  166. devflow_engine/prompts/recovery/execution/prompt.md +8 -0
  167. devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
  168. devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
  169. devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
  170. devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
  171. devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
  172. devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
  173. devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
  174. devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
  175. devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
  176. devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
  177. devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
  178. devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
  179. devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
  180. devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
  181. devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
  182. devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
  183. devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
  184. devflow_engine/recovery/__init__.py +3 -0
  185. devflow_engine/recovery/dag.py +2609 -0
  186. devflow_engine/recovery/models.py +220 -0
  187. devflow_engine/refactor.py +93 -0
  188. devflow_engine/registry/__init__.py +1 -0
  189. devflow_engine/registry/cards.py +238 -0
  190. devflow_engine/registry/domain_normalize.py +60 -0
  191. devflow_engine/registry/effects.py +65 -0
  192. devflow_engine/registry/enforce_report.py +150 -0
  193. devflow_engine/registry/module_cards_classify.py +164 -0
  194. devflow_engine/registry/module_cards_draft.py +184 -0
  195. devflow_engine/registry/module_cards_gate.py +59 -0
  196. devflow_engine/registry/packages.py +347 -0
  197. devflow_engine/registry/pathways.py +323 -0
  198. devflow_engine/review/__init__.py +11 -0
  199. devflow_engine/review/dag.py +588 -0
  200. devflow_engine/review/review_story.py +67 -0
  201. devflow_engine/scope_idea/__init__.py +3 -0
  202. devflow_engine/scope_idea/agentic.py +39 -0
  203. devflow_engine/scope_idea/dag.py +1069 -0
  204. devflow_engine/scope_idea/models.py +175 -0
  205. devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
  206. devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
  207. devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
  208. devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
  209. devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
  210. devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
  211. devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
  212. devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
  213. devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
  214. devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
  215. devflow_engine/skills/registry.example.yaml +42 -0
  216. devflow_engine/source_doc_assumptions.py +291 -0
  217. devflow_engine/source_doc_mutation_dag.py +1606 -0
  218. devflow_engine/source_doc_mutation_eval.py +417 -0
  219. devflow_engine/source_doc_mutation_worker.py +25 -0
  220. devflow_engine/source_docs_schema.py +207 -0
  221. devflow_engine/source_docs_updater.py +309 -0
  222. devflow_engine/source_scope/__init__.py +15 -0
  223. devflow_engine/source_scope/agentic.py +45 -0
  224. devflow_engine/source_scope/dag.py +1626 -0
  225. devflow_engine/source_scope/models.py +177 -0
  226. devflow_engine/stores/__init__.py +0 -0
  227. devflow_engine/stores/execution_store.py +3534 -0
  228. devflow_engine/story/__init__.py +0 -0
  229. devflow_engine/story/contracts.py +160 -0
  230. devflow_engine/story/discovery.py +47 -0
  231. devflow_engine/story/evidence.py +118 -0
  232. devflow_engine/story/hashing.py +27 -0
  233. devflow_engine/story/implemented_queue_purge.py +148 -0
  234. devflow_engine/story/indexer.py +105 -0
  235. devflow_engine/story/io.py +20 -0
  236. devflow_engine/story/markdown_contracts.py +298 -0
  237. devflow_engine/story/reconciliation.py +408 -0
  238. devflow_engine/story/validate_stories.py +149 -0
  239. devflow_engine/story/validate_tests_story.py +512 -0
  240. devflow_engine/story/validation.py +133 -0
  241. devflow_engine/ui_grounding/__init__.py +11 -0
  242. devflow_engine/ui_grounding/agentic.py +31 -0
  243. devflow_engine/ui_grounding/dag.py +874 -0
  244. devflow_engine/ui_grounding/models.py +224 -0
  245. devflow_engine/ui_grounding/pencil_bridge.py +247 -0
  246. devflow_engine/vendor/__init__.py +0 -0
  247. devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
  248. devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
  249. devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
  250. devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
  251. devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
  252. devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
  253. devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
  254. devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
  255. devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
  256. devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
  257. devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
  258. devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
  259. devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
  260. devflow_engine/worker.py +1086 -0
  261. devflow_engine/worker_guard.py +233 -0
  262. devflow_engine-1.0.0.dist-info/METADATA +235 -0
  263. devflow_engine-1.0.0.dist-info/RECORD +393 -0
  264. devflow_engine-1.0.0.dist-info/WHEEL +4 -0
  265. devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
  266. devin/__init__.py +6 -0
  267. devin/dag.py +58 -0
  268. devin/dag_two_arm.py +138 -0
  269. devin/devin_chat_scenario_catalog.json +588 -0
  270. devin/devin_eval.py +677 -0
  271. devin/nodes/__init__.py +0 -0
  272. devin/nodes/ideation/__init__.py +0 -0
  273. devin/nodes/ideation/node.py +195 -0
  274. devin/nodes/ideation/playground.py +267 -0
  275. devin/nodes/ideation/prompt.md +65 -0
  276. devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
  277. devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
  278. devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
  279. devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
  280. devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
  281. devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
  282. devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
  283. devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
  284. devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
  285. devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
  286. devin/nodes/ideation/scenarios/vague_idea.py +16 -0
  287. devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
  288. devin/nodes/ideation/tools.json +312 -0
  289. devin/nodes/insight/__init__.py +0 -0
  290. devin/nodes/insight/node.py +49 -0
  291. devin/nodes/insight/playground.py +154 -0
  292. devin/nodes/insight/prompt.md +61 -0
  293. devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
  294. devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
  295. devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
  296. devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
  297. devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
  298. devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
  299. devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
  300. devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
  301. devin/nodes/insight/scenarios/operational_debugging.py +15 -0
  302. devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
  303. devin/nodes/insight/scenarios/operational_question.py +9 -0
  304. devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
  305. devin/nodes/insight/scenarios/queue_status.py +15 -0
  306. devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
  307. devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
  308. devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
  309. devin/nodes/insight/scenarios/worker_state_check.py +15 -0
  310. devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
  311. devin/nodes/insight/tools.json +126 -0
  312. devin/nodes/intake/__init__.py +0 -0
  313. devin/nodes/intake/node.py +27 -0
  314. devin/nodes/intake/playground.py +47 -0
  315. devin/nodes/intake/prompt.md +12 -0
  316. devin/nodes/intake/scenarios/ideation_routing.py +4 -0
  317. devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
  318. devin/nodes/intake/scenarios/insight_routing.py +4 -0
  319. devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
  320. devin/nodes/iterate/README.md +44 -0
  321. devin/nodes/iterate/__init__.py +1 -0
  322. devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
  323. devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
  324. devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
  325. devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
  326. devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
  327. devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
  328. devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
  329. devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
  330. devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
  331. devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
  332. devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
  333. devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
  334. devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
  335. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
  336. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
  337. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
  338. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
  339. devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
  340. devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
  341. devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
  342. devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
  343. devin/nodes/iterate/agent-roles.md +89 -0
  344. devin/nodes/iterate/agents/README.md +10 -0
  345. devin/nodes/iterate/artifacts.md +504 -0
  346. devin/nodes/iterate/contract.md +100 -0
  347. devin/nodes/iterate/eval-plan.md +74 -0
  348. devin/nodes/iterate/node.py +100 -0
  349. devin/nodes/iterate/pipeline/README.md +13 -0
  350. devin/nodes/iterate/playground-contract.md +76 -0
  351. devin/nodes/iterate/prompt.md +11 -0
  352. devin/nodes/iterate/scenarios/README.md +38 -0
  353. devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
  354. devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
  355. devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
  356. devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
  357. devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
  358. devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
  359. devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
  360. devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
  361. devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
  362. devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
  363. devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
  364. devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
  365. devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
  366. devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
  367. devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
  368. devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
  369. devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
  370. devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
  371. devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
  372. devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
  373. devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
  374. devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
  375. devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
  376. devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
  377. devin/nodes/shared/__init__.py +0 -0
  378. devin/nodes/shared/filemaker_expert.md +80 -0
  379. devin/nodes/shared/filemaker_expert.py +354 -0
  380. devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
  381. devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
  382. devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
  383. devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
  384. devin/nodes/shared/helpers.py +156 -0
  385. devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
  386. devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
  387. devin/nodes/shared/models.py +44 -0
  388. devin/nodes/shared/post.py +40 -0
  389. devin/nodes/shared/router.py +107 -0
  390. devin/nodes/shared/tools.py +191 -0
  391. devin/shared/devin-chat-rubric.md +237 -0
  392. devin/shared/devin-chat-scenario-suite.md +90 -0
  393. devin/shared/eval_doctrine.md +9 -0
@@ -0,0 +1,2609 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import sqlite3
7
+ import subprocess
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from urllib import error as urllib_error
12
+ from urllib import request as urllib_request
13
+
14
+ from pydantic import BaseModel
15
+
16
+ from ..agentic_prompts import load_agentic_prompt_lines
17
+ from ..agentic_runtime import run_agent_step
18
+ from ..devflow_state import publish_devflow_state
19
+ from ..implementation.dag import LocalSetupContract, _get_docker_service_logs
20
+ from ..llm.cli_stream import llm_sessions_db
21
+ from ..implementation.test_runtime import (
22
+ discover_story_scoped_test_paths,
23
+ load_story_test_runtime_contract,
24
+ normalize_recovery_story_runtime_contract,
25
+ persist_story_runtime_contract,
26
+ resolve_story_runtime_contract,
27
+ story_test_runtime_contract_path,
28
+ )
29
+ from ..stores.execution_store import ExecutionStore
30
+ from ..vendor.datalumina_genai.core.nodes.agent import AgentConfig, AgentNode
31
+ from ..vendor.datalumina_genai.core.nodes.base import Node
32
+ from ..vendor.datalumina_genai.core.nodes.router import BaseRouter, RouterNode
33
+ from ..vendor.datalumina_genai.core.schema import NodeConfig, WorkflowSchema
34
+ from ..vendor.datalumina_genai.core.task import TaskContext
35
+ from ..vendor.datalumina_genai.core.workflow import Workflow
36
+ from .models import (
37
+ FailedQueueItemArtifact,
38
+ RecoveryInvestigationArtifact,
39
+ RecoveryNonConvergenceArtifact,
40
+ RecoveryDiagnosisArtifact,
41
+ RecoverySuccessCriterion,
42
+ RecoveryExecutionArtifact,
43
+ PreReplayCheckArtifact,
44
+ RecoveryOutcomeArtifact,
45
+ RemediationPlanArtifact,
46
+ ReenqueueArtifact,
47
+ RecoveryHandoffArtifact,
48
+ SystemicPatternArtifact,
49
+ CodeRootCauseArtifact,
50
+ RemediationResultArtifact,
51
+ )
52
+
53
+ DAG_ID = "post_queue_failure_recovery_dag"
54
+ _CURRENT_STORE: ExecutionStore | None = None
55
+ _CURRENT_RUN_ID: str | None = None
56
+ _CURRENT_STRENGTH: str | None = None
57
+ _CURRENT_REPO_ROOT: Path | None = None
58
+
59
+ _QUEUE_TABLE_BY_KIND = {
60
+ "scope": ("scope_queue", "scope_queue_id"),
61
+ "idea_creation": ("idea_creation_queue", "idea_creation_queue_id"),
62
+ "idea": ("idea_queue", "idea_queue_id"),
63
+ "story": ("story_queue", "story_queue_id"),
64
+ "integration": ("integration_queue", "integration_queue_id"),
65
+ "recovery": ("recovery_queue", "recovery_queue_id"),
66
+ }
67
+
68
+
69
+ class FailureRecoveryDagEvent(BaseModel):
70
+ repo_root: str
71
+ project_id: str
72
+ queue_type: str
73
+ item_id: str
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class FailureRecoveryDagResult:
78
+ exit_code: int
79
+ run_id: str
80
+ outcome: dict[str, Any]
81
+ message: str
82
+
83
+
84
+ def _store_run() -> tuple[ExecutionStore, str]:
85
+ if _CURRENT_STORE is None or _CURRENT_RUN_ID is None:
86
+ raise RuntimeError("recovery workflow missing runtime bindings")
87
+ return _CURRENT_STORE, _CURRENT_RUN_ID
88
+
89
+
90
+ def _repo_root() -> Path:
91
+ if _CURRENT_REPO_ROOT is None:
92
+ raise RuntimeError("recovery workflow missing repo root binding")
93
+ return _CURRENT_REPO_ROOT
94
+
95
+
96
+ def _persist_node(*, node_id: str, node_name: str, fn):
97
+ store, run_id = _store_run()
98
+ node_exec_id = store.create_node_attempt(run_id=run_id, node_id=node_id, node_name=node_name, attempt=1)
99
+ try:
100
+ output, task_context = fn(node_exec_id)
101
+ except Exception as exc:
102
+ store.mark_node_finished(node_exec_id=node_exec_id, status="failed", error={"message": str(exc)})
103
+ raise
104
+ store.mark_node_finished(node_exec_id=node_exec_id, status="succeeded", output=output)
105
+ return task_context
106
+
107
+
108
+ def _recovery_display_path(recovery_id: str) -> str:
109
+ return f"recovery:recovery_{recovery_id}"
110
+
111
+
112
+ def _publish(project_id: str, run_id: str, state: str, status: str, summary: str, error: str | None = None, recovery_id: str | None = None) -> None:
113
+ publish_devflow_state(
114
+ project_id=project_id,
115
+ run_id=run_id,
116
+ current_state=state,
117
+ current_status=status,
118
+ run_summary=summary,
119
+ error_message=error,
120
+ display="project",
121
+ display_path=_recovery_display_path(recovery_id or run_id),
122
+ )
123
+
124
+
125
+ def _publish_node(project_id: str, run_id: str, summary: str, recovery_id: str | None = None) -> None:
126
+ publish_devflow_state(
127
+ project_id=project_id,
128
+ run_id=run_id,
129
+ current_state="running",
130
+ current_status="processing",
131
+ run_summary=summary,
132
+ display="project",
133
+ display_path=_recovery_display_path(recovery_id or run_id),
134
+ )
135
+
136
+
137
+ def _normalize_text(value: Any) -> str:
138
+ return str(value or "").strip().lower()
139
+
140
+
141
+ _SOFT_PROVENANCE_REASON_MARKERS = (
142
+ "provenance",
143
+ "byte identity",
144
+ "byte-ident",
145
+ "byte-for-byte",
146
+ "forensic",
147
+ "checksum",
148
+ "digest",
149
+ "hash certainty",
150
+ "identity certainty",
151
+ "cannot prove byte identity",
152
+ "cannot prove identity",
153
+ "cannot prove provenance",
154
+ "cannot verify provenance",
155
+ "audit trail",
156
+ "metadata certainty",
157
+ )
158
+
159
+ _HARD_BLOCK_REASON_MARKERS = (
160
+ "manual review",
161
+ "human",
162
+ "operator",
163
+ "approval",
164
+ "contradict",
165
+ "conflict",
166
+ "inconsistent",
167
+ "mismatch",
168
+ "failed",
169
+ "failure",
170
+ "not viable",
171
+ "unsafe",
172
+ "missing artifact",
173
+ "missing runtime",
174
+ "unauthorized",
175
+ "forbidden",
176
+ "denied",
177
+ "exhausted",
178
+ )
179
+
180
+
181
+ def _is_soft_provenance_reason(reason: str) -> bool:
182
+ normalized = _normalize_text(reason)
183
+ return bool(normalized) and any(marker in normalized for marker in _SOFT_PROVENANCE_REASON_MARKERS) and not any(
184
+ marker in normalized for marker in _HARD_BLOCK_REASON_MARKERS
185
+ )
186
+
187
+
188
+ def _verification_allows_reenqueue(
189
+ *,
190
+ execution: RecoveryExecutionArtifact,
191
+ verified: PreReplayCheckArtifact,
192
+ diagnosis: RecoveryDiagnosisArtifact | None,
193
+ ) -> bool:
194
+ if execution.outcome != "reenqueued":
195
+ return False
196
+ if verified.ready:
197
+ return True
198
+ if diagnosis is not None and diagnosis.suggested_action == "manual_review_required":
199
+ return False
200
+ if not verified.checks:
201
+ return False
202
+ blocking_reasons = [str(reason).strip() for reason in verified.blocking_reasons if str(reason).strip()]
203
+ if not blocking_reasons:
204
+ return False
205
+ return all(_is_soft_provenance_reason(reason) for reason in blocking_reasons)
206
+
207
+
208
+ def _normalized_failure_signature(*, failure_message: str | None, failure_context: dict[str, Any] | None) -> str:
209
+ ctx = failure_context if isinstance(failure_context, dict) else {}
210
+ message = str(failure_message or "").strip()
211
+ error_type = str(ctx.get("error_type") or "").strip()
212
+ failed_stage = str(ctx.get("failed_stage") or "").strip().lower()
213
+ actual_failed_node = str(ctx.get("actual_failed_node") or "").strip().lower() or failed_stage
214
+ if "Prompt is too long" in message:
215
+ base = "prompt_too_long"
216
+ elif "NameError" in message:
217
+ base = f"name_error:{error_type or 'unknown'}"
218
+ elif error_type:
219
+ base = error_type.lower().replace(" ", "_")[:80]
220
+ else:
221
+ base = message[:80].lower().replace(" ", "_")
222
+ return f"{actual_failed_node}:{base}" if actual_failed_node else base
223
+
224
+
225
+ _RECOVERY_CHURN_GATE_THRESHOLD = 3
226
+ _RECOVERY_CHURN_GATE_VERSION = 1
227
+
228
+
229
+ def _durable_recovery_identity(item: FailedQueueItemArtifact) -> str:
230
+ if item.queue_type == "story" and str(item.story_id or "").strip():
231
+ return f"story:{str(item.story_id or '').strip()}"
232
+ raw = item.raw_row if isinstance(item.raw_row, dict) else {}
233
+ for key in ("scope_id", "idea_id", "integration_id"):
234
+ value = str(raw.get(key) or "").strip()
235
+ if value:
236
+ return f"{item.queue_type}:{value}"
237
+ return f"{item.queue_type}:{item.item_id}"
238
+
239
+
240
+ def _recovery_churn_key(*, item: FailedQueueItemArtifact, failure_signature: str) -> str:
241
+ return (
242
+ f"recovery_churn_gate:v{_RECOVERY_CHURN_GATE_VERSION}:"
243
+ f"{_durable_recovery_identity(item)}:{failure_signature}:no_material_change"
244
+ )
245
+
246
+
247
+ def _recovery_churn_fingerprint_inputs(*, item: FailedQueueItemArtifact, failure_signature: str) -> dict[str, Any]:
248
+ return {
249
+ "surface": "recovery_churn_gate",
250
+ "version": _RECOVERY_CHURN_GATE_VERSION,
251
+ "queue_type": item.queue_type,
252
+ "durable_identity": _durable_recovery_identity(item),
253
+ "failure_signature": failure_signature,
254
+ "no_material_change": True,
255
+ }
256
+
257
+
258
+ def _load_recovery_churn_gate_state(
259
+ *,
260
+ store: ExecutionStore,
261
+ project_id: str,
262
+ item: FailedQueueItemArtifact,
263
+ failure_signature: str,
264
+ ) -> dict[str, Any]:
265
+ fingerprint = store._fingerprint_from_inputs( # type: ignore[attr-defined]
266
+ _recovery_churn_fingerprint_inputs(item=item, failure_signature=failure_signature)
267
+ )
268
+ with store._connect() as conn:
269
+ row = conn.execute(
270
+ (
271
+ "SELECT error_task_id, status, occurrence_count "
272
+ "FROM error_tasks WHERE project_id=? AND fingerprint=? "
273
+ "ORDER BY created_at DESC LIMIT 1"
274
+ ),
275
+ (project_id, fingerprint),
276
+ ).fetchone()
277
+ return {
278
+ "fingerprint": fingerprint,
279
+ "churn_key": _recovery_churn_key(item=item, failure_signature=failure_signature),
280
+ "error_task_id": None if row is None else str(row["error_task_id"] or ""),
281
+ "status": None if row is None else str(row["status"] or ""),
282
+ "occurrence_count": 0 if row is None else int(row["occurrence_count"] or 0),
283
+ "threshold": _RECOVERY_CHURN_GATE_THRESHOLD,
284
+ "threshold_met": row is not None and int(row["occurrence_count"] or 0) >= _RECOVERY_CHURN_GATE_THRESHOLD,
285
+ }
286
+
287
+
288
+ def _record_recovery_churn_strike(
289
+ *,
290
+ store: ExecutionStore,
291
+ project_id: str,
292
+ run_id: str,
293
+ item: FailedQueueItemArtifact,
294
+ failure_signature: str,
295
+ message: str,
296
+ ) -> dict[str, Any]:
297
+ error_task_id = store.create_error_task_from_failure(
298
+ project_id=project_id,
299
+ run_id=run_id,
300
+ plane="process_error",
301
+ source_kind="recovery",
302
+ source_ref=item.item_id,
303
+ title=f"Recovery churn gate: {_durable_recovery_identity(item)}",
304
+ severity="high",
305
+ error_type="recovery_churn_no_material_change",
306
+ message=message,
307
+ stacktrace=None,
308
+ next_steps=[
309
+ "Review repeated recovery churn on the same durable identity.",
310
+ "Apply a material fix before retrying recovery again.",
311
+ ],
312
+ fingerprint_inputs=_recovery_churn_fingerprint_inputs(item=item, failure_signature=failure_signature),
313
+ )
314
+ state = _load_recovery_churn_gate_state(
315
+ store=store,
316
+ project_id=project_id,
317
+ item=item,
318
+ failure_signature=failure_signature,
319
+ )
320
+ state["error_task_id"] = error_task_id
321
+ return state
322
+
323
+
324
+ def _decode_failure_context_blob(raw: Any) -> dict[str, Any]:
325
+ if isinstance(raw, dict):
326
+ return dict(raw)
327
+ try:
328
+ payload = json.loads(str(raw or "{}") or "{}")
329
+ return payload if isinstance(payload, dict) else {}
330
+ except Exception:
331
+ return {}
332
+
333
+
334
+ def _story_churn_source_item_ids(*, conn, item: FailedQueueItemArtifact) -> list[str]:
335
+ if item.queue_type == "story" and str(item.story_id or "").strip():
336
+ rows = conn.execute(
337
+ "SELECT story_queue_id FROM story_queue WHERE story_id=?",
338
+ (str(item.story_id or "").strip(),),
339
+ ).fetchall()
340
+ ids = [str(row["story_queue_id"] or "") for row in rows if str(row["story_queue_id"] or "").strip()]
341
+ if ids:
342
+ return ids
343
+ return [item.item_id]
344
+
345
+
346
+ def _collect_failure_evidence(
347
+ *,
348
+ item: FailedQueueItemArtifact,
349
+ investigation: RecoveryInvestigationArtifact | None = None,
350
+ extra_evidence: list[str] | None = None,
351
+ ) -> list[str]:
352
+ evidence: list[str] = []
353
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
354
+ for raw in (
355
+ item.failure_message,
356
+ failure_context.get("error"),
357
+ failure_context.get("error_type"),
358
+ failure_context.get("failed_stage"),
359
+ json.dumps(failure_context, sort_keys=True) if failure_context else None,
360
+ None if investigation is None else investigation.summary,
361
+ None if investigation is None else investigation.failure_nature,
362
+ *(extra_evidence or []),
363
+ ):
364
+ text = str(raw or "").strip()
365
+ if text:
366
+ evidence.append(text)
367
+ return evidence
368
+
369
+
370
+ def _load_json_file(path: Path) -> dict[str, Any] | None:
371
+ try:
372
+ payload = json.loads(path.read_text(encoding="utf-8"))
373
+ except Exception:
374
+ return None
375
+ return payload if isinstance(payload, dict) else None
376
+
377
+
378
+ def _preflight_artifact_paths(*, repo_root: Path, item: FailedQueueItemArtifact) -> list[Path]:
379
+ candidates: list[Path] = []
380
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
381
+ for raw_path in failure_context.get("artifact_paths") or []:
382
+ text = str(raw_path or "").strip()
383
+ if not text:
384
+ continue
385
+ path = Path(text)
386
+ candidates.append(path if path.is_absolute() else repo_root / path)
387
+ story_id = str(item.story_id or "").strip()
388
+ if story_id:
389
+ candidates.append(repo_root / ".devflow" / "stories" / story_id / "preflight.json")
390
+ seen: set[str] = set()
391
+ ordered: list[Path] = []
392
+ for candidate in candidates:
393
+ key = str(candidate)
394
+ if key in seen:
395
+ continue
396
+ seen.add(key)
397
+ ordered.append(candidate)
398
+ return ordered
399
+
400
+
401
+ def _load_preflight_health_failure_details(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
402
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
403
+ if str(failure_context.get("failed_stage") or "").strip().lower() != "preflight":
404
+ return None
405
+ for artifact_path in _preflight_artifact_paths(repo_root=repo_root, item=item):
406
+ payload = _load_json_file(artifact_path)
407
+ if payload is None:
408
+ continue
409
+ blocking_issues = payload.get("blocking_issues") if isinstance(payload.get("blocking_issues"), list) else []
410
+ health_issues = [issue for issue in blocking_issues if isinstance(issue, dict) and str(issue.get("kind") or "").strip() == "health_check_failed"]
411
+ if not health_issues:
412
+ continue
413
+ return {
414
+ "artifact_path": artifact_path,
415
+ "report": payload,
416
+ "health_issues": health_issues,
417
+ }
418
+ return None
419
+
420
+
421
+ def _get_recovery_runtime(failure_context: dict[str, Any] | None) -> dict[str, Any]:
422
+ if not isinstance(failure_context, dict):
423
+ return {"strategy_history": [], "last_success": None}
424
+ payload = failure_context.get("recovery_runtime")
425
+ if not isinstance(payload, dict):
426
+ return {"strategy_history": [], "last_success": None}
427
+ return {
428
+ "strategy_history": list(payload.get("strategy_history") or []),
429
+ "last_success": payload.get("last_success"),
430
+ }
431
+
432
+
433
+ def _write_recovery_runtime(failure_context: dict[str, Any] | None, runtime: dict[str, Any]) -> dict[str, Any]:
434
+ payload = dict(failure_context or {})
435
+ payload["recovery_runtime"] = {
436
+ "strategy_history": list(runtime.get("strategy_history") or []),
437
+ "last_success": runtime.get("last_success"),
438
+ }
439
+ return payload
440
+
441
+
442
+ def _persist_queue_failure_context(*, store: ExecutionStore, item: FailedQueueItemArtifact, failure_context: dict[str, Any]) -> None:
443
+ mapping = _QUEUE_TABLE_BY_KIND.get(item.queue_type)
444
+ if mapping is None:
445
+ return
446
+ table, id_col = mapping
447
+ now = int(__import__("time").time())
448
+ with store._connect() as conn:
449
+ conn.execute(
450
+ f"UPDATE {table} SET failure_context_json=?, updated_at=? WHERE {id_col}=?",
451
+ (json.dumps(failure_context, sort_keys=True), now, item.item_id),
452
+ )
453
+
454
+
455
+ def _dedupe_strings(values: list[Any]) -> list[str]:
456
+ seen: set[str] = set()
457
+ ordered: list[str] = []
458
+ for raw in values:
459
+ value = str(raw or "").strip()
460
+ if not value or value in seen:
461
+ continue
462
+ seen.add(value)
463
+ ordered.append(value)
464
+ return ordered
465
+
466
+
467
+ def _normalize_success_criteria(
468
+ *,
469
+ verification_targets: list[RecoverySuccessCriterion] | list[dict[str, Any]] | None,
470
+ fallback_targets: list[RecoverySuccessCriterion] | list[dict[str, Any]] | None,
471
+ ) -> list[RecoverySuccessCriterion]:
472
+ chosen = verification_targets or fallback_targets or [
473
+ {
474
+ "criterion": "Recovery action completed",
475
+ "oracle": "The queue item state matches the selected recovery strategy family.",
476
+ }
477
+ ]
478
+ normalized: list[RecoverySuccessCriterion] = []
479
+ for target in chosen:
480
+ criterion = RecoverySuccessCriterion.model_validate(target)
481
+ if not str(criterion.oracle or "").strip():
482
+ raise ValueError("Recovery success criteria must include a non-empty oracle.")
483
+ normalized.append(criterion)
484
+ return normalized
485
+
486
+
487
+ def _recoverable_story_state_root(*, repo_root: Path) -> Path:
488
+ return repo_root / ".devflow" / "stories"
489
+
490
+
491
+ def _recovery_handoff_artifact_path(*, repo_root: Path, item: FailedQueueItemArtifact) -> Path:
492
+ story_id = str(item.story_id or "").strip()
493
+ if story_id:
494
+ safe_story_id = re.sub(r"[^A-Za-z0-9_.-]+", "_", story_id) or "unknown_story"
495
+ return _recoverable_story_state_root(repo_root=repo_root) / safe_story_id / "recovery_handoff.json"
496
+ safe_item_id = re.sub(r"[^A-Za-z0-9_.-]+", "_", item.item_id) or "unknown_item"
497
+ return repo_root / ".devflow" / "recovery_handoffs" / item.queue_type / f"{safe_item_id}.json"
498
+
499
+
500
+ def _extract_log_refs(payload: Any) -> tuple[list[str], list[str]]:
501
+ log_paths: list[str] = []
502
+ session_ids: list[str] = []
503
+
504
+ def _walk(value: Any, *, key: str | None = None) -> None:
505
+ if isinstance(value, dict):
506
+ for child_key, child_value in value.items():
507
+ _walk(child_value, key=str(child_key))
508
+ return
509
+ if isinstance(value, list):
510
+ for item in value:
511
+ _walk(item, key=key)
512
+ return
513
+ if not isinstance(value, str):
514
+ return
515
+ lowered = str(key or "").lower()
516
+ if lowered in {"log_path", "logfile", "log_file", "journal_path"} and value.strip():
517
+ log_paths.append(value.strip())
518
+ elif lowered == "session_id" and value.strip():
519
+ session_ids.append(value.strip())
520
+
521
+ _walk(payload)
522
+ return _dedupe_strings(log_paths), _dedupe_strings(session_ids)
523
+
524
+
525
+ def _read_log_excerpt(*, path: Path, max_lines: int = 60, max_chars: int = 6000) -> str:
526
+ try:
527
+ if path.suffix == ".jsonl":
528
+ lines = path.read_text(encoding="utf-8").splitlines()[-max_lines:]
529
+ rendered: list[str] = []
530
+ for raw in lines:
531
+ try:
532
+ record = json.loads(raw)
533
+ except Exception:
534
+ rendered.append(raw)
535
+ continue
536
+ stream = str(record.get("stream") or "log").strip()
537
+ line = str(record.get("line") or "").rstrip()
538
+ if line:
539
+ rendered.append(f"[{stream}] {line}")
540
+ return "\\n".join(rendered)[-max_chars:]
541
+ return path.read_text(encoding="utf-8")[-max_chars:]
542
+ except Exception:
543
+ return ""
544
+
545
+
546
+ def _load_llm_session_log_refs(*, run_id: str | None, node_exec_id: str | None) -> tuple[list[str], list[str]]:
547
+ db_path = llm_sessions_db()
548
+ if not db_path.exists() or (not run_id and not node_exec_id):
549
+ return [], []
550
+ query = "SELECT session_id, log_path FROM dev_journal_entries WHERE "
551
+ params: list[str] = []
552
+ clauses: list[str] = []
553
+ if node_exec_id:
554
+ clauses.append("node_exec_id=?")
555
+ params.append(node_exec_id)
556
+ if run_id:
557
+ clauses.append("run_id=?")
558
+ params.append(run_id)
559
+ query += " OR ".join(clauses) + " ORDER BY started_at DESC LIMIT 6"
560
+ try:
561
+ conn = sqlite3.connect(str(db_path))
562
+ conn.row_factory = sqlite3.Row
563
+ try:
564
+ rows = conn.execute(query, tuple(params)).fetchall()
565
+ finally:
566
+ conn.close()
567
+ except Exception:
568
+ return [], []
569
+ log_paths = [str(row["log_path"] or "").strip() for row in rows if str(row["log_path"] or "").strip()]
570
+ session_ids = [str(row["session_id"] or "").strip() for row in rows if str(row["session_id"] or "").strip()]
571
+ return _dedupe_strings(log_paths), _dedupe_strings(session_ids)
572
+
573
+
574
+ def _gather_log_first_recovery_evidence(*, store: ExecutionStore, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any]:
575
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
576
+ run_id = str(failure_context.get("implementation_run_id") or "").strip()
577
+ failed_stage = str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or "").strip()
578
+ if not run_id or not failed_stage:
579
+ return {"available": False, "source": None, "refs": [], "session_ids": [], "excerpt": "", "stage": failed_stage or None}
580
+ node = store.get_latest_node_attempt(run_id=run_id, node_id=failed_stage)
581
+ node_output = node.get("output") if isinstance(node, dict) and isinstance(node.get("output"), dict) else {}
582
+ log_paths, session_ids = _extract_log_refs(node_output)
583
+ journal_log_paths, journal_session_ids = _load_llm_session_log_refs(run_id=run_id, node_exec_id=None if not isinstance(node, dict) else str(node.get("node_exec_id") or "") or None)
584
+ log_paths = _dedupe_strings([*log_paths, *journal_log_paths])
585
+ session_ids = _dedupe_strings([*session_ids, *journal_session_ids])
586
+ excerpt = ""
587
+ used_refs: list[str] = []
588
+ for raw_path in log_paths:
589
+ path = Path(raw_path).expanduser()
590
+ if not path.is_absolute():
591
+ path = repo_root / raw_path
592
+ if not path.exists() or not path.is_file():
593
+ continue
594
+ excerpt = _read_log_excerpt(path=path)
595
+ if excerpt:
596
+ used_refs.append(str(path))
597
+ break
598
+ refs = _dedupe_strings([*used_refs, *session_ids])
599
+ source = "streamed_agent_logs" if used_refs else ("llm_session_journal" if session_ids else None)
600
+ return {
601
+ "available": bool(source),
602
+ "source": source,
603
+ "refs": refs,
604
+ "session_ids": session_ids,
605
+ "excerpt": excerpt,
606
+ "stage": failed_stage,
607
+ "implementation_run_id": run_id,
608
+ }
609
+
610
+
611
+ def _enrich_investigation_with_log_evidence(*, investigation: RecoveryInvestigationArtifact, log_evidence: dict[str, Any]) -> RecoveryInvestigationArtifact:
612
+ if not log_evidence.get("available"):
613
+ return investigation
614
+ evidence = list(investigation.evidence or [])
615
+ source = str(log_evidence.get("source") or "streamed_agent_logs")
616
+ source_ref = ", ".join(str(ref) for ref in (log_evidence.get("refs") or []) if str(ref).strip())
617
+ source_line = f"primary_evidence_source={source}" + (f" ({source_ref})" if source_ref else "")
618
+ if source_line not in evidence:
619
+ evidence.insert(0, source_line)
620
+ return investigation.model_copy(update={
621
+ "evidence": evidence[:8],
622
+ "primary_evidence_source": investigation.primary_evidence_source or source,
623
+ "primary_evidence_refs": list(investigation.primary_evidence_refs or []) or [str(ref) for ref in (log_evidence.get("refs") or []) if str(ref).strip()],
624
+ "primary_log_insight": investigation.primary_log_insight or investigation.summary,
625
+ })
626
+
627
+
628
+ def _persist_recovery_handoff_artifact(
629
+ *,
630
+ repo_root: Path,
631
+ recovery_run_id: str,
632
+ item: FailedQueueItemArtifact,
633
+ investigation: RecoveryInvestigationArtifact | None,
634
+ diagnosis: RecoveryDiagnosisArtifact | None,
635
+ execution: RecoveryExecutionArtifact | None,
636
+ pre_replay: PreReplayCheckArtifact | None,
637
+ ) -> Path | None:
638
+ useful = any(
639
+ [
640
+ investigation is not None,
641
+ diagnosis is not None,
642
+ execution is not None and bool(str(execution.verification_summary or execution.execution_summary or "").strip()),
643
+ pre_replay is not None and bool(pre_replay.blocking_reasons),
644
+ ]
645
+ )
646
+ if not useful:
647
+ return None
648
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
649
+ disproven_dead_ends: list[str] = []
650
+ if investigation is not None and investigation.non_convergence is not None and investigation.non_convergence.unchanged_test_surface:
651
+ disproven_dead_ends.append("Repeated retries did not materially change the failing surface.")
652
+ if investigation is not None and investigation.non_convergence is not None and investigation.non_convergence.wrong_seam:
653
+ disproven_dead_ends.append("Prior recovery attempts stayed on the wrong seam.")
654
+ payload = RecoveryHandoffArtifact(
655
+ queue_type=item.queue_type,
656
+ item_id=item.item_id,
657
+ story_id=item.story_id,
658
+ implementation_run_id=str(failure_context.get("implementation_run_id") or "").strip() or None,
659
+ failed_stage=str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or "").strip() or None,
660
+ primary_evidence_source=(None if investigation is None else investigation.primary_evidence_source) or (None if execution is None else execution.primary_evidence_source),
661
+ primary_evidence_refs=([] if investigation is None else investigation.primary_evidence_refs) or ([] if execution is None else execution.primary_evidence_refs),
662
+ key_log_insight=(None if investigation is None else investigation.primary_log_insight) or (None if execution is None else execution.primary_log_insight),
663
+ failing_surface_summary=(None if investigation is None else investigation.summary) or str(item.failure_message or execution.execution_summary if execution else item.item_id),
664
+ likely_seam=(None if investigation is None else investigation.affected_boundary) or str(failure_context.get("actual_failed_node") or failure_context.get("failed_stage") or diagnosis.strategy if diagnosis else "").strip() or None,
665
+ disproven_dead_ends=disproven_dead_ends[:4],
666
+ verification_blockers=[] if pre_replay is None else [str(reason) for reason in pre_replay.blocking_reasons if str(reason).strip()][:4],
667
+ non_convergence_insight=None if investigation is None or investigation.non_convergence is None else investigation.non_convergence.reason,
668
+ produced_by_recovery_run_id=recovery_run_id,
669
+ )
670
+ path = _recovery_handoff_artifact_path(repo_root=repo_root, item=item)
671
+ path.parent.mkdir(parents=True, exist_ok=True)
672
+ path.write_text(json.dumps(payload.model_dump(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
673
+ return path
674
+
675
+
676
+ def _build_diagnosis(
677
+ *,
678
+ item: FailedQueueItemArtifact,
679
+ investigation: RecoveryInvestigationArtifact | None,
680
+ prior_execution: RecoveryExecutionArtifact | None = None,
681
+ prior_verification: PreReplayCheckArtifact | None = None,
682
+ attempt: int = 1,
683
+ ) -> tuple[RecoveryDiagnosisArtifact, RemediationPlanArtifact]:
684
+ diagnosis, _envelope = run_agent_step(
685
+ repo_root=_repo_root(),
686
+ stage_name="recovery_diagnosis",
687
+ output_model=RecoveryDiagnosisArtifact,
688
+ context_payload={
689
+ "failed_item": item.model_dump(),
690
+ "investigation": None if investigation is None else investigation.model_dump(),
691
+ "prior_execution": None if prior_execution is None else prior_execution.model_dump(),
692
+ "prior_verification": None if prior_verification is None else prior_verification.model_dump(),
693
+ "attempt": attempt,
694
+ },
695
+ guidance=load_agentic_prompt_lines("recovery_diagnosis"),
696
+ timeout_seconds=300,
697
+ strength=_CURRENT_STRENGTH,
698
+ )
699
+ targets = _normalize_success_criteria(
700
+ verification_targets=diagnosis.verification_targets,
701
+ fallback_targets=None if investigation is None else investigation.success_criteria,
702
+ )
703
+ enforced = diagnosis.model_copy(
704
+ update={
705
+ "queue_type": item.queue_type,
706
+ "item_id": item.item_id,
707
+ "summary": str(diagnosis.summary or (None if investigation is None else investigation.summary) or "Recovery diagnosis").strip(),
708
+ "rationale": str(diagnosis.rationale or (None if investigation is None else investigation.failure_nature) or "Recovery diagnosis").strip(),
709
+ "verification_targets": targets,
710
+ "replay_path": str(diagnosis.replay_path or (None if investigation is None else investigation.replay_path) or "") or None,
711
+ "evidence": [str(entry) for entry in (diagnosis.evidence or (None if investigation is None else investigation.evidence) or []) if str(entry).strip()][:8],
712
+ }
713
+ )
714
+ plan = RemediationPlanArtifact(
715
+ queue_type=item.queue_type,
716
+ action=enforced.suggested_action,
717
+ summary=enforced.summary,
718
+ steps=[criterion.criterion for criterion in enforced.verification_targets],
719
+ replay_path=enforced.replay_path,
720
+ )
721
+ return enforced, plan
722
+
723
+
724
+ def _record_recovery_attempt(
725
+ *,
726
+ item: FailedQueueItemArtifact,
727
+ diagnosis: RecoveryDiagnosisArtifact | None = None,
728
+ success: bool = False,
729
+ failure_signature: str | None = None,
730
+ material_change: bool | None = None,
731
+ remediation_artifact: str | None = None,
732
+ ) -> dict[str, Any]:
733
+ runtime = _get_recovery_runtime(item.failure_context if isinstance(item.failure_context, dict) else {})
734
+ if diagnosis is not None:
735
+ entry: dict[str, Any] = {
736
+ "strategy": diagnosis.strategy,
737
+ "summary": diagnosis.summary,
738
+ "success": success,
739
+ }
740
+ if failure_signature is not None:
741
+ entry["failure_signature"] = failure_signature
742
+ if material_change is not None:
743
+ entry["material_change"] = material_change
744
+ if remediation_artifact is not None:
745
+ entry["remediation_artifact"] = remediation_artifact
746
+ runtime["strategy_history"].append(entry)
747
+ if success and diagnosis is not None:
748
+ runtime["last_success"] = {
749
+ "strategy": diagnosis.strategy,
750
+ "failure_signature": failure_signature,
751
+ "material_change": material_change,
752
+ }
753
+ return _write_recovery_runtime(item.failure_context if isinstance(item.failure_context, dict) else {}, runtime)
754
+
755
+
756
+ _DOWNSTREAM_RECOVERY_STAGES = {"redreview", "security", "verifygreen", "gitcommit(refactor)"}
757
+
758
+ _SCHEMA_RUNTIME_DRIFT_MARKERS = (
759
+ "schema drift",
760
+ "runtime drift",
761
+ "contract drift",
762
+ "migration mismatch",
763
+ "serialization",
764
+ "deserial",
765
+ "no tests collected",
766
+ "collected 0 items",
767
+ "pytest import error",
768
+ "vitest",
769
+ )
770
+
771
+
772
+ def _build_non_convergence_analysis(
773
+ *,
774
+ item: FailedQueueItemArtifact,
775
+ failure_signature: str,
776
+ occurrence_count: int,
777
+ remediation_artifact: str | None = None,
778
+ ) -> RecoveryNonConvergenceArtifact:
779
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
780
+ runtime = _get_recovery_runtime(failure_context)
781
+ history = [entry for entry in (runtime.get("strategy_history") or []) if isinstance(entry, dict)]
782
+ matching_history = [
783
+ entry
784
+ for entry in history
785
+ if str(entry.get("failure_signature") or "").strip() == failure_signature or not str(entry.get("failure_signature") or "").strip()
786
+ ]
787
+ attempts_reviewed = max(int(occurrence_count or 0), len(matching_history))
788
+ recent_attempts = matching_history[-max(attempts_reviewed, _RECOVERY_CHURN_GATE_THRESHOLD) :]
789
+ recent_failure_signatures = {str(entry.get("failure_signature") or "").strip() for entry in recent_attempts if str(entry.get("failure_signature") or "").strip()}
790
+ same_failure_surface = bool(failure_signature) and (not recent_failure_signatures or recent_failure_signatures == {failure_signature})
791
+ unchanged_test_surface = same_failure_surface and all(entry.get("material_change") is False for entry in recent_attempts)
792
+ measurable_progress = any(bool(entry.get("success")) or entry.get("material_change") is True for entry in recent_attempts)
793
+ failed_stage = str(failure_context.get("failed_stage") or "").strip().lower() or None
794
+ downstream_blocker = bool(failed_stage and failed_stage in _DOWNSTREAM_RECOVERY_STAGES)
795
+ strategy_set = {str(entry.get("strategy") or "").strip() for entry in recent_attempts if str(entry.get("strategy") or "").strip()}
796
+ artifact_set = {
797
+ str(entry.get("remediation_artifact") or "").strip()
798
+ for entry in recent_attempts
799
+ if str(entry.get("remediation_artifact") or "").strip()
800
+ }
801
+ combined_text = "\n".join(
802
+ [
803
+ str(item.failure_message or ""),
804
+ str(failure_context.get("error") or ""),
805
+ str(failure_context.get("error_type") or ""),
806
+ str(failure_context.get("failed_stage") or ""),
807
+ *(str(entry.get("summary") or "") for entry in recent_attempts),
808
+ *(str(entry.get("remediation_artifact") or "") for entry in recent_attempts),
809
+ ]
810
+ ).lower()
811
+ schema_or_runtime_drift = any(marker in combined_text for marker in _SCHEMA_RUNTIME_DRIFT_MARKERS)
812
+ wrong_seam = bool(
813
+ same_failure_surface
814
+ and not measurable_progress
815
+ and not downstream_blocker
816
+ and not schema_or_runtime_drift
817
+ and (len(strategy_set) == 1 or len(artifact_set) == 1 or remediation_artifact)
818
+ )
819
+ summary_bits: list[str] = []
820
+ if unchanged_test_surface:
821
+ summary_bits.append("the same failure surface stayed unchanged across attempts")
822
+ if wrong_seam:
823
+ summary_bits.append("recovery kept editing the wrong seam")
824
+ if downstream_blocker and failed_stage:
825
+ summary_bits.append(f"the blocking node stayed downstream at {failed_stage} rather than the green boundary")
826
+ if schema_or_runtime_drift:
827
+ summary_bits.append("schema/runtime drift kept invalidating the attempted fixes")
828
+ if not measurable_progress:
829
+ summary_bits.append("there was no measurable improvement across the three attempts")
830
+ if not summary_bits:
831
+ summary_bits.append("three attempts did not materially change the failing boundary")
832
+ evidence: list[str] = [
833
+ f"failure_signature={failure_signature}",
834
+ f"attempts_reviewed={attempts_reviewed}",
835
+ ]
836
+ if failed_stage:
837
+ evidence.append(f"failed_stage={failed_stage}")
838
+ if strategy_set:
839
+ evidence.append("strategies=" + ", ".join(sorted(strategy_set)))
840
+ if artifact_set:
841
+ evidence.append("remediation_artifacts=" + ", ".join(sorted(artifact_set)))
842
+ if unchanged_test_surface:
843
+ evidence.append("all reviewed attempts recorded material_change=false")
844
+ if remediation_artifact and remediation_artifact not in artifact_set:
845
+ evidence.append(f"current_remediation_artifact={remediation_artifact}")
846
+ return RecoveryNonConvergenceArtifact(
847
+ summary="Non-convergence analysis for repeated recovery churn.",
848
+ reason="; ".join(summary_bits),
849
+ attempts_reviewed=attempts_reviewed,
850
+ same_failure_surface=same_failure_surface,
851
+ unchanged_test_surface=unchanged_test_surface,
852
+ wrong_seam=wrong_seam,
853
+ downstream_blocker=downstream_blocker,
854
+ downstream_blocker_stage=failed_stage if downstream_blocker else None,
855
+ schema_or_runtime_drift=schema_or_runtime_drift,
856
+ measurable_progress=measurable_progress,
857
+ evidence=evidence[:8],
858
+ )
859
+
860
+
861
+ def _build_churn_gate_investigation(
862
+ *,
863
+ item: FailedQueueItemArtifact,
864
+ failure_signature: str,
865
+ occurrence_count: int,
866
+ threshold: int,
867
+ churn_key: str | None = None,
868
+ remediation_artifact: str | None = None,
869
+ ) -> RecoveryInvestigationArtifact:
870
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
871
+ failed_stage = str(failure_context.get("failed_stage") or "").strip().lower() or None
872
+ non_convergence = _build_non_convergence_analysis(
873
+ item=item,
874
+ failure_signature=failure_signature,
875
+ occurrence_count=occurrence_count,
876
+ remediation_artifact=remediation_artifact,
877
+ )
878
+ underlying_issue = str(item.failure_message or failure_context.get("error") or failure_signature or "Repeated recovery failure").strip()
879
+ evidence = _collect_failure_evidence(item=item, extra_evidence=[*non_convergence.evidence, churn_key or ""])
880
+ return RecoveryInvestigationArtifact(
881
+ queue_type=item.queue_type,
882
+ item_id=item.item_id,
883
+ summary=(
884
+ f"Recovery churn gate hit after {occurrence_count}/{threshold} no-progress attempts on "
885
+ f"{failed_stage or failure_signature or item.item_id}."
886
+ ),
887
+ failure_nature=underlying_issue,
888
+ evidence=evidence[:8],
889
+ primary_evidence_source="recovery_churn_history",
890
+ primary_log_insight=non_convergence.reason,
891
+ affected_boundary=failed_stage or failure_signature or item.queue_type,
892
+ likely_failed_stage=failed_stage,
893
+ confidence="high" if occurrence_count >= threshold else "medium",
894
+ recovery_goal="Explain both the underlying failure and why implementation did not converge before any further replay.",
895
+ success_criteria=[
896
+ {
897
+ "criterion": "Underlying failure boundary identified",
898
+ "oracle": "The investigation names the concrete failing node/stage and the defect surface that stayed broken.",
899
+ },
900
+ {
901
+ "criterion": "Non-convergence reason identified",
902
+ "oracle": "The investigation explains why three attempts made no measurable progress and whether the repeated work stayed on the wrong seam, on an unchanged test surface, or behind downstream/runtime drift.",
903
+ },
904
+ ],
905
+ verification_evidence=[
906
+ f"Churn gate occurrence count reached {occurrence_count}/{threshold}.",
907
+ non_convergence.reason,
908
+ ],
909
+ replay_path="manual_review_required",
910
+ escalation_conditions=[
911
+ "Do not requeue again until the non-convergence reason has a concrete fix plan.",
912
+ "Require a materially different intervention when the same story/node has failed three times with no measurable improvement.",
913
+ ],
914
+ non_convergence=non_convergence,
915
+ )
916
+
917
+
918
+ def _build_story_replay_metadata(*, item: FailedQueueItemArtifact, diagnosis: RecoveryDiagnosisArtifact | None, execution: RecoveryExecutionArtifact | None) -> dict[str, Any] | None:
919
+ if item.queue_type != "story":
920
+ return None
921
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
922
+ prior_run_id = str(failure_context.get("implementation_run_id") or "").strip()
923
+ failed_stage = str(failure_context.get("failed_stage") or "").strip().lower()
924
+ if not prior_run_id or not failed_stage:
925
+ return None
926
+ canonical_order = [
927
+ "normalize",
928
+ "preflight",
929
+ "dependencyassessment",
930
+ "storyimplementationplanning",
931
+ "testdesign",
932
+ "red",
933
+ "redreview",
934
+ "storysufficiencyreconciliation",
935
+ "green",
936
+ "refactor",
937
+ "security",
938
+ "gitcommit(refactor)",
939
+ ]
940
+ if failed_stage not in canonical_order:
941
+ return None
942
+ invalidated = {failed_stage}
943
+ invalidated.update(str(item).strip().lower() for item in (failure_context.get("invalidated_stages") or []) if str(item).strip())
944
+ for criterion in ((execution.success_criteria if execution is not None else None) or []):
945
+ criterion_text = f"{criterion.criterion} {criterion.oracle}".lower()
946
+ for stage_id in canonical_order:
947
+ if stage_id in criterion_text and stage_id != failed_stage:
948
+ invalidated.add(stage_id)
949
+ valid_prior = [stage_id for stage_id in canonical_order if stage_id not in invalidated and canonical_order.index(stage_id) < canonical_order.index(failed_stage)]
950
+ repaired_artifacts = failure_context.get("repaired_artifacts")
951
+ if not isinstance(repaired_artifacts, list):
952
+ repaired_artifacts = []
953
+ replay_metadata = {
954
+ "resume_from_stage": failed_stage,
955
+ "valid_prior_stages": valid_prior,
956
+ "invalidated_stages": [stage_id for stage_id in canonical_order if stage_id in invalidated],
957
+ "repaired_artifacts": repaired_artifacts,
958
+ "prior_run_id": prior_run_id,
959
+ "strategy": None if diagnosis is None else diagnosis.strategy,
960
+ }
961
+ return replay_metadata
962
+
963
+
964
+ _TEST_RUNTIME_STAGE_IDS = {"red", "redreview", "green", "verifygreen", "gitcommit(refactor)"}
965
+ _TEST_RUNTIME_FAILURE_PATTERNS = (
966
+ "story test validation failed",
967
+ "story-scoped test sufficiency failed",
968
+ "insufficient_story_tests",
969
+ "test validation",
970
+ "no story-scoped tests discovered",
971
+ "no tests collected",
972
+ "collected 0 items",
973
+ "pytest",
974
+ "vitest",
975
+ )
976
+
977
+ _PYTESTMARK_BLOCK_START_RE = re.compile(r"^\s*pytestmark\s*=\s*\[")
978
+ _OPEN_FROM_IMPORT_RE = re.compile(r"^\s*from\b.+\bimport\s*\(\s*$")
979
+
980
+
981
+ def _count_parens(text: str) -> int:
982
+ return text.count("(") - text.count(")")
983
+
984
+
985
+ def _find_pytestmark_block(lines: list[str], *, start_index: int) -> tuple[int, int] | None:
986
+ for index in range(start_index, len(lines)):
987
+ if not _PYTESTMARK_BLOCK_START_RE.match(lines[index]):
988
+ continue
989
+ balance = lines[index].count("[") - lines[index].count("]")
990
+ end_index = index + 1
991
+ while balance > 0 and end_index < len(lines):
992
+ balance += lines[end_index].count("[") - lines[end_index].count("]")
993
+ end_index += 1
994
+ if balance == 0:
995
+ return index, end_index
996
+ return None
997
+ return None
998
+
999
+
1000
+ def _repair_malformed_story_pytestmark_import_block(*, text: str) -> str | None:
1001
+ lines = text.splitlines()
1002
+ for import_start, line in enumerate(lines):
1003
+ if not _OPEN_FROM_IMPORT_RE.match(line):
1004
+ continue
1005
+ block = _find_pytestmark_block(lines, start_index=import_start + 1)
1006
+ if block is None:
1007
+ continue
1008
+ pytestmark_start, pytestmark_end = block
1009
+ paren_balance = _count_parens(line)
1010
+ closing_index: int | None = None
1011
+ cursor = import_start + 1
1012
+ while cursor < len(lines):
1013
+ if pytestmark_start <= cursor < pytestmark_end:
1014
+ cursor = pytestmark_end
1015
+ continue
1016
+ paren_balance += _count_parens(lines[cursor])
1017
+ if paren_balance <= 0:
1018
+ closing_index = cursor
1019
+ break
1020
+ cursor += 1
1021
+ if closing_index is None:
1022
+ continue
1023
+ repaired_lines = lines[:pytestmark_start] + lines[pytestmark_end:]
1024
+ adjusted_closing_index = closing_index - (pytestmark_end - pytestmark_start)
1025
+ insert_at = adjusted_closing_index + 1
1026
+ pytestmark_lines = lines[pytestmark_start:pytestmark_end]
1027
+ prefix = repaired_lines[:insert_at]
1028
+ suffix = repaired_lines[insert_at:]
1029
+ updated_lines = [
1030
+ *prefix,
1031
+ *([""] if prefix and prefix[-1].strip() else []),
1032
+ *pytestmark_lines,
1033
+ *([""] if suffix and suffix[0].strip() else []),
1034
+ *suffix,
1035
+ ]
1036
+ updated_text = "\n".join(updated_lines).rstrip() + "\n"
1037
+ if updated_text == text:
1038
+ continue
1039
+ try:
1040
+ compile(updated_text, "<recovered_story_test>", "exec")
1041
+ except SyntaxError:
1042
+ continue
1043
+ return updated_text
1044
+ return None
1045
+
1046
+
1047
+ def _repair_story_test_files_for_runtime_failure(*, repo_root: Path, test_paths: list[str]) -> list[str]:
1048
+ changed: list[str] = []
1049
+ for raw_path in test_paths:
1050
+ relative = str(raw_path or "").strip()
1051
+ if not relative or not relative.endswith(".py"):
1052
+ continue
1053
+ candidate = repo_root / relative
1054
+ if not candidate.exists() or not candidate.is_file():
1055
+ continue
1056
+ try:
1057
+ original = candidate.read_text(encoding="utf-8")
1058
+ compile(original, str(candidate), "exec")
1059
+ continue
1060
+ except SyntaxError:
1061
+ pass
1062
+ except Exception:
1063
+ continue
1064
+ repaired = _repair_malformed_story_pytestmark_import_block(text=original)
1065
+ if repaired is None:
1066
+ continue
1067
+ candidate.write_text(repaired, encoding="utf-8")
1068
+ changed.append(relative)
1069
+ return changed
1070
+
1071
+
1072
+ def _maybe_repair_story_test_runtime_contract(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
1073
+ if item.queue_type != "story":
1074
+ return None
1075
+ story_id = str(item.story_id or "").strip()
1076
+ if not story_id:
1077
+ return None
1078
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
1079
+ failed_stage = str(failure_context.get("failed_stage") or "").strip().lower()
1080
+ if failed_stage not in _TEST_RUNTIME_STAGE_IDS:
1081
+ return None
1082
+ current_contract = load_story_test_runtime_contract(repo_root=repo_root, story_id=story_id) or {}
1083
+ current_story_uuid = str(current_contract.get("story_uuid") or "").strip() or None
1084
+ failure_blob = " ".join(
1085
+ str(part or "")
1086
+ for part in (
1087
+ item.failure_message,
1088
+ failure_context.get("error"),
1089
+ failure_context.get("error_type"),
1090
+ )
1091
+ ).lower()
1092
+ fresh_test_paths = [str(path) for path in (current_contract.get("test_paths") or []) if str(path).strip()]
1093
+ if not fresh_test_paths:
1094
+ fresh_test_paths = discover_story_scoped_test_paths(
1095
+ repo_root=repo_root,
1096
+ story_id=story_id,
1097
+ story_uuid=current_story_uuid,
1098
+ )
1099
+ if current_contract:
1100
+ fresh_contract = dict(current_contract)
1101
+ if fresh_test_paths:
1102
+ fresh_contract["test_paths"] = fresh_test_paths
1103
+ else:
1104
+ fresh_contract = resolve_story_runtime_contract(
1105
+ repo_root=repo_root,
1106
+ story_id=story_id,
1107
+ story_uuid=current_story_uuid,
1108
+ test_paths=fresh_test_paths,
1109
+ prefer_story_contract=False,
1110
+ )
1111
+ fresh_contract = normalize_recovery_story_runtime_contract(
1112
+ repo_root=repo_root,
1113
+ contract=fresh_contract,
1114
+ )
1115
+ fresh_contract["source"] = "recovery_repair"
1116
+ repaired_test_files = _repair_story_test_files_for_runtime_failure(
1117
+ repo_root=repo_root,
1118
+ test_paths=fresh_test_paths,
1119
+ )
1120
+
1121
+ comparable_keys = ("framework", "cwd", "run_cmd", "env", "setup_cmd", "test_paths")
1122
+ current_fingerprint = {key: current_contract.get(key) for key in comparable_keys}
1123
+ fresh_fingerprint = {key: fresh_contract.get(key) for key in comparable_keys}
1124
+ looks_like_runtime_failure = any(pattern in failure_blob for pattern in _TEST_RUNTIME_FAILURE_PATTERNS)
1125
+ if not looks_like_runtime_failure and current_fingerprint == fresh_fingerprint and current_contract and not repaired_test_files:
1126
+ return None
1127
+
1128
+ path = story_test_runtime_contract_path(repo_root=repo_root, story_id=story_id)
1129
+ updated = current_fingerprint != fresh_fingerprint or not current_contract or bool(repaired_test_files)
1130
+ if updated:
1131
+ if current_fingerprint != fresh_fingerprint or not current_contract:
1132
+ path = persist_story_runtime_contract(repo_root=repo_root, contract=fresh_contract)
1133
+ return {
1134
+ "story_id": story_id,
1135
+ "path": str(path),
1136
+ "updated": updated,
1137
+ "previous_contract": current_contract or None,
1138
+ "runtime_contract": fresh_contract,
1139
+ "files_changed": repaired_test_files,
1140
+ "reason": "test_runtime_boundary_repaired",
1141
+ }
1142
+
1143
+
1144
+ def _load_local_setup_contract(repo_root: Path) -> tuple[Path, LocalSetupContract] | None:
1145
+ path = repo_root / ".devflow" / "local_setup.json"
1146
+ payload = _load_json_file(path)
1147
+ if payload is None:
1148
+ return None
1149
+ try:
1150
+ return path, LocalSetupContract.model_validate(payload)
1151
+ except Exception:
1152
+ return None
1153
+
1154
+
1155
+ def _check_health_endpoint(url: str, expected_status: int) -> bool:
1156
+ try:
1157
+ with urllib_request.urlopen(url, timeout=5) as response:
1158
+ return int(response.status) == int(expected_status)
1159
+ except urllib_error.HTTPError as exc:
1160
+ return int(exc.code) == int(expected_status)
1161
+ except Exception:
1162
+ return False
1163
+
1164
+
1165
+ def _collect_preflight_repair_files(*, repo_root: Path, service_hints: list[str]) -> dict[str, str]:
1166
+ candidates = [
1167
+ ".devflow/local_setup.json",
1168
+ "docker-compose.yml",
1169
+ "docker-compose.yaml",
1170
+ "compose.yml",
1171
+ "compose.yaml",
1172
+ "package.json",
1173
+ "pnpm-workspace.yaml",
1174
+ "pnpm-workspace.yml",
1175
+ "pnpm-lock.yaml",
1176
+ "package-lock.json",
1177
+ "yarn.lock",
1178
+ "pyproject.toml",
1179
+ "uv.lock",
1180
+ "requirements.txt",
1181
+ "requirements-dev.txt",
1182
+ "vite.config.ts",
1183
+ "vite.config.js",
1184
+ "vite.config.mjs",
1185
+ "next.config.js",
1186
+ "next.config.mjs",
1187
+ "next.config.ts",
1188
+ ]
1189
+ files: dict[str, str] = {}
1190
+ for relative in candidates:
1191
+ path = repo_root / relative
1192
+ if not path.exists() or not path.is_file():
1193
+ continue
1194
+ try:
1195
+ files[relative] = path.read_text(encoding="utf-8")
1196
+ except Exception:
1197
+ continue
1198
+ hints = {hint for hint in service_hints if hint}
1199
+ for child in sorted(repo_root.iterdir()):
1200
+ if not child.is_dir() or child.name.startswith("."):
1201
+ continue
1202
+ child_name = child.name.lower()
1203
+ if hints and not any(hint in child_name or child_name in hint for hint in hints):
1204
+ continue
1205
+ for name in ("package.json", "pyproject.toml", "vite.config.ts", "vite.config.js", "next.config.js", "next.config.ts"):
1206
+ path = child / name
1207
+ if not path.exists() or not path.is_file():
1208
+ continue
1209
+ rel = str(path.relative_to(repo_root))
1210
+ if rel in files:
1211
+ continue
1212
+ try:
1213
+ files[rel] = path.read_text(encoding="utf-8")
1214
+ except Exception:
1215
+ continue
1216
+ return files
1217
+
1218
+
1219
+ def _select_recovery_health_checks(*, contract: LocalSetupContract, failing_urls: set[str]) -> list[HealthCheckEntry]:
1220
+ checks = [hc for hc in contract.health_checks if hc.url in failing_urls] if failing_urls else list(contract.health_checks)
1221
+ return checks or list(contract.health_checks)
1222
+
1223
+
1224
+ def _verify_recovery_health_checks(*, checks: list[HealthCheckEntry]) -> tuple[list[str], list[str]]:
1225
+ verification_checks: list[str] = []
1226
+ blocking_reasons: list[str] = []
1227
+ for health_check in checks:
1228
+ if _check_health_endpoint(health_check.url, health_check.expected_status):
1229
+ verification_checks.append(f"{health_check.name} returned {health_check.expected_status} at {health_check.url}")
1230
+ else:
1231
+ blocking_reasons.append(f"{health_check.name} still failed health verification at {health_check.url}")
1232
+ return verification_checks, blocking_reasons
1233
+
1234
+
1235
+ def _maybe_repair_story_preflight_health_boundary(*, repo_root: Path, item: FailedQueueItemArtifact) -> dict[str, Any] | None:
1236
+ details = _load_preflight_health_failure_details(repo_root=repo_root, item=item)
1237
+ if details is None:
1238
+ return None
1239
+ setup = _load_local_setup_contract(repo_root)
1240
+ if setup is None:
1241
+ return {
1242
+ "ready": False,
1243
+ "blocking_reasons": ["Cannot repair preflight health failure because .devflow/local_setup.json is missing or invalid."],
1244
+ "files_changed": [],
1245
+ "artifact_path": str(details["artifact_path"]),
1246
+ "verification_checks": [],
1247
+ "repair_result": None,
1248
+ "start_command_executed": False,
1249
+ }
1250
+ setup_path, contract = setup
1251
+ failing_urls = {
1252
+ str(issue.get("message") or "").split(": ", 1)[1].split(" did not return ", 1)[0].strip()
1253
+ for issue in details["health_issues"]
1254
+ if ": " in str(issue.get("message") or "") and " did not return " in str(issue.get("message") or "")
1255
+ }
1256
+ failing_checks = _select_recovery_health_checks(contract=contract, failing_urls=failing_urls)
1257
+ initial_verification_checks, initial_blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
1258
+ if not initial_blocking_reasons:
1259
+ return {
1260
+ "ready": True,
1261
+ "blocking_reasons": [],
1262
+ "files_changed": [],
1263
+ "artifact_path": str(details["artifact_path"]),
1264
+ "verification_checks": initial_verification_checks
1265
+ + [
1266
+ "Skipped local setup start command because the previously failing health checks are already healthy."
1267
+ ],
1268
+ "repair_result": None,
1269
+ "start_command_executed": False,
1270
+ }
1271
+ service_hints = [str(hc.name or "").split("-", 1)[0].strip().lower() for hc in failing_checks]
1272
+ logs_by_service = {
1273
+ hint: _get_docker_service_logs(hint, repo_root)
1274
+ for hint in service_hints
1275
+ if hint
1276
+ }
1277
+ repair_result, _envelope = run_agent_step(
1278
+ repo_root=repo_root,
1279
+ stage_name="recovery_preflight_health_repo_repair",
1280
+ output_model=RemediationResultArtifact,
1281
+ context_payload={
1282
+ "failed_item": item.model_dump(),
1283
+ "preflight_report_path": str(details["artifact_path"]),
1284
+ "preflight_report": details["report"],
1285
+ "health_issues": details["health_issues"],
1286
+ "local_setup_path": str(setup_path),
1287
+ "local_setup": contract.model_dump(),
1288
+ "service_logs": logs_by_service,
1289
+ "files_to_change": _collect_preflight_repair_files(repo_root=repo_root, service_hints=service_hints),
1290
+ },
1291
+ guidance=load_agentic_prompt_lines("recovery_preflight_health_repo_repair"),
1292
+ timeout_seconds=600,
1293
+ strength=_CURRENT_STRENGTH,
1294
+ )
1295
+ files_written: list[str] = []
1296
+ for patch in repair_result.file_patches or []:
1297
+ target = repo_root / patch.path
1298
+ try:
1299
+ target.parent.mkdir(parents=True, exist_ok=True)
1300
+ target.write_text(patch.content, encoding="utf-8")
1301
+ files_written.append(patch.path)
1302
+ except Exception:
1303
+ continue
1304
+ repair_result = repair_result.model_copy(update={"fix_applied": bool(files_written), "files_changed": files_written or repair_result.files_changed})
1305
+ if not repair_result.fix_applied:
1306
+ return {
1307
+ "ready": False,
1308
+ "blocking_reasons": ["Preflight health repair produced no file changes, so recovery cannot verify a real repo/config fix."],
1309
+ "files_changed": [],
1310
+ "artifact_path": str(details["artifact_path"]),
1311
+ "verification_checks": [],
1312
+ "repair_result": repair_result,
1313
+ "start_command_executed": False,
1314
+ }
1315
+ refreshed_setup = _load_local_setup_contract(repo_root)
1316
+ if refreshed_setup is not None:
1317
+ _setup_path, contract = refreshed_setup
1318
+ failing_checks = _select_recovery_health_checks(contract=contract, failing_urls=failing_urls)
1319
+ verification_checks, blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
1320
+ start_command_executed = False
1321
+ if blocking_reasons:
1322
+ subprocess.run(
1323
+ ["/bin/sh", "-lc", contract.start_command],
1324
+ cwd=str(repo_root),
1325
+ capture_output=True,
1326
+ text=True,
1327
+ check=False,
1328
+ timeout=600,
1329
+ )
1330
+ start_command_executed = True
1331
+ verification_checks, blocking_reasons = _verify_recovery_health_checks(checks=failing_checks)
1332
+ else:
1333
+ verification_checks.append(
1334
+ "Skipped local setup start command because the previously failing health checks are already healthy after the repo/config repair."
1335
+ )
1336
+ return {
1337
+ "ready": not blocking_reasons,
1338
+ "blocking_reasons": blocking_reasons,
1339
+ "files_changed": files_written,
1340
+ "artifact_path": str(details["artifact_path"]),
1341
+ "verification_checks": verification_checks,
1342
+ "repair_result": repair_result,
1343
+ "start_command_executed": start_command_executed,
1344
+ }
1345
+
1346
+
1347
+ class LoadFailedQueueItemNode(Node):
1348
+ async def process(self, task_context: TaskContext) -> TaskContext:
1349
+ def _run(_node_exec_id: str):
1350
+ event = task_context.event
1351
+ store, run_id = _store_run()
1352
+ row = None
1353
+ payload_ref = None
1354
+ story_id = None
1355
+ repo_root = Path(str(event.repo_root))
1356
+ queue_type = str(event.queue_type)
1357
+ item_id = str(event.item_id)
1358
+ if queue_type == "scope":
1359
+ row = store.get_scope_queue_item(scope_queue_id=item_id)
1360
+ payload_ref = None if row is None else str(row.get("scope_payload_path") or "") or None
1361
+ elif queue_type == "idea_creation":
1362
+ row = store.get_idea_creation_queue_item(idea_creation_queue_id=item_id)
1363
+ payload_ref = None if row is None else str(row.get("idea_payload_path") or "") or None
1364
+ elif queue_type == "idea":
1365
+ row = store.get_idea_queue_item(idea_queue_id=item_id)
1366
+ payload_ref = None if row is None else str(row.get("idea_payload_path") or "") or None
1367
+ elif queue_type == "story":
1368
+ row = store.get_story_queue_item(story_queue_id=item_id)
1369
+ if row is not None:
1370
+ artifact = store.get_artifact(artifact_id=str(row.get("story_artifact_id") or ""))
1371
+ payload_ref = None if artifact is None else str(artifact.get("uri") or "") or None
1372
+ story_id = str(row.get("story_id") or "") or None
1373
+ elif queue_type == "integration":
1374
+ row = store.get_integration_queue_item(integration_queue_id=item_id)
1375
+ payload_ref = None if row is None else str(row.get("integration_payload_path") or "") or None
1376
+ else:
1377
+ raise ValueError(f"unsupported queue_type={queue_type}")
1378
+ if row is None:
1379
+ raise ValueError(f"queue item not found: {queue_type}:{item_id}")
1380
+ artifact = FailedQueueItemArtifact(
1381
+ queue_type=queue_type, # type: ignore[arg-type]
1382
+ item_id=item_id,
1383
+ project_id=row.get("project_id"),
1384
+ dfs_project_id=row.get("dfs_project_id"),
1385
+ enqueue_run_id=str(row.get("enqueue_run_id") or ""),
1386
+ status=str(row.get("status") or ""),
1387
+ title=str(row.get("title") or item_id),
1388
+ payload_ref=payload_ref,
1389
+ payload_exists=bool(payload_ref and Path(payload_ref).exists()),
1390
+ story_id=story_id,
1391
+ failure_message=row.get("failure_message"),
1392
+ failure_context=dict(row.get("failure_context") or {}),
1393
+ raw_row=dict(row),
1394
+ )
1395
+ task_context.metadata["failed_item"] = artifact
1396
+ _publish_node(artifact.dfs_project_id or event.project_id, run_id, "Loading failure", recovery_id=item_id)
1397
+ _publish(artifact.dfs_project_id or event.project_id, run_id, "running", "processing", f"Loading failed {queue_type} queue item", recovery_id=item_id)
1398
+ self.save_output(artifact)
1399
+ return artifact.model_dump(), task_context
1400
+ return _persist_node(node_id="load_failed_item", node_name="LoadFailedQueueItem", fn=_run)
1401
+
1402
+
1403
+ class SystemicPatternAnalysisNode(Node):
1404
+ """Deterministic node: queries DB for sibling failures sharing the same error signature.
1405
+
1406
+ Produces SystemicPatternArtifact stored in task_context.metadata["systemic_pattern"].
1407
+ Also sets metadata["churn_detected"] when the durable recovery churn gate is already
1408
+ at or above threshold for the same durable identity + normalized failure signature.
1409
+ """
1410
+
1411
+ async def process(self, task_context: TaskContext) -> TaskContext:
1412
+ def _run(_node_exec_id: str):
1413
+ store, run_id = _store_run()
1414
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1415
+ project_id = item.dfs_project_id or task_context.event.project_id
1416
+ _publish_node(project_id, run_id, "Analysing failure pattern", recovery_id=item.item_id)
1417
+
1418
+ failure_context = item.failure_context if isinstance(item.failure_context, dict) else {}
1419
+ failure_message = str(item.failure_message or "").strip()
1420
+ error_type = str(failure_context.get("error_type") or "").strip()
1421
+ failed_stage = str(failure_context.get("failed_stage") or "").strip()
1422
+
1423
+ failure_signature = _normalized_failure_signature(
1424
+ failure_message=failure_message,
1425
+ failure_context=failure_context,
1426
+ )
1427
+
1428
+ queue_type = item.queue_type
1429
+ item_id = item.item_id
1430
+
1431
+ affected_item_ids: list[str] = []
1432
+ sample_failure_messages: list[str] = []
1433
+ failed_stages: list[str] = []
1434
+ churn_state = _load_recovery_churn_gate_state(
1435
+ store=store,
1436
+ project_id=project_id,
1437
+ item=item,
1438
+ failure_signature=failure_signature,
1439
+ )
1440
+ churn_detected = bool(churn_state["threshold_met"])
1441
+
1442
+ with store._connect() as conn:
1443
+ # --- Find sibling failed items in the source queue with matching failure_message ---
1444
+ if queue_type == "story":
1445
+ table = "story_queue"
1446
+ id_col = "story_queue_id"
1447
+ elif queue_type == "idea_creation":
1448
+ table = "idea_creation_queue"
1449
+ id_col = "idea_creation_queue_id"
1450
+ elif queue_type == "idea":
1451
+ table = "idea_queue"
1452
+ id_col = "idea_queue_id"
1453
+ elif queue_type == "scope":
1454
+ table = "scope_queue"
1455
+ id_col = "scope_queue_id"
1456
+ elif queue_type == "integration":
1457
+ table = "integration_queue"
1458
+ id_col = "integration_queue_id"
1459
+ else:
1460
+ table = None
1461
+ id_col = None
1462
+
1463
+ if table and failure_message:
1464
+ # Search by LIKE on failure_message (primary)
1465
+ sig_fragment = failure_message[:60].replace("%", "")
1466
+ sibling_rows = conn.execute(
1467
+ f"SELECT {id_col} as item_id, failure_message, failure_context_json FROM {table} "
1468
+ f"WHERE status='failed' AND failure_message LIKE ? LIMIT 100",
1469
+ (f"%{sig_fragment}%",),
1470
+ ).fetchall()
1471
+ for r in sibling_rows:
1472
+ sid = str(r["item_id"] or "")
1473
+ if sid and sid not in affected_item_ids:
1474
+ affected_item_ids.append(sid)
1475
+ msg = str(r["failure_message"] or "")
1476
+ if msg and msg not in sample_failure_messages:
1477
+ sample_failure_messages.append(msg)
1478
+ # Extract failed_stage from failure_context_json
1479
+ try:
1480
+ fc = json.loads(str(r["failure_context_json"] or "{}") or "{}")
1481
+ fs = str(fc.get("failed_stage") or "").strip()
1482
+ if fs and fs not in failed_stages:
1483
+ failed_stages.append(fs)
1484
+ except Exception:
1485
+ pass
1486
+
1487
+ # --- Also search recovery_queue for matching failure signature ---
1488
+ rq_rows = conn.execute(
1489
+ "SELECT source_item_id, failure_message FROM recovery_queue "
1490
+ "WHERE failure_message LIKE ? LIMIT 100",
1491
+ (f"%{failure_message[:60].replace('%', '')}%",) if failure_message else ("%",),
1492
+ ).fetchall()
1493
+ for r in rq_rows:
1494
+ sid = str(r["source_item_id"] or "")
1495
+ if sid and sid not in affected_item_ids:
1496
+ affected_item_ids.append(sid)
1497
+
1498
+ # Include current item if not already in list
1499
+ if item_id not in affected_item_ids:
1500
+ affected_item_ids.insert(0, item_id)
1501
+ if failure_message and failure_message not in sample_failure_messages:
1502
+ sample_failure_messages.insert(0, failure_message)
1503
+ if failed_stage and failed_stage not in failed_stages:
1504
+ failed_stages.insert(0, failed_stage)
1505
+
1506
+ total_affected = len(affected_item_ids)
1507
+ is_systemic = total_affected >= 3
1508
+
1509
+ pattern = SystemicPatternArtifact(
1510
+ failure_signature=failure_signature,
1511
+ is_systemic=is_systemic,
1512
+ affected_queue_type=queue_type,
1513
+ affected_item_ids=affected_item_ids,
1514
+ sample_failure_messages=sample_failure_messages[:5],
1515
+ failed_stages=failed_stages[:10],
1516
+ total_affected=total_affected,
1517
+ pattern_summary=(
1518
+ f"{'Systemic' if is_systemic else 'Isolated'} failure: {failure_signature}. "
1519
+ f"{total_affected} affected item(s) in {queue_type} queue."
1520
+ ),
1521
+ )
1522
+
1523
+ task_context.metadata["systemic_pattern"] = pattern
1524
+ if churn_detected:
1525
+ task_context.metadata["churn_detected"] = True
1526
+ task_context.metadata["churn"] = {
1527
+ "detected": True,
1528
+ "failure_signature": failure_signature,
1529
+ "occurrence_count": int(churn_state["occurrence_count"]),
1530
+ "threshold": int(churn_state["threshold"]),
1531
+ "error_task_id": churn_state["error_task_id"],
1532
+ "churn_key": churn_state["churn_key"],
1533
+ "story_id": item.story_id,
1534
+ "item_id": item.item_id,
1535
+ }
1536
+
1537
+ self.save_output(pattern)
1538
+ return pattern.model_dump(), task_context
1539
+
1540
+ return _persist_node(node_id="systemic_pattern_analysis", node_name="SystemicPatternAnalysis", fn=_run)
1541
+
1542
+
1543
+ class AgenticFailureInvestigationNode(AgentNode):
1544
+ def get_agent_config(self) -> AgentConfig:
1545
+ return AgentConfig(instructions="Investigate the nature of a process failure, identify the affected boundary, and define what successful recovery would look like.", output_type=RecoveryInvestigationArtifact)
1546
+
1547
+ async def process(self, task_context: TaskContext) -> TaskContext:
1548
+ def _run(_node_exec_id: str):
1549
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1550
+ if task_context.metadata.get("churn_detected"):
1551
+ churn = task_context.metadata.get("churn") or {}
1552
+ artifact = _build_churn_gate_investigation(
1553
+ item=item,
1554
+ failure_signature=str(churn.get("failure_signature") or ""),
1555
+ occurrence_count=int(churn.get("occurrence_count") or 0),
1556
+ threshold=int(churn.get("threshold") or _RECOVERY_CHURN_GATE_THRESHOLD),
1557
+ churn_key=str(churn.get("churn_key") or "") or None,
1558
+ )
1559
+ task_context.metadata["investigation"] = artifact
1560
+ self.save_output(artifact)
1561
+ return artifact.model_dump(), task_context
1562
+ _publish_node(item.dfs_project_id or task_context.event.project_id, _store_run()[1], "Investigating failure", recovery_id=item.item_id)
1563
+ store, _run_id = _store_run()
1564
+ repo_root = Path(str(task_context.event.repo_root))
1565
+ log_evidence = _gather_log_first_recovery_evidence(store=store, repo_root=repo_root, item=item)
1566
+ artifact, _envelope = run_agent_step(
1567
+ repo_root=repo_root,
1568
+ stage_name="recovery_failure_investigation",
1569
+ output_model=RecoveryInvestigationArtifact,
1570
+ context_payload={"failed_item": item.model_dump(), "log_evidence": log_evidence},
1571
+ guidance=load_agentic_prompt_lines("recovery_failure_investigation"),
1572
+ timeout_seconds=300,
1573
+ strength=_CURRENT_STRENGTH,
1574
+ )
1575
+ artifact = _enrich_investigation_with_log_evidence(investigation=artifact, log_evidence=log_evidence)
1576
+ task_context.metadata["investigation"] = artifact
1577
+ task_context.metadata["log_evidence"] = log_evidence
1578
+ self.save_output(artifact)
1579
+ return artifact.model_dump(), task_context
1580
+ return _persist_node(node_id="failure_investigation", node_name="AgenticFailureInvestigation", fn=_run)
1581
+
1582
+
1583
+ class RootCauseCodeInvestigationNode(AgentNode):
1584
+ """AgentNode: given a systemic pattern, reads key source files and asks the LLM to
1585
+ identify the root cause and propose a concrete fix plan.
1586
+
1587
+ Output: CodeRootCauseArtifact
1588
+ """
1589
+
1590
+ def get_agent_config(self) -> AgentConfig:
1591
+ return AgentConfig(
1592
+ instructions="Investigate the root cause of a systemic failure pattern in the DevFlow codebase. Identify the exact code location, describe the problem, and propose a specific fix plan.",
1593
+ output_type=CodeRootCauseArtifact,
1594
+ )
1595
+
1596
+ async def process(self, task_context: TaskContext) -> TaskContext:
1597
+ def _run(_node_exec_id: str):
1598
+ event = task_context.event
1599
+ store, run_id = _store_run()
1600
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1601
+ pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
1602
+ repo_root = Path(str(event.repo_root))
1603
+ project_id = item.dfs_project_id or event.project_id
1604
+
1605
+ _publish_node(project_id, run_id, "Investigating root cause", recovery_id=item.item_id)
1606
+
1607
+ # Deterministically read key source files before calling agent
1608
+ key_source_files: dict[str, str] = {}
1609
+
1610
+ # 1. agentic_runtime.py — always include
1611
+ _read_file_into(key_source_files, repo_root / "src" / "devflow_engine" / "agentic_runtime.py")
1612
+
1613
+ # 2. implementation/dag.py — Red/Green context-building sections
1614
+ impl_dag = repo_root / "src" / "devflow_engine" / "implementation" / "dag.py"
1615
+ if impl_dag.exists():
1616
+ try:
1617
+ full = impl_dag.read_text(encoding="utf-8")
1618
+ # Extract lines around _build_red_generation_context and _build_green_generation_context
1619
+ lines = full.splitlines()
1620
+ relevant: list[str] = []
1621
+ capture = False
1622
+ for i, line in enumerate(lines):
1623
+ if "_build_red_generation_context" in line or "_build_green_generation_context" in line:
1624
+ capture = True
1625
+ if capture:
1626
+ relevant.append(line)
1627
+ # Stop after a reasonable chunk (50 lines per section)
1628
+ if len(relevant) > 200:
1629
+ relevant.append("... [truncated]")
1630
+ break
1631
+ key_source_files["implementation/dag.py (red/green context sections)"] = "\n".join(relevant[:200])
1632
+ except Exception:
1633
+ pass
1634
+
1635
+ # 3. List .claude/agents/ directory sizes if present
1636
+ agents_dir = repo_root / ".claude" / "agents"
1637
+ if agents_dir.exists():
1638
+ sizes: list[str] = []
1639
+ try:
1640
+ for p in sorted(agents_dir.iterdir()):
1641
+ if p.is_file():
1642
+ sizes.append(f"{p.name}: {p.stat().st_size} bytes")
1643
+ except Exception:
1644
+ pass
1645
+ key_source_files[".claude/agents/ directory"] = "\n".join(sizes) if sizes else "(empty)"
1646
+
1647
+ # 4. CLI config
1648
+ cli_config = ""
1649
+ config_path = Path.home() / ".devflow" / "config.toml"
1650
+ if config_path.exists():
1651
+ try:
1652
+ cli_config = config_path.read_text(encoding="utf-8")
1653
+ except Exception:
1654
+ cli_config = "(unreadable)"
1655
+
1656
+ artifact, _envelope = run_agent_step(
1657
+ repo_root=repo_root,
1658
+ stage_name="recovery_root_cause_investigation",
1659
+ output_model=CodeRootCauseArtifact,
1660
+ context_payload={
1661
+ "systemic_pattern": pattern.model_dump(),
1662
+ "failed_item": item.model_dump(),
1663
+ "key_source_files": key_source_files,
1664
+ "cli_config": cli_config,
1665
+ },
1666
+ guidance=load_agentic_prompt_lines("recovery_root_cause_investigation"),
1667
+ timeout_seconds=600,
1668
+ strength=_CURRENT_STRENGTH,
1669
+ )
1670
+ task_context.metadata["root_cause"] = artifact
1671
+ self.save_output(artifact)
1672
+ return artifact.model_dump(), task_context
1673
+
1674
+ return _persist_node(node_id="root_cause_investigation", node_name="RootCauseCodeInvestigation", fn=_run)
1675
+
1676
+
1677
+ def _read_file_into(dest: dict[str, str], path: Path) -> None:
1678
+ """Helper: read a file and store its content in dest under its relative path."""
1679
+ try:
1680
+ if path.exists():
1681
+ dest[str(path.name)] = path.read_text(encoding="utf-8")
1682
+ except Exception:
1683
+ pass
1684
+
1685
+
1686
+ class RemediationExecutionNode(AgentNode):
1687
+ """AgentNode: given CodeRootCauseArtifact, applies the fix.
1688
+
1689
+ After parsing the agent response:
1690
+ - Writes file_patches to disk at repo_root / patch.path
1691
+ - Sets fix_applied=True if files were written
1692
+
1693
+ Output: RemediationResultArtifact
1694
+ """
1695
+
1696
+ def get_agent_config(self) -> AgentConfig:
1697
+ return AgentConfig(
1698
+ instructions="Apply a diagnosed code/config fix to resolve a systemic DevFlow failure. Produce file patches with complete corrected file contents.",
1699
+ output_type=RemediationResultArtifact,
1700
+ )
1701
+
1702
+ async def process(self, task_context: TaskContext) -> TaskContext:
1703
+ def _run(_node_exec_id: str):
1704
+ event = task_context.event
1705
+ store, run_id = _store_run()
1706
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1707
+ root_cause: CodeRootCauseArtifact = task_context.metadata["root_cause"]
1708
+ pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
1709
+ repo_root = Path(str(event.repo_root))
1710
+ project_id = item.dfs_project_id or event.project_id
1711
+
1712
+ _publish_node(project_id, run_id, "Applying remediation", recovery_id=item.item_id)
1713
+
1714
+ # Deterministically read current content of files the fix will touch
1715
+ files_to_change: dict[str, str] = {}
1716
+ if root_cause.root_cause_location:
1717
+ # root_cause_location is like "src/devflow_engine/agentic_runtime.py:run_agent_step"
1718
+ location_path = root_cause.root_cause_location.split(":")[0].strip()
1719
+ candidate = repo_root / location_path
1720
+ _read_file_into(files_to_change, candidate)
1721
+
1722
+ # Also read files listed in files_inspected (capped at 3)
1723
+ for fi in (root_cause.files_inspected or [])[:3]:
1724
+ _read_file_into(files_to_change, repo_root / fi)
1725
+
1726
+ artifact, _envelope = run_agent_step(
1727
+ repo_root=repo_root,
1728
+ stage_name="recovery_remediation_execution",
1729
+ output_model=RemediationResultArtifact,
1730
+ context_payload={
1731
+ "root_cause": root_cause.model_dump(),
1732
+ "systemic_pattern": pattern.model_dump(),
1733
+ "files_to_change": files_to_change,
1734
+ "affected_items": pattern.affected_item_ids,
1735
+ },
1736
+ guidance=load_agentic_prompt_lines("recovery_remediation_execution"),
1737
+ timeout_seconds=900,
1738
+ strength=_CURRENT_STRENGTH,
1739
+ )
1740
+
1741
+ # Write file patches to disk
1742
+ files_written: list[str] = []
1743
+ for patch in (artifact.file_patches or []):
1744
+ try:
1745
+ target = repo_root / patch.path
1746
+ target.parent.mkdir(parents=True, exist_ok=True)
1747
+ target.write_text(patch.content, encoding="utf-8")
1748
+ files_written.append(patch.path)
1749
+ except Exception as exc:
1750
+ # Log but don't fail the node; note it in verification
1751
+ pass
1752
+
1753
+ if files_written:
1754
+ artifact = artifact.model_copy(update={"fix_applied": True, "files_changed": files_written})
1755
+
1756
+ task_context.metadata["remediation_result"] = artifact
1757
+ self.save_output(artifact)
1758
+ return artifact.model_dump(), task_context
1759
+
1760
+ return _persist_node(node_id="remediation_execution", node_name="RemediationExecution", fn=_run)
1761
+
1762
+
1763
+ class BulkReenqueueNode(Node):
1764
+ """Deterministic node: after remediation, bulk-re-enqueues all items listed in
1765
+ RemediationResultArtifact.items_to_requeue and dead-letters items in items_to_dead_letter.
1766
+ """
1767
+
1768
+ async def process(self, task_context: TaskContext) -> TaskContext:
1769
+ def _run(_node_exec_id: str):
1770
+ event = task_context.event
1771
+ store, run_id = _store_run()
1772
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1773
+ remediation: RemediationResultArtifact = task_context.metadata["remediation_result"]
1774
+ pattern: SystemicPatternArtifact = task_context.metadata["systemic_pattern"]
1775
+ project_id = item.dfs_project_id or event.project_id
1776
+
1777
+ _publish_node(project_id, run_id, "Re-enqueuing fixed items", recovery_id=item.item_id)
1778
+
1779
+ queue_type = pattern.affected_queue_type
1780
+ requeued: list[str] = []
1781
+ dead_lettered: list[str] = []
1782
+ errors: list[str] = []
1783
+
1784
+ for qid in (remediation.items_to_requeue or []):
1785
+ try:
1786
+ if queue_type == "story":
1787
+ store.retry_story_queue_item(project_id=project_id, story_queue_id=qid, preserve_failure_context=False)
1788
+ elif queue_type == "idea_creation":
1789
+ store.retry_idea_creation_queue_item(project_id=project_id, idea_creation_queue_id=qid, preserve_failure_context=False)
1790
+ elif queue_type == "idea":
1791
+ store.retry_idea_queue_item(project_id=project_id, idea_queue_id=qid, preserve_failure_context=False)
1792
+ elif queue_type == "scope":
1793
+ store.retry_scope_queue_item(project_id=project_id, scope_queue_id=qid, preserve_failure_context=False)
1794
+ elif queue_type == "integration":
1795
+ store.retry_integration_queue_item(project_id=project_id, integration_queue_id=qid, preserve_failure_context=False)
1796
+ requeued.append(qid)
1797
+ except Exception as exc:
1798
+ errors.append(f"{qid}: {exc}")
1799
+
1800
+ # Mark dead-letter items as failed in queue (set failure_message to indicate dead-lettered)
1801
+ for qid in (remediation.items_to_dead_letter or []):
1802
+ try:
1803
+ with store._connect() as conn:
1804
+ now_ts = int(__import__("time").time())
1805
+ if queue_type == "story":
1806
+ conn.execute(
1807
+ "UPDATE story_queue SET status='failed', failure_message=?, updated_at=? WHERE story_queue_id=?",
1808
+ ("dead_lettered_by_recovery", now_ts, qid),
1809
+ )
1810
+ elif queue_type == "idea_creation":
1811
+ conn.execute(
1812
+ "UPDATE idea_creation_queue SET status='failed', failure_message=?, updated_at=? WHERE idea_creation_queue_id=?",
1813
+ ("dead_lettered_by_recovery", now_ts, qid),
1814
+ )
1815
+ elif queue_type == "idea":
1816
+ conn.execute(
1817
+ "UPDATE idea_queue SET status='failed', failure_message=?, updated_at=? WHERE idea_queue_id=?",
1818
+ ("dead_lettered_by_recovery", now_ts, qid),
1819
+ )
1820
+ elif queue_type == "scope":
1821
+ conn.execute(
1822
+ "UPDATE scope_queue SET status='failed', failure_message=?, updated_at=? WHERE scope_queue_id=?",
1823
+ ("dead_lettered_by_recovery", now_ts, qid),
1824
+ )
1825
+ elif queue_type == "integration":
1826
+ conn.execute(
1827
+ "UPDATE integration_queue SET status='failed', failure_message=?, updated_at=? WHERE integration_queue_id=?",
1828
+ ("dead_lettered_by_recovery", now_ts, qid),
1829
+ )
1830
+ dead_lettered.append(qid)
1831
+ except Exception as exc:
1832
+ errors.append(f"dead_letter {qid}: {exc}")
1833
+
1834
+ outcome = "reenqueued" if requeued else "blocked"
1835
+ task_context.metadata["outcome"] = outcome
1836
+ task_context.metadata["bulk_reenqueue_result"] = {
1837
+ "requeued": requeued,
1838
+ "dead_lettered": dead_lettered,
1839
+ "errors": errors,
1840
+ }
1841
+
1842
+ summary = f"Bulk re-enqueue: {len(requeued)} re-queued, {len(dead_lettered)} dead-lettered, {len(errors)} errors."
1843
+ result = {"requeued": requeued, "dead_lettered": dead_lettered, "errors": errors, "outcome": outcome, "summary": summary}
1844
+ self.save_output(result)
1845
+ return result, task_context
1846
+
1847
+ return _persist_node(node_id="bulk_reenqueue", node_name="BulkReenqueue", fn=_run)
1848
+
1849
+
1850
+ # ---------------------------------------------------------------------------
1851
+ # Router redesign
1852
+ # ---------------------------------------------------------------------------
1853
+
1854
+ class _RouteLoopGuard(RouterNode):
1855
+ def determine_next_node(self, task_context: TaskContext) -> Node | None:
1856
+ if task_context.metadata.get("churn_detected"):
1857
+ task_context.metadata["outcome"] = "blocked"
1858
+ task_context.metadata["delegation_summary"] = None
1859
+ failure_signature = str((task_context.metadata.get("churn") or {}).get("failure_signature") or "same normalized failure")
1860
+ churn_key = str((task_context.metadata.get("churn") or {}).get("churn_key") or "")
1861
+ occurrence_count = int((task_context.metadata.get("churn") or {}).get("occurrence_count") or 0)
1862
+ threshold = int((task_context.metadata.get("churn") or {}).get("threshold") or _RECOVERY_CHURN_GATE_THRESHOLD)
1863
+ task_context.metadata["pre_replay"] = PreReplayCheckArtifact(
1864
+ queue_type=task_context.metadata["failed_item"].queue_type,
1865
+ ready=False,
1866
+ checks=[],
1867
+ blocking_reasons=[
1868
+ (
1869
+ f"CHURN: durable recovery gate blocked {failure_signature} "
1870
+ f"after {occurrence_count}/{threshold} no-material-change strikes"
1871
+ + (f" ({churn_key})" if churn_key else "")
1872
+ )
1873
+ ],
1874
+ )
1875
+ return PublishRecoveryStateNode(task_context=task_context)
1876
+ return None
1877
+
1878
+
1879
+ class _RouteSystemic(RouterNode):
1880
+ def determine_next_node(self, task_context: TaskContext) -> Node | None:
1881
+ pattern: SystemicPatternArtifact | None = task_context.metadata.get("systemic_pattern")
1882
+ if pattern and pattern.is_systemic:
1883
+ return RootCauseCodeInvestigationNode(task_context=task_context)
1884
+ return None
1885
+
1886
+
1887
+ class _RouteIsolatedProcess(RouterNode):
1888
+ def determine_next_node(self, task_context: TaskContext) -> Node | None:
1889
+ return AgenticRecoveryDiagnosisNode(task_context=task_context)
1890
+
1891
+
1892
+ class SystemicVsIsolatedRouter(BaseRouter):
1893
+ def __init__(self) -> None:
1894
+ self.routes = [_RouteLoopGuard(), _RouteSystemic(), _RouteIsolatedProcess()]
1895
+ self.fallback = PublishRecoveryStateNode()
1896
+
1897
+
1898
+ # ---------------------------------------------------------------------------
1899
+ # Legacy router (kept for backwards-compat in case referenced externally)
1900
+ # ---------------------------------------------------------------------------
1901
+
1902
+ class _RouteCodeError(RouterNode):
1903
+ def determine_next_node(self, task_context: TaskContext) -> Node | None:
1904
+ if False:
1905
+ return AgenticRecoveryExecutionNode(task_context=task_context)
1906
+ return None
1907
+
1908
+
1909
+ class _RouteProcessError(RouterNode):
1910
+ def determine_next_node(self, task_context: TaskContext) -> Node | None:
1911
+ if True:
1912
+ return AgenticRecoveryDiagnosisNode(task_context=task_context)
1913
+ return None
1914
+
1915
+
1916
+ class FailureTypeRouter(BaseRouter):
1917
+ def __init__(self) -> None:
1918
+ self.routes = [_RouteCodeError(), _RouteProcessError()]
1919
+ self.fallback = PublishRecoveryStateNode()
1920
+
1921
+
1922
+ class AgenticRecoveryDiagnosisNode(AgentNode):
1923
+ def get_agent_config(self) -> AgentConfig:
1924
+ return AgentConfig(instructions="Diagnose the best recovery strategy for a failed queue item.", output_type=RecoveryDiagnosisArtifact)
1925
+
1926
+ async def process(self, task_context: TaskContext) -> TaskContext:
1927
+ def _run(_node_exec_id: str):
1928
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1929
+ investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
1930
+ _publish_node(item.dfs_project_id or task_context.event.project_id, _store_run()[1], "Diagnosing recovery", recovery_id=item.item_id)
1931
+ diagnosis, plan = _build_diagnosis(item=item, investigation=investigation)
1932
+ task_context.metadata["diagnosis"] = diagnosis
1933
+ task_context.metadata["plan"] = plan
1934
+ self.save_output(diagnosis)
1935
+ return {"diagnosis": diagnosis.model_dump(), "plan": plan.model_dump()}, task_context
1936
+ return _persist_node(node_id="recovery_diagnosis", node_name="AgenticRecoveryDiagnosis", fn=_run)
1937
+
1938
+
1939
+ class AgenticRecoveryExecutionNode(AgentNode):
1940
+ def get_agent_config(self) -> AgentConfig:
1941
+ return AgentConfig(instructions="Execute recovery and verify the outcome against agent-defined success criteria inside the same node.", output_type=RecoveryExecutionArtifact)
1942
+
1943
+ async def process(self, task_context: TaskContext) -> TaskContext:
1944
+ def _run(_node_exec_id: str):
1945
+ event = task_context.event
1946
+ store, run_id = _store_run()
1947
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
1948
+ diagnosis: RecoveryDiagnosisArtifact | None = task_context.metadata.get("diagnosis")
1949
+ plan: RemediationPlanArtifact | None = task_context.metadata.get("plan")
1950
+ investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
1951
+ repo_root = Path(str(event.repo_root))
1952
+ _publish_node(item.dfs_project_id or event.project_id, run_id, "Executing recovery", recovery_id=item.item_id)
1953
+ last_reenqueue: ReenqueueArtifact | None = None
1954
+ last_execution: RecoveryExecutionArtifact | None = None
1955
+ last_verified: PreReplayCheckArtifact | None = None
1956
+
1957
+ preflight_health_repair = _maybe_repair_story_preflight_health_boundary(repo_root=repo_root, item=item)
1958
+ if preflight_health_repair is not None:
1959
+ failure_signature = _normalized_failure_signature(
1960
+ failure_message=item.failure_message,
1961
+ failure_context=item.failure_context if isinstance(item.failure_context, dict) else {},
1962
+ )
1963
+ health_repair_diagnosis = RecoveryDiagnosisArtifact(
1964
+ queue_type=item.queue_type,
1965
+ item_id=item.item_id,
1966
+ strategy="preflight_health_repair_recovery",
1967
+ summary="Repair repo/config causing preflight health failure.",
1968
+ rationale="Preflight evidence contains health_check_failed blockers that require a bounded repo/config repair before replay.",
1969
+ verification_targets=[
1970
+ {
1971
+ "criterion": "Failing health endpoint returns the expected status",
1972
+ "oracle": "The previously failing preflight boundary is reachable and returns the expected HTTP status after the repair.",
1973
+ "evidence_ref": preflight_health_repair["artifact_path"],
1974
+ }
1975
+ ],
1976
+ suggested_action="repair_artifact_then_requeue",
1977
+ )
1978
+ plan = RemediationPlanArtifact(
1979
+ queue_type=item.queue_type,
1980
+ action="repair_artifact_then_requeue",
1981
+ summary="Repair repo/config causing preflight health failure, verify the boundary, then requeue.",
1982
+ steps=[criterion.criterion for criterion in health_repair_diagnosis.verification_targets],
1983
+ remediation_artifact=preflight_health_repair["artifact_path"],
1984
+ )
1985
+ task_context.metadata["diagnosis"] = health_repair_diagnosis
1986
+ task_context.metadata["plan"] = plan
1987
+ repair_result: RemediationResultArtifact | None = preflight_health_repair.get("repair_result")
1988
+ if not preflight_health_repair["ready"]:
1989
+ execution = RecoveryExecutionArtifact(
1990
+ queue_type=item.queue_type,
1991
+ item_id=item.item_id,
1992
+ outcome="blocked",
1993
+ execution_summary=(
1994
+ "Preflight health repo/config repair did not restore the failing boundary."
1995
+ if repair_result is not None and repair_result.fix_applied
1996
+ else "Preflight health repo/config repair could not produce a bounded fix."
1997
+ ),
1998
+ preserve_failure_context=True,
1999
+ attempts_used=1,
2000
+ success_criteria=health_repair_diagnosis.verification_targets,
2001
+ verification_summary="; ".join(preflight_health_repair["blocking_reasons"]),
2002
+ )
2003
+ last_verified = PreReplayCheckArtifact(
2004
+ queue_type=item.queue_type,
2005
+ ready=False,
2006
+ checks=preflight_health_repair["verification_checks"],
2007
+ blocking_reasons=preflight_health_repair["blocking_reasons"],
2008
+ )
2009
+ updated_context = _record_recovery_attempt(
2010
+ item=item,
2011
+ diagnosis=health_repair_diagnosis,
2012
+ success=False,
2013
+ failure_signature=failure_signature,
2014
+ material_change=bool(preflight_health_repair["files_changed"]),
2015
+ remediation_artifact=preflight_health_repair["artifact_path"],
2016
+ )
2017
+ item.failure_context = updated_context
2018
+ _persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
2019
+ task_context.metadata["preflight_health_repair"] = preflight_health_repair
2020
+ task_context.metadata["pre_replay"] = last_verified
2021
+ task_context.metadata["outcome"] = "blocked"
2022
+ task_context.metadata["recovery_execution"] = execution
2023
+ self.save_output(execution)
2024
+ return {
2025
+ "outcome": "blocked",
2026
+ "recovery_execution": execution.model_dump(),
2027
+ "pre_replay": last_verified.model_dump(),
2028
+ "preflight_health_repair": {
2029
+ "files_changed": preflight_health_repair["files_changed"],
2030
+ "artifact_path": preflight_health_repair["artifact_path"],
2031
+ },
2032
+ }, task_context
2033
+ health_repair_changed_files = bool(preflight_health_repair["files_changed"])
2034
+ execution = RecoveryExecutionArtifact(
2035
+ queue_type=item.queue_type,
2036
+ item_id=item.item_id,
2037
+ outcome="reenqueued",
2038
+ execution_summary=(
2039
+ "Repaired repo/config for preflight health failure and re-verified the boundary."
2040
+ if health_repair_changed_files
2041
+ else "Re-verified the preflight health boundary and skipped local setup bootstrap because the stack was already healthy."
2042
+ ),
2043
+ preserve_failure_context=True,
2044
+ attempts_used=1,
2045
+ success_criteria=health_repair_diagnosis.verification_targets,
2046
+ verification_summary="; ".join(preflight_health_repair["verification_checks"]),
2047
+ )
2048
+ updated_context = _record_recovery_attempt(
2049
+ item=item,
2050
+ diagnosis=health_repair_diagnosis,
2051
+ success=True,
2052
+ failure_signature=failure_signature,
2053
+ material_change=health_repair_changed_files,
2054
+ remediation_artifact=preflight_health_repair["artifact_path"],
2055
+ )
2056
+ updated_context["preflight_health_repair"] = {
2057
+ "artifact_path": preflight_health_repair["artifact_path"],
2058
+ "files_changed": preflight_health_repair["files_changed"],
2059
+ }
2060
+ item.failure_context = updated_context
2061
+ _persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
2062
+ replay_metadata = _build_story_replay_metadata(item=item, diagnosis=health_repair_diagnosis, execution=execution)
2063
+ row = store.retry_story_queue_item(
2064
+ project_id=event.project_id,
2065
+ story_queue_id=item.item_id,
2066
+ preserve_failure_context=True,
2067
+ replay_metadata=replay_metadata,
2068
+ )
2069
+ failure_context = dict(item.failure_context if isinstance(item.failure_context, dict) else {})
2070
+ failure_context["preflight_health_repair"] = {
2071
+ "artifact_path": preflight_health_repair["artifact_path"],
2072
+ "files_changed": preflight_health_repair["files_changed"],
2073
+ }
2074
+ if replay_metadata is not None:
2075
+ failure_context["replay"] = replay_metadata
2076
+ last_reenqueue = ReenqueueArtifact(
2077
+ queue_type=item.queue_type,
2078
+ item_id=item.item_id,
2079
+ status=str(row.get("status") or "queued"),
2080
+ failure_context=failure_context,
2081
+ replay_metadata=replay_metadata,
2082
+ )
2083
+ last_verified = PreReplayCheckArtifact(
2084
+ queue_type=item.queue_type,
2085
+ ready=True,
2086
+ checks=preflight_health_repair["verification_checks"],
2087
+ blocking_reasons=[],
2088
+ )
2089
+ task_context.metadata["preflight_health_repair"] = preflight_health_repair
2090
+ task_context.metadata["reenqueue"] = last_reenqueue
2091
+ task_context.metadata["pre_replay"] = last_verified
2092
+ task_context.metadata["outcome"] = "reenqueued"
2093
+ task_context.metadata["recovery_execution"] = execution
2094
+ self.save_output(execution)
2095
+ return {
2096
+ "outcome": "reenqueued",
2097
+ "recovery_execution": execution.model_dump(),
2098
+ "pre_replay": last_verified.model_dump(),
2099
+ "preflight_health_repair": {
2100
+ "files_changed": preflight_health_repair["files_changed"],
2101
+ "artifact_path": preflight_health_repair["artifact_path"],
2102
+ },
2103
+ }, task_context
2104
+
2105
+ runtime_contract_repair = _maybe_repair_story_test_runtime_contract(repo_root=repo_root, item=item)
2106
+ if runtime_contract_repair is not None:
2107
+ failure_signature = _normalized_failure_signature(
2108
+ failure_message=item.failure_message,
2109
+ failure_context=item.failure_context if isinstance(item.failure_context, dict) else {},
2110
+ )
2111
+ repaired_test_files = [str(path) for path in (runtime_contract_repair.get("files_changed") or []) if str(path).strip()]
2112
+ contract_updated = bool(
2113
+ (runtime_contract_repair.get("previous_contract") or None) is None
2114
+ or any(
2115
+ (runtime_contract_repair.get("previous_contract") or {}).get(key) != runtime_contract_repair["runtime_contract"].get(key)
2116
+ for key in ("framework", "cwd", "run_cmd", "env", "setup_cmd", "test_paths")
2117
+ )
2118
+ )
2119
+ runtime_repair_diagnosis = diagnosis or RecoveryDiagnosisArtifact(
2120
+ queue_type=item.queue_type,
2121
+ item_id=item.item_id,
2122
+ strategy="artifact_regeneration_recovery",
2123
+ summary="Repair runtime contract",
2124
+ rationale="Story runtime contract repair is the bounded automated fix path.",
2125
+ suggested_action="repair_artifact_then_requeue",
2126
+ )
2127
+ runtime_repair_diagnosis = runtime_repair_diagnosis.model_copy(update={"strategy": "artifact_regeneration_recovery"})
2128
+ if plan is None:
2129
+ plan = RemediationPlanArtifact(
2130
+ queue_type=item.queue_type,
2131
+ action="repair_artifact_then_requeue",
2132
+ summary="Repair story test runtime contract and requeue.",
2133
+ )
2134
+ if not runtime_contract_repair["updated"]:
2135
+ churn_message = (
2136
+ "Recovery runtime repair produced no material runtime-contract delta "
2137
+ f"for {_durable_recovery_identity(item)} on {failure_signature}."
2138
+ )
2139
+ churn_state = _record_recovery_churn_strike(
2140
+ store=store,
2141
+ project_id=event.project_id,
2142
+ run_id=run_id,
2143
+ item=item,
2144
+ failure_signature=failure_signature,
2145
+ message=churn_message,
2146
+ )
2147
+ noop_cycles = int(churn_state["occurrence_count"])
2148
+ updated_context = _record_recovery_attempt(
2149
+ item=item,
2150
+ diagnosis=runtime_repair_diagnosis,
2151
+ success=False,
2152
+ failure_signature=failure_signature,
2153
+ material_change=False,
2154
+ remediation_artifact=runtime_contract_repair["path"],
2155
+ )
2156
+ item.failure_context = updated_context
2157
+ task_context.metadata["failed_item"] = item
2158
+ _persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
2159
+ task_context.metadata["durable_churn_key"] = churn_state["churn_key"]
2160
+ task_context.metadata["durable_churn_error_task_id"] = churn_state["error_task_id"]
2161
+ if churn_state["threshold_met"]:
2162
+ investigation = _build_churn_gate_investigation(
2163
+ item=item,
2164
+ failure_signature=failure_signature,
2165
+ occurrence_count=noop_cycles,
2166
+ threshold=_RECOVERY_CHURN_GATE_THRESHOLD,
2167
+ churn_key=str(churn_state.get("churn_key") or "") or None,
2168
+ remediation_artifact=str(runtime_contract_repair["path"]),
2169
+ )
2170
+ task_context.metadata["investigation"] = investigation
2171
+ plan = plan.model_copy(
2172
+ update={
2173
+ "action": "manual_review_required",
2174
+ "summary": "Durable recovery churn gate blocked repeated no-op runtime repair.",
2175
+ "remediation_artifact": runtime_contract_repair["path"],
2176
+ }
2177
+ )
2178
+ task_context.metadata["plan"] = plan
2179
+ execution = RecoveryExecutionArtifact(
2180
+ queue_type=item.queue_type,
2181
+ item_id=item.item_id,
2182
+ outcome="blocked",
2183
+ execution_summary=(
2184
+ "Blocked repeated story runtime repair because the same normalized failure produced no material runtime-contract delta."
2185
+ ),
2186
+ preserve_failure_context=True,
2187
+ attempts_used=noop_cycles,
2188
+ success_criteria=[
2189
+ {
2190
+ "criterion": "Durable churn gate blocks repeated no-op runtime repair",
2191
+ "oracle": "Recovery hard-blocks after 3 strikes on the same durable identity with no material runtime-contract delta.",
2192
+ "evidence_ref": runtime_contract_repair["path"],
2193
+ }
2194
+ ],
2195
+ verification_summary=(
2196
+ f"CHURN: durable recovery gate blocked {failure_signature} after {noop_cycles}/{_RECOVERY_CHURN_GATE_THRESHOLD} no-material-change strikes"
2197
+ f" ({churn_state['churn_key']})."
2198
+ ),
2199
+ )
2200
+ last_verified = PreReplayCheckArtifact(
2201
+ queue_type=item.queue_type,
2202
+ ready=False,
2203
+ checks=[],
2204
+ blocking_reasons=[
2205
+ f"CHURN: durable recovery gate blocked {failure_signature} after {noop_cycles}/{_RECOVERY_CHURN_GATE_THRESHOLD} no-material-change strikes"
2206
+ f" ({churn_state['churn_key']})."
2207
+ ],
2208
+ )
2209
+ task_context.metadata["runtime_contract_update"] = runtime_contract_repair
2210
+ task_context.metadata["pre_replay"] = last_verified
2211
+ task_context.metadata["outcome"] = "blocked"
2212
+ task_context.metadata["recovery_execution"] = execution
2213
+ task_context.metadata["churn_detected"] = True
2214
+ task_context.metadata["churn"] = {
2215
+ "detected": True,
2216
+ "failure_signature": failure_signature,
2217
+ "noop_cycles": noop_cycles,
2218
+ "occurrence_count": noop_cycles,
2219
+ "threshold": _RECOVERY_CHURN_GATE_THRESHOLD,
2220
+ "error_task_id": churn_state["error_task_id"],
2221
+ "churn_key": churn_state["churn_key"],
2222
+ "reason": "no_material_runtime_contract_delta",
2223
+ }
2224
+ self.save_output(execution)
2225
+ return {
2226
+ "outcome": "blocked",
2227
+ "recovery_execution": execution.model_dump(),
2228
+ "pre_replay": last_verified.model_dump(),
2229
+ "runtime_contract_update": runtime_contract_repair,
2230
+ }, task_context
2231
+ execution_summary_parts: list[str] = []
2232
+ success_criteria = []
2233
+ verification_checks = []
2234
+ remediation_artifact = str(runtime_contract_repair["path"])
2235
+ if contract_updated:
2236
+ execution_summary_parts.append(f"Updated story test runtime contract at {runtime_contract_repair['path']}.")
2237
+ success_criteria.append(
2238
+ {
2239
+ "criterion": "Story runtime contract updated",
2240
+ "oracle": "The canonical story test_runtime.json matches the corrected run pattern.",
2241
+ "evidence_ref": runtime_contract_repair["path"],
2242
+ }
2243
+ )
2244
+ verification_checks.append("story runtime contract updated")
2245
+ if repaired_test_files:
2246
+ execution_summary_parts.append(
2247
+ "Repaired malformed story-scoped pytest file(s): " + ", ".join(repaired_test_files) + "."
2248
+ )
2249
+ success_criteria.append(
2250
+ {
2251
+ "criterion": "Malformed story-scoped pytest file repaired",
2252
+ "oracle": "Injected pytestmark blocks no longer sit inside an open Python import list and the repaired file compiles.",
2253
+ "evidence_ref": repaired_test_files[0],
2254
+ }
2255
+ )
2256
+ verification_checks.append("malformed story-scoped pytest file repaired")
2257
+ remediation_artifact = repaired_test_files[0]
2258
+ plan = plan.model_copy(
2259
+ update={
2260
+ "action": "repair_artifact_then_requeue",
2261
+ "summary": " ".join(execution_summary_parts).strip(),
2262
+ "remediation_artifact": remediation_artifact,
2263
+ }
2264
+ )
2265
+ task_context.metadata["plan"] = plan
2266
+ execution = RecoveryExecutionArtifact(
2267
+ queue_type=item.queue_type,
2268
+ item_id=item.item_id,
2269
+ outcome="reenqueued",
2270
+ execution_summary=" ".join(execution_summary_parts).strip(),
2271
+ preserve_failure_context=True,
2272
+ attempts_used=1,
2273
+ success_criteria=[
2274
+ *success_criteria,
2275
+ {
2276
+ "criterion": "Story re-enqueued",
2277
+ "oracle": "The story queue row returns to queued so the next run uses the repaired runtime boundary.",
2278
+ },
2279
+ ],
2280
+ verification_summary="; ".join([*verification_checks, "story re-enqueued"]),
2281
+ )
2282
+ updated_context = _record_recovery_attempt(
2283
+ item=item,
2284
+ diagnosis=runtime_repair_diagnosis,
2285
+ success=True,
2286
+ failure_signature=failure_signature,
2287
+ material_change=True,
2288
+ remediation_artifact=runtime_contract_repair["path"],
2289
+ )
2290
+ updated_context["runtime_contract_update"] = {
2291
+ "path": runtime_contract_repair["path"],
2292
+ "source": runtime_contract_repair["runtime_contract"].get("source"),
2293
+ "files_changed": repaired_test_files,
2294
+ }
2295
+ item.failure_context = updated_context
2296
+ _persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
2297
+ replay_metadata = _build_story_replay_metadata(item=item, diagnosis=diagnosis, execution=execution)
2298
+ row = store.retry_story_queue_item(
2299
+ project_id=event.project_id,
2300
+ story_queue_id=item.item_id,
2301
+ preserve_failure_context=True,
2302
+ replay_metadata=replay_metadata,
2303
+ )
2304
+ failure_context = dict(item.failure_context if isinstance(item.failure_context, dict) else {})
2305
+ failure_context["runtime_contract_update"] = {
2306
+ "path": runtime_contract_repair["path"],
2307
+ "source": runtime_contract_repair["runtime_contract"].get("source"),
2308
+ "files_changed": repaired_test_files,
2309
+ }
2310
+ if replay_metadata is not None:
2311
+ failure_context["replay"] = replay_metadata
2312
+ last_reenqueue = ReenqueueArtifact(
2313
+ queue_type=item.queue_type,
2314
+ item_id=item.item_id,
2315
+ status=str(row.get("status") or "queued"),
2316
+ failure_context=failure_context,
2317
+ replay_metadata=replay_metadata,
2318
+ )
2319
+ last_verified = PreReplayCheckArtifact(
2320
+ queue_type=item.queue_type,
2321
+ ready=True,
2322
+ checks=[*verification_checks, "story queue item reset to queued"],
2323
+ blocking_reasons=[],
2324
+ )
2325
+ task_context.metadata["runtime_contract_update"] = runtime_contract_repair
2326
+ task_context.metadata["reenqueue"] = last_reenqueue
2327
+ task_context.metadata["pre_replay"] = last_verified
2328
+ task_context.metadata["outcome"] = "reenqueued"
2329
+ task_context.metadata["recovery_execution"] = execution
2330
+ self.save_output(execution)
2331
+ return {
2332
+ "outcome": "reenqueued",
2333
+ "recovery_execution": execution.model_dump(),
2334
+ "pre_replay": last_verified.model_dump(),
2335
+ "runtime_contract_update": runtime_contract_repair,
2336
+ }, task_context
2337
+
2338
+ for attempt in range(1, 4):
2339
+ if diagnosis is None or plan is None:
2340
+ diagnosis, plan = _build_diagnosis(item=item, investigation=investigation, attempt=attempt)
2341
+ task_context.metadata["diagnosis"] = diagnosis
2342
+ task_context.metadata["plan"] = plan
2343
+ try:
2344
+ execution, _envelope = run_agent_step(
2345
+ repo_root=repo_root,
2346
+ stage_name="recovery_execution",
2347
+ output_model=RecoveryExecutionArtifact,
2348
+ context_payload={
2349
+ "failed_item": item.model_dump(),
2350
+ "diagnosis": None if diagnosis is None else diagnosis.model_dump(),
2351
+ "success_criteria": [] if diagnosis is None else [c.model_dump() for c in diagnosis.verification_targets],
2352
+ "plan": None if plan is None else plan.model_dump(),
2353
+ "attempt": attempt,
2354
+ "previous_reenqueue": None if last_reenqueue is None else last_reenqueue.model_dump(),
2355
+ "previous_execution": None if last_execution is None else last_execution.model_dump(),
2356
+ },
2357
+ guidance=load_agentic_prompt_lines("recovery_execution"),
2358
+ timeout_seconds=300,
2359
+ strength=_CURRENT_STRENGTH,
2360
+ )
2361
+ except Exception as exc:
2362
+ error_summary = f"recovery_execution agent step failed (attempt {attempt}): {exc}"
2363
+ task_context.metadata["outcome"] = "reenqueued"
2364
+ task_context.metadata["recovery_execution_error"] = error_summary
2365
+ if item.queue_type == "scope":
2366
+ store.retry_scope_queue_item(project_id=event.project_id, scope_queue_id=item.item_id, preserve_failure_context=False)
2367
+ elif item.queue_type == "idea_creation":
2368
+ store.retry_idea_creation_queue_item(project_id=event.project_id, idea_creation_queue_id=item.item_id, preserve_failure_context=False)
2369
+ elif item.queue_type == "idea":
2370
+ store.retry_idea_queue_item(project_id=event.project_id, idea_queue_id=item.item_id, preserve_failure_context=False)
2371
+ elif item.queue_type == "integration":
2372
+ store.retry_integration_queue_item(project_id=event.project_id, integration_queue_id=item.item_id, preserve_failure_context=False)
2373
+ else:
2374
+ store.retry_story_queue_item(project_id=event.project_id, story_queue_id=item.item_id, preserve_failure_context=False, replay_metadata=None)
2375
+ fallback_artifact = RecoveryExecutionArtifact(
2376
+ queue_type=item.queue_type,
2377
+ item_id=item.item_id,
2378
+ outcome="reenqueued",
2379
+ execution_summary=error_summary,
2380
+ attempts_used=attempt,
2381
+ )
2382
+ self.save_output(fallback_artifact)
2383
+ return {"outcome": "reenqueued", "recovery_execution": fallback_artifact.model_dump(), "recovery_execution_error": error_summary}, task_context
2384
+ execution.attempts_used = attempt
2385
+ last_execution = execution
2386
+ if execution.outcome == "delegated":
2387
+ updated_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=False)
2388
+ item.failure_context = updated_context
2389
+ _persist_queue_failure_context(store=store, item=item, failure_context=updated_context)
2390
+ task_context.metadata["outcome"] = "delegated"
2391
+ task_context.metadata["delegation_summary"] = execution.delegation_summary or execution.execution_summary
2392
+ task_context.metadata["recovery_execution"] = execution
2393
+ self.save_output(execution)
2394
+ return {"outcome": "delegated", "recovery_execution": execution.model_dump()}, task_context
2395
+ if execution.outcome == "blocked":
2396
+ if attempt < 3:
2397
+ diagnosis, plan = _build_diagnosis(
2398
+ item=item,
2399
+ investigation=investigation,
2400
+ prior_execution=execution,
2401
+ attempt=attempt + 1,
2402
+ )
2403
+ task_context.metadata["diagnosis"] = diagnosis
2404
+ task_context.metadata["plan"] = plan
2405
+ continue
2406
+ task_context.metadata["outcome"] = "blocked"
2407
+ task_context.metadata["recovery_execution"] = execution
2408
+ self.save_output(execution)
2409
+ return {"outcome": "blocked", "recovery_execution": execution.model_dump()}, task_context
2410
+ preserve = execution.preserve_failure_context if plan is None else plan.preserve_failure_context
2411
+ if preserve:
2412
+ failure_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=False)
2413
+ item.failure_context = failure_context
2414
+ _persist_queue_failure_context(store=store, item=item, failure_context=failure_context)
2415
+ replay_metadata = _build_story_replay_metadata(item=item, diagnosis=diagnosis, execution=execution)
2416
+ if item.queue_type == "scope":
2417
+ row = store.retry_scope_queue_item(project_id=event.project_id, scope_queue_id=item.item_id, preserve_failure_context=preserve)
2418
+ elif item.queue_type == "idea_creation":
2419
+ row = store.retry_idea_creation_queue_item(project_id=event.project_id, idea_creation_queue_id=item.item_id, preserve_failure_context=preserve)
2420
+ elif item.queue_type == "idea":
2421
+ row = store.retry_idea_queue_item(project_id=event.project_id, idea_queue_id=item.item_id, preserve_failure_context=preserve)
2422
+ elif item.queue_type == "integration":
2423
+ row = store.retry_integration_queue_item(project_id=event.project_id, integration_queue_id=item.item_id, preserve_failure_context=preserve)
2424
+ else:
2425
+ row = store.retry_story_queue_item(
2426
+ project_id=event.project_id,
2427
+ story_queue_id=item.item_id,
2428
+ preserve_failure_context=preserve,
2429
+ replay_metadata=replay_metadata,
2430
+ )
2431
+ failure_context = item.failure_context if preserve else None
2432
+ if isinstance(failure_context, dict) and replay_metadata is not None:
2433
+ failure_context = dict(failure_context)
2434
+ failure_context["replay"] = replay_metadata
2435
+ last_reenqueue = ReenqueueArtifact(
2436
+ queue_type=item.queue_type,
2437
+ item_id=item.item_id,
2438
+ status=str(row.get("status") or "queued"),
2439
+ failure_context=failure_context,
2440
+ replay_metadata=replay_metadata,
2441
+ )
2442
+ task_context.metadata["reenqueue"] = last_reenqueue
2443
+ _publish_node(item.dfs_project_id or event.project_id, run_id, "Verifying recovery", recovery_id=item.item_id)
2444
+ verified, _verify_env = run_agent_step(
2445
+ repo_root=repo_root,
2446
+ stage_name="recovery_execution_verification",
2447
+ output_model=PreReplayCheckArtifact,
2448
+ context_payload={
2449
+ "failed_item": item.model_dump(),
2450
+ "diagnosis": None if diagnosis is None else diagnosis.model_dump(),
2451
+ "success_criteria": [] if diagnosis is None else [c.model_dump() for c in diagnosis.verification_targets],
2452
+ "execution": execution.model_dump(),
2453
+ "reenqueue": last_reenqueue.model_dump(),
2454
+ "attempt": attempt,
2455
+ },
2456
+ guidance=load_agentic_prompt_lines("recovery_execution_verification"),
2457
+ timeout_seconds=300,
2458
+ strength=_CURRENT_STRENGTH,
2459
+ )
2460
+ last_verified = verified
2461
+ task_context.metadata["pre_replay"] = verified
2462
+ if _verification_allows_reenqueue(execution=execution, verified=verified, diagnosis=diagnosis):
2463
+ if preserve:
2464
+ failure_context = _record_recovery_attempt(item=item, diagnosis=diagnosis, success=True)
2465
+ item.failure_context = failure_context
2466
+ _persist_queue_failure_context(store=store, item=item, failure_context=failure_context)
2467
+ task_context.metadata["outcome"] = "reenqueued"
2468
+ task_context.metadata["recovery_execution"] = execution
2469
+ self.save_output(execution)
2470
+ return {"outcome": "reenqueued", "recovery_execution": execution.model_dump(), "pre_replay": verified.model_dump()}, task_context
2471
+ if attempt < 3:
2472
+ diagnosis, plan = _build_diagnosis(
2473
+ item=item,
2474
+ investigation=investigation,
2475
+ prior_execution=execution,
2476
+ prior_verification=verified,
2477
+ attempt=attempt + 1,
2478
+ )
2479
+ task_context.metadata["diagnosis"] = diagnosis
2480
+ task_context.metadata["plan"] = plan
2481
+ task_context.metadata["outcome"] = "blocked"
2482
+ artifact = last_execution or RecoveryExecutionArtifact(queue_type=item.queue_type, item_id=item.item_id, outcome="blocked", execution_summary="Recovery attempts exhausted", attempts_used=3)
2483
+ task_context.metadata["recovery_execution"] = artifact
2484
+ if last_verified is not None:
2485
+ task_context.metadata["pre_replay"] = last_verified
2486
+ self.save_output(artifact)
2487
+ return {"outcome": "blocked", "recovery_execution": artifact.model_dump(), "pre_replay": None if last_verified is None else last_verified.model_dump()}, task_context
2488
+ return _persist_node(node_id="recovery_execution", node_name="AgenticRecoveryExecution", fn=_run)
2489
+
2490
+
2491
+ class PublishRecoveryStateNode(Node):
2492
+ async def process(self, task_context: TaskContext) -> TaskContext:
2493
+ def _run(_node_exec_id: str):
2494
+ event = task_context.event
2495
+ _store, run_id = _store_run()
2496
+ item: FailedQueueItemArtifact = task_context.metadata["failed_item"]
2497
+ _publish_node(item.dfs_project_id or event.project_id, run_id, "Publishing outcome", recovery_id=item.item_id)
2498
+ investigation: RecoveryInvestigationArtifact | None = task_context.metadata.get("investigation")
2499
+ plan: RemediationPlanArtifact | None = task_context.metadata.get("plan")
2500
+ reenqueue: ReenqueueArtifact | None = task_context.metadata.get("reenqueue")
2501
+ recovery_execution: RecoveryExecutionArtifact | None = task_context.metadata.get("recovery_execution")
2502
+ outcome = str(task_context.metadata.get("outcome") or "blocked")
2503
+ pre_replay: PreReplayCheckArtifact | None = task_context.metadata.get("pre_replay")
2504
+ if outcome == "reenqueued":
2505
+ summary = f"Recovery re-enqueued failed {item.queue_type} item {item.item_id}."
2506
+ _publish(item.dfs_project_id or event.project_id, run_id, "idle", "completed", summary, recovery_id=item.item_id)
2507
+ exit_code = 0
2508
+ elif outcome == "delegated":
2509
+ summary = str(task_context.metadata.get("delegation_summary") or f"Delegated {item.queue_type} item {item.item_id} to code error recovery.")
2510
+ _publish(item.dfs_project_id or event.project_id, run_id, "idle", "completed", summary, recovery_id=item.item_id)
2511
+ exit_code = 0
2512
+ else:
2513
+ reason = "recovery blocked"
2514
+ if pre_replay is not None and pre_replay.blocking_reasons:
2515
+ reason = "; ".join(pre_replay.blocking_reasons)
2516
+ elif recovery_execution is not None and recovery_execution.verification_summary:
2517
+ reason = recovery_execution.verification_summary
2518
+ elif investigation is not None and investigation.summary:
2519
+ reason = investigation.summary
2520
+ elif plan is not None and plan.summary:
2521
+ reason = plan.summary
2522
+ if investigation is not None and investigation.non_convergence is not None:
2523
+ reason = f"{reason}; non-convergence: {investigation.non_convergence.reason}" if reason else investigation.non_convergence.reason
2524
+ summary = f"Recovery blocked for failed {item.queue_type} item {item.item_id}: {reason}"
2525
+ _publish(item.dfs_project_id or event.project_id, run_id, "failed", "blocked", summary, reason, recovery_id=item.item_id)
2526
+ exit_code = 2
2527
+ diagnosis: RecoveryDiagnosisArtifact | None = task_context.metadata.get("diagnosis")
2528
+ handoff_path = _persist_recovery_handoff_artifact(
2529
+ repo_root=Path(str(event.repo_root)),
2530
+ recovery_run_id=run_id,
2531
+ item=item,
2532
+ investigation=investigation,
2533
+ diagnosis=diagnosis,
2534
+ execution=recovery_execution,
2535
+ pre_replay=pre_replay,
2536
+ )
2537
+ if recovery_execution is not None and handoff_path is not None:
2538
+ recovery_execution = recovery_execution.model_copy(update={
2539
+ "recovery_handoff_artifact_path": str(handoff_path),
2540
+ "recovery_handoff_summary": f"Persisted compact recovery handoff at {handoff_path}",
2541
+ })
2542
+ task_context.metadata["recovery_execution"] = recovery_execution
2543
+ artifact = RecoveryOutcomeArtifact(
2544
+ queue_type=item.queue_type,
2545
+ item_id=item.item_id,
2546
+ project_id=item.project_id,
2547
+ outcome=outcome, # type: ignore[arg-type]
2548
+ summary=summary,
2549
+ investigation=investigation,
2550
+ plan=plan,
2551
+ reenqueue=reenqueue,
2552
+ recovery_handoff_artifact_path=None if handoff_path is None else str(handoff_path),
2553
+ )
2554
+ task_context.metadata["result"] = artifact.model_dump()
2555
+ task_context.metadata["message"] = json.dumps(artifact.model_dump(), sort_keys=True)
2556
+ task_context.metadata["exit_code"] = exit_code
2557
+ self.save_output(artifact)
2558
+ return {"outcome": outcome, "summary": summary, "exit_code": exit_code}, task_context
2559
+ return _persist_node(node_id="publish_recovery_state", node_name="PublishRecoveryState", fn=_run)
2560
+
2561
+
2562
+ class FailureRecoveryWorkflow(Workflow):
2563
+ workflow_schema = WorkflowSchema(
2564
+ description="Post-queue-drain failure recovery DAG — systemic-aware",
2565
+ event_schema=FailureRecoveryDagEvent,
2566
+ start=LoadFailedQueueItemNode,
2567
+ nodes=[
2568
+ NodeConfig(node=LoadFailedQueueItemNode, connections=[SystemicPatternAnalysisNode]),
2569
+ NodeConfig(node=SystemicPatternAnalysisNode, connections=[AgenticFailureInvestigationNode]),
2570
+ NodeConfig(node=AgenticFailureInvestigationNode, connections=[SystemicVsIsolatedRouter]),
2571
+ NodeConfig(node=SystemicVsIsolatedRouter, connections=[PublishRecoveryStateNode, RootCauseCodeInvestigationNode, AgenticRecoveryDiagnosisNode], is_router=True),
2572
+ NodeConfig(node=RootCauseCodeInvestigationNode, connections=[RemediationExecutionNode]),
2573
+ NodeConfig(node=RemediationExecutionNode, connections=[BulkReenqueueNode]),
2574
+ NodeConfig(node=BulkReenqueueNode, connections=[PublishRecoveryStateNode]),
2575
+ NodeConfig(node=AgenticRecoveryDiagnosisNode, connections=[AgenticRecoveryExecutionNode]),
2576
+ NodeConfig(node=AgenticRecoveryExecutionNode, connections=[PublishRecoveryStateNode]),
2577
+ NodeConfig(node=PublishRecoveryStateNode, connections=[]),
2578
+ ],
2579
+ )
2580
+
2581
+
2582
+ def run_failure_recovery_dag(*, repo_root: Path, store: ExecutionStore, project_id: str, queue_type: str, item_id: str, run_id: str | None = None, strength: str | None = None) -> FailureRecoveryDagResult:
2583
+ owns_run = run_id is None
2584
+ if run_id is None:
2585
+ run_id = store.create_run(dag_id=DAG_ID, dag_version="v2", root_correlation_id=f"corr_recovery_{queue_type}_{item_id}", config={"project_id": project_id, "queue_type": queue_type, "item_id": item_id})
2586
+ store.mark_run_started(run_id=run_id)
2587
+ _publish(project_id, run_id, "running", "processing", f"Recovering failed {queue_type} queue item", recovery_id=item_id)
2588
+ wf = FailureRecoveryWorkflow()
2589
+ global _CURRENT_STORE, _CURRENT_RUN_ID, _CURRENT_STRENGTH, _CURRENT_REPO_ROOT
2590
+ _CURRENT_STORE = store
2591
+ _CURRENT_RUN_ID = run_id
2592
+ _CURRENT_STRENGTH = strength
2593
+ _CURRENT_REPO_ROOT = repo_root
2594
+ try:
2595
+ ctx = wf.run({"repo_root": str(repo_root), "project_id": project_id, "queue_type": queue_type, "item_id": item_id})
2596
+ except Exception as exc:
2597
+ if owns_run:
2598
+ store.mark_run_finished(run_id=run_id, status="failed")
2599
+ _publish(project_id, run_id, "failed", "failed", f"Recovery failed for {queue_type}:{item_id}", str(exc), recovery_id=item_id)
2600
+ raise
2601
+ finally:
2602
+ _CURRENT_STORE = None
2603
+ _CURRENT_RUN_ID = None
2604
+ _CURRENT_STRENGTH = None
2605
+ _CURRENT_REPO_ROOT = None
2606
+ exit_code = int(ctx.metadata.get("exit_code") or 0)
2607
+ if owns_run:
2608
+ store.mark_run_finished(run_id=run_id, status="succeeded" if exit_code == 0 else "failed")
2609
+ return FailureRecoveryDagResult(exit_code=exit_code, run_id=run_id, outcome=dict(ctx.metadata.get("result") or {}), message=str(ctx.metadata.get("message") or ""))