devflow-engine 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. devflow_engine/__init__.py +3 -0
  2. devflow_engine/agentic_prompts.py +100 -0
  3. devflow_engine/agentic_runtime.py +398 -0
  4. devflow_engine/api_key_flow_harness.py +539 -0
  5. devflow_engine/api_keys.py +357 -0
  6. devflow_engine/bootstrap/__init__.py +2 -0
  7. devflow_engine/bootstrap/provision_from_template.py +84 -0
  8. devflow_engine/cli/__init__.py +0 -0
  9. devflow_engine/cli/app.py +7270 -0
  10. devflow_engine/core/__init__.py +0 -0
  11. devflow_engine/core/config.py +86 -0
  12. devflow_engine/core/logging.py +29 -0
  13. devflow_engine/core/paths.py +45 -0
  14. devflow_engine/core/toml_kv.py +33 -0
  15. devflow_engine/devflow_event_worker.py +1292 -0
  16. devflow_engine/devflow_state.py +201 -0
  17. devflow_engine/devin2/__init__.py +9 -0
  18. devflow_engine/devin2/agent_definition.py +120 -0
  19. devflow_engine/devin2/pi_runner.py +204 -0
  20. devflow_engine/devin_orchestration.py +69 -0
  21. devflow_engine/docs/prompts/anti-patterns.md +42 -0
  22. devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
  23. devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
  24. devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
  25. devflow_engine/doctor/__init__.py +2 -0
  26. devflow_engine/doctor/triage.py +140 -0
  27. devflow_engine/error/__init__.py +0 -0
  28. devflow_engine/error/remediation.py +21 -0
  29. devflow_engine/errors/error_solver_dag.py +522 -0
  30. devflow_engine/errors/runtime_observability.py +67 -0
  31. devflow_engine/idea/__init__.py +4 -0
  32. devflow_engine/idea/actors.py +481 -0
  33. devflow_engine/idea/agentic.py +465 -0
  34. devflow_engine/idea/analyze.py +93 -0
  35. devflow_engine/idea/devin_chat_dag.py +1 -0
  36. devflow_engine/idea/diff.py +99 -0
  37. devflow_engine/idea/drafts.py +446 -0
  38. devflow_engine/idea/idea_creation_dag.py +643 -0
  39. devflow_engine/idea/ideation_enrichment.py +355 -0
  40. devflow_engine/idea/ideation_enrichment_worker.py +19 -0
  41. devflow_engine/idea/paths.py +28 -0
  42. devflow_engine/idea/promote.py +53 -0
  43. devflow_engine/idea/redaction.py +27 -0
  44. devflow_engine/idea/repo_tools.py +1277 -0
  45. devflow_engine/idea/response_mode.py +30 -0
  46. devflow_engine/idea/story_pipeline.py +1585 -0
  47. devflow_engine/idea/sufficiency.py +376 -0
  48. devflow_engine/idea/traditional_stories.py +1257 -0
  49. devflow_engine/implementation/__init__.py +0 -0
  50. devflow_engine/implementation/alembic_preflight.py +700 -0
  51. devflow_engine/implementation/dag.py +8450 -0
  52. devflow_engine/implementation/green_gate.py +93 -0
  53. devflow_engine/implementation/prompts.py +108 -0
  54. devflow_engine/implementation/test_runtime.py +623 -0
  55. devflow_engine/integration/__init__.py +19 -0
  56. devflow_engine/integration/agentic.py +66 -0
  57. devflow_engine/integration/dag.py +3539 -0
  58. devflow_engine/integration/prompts.py +114 -0
  59. devflow_engine/integration/supabase_schema.sql +31 -0
  60. devflow_engine/integration/supabase_sync.py +177 -0
  61. devflow_engine/llm/__init__.py +1 -0
  62. devflow_engine/llm/cli_one_shot.py +84 -0
  63. devflow_engine/llm/cli_stream.py +371 -0
  64. devflow_engine/llm/execution_context.py +26 -0
  65. devflow_engine/llm/invoke.py +1322 -0
  66. devflow_engine/llm/provider_api.py +304 -0
  67. devflow_engine/llm/repo_knowledge.py +588 -0
  68. devflow_engine/llm_primitives.py +315 -0
  69. devflow_engine/orchestration.py +62 -0
  70. devflow_engine/planning/__init__.py +0 -0
  71. devflow_engine/planning/analyze_repo.py +92 -0
  72. devflow_engine/planning/render_drafts.py +133 -0
  73. devflow_engine/playground/__init__.py +0 -0
  74. devflow_engine/playground/hooks.py +26 -0
  75. devflow_engine/playwright_workflow/__init__.py +5 -0
  76. devflow_engine/playwright_workflow/dag.py +1317 -0
  77. devflow_engine/process/__init__.py +5 -0
  78. devflow_engine/process/dag.py +59 -0
  79. devflow_engine/project_registration/__init__.py +3 -0
  80. devflow_engine/project_registration/dag.py +1581 -0
  81. devflow_engine/project_registry.py +109 -0
  82. devflow_engine/prompts/devin/generic/prompt.md +6 -0
  83. devflow_engine/prompts/devin/ideation/prompt.md +263 -0
  84. devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
  85. devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
  86. devflow_engine/prompts/devin/insight/prompt.md +11 -0
  87. devflow_engine/prompts/devin/insight/scenarios.md +5 -0
  88. devflow_engine/prompts/devin/intake/prompt.md +15 -0
  89. devflow_engine/prompts/devin/iterate/prompt.md +12 -0
  90. devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
  91. devflow_engine/prompts/devin/shared/principles.md +246 -0
  92. devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
  93. devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
  94. devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
  95. devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
  96. devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
  97. devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
  98. devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
  99. devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
  100. devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
  101. devflow_engine/prompts/implementation/red/prompt.md +27 -0
  102. devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
  103. devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
  104. devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
  105. devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
  106. devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
  107. devflow_engine/prompts/integration/README.md +185 -0
  108. devflow_engine/prompts/integration/green/example.md +67 -0
  109. devflow_engine/prompts/integration/green/green/prompt.md +10 -0
  110. devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
  111. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
  112. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
  113. devflow_engine/prompts/integration/green_enrich/example.md +79 -0
  114. devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
  115. devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
  116. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
  117. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  118. devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
  119. devflow_engine/prompts/integration/red/example.md +152 -0
  120. devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
  121. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  122. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
  123. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
  124. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
  125. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  126. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
  127. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
  128. devflow_engine/prompts/integration/red/red/prompt.md +11 -0
  129. devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
  130. devflow_engine/prompts/integration/red_review/example.md +71 -0
  131. devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
  132. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  133. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
  134. devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
  135. devflow_engine/prompts/integration/resolve/example.md +111 -0
  136. devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
  137. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
  138. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
  139. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
  140. devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
  141. devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
  142. devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
  143. devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
  144. devflow_engine/prompts/integration/validate/example.md +143 -0
  145. devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
  146. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  147. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
  148. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
  149. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
  150. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  151. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
  152. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
  153. devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
  154. devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
  155. devflow_engine/prompts/integration/write_workflows/example.md +100 -0
  156. devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
  157. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
  158. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
  159. devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
  160. devflow_engine/prompts/iterate/README.md +7 -0
  161. devflow_engine/prompts/iterate/coder/prompt.md +11 -0
  162. devflow_engine/prompts/iterate/framer/prompt.md +11 -0
  163. devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
  164. devflow_engine/prompts/iterate/observer/prompt.md +11 -0
  165. devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
  166. devflow_engine/prompts/recovery/execution/prompt.md +8 -0
  167. devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
  168. devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
  169. devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
  170. devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
  171. devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
  172. devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
  173. devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
  174. devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
  175. devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
  176. devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
  177. devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
  178. devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
  179. devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
  180. devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
  181. devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
  182. devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
  183. devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
  184. devflow_engine/recovery/__init__.py +3 -0
  185. devflow_engine/recovery/dag.py +2609 -0
  186. devflow_engine/recovery/models.py +220 -0
  187. devflow_engine/refactor.py +93 -0
  188. devflow_engine/registry/__init__.py +1 -0
  189. devflow_engine/registry/cards.py +238 -0
  190. devflow_engine/registry/domain_normalize.py +60 -0
  191. devflow_engine/registry/effects.py +65 -0
  192. devflow_engine/registry/enforce_report.py +150 -0
  193. devflow_engine/registry/module_cards_classify.py +164 -0
  194. devflow_engine/registry/module_cards_draft.py +184 -0
  195. devflow_engine/registry/module_cards_gate.py +59 -0
  196. devflow_engine/registry/packages.py +347 -0
  197. devflow_engine/registry/pathways.py +323 -0
  198. devflow_engine/review/__init__.py +11 -0
  199. devflow_engine/review/dag.py +588 -0
  200. devflow_engine/review/review_story.py +67 -0
  201. devflow_engine/scope_idea/__init__.py +3 -0
  202. devflow_engine/scope_idea/agentic.py +39 -0
  203. devflow_engine/scope_idea/dag.py +1069 -0
  204. devflow_engine/scope_idea/models.py +175 -0
  205. devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
  206. devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
  207. devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
  208. devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
  209. devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
  210. devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
  211. devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
  212. devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
  213. devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
  214. devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
  215. devflow_engine/skills/registry.example.yaml +42 -0
  216. devflow_engine/source_doc_assumptions.py +291 -0
  217. devflow_engine/source_doc_mutation_dag.py +1606 -0
  218. devflow_engine/source_doc_mutation_eval.py +417 -0
  219. devflow_engine/source_doc_mutation_worker.py +25 -0
  220. devflow_engine/source_docs_schema.py +207 -0
  221. devflow_engine/source_docs_updater.py +309 -0
  222. devflow_engine/source_scope/__init__.py +15 -0
  223. devflow_engine/source_scope/agentic.py +45 -0
  224. devflow_engine/source_scope/dag.py +1626 -0
  225. devflow_engine/source_scope/models.py +177 -0
  226. devflow_engine/stores/__init__.py +0 -0
  227. devflow_engine/stores/execution_store.py +3534 -0
  228. devflow_engine/story/__init__.py +0 -0
  229. devflow_engine/story/contracts.py +160 -0
  230. devflow_engine/story/discovery.py +47 -0
  231. devflow_engine/story/evidence.py +118 -0
  232. devflow_engine/story/hashing.py +27 -0
  233. devflow_engine/story/implemented_queue_purge.py +148 -0
  234. devflow_engine/story/indexer.py +105 -0
  235. devflow_engine/story/io.py +20 -0
  236. devflow_engine/story/markdown_contracts.py +298 -0
  237. devflow_engine/story/reconciliation.py +408 -0
  238. devflow_engine/story/validate_stories.py +149 -0
  239. devflow_engine/story/validate_tests_story.py +512 -0
  240. devflow_engine/story/validation.py +133 -0
  241. devflow_engine/ui_grounding/__init__.py +11 -0
  242. devflow_engine/ui_grounding/agentic.py +31 -0
  243. devflow_engine/ui_grounding/dag.py +874 -0
  244. devflow_engine/ui_grounding/models.py +224 -0
  245. devflow_engine/ui_grounding/pencil_bridge.py +247 -0
  246. devflow_engine/vendor/__init__.py +0 -0
  247. devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
  248. devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
  249. devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
  250. devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
  251. devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
  252. devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
  253. devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
  254. devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
  255. devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
  256. devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
  257. devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
  258. devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
  259. devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
  260. devflow_engine/worker.py +1086 -0
  261. devflow_engine/worker_guard.py +233 -0
  262. devflow_engine-1.0.0.dist-info/METADATA +235 -0
  263. devflow_engine-1.0.0.dist-info/RECORD +393 -0
  264. devflow_engine-1.0.0.dist-info/WHEEL +4 -0
  265. devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
  266. devin/__init__.py +6 -0
  267. devin/dag.py +58 -0
  268. devin/dag_two_arm.py +138 -0
  269. devin/devin_chat_scenario_catalog.json +588 -0
  270. devin/devin_eval.py +677 -0
  271. devin/nodes/__init__.py +0 -0
  272. devin/nodes/ideation/__init__.py +0 -0
  273. devin/nodes/ideation/node.py +195 -0
  274. devin/nodes/ideation/playground.py +267 -0
  275. devin/nodes/ideation/prompt.md +65 -0
  276. devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
  277. devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
  278. devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
  279. devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
  280. devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
  281. devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
  282. devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
  283. devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
  284. devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
  285. devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
  286. devin/nodes/ideation/scenarios/vague_idea.py +16 -0
  287. devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
  288. devin/nodes/ideation/tools.json +312 -0
  289. devin/nodes/insight/__init__.py +0 -0
  290. devin/nodes/insight/node.py +49 -0
  291. devin/nodes/insight/playground.py +154 -0
  292. devin/nodes/insight/prompt.md +61 -0
  293. devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
  294. devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
  295. devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
  296. devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
  297. devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
  298. devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
  299. devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
  300. devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
  301. devin/nodes/insight/scenarios/operational_debugging.py +15 -0
  302. devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
  303. devin/nodes/insight/scenarios/operational_question.py +9 -0
  304. devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
  305. devin/nodes/insight/scenarios/queue_status.py +15 -0
  306. devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
  307. devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
  308. devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
  309. devin/nodes/insight/scenarios/worker_state_check.py +15 -0
  310. devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
  311. devin/nodes/insight/tools.json +126 -0
  312. devin/nodes/intake/__init__.py +0 -0
  313. devin/nodes/intake/node.py +27 -0
  314. devin/nodes/intake/playground.py +47 -0
  315. devin/nodes/intake/prompt.md +12 -0
  316. devin/nodes/intake/scenarios/ideation_routing.py +4 -0
  317. devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
  318. devin/nodes/intake/scenarios/insight_routing.py +4 -0
  319. devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
  320. devin/nodes/iterate/README.md +44 -0
  321. devin/nodes/iterate/__init__.py +1 -0
  322. devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
  323. devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
  324. devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
  325. devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
  326. devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
  327. devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
  328. devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
  329. devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
  330. devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
  331. devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
  332. devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
  333. devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
  334. devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
  335. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
  336. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
  337. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
  338. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
  339. devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
  340. devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
  341. devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
  342. devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
  343. devin/nodes/iterate/agent-roles.md +89 -0
  344. devin/nodes/iterate/agents/README.md +10 -0
  345. devin/nodes/iterate/artifacts.md +504 -0
  346. devin/nodes/iterate/contract.md +100 -0
  347. devin/nodes/iterate/eval-plan.md +74 -0
  348. devin/nodes/iterate/node.py +100 -0
  349. devin/nodes/iterate/pipeline/README.md +13 -0
  350. devin/nodes/iterate/playground-contract.md +76 -0
  351. devin/nodes/iterate/prompt.md +11 -0
  352. devin/nodes/iterate/scenarios/README.md +38 -0
  353. devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
  354. devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
  355. devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
  356. devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
  357. devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
  358. devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
  359. devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
  360. devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
  361. devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
  362. devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
  363. devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
  364. devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
  365. devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
  366. devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
  367. devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
  368. devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
  369. devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
  370. devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
  371. devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
  372. devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
  373. devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
  374. devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
  375. devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
  376. devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
  377. devin/nodes/shared/__init__.py +0 -0
  378. devin/nodes/shared/filemaker_expert.md +80 -0
  379. devin/nodes/shared/filemaker_expert.py +354 -0
  380. devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
  381. devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
  382. devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
  383. devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
  384. devin/nodes/shared/helpers.py +156 -0
  385. devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
  386. devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
  387. devin/nodes/shared/models.py +44 -0
  388. devin/nodes/shared/post.py +40 -0
  389. devin/nodes/shared/router.py +107 -0
  390. devin/nodes/shared/tools.py +191 -0
  391. devin/shared/devin-chat-rubric.md +237 -0
  392. devin/shared/devin-chat-scenario-suite.md +90 -0
  393. devin/shared/eval_doctrine.md +9 -0
@@ -0,0 +1,100 @@
1
+ # Iterate arm contract
2
+
3
+ ## Purpose
4
+
5
+ `Iterate` is the Devin chat arm for hyper-specific fixes, quick changes, and targeted improvements against an existing surface.
6
+
7
+ It fills the gap between:
8
+ - `Idea`, which shapes planning truth for broader downstream work
9
+ - `Insight`, which stays read-only and explanatory
10
+
11
+ ## Recommended route set
12
+
13
+ Top-level intake routing should become:
14
+ - `idea`
15
+ - `insight`
16
+ - `iterate`
17
+ - `neither`
18
+
19
+ ## Route into Iterate when
20
+
21
+ The user is asking for a bounded change against an existing surface, for example:
22
+ - fixing a concrete error
23
+ - making a small behavior change
24
+ - tweaking a page, component, route, or flow
25
+ - improving a narrow interaction without opening broad product planning
26
+
27
+ ## Route away from Iterate when
28
+
29
+ ### To `insight`
30
+ - the user only wants explanation, diagnosis, or investigation
31
+ - no implementation path is being requested yet
32
+
33
+ ### To `idea`
34
+ - the request is broader feature or workflow planning
35
+ - the change is no longer task-scale
36
+ - success depends on new planning truth rather than a targeted delta
37
+
38
+ ## Core objective
39
+
40
+ Own a targeted request from framing through verified completion, or stop with an honest blocked or promotion verdict.
41
+
42
+ ## Primary objectives
43
+
44
+ 1. Convert the request into a precise task artifact.
45
+ 2. Ground the task in evidence, repro, or a red verification seam.
46
+ 3. Supervise implementation through a bounded coding loop.
47
+ 4. Keep scope tight and resist turning task work into ideation theater.
48
+ 5. Preserve ownership boundaries: Iterator owns truth and validation, Coder owns implementation attempts.
49
+
50
+ ## Derived non-goals
51
+
52
+ - Do not behave like broad ideation intake.
53
+ - Do not stay read-only when the user clearly wants a fix or change.
54
+ - Do not claim reproducibility without evidence.
55
+ - Do not claim completion without scoped verification.
56
+ - Do not broaden scope without explicit approval.
57
+ - Do not let the implementation worker redefine the task contract.
58
+
59
+ ## Orchestration pattern
60
+
61
+ Preferred pattern: advisor-primary with one supervised coding subagent.
62
+
63
+ - Primary: `Iterator`
64
+ - Advisors: `Framer`, `Observer`
65
+ - Worker subagent: `Coder`
66
+
67
+ This should remain a simple accountable structure, not a peer swarm.
68
+
69
+ ## Pipeline order for this node
70
+
71
+ The iterate lane should be designed in this order:
72
+ 1. objectives and requirements
73
+ 2. evals
74
+ 3. tools and boundaries
75
+ 4. harness and playground
76
+ 5. prompt content only after the first four are stable
77
+
78
+ Cross-agent stage docs live in `pipeline/`.
79
+ Per-agent stage docs live in `agents/<agent>/`.
80
+
81
+ ## Verification loop
82
+
83
+ 1. Framer produces the task artifact.
84
+ 2. Observer produces the observation artifact.
85
+ 3. Iterator decides whether truth is sufficient to proceed.
86
+ 4. Iterator spawns Coder.
87
+ 5. Coder attempts the scoped delta.
88
+ 6. Iterator validates against the observation seam, success criteria, and scope boundary.
89
+ 7. If repairable but not aligned, Iterator respawns Coder with repair-specific context.
90
+ 8. If aligned, Iterator returns completion.
91
+ 9. If no longer task-scale or truth is missing, Iterator blocks or escalates clearly.
92
+
93
+ ## Completion gate
94
+
95
+ Iterator may only return completion when all are true:
96
+ 1. the confirmed repro no longer reproduces, or the scoped failing seam is green
97
+ 2. success criteria are satisfied
98
+ 3. the implemented change stayed within task scope
99
+ 4. observation evidence and final behavior agree
100
+ 5. no blocker remains that invalidates the claim
@@ -0,0 +1,74 @@
1
+ # Iterate eval plan
2
+
3
+ This file captures the pre-prompt eval targets that should exist before prompt writing or runtime wiring.
4
+
5
+ ## Pipeline stance
6
+
7
+ The eval layer comes after objectives and requirements, but before tool affordances or playground implementation details.
8
+
9
+ For the iterate lane, evals should be readable at three levels:
10
+ - route and lane level
11
+ - per-agent accountability level
12
+ - end-to-end truthful completion level
13
+
14
+ ## Route evals
15
+
16
+ 1. route a concrete fix request into `iterate`
17
+ 2. route an investigation-only request into `insight`
18
+ 3. promote a broader feature or workflow request into `idea`
19
+ 4. keep a small request in `iterate` instead of inflating it
20
+
21
+ ## Framing evals
22
+
23
+ 1. turn messy conversational input into a bounded task artifact
24
+ 2. extract route, page, component, file, or function hints when present
25
+ 3. distinguish current behavior from desired behavior clearly
26
+ 4. write observable success criteria rather than vague aspirations
27
+ 5. surface blocking unknowns when the request is underspecified
28
+
29
+ ## Observation evals
30
+
31
+ 1. confirm a reported error when evidence exists
32
+ 2. report `not_confirmed` honestly when the error cannot be reproduced
33
+ 3. create a bounded red seam for a targeted improvement
34
+ 4. provide repro steps another agent can run
35
+ 5. avoid inventing evidence when logs are silent
36
+ 6. ask for more context when truth is genuinely missing
37
+
38
+ ## Iterator supervision evals
39
+
40
+ 1. refuse to spawn Coder before framing and observation are sufficient
41
+ 2. respawn Coder after a near miss with tighter repair context
42
+ 3. refuse completion when the observation seam is still red
43
+ 4. refuse completion when success criteria are unmet
44
+ 5. block honestly when missing truth prevents safe implementation
45
+ 6. escalate when the task grows past iterate scale
46
+
47
+ ## Coder evals
48
+
49
+ 1. fix a reproducible error without unrelated drift
50
+ 2. satisfy a targeted improvement seam
51
+ 3. report partial progress honestly when the first attempt fails
52
+ 4. stay within the scoped files and surfaces when the task is narrow
53
+ 5. return actionable blocker detail when safe completion is impossible
54
+ 6. improve on a second attempt when respawned with repair context
55
+
56
+ ## Completion truth evals
57
+
58
+ 1. no green claim without green seam
59
+ 2. no success claim without requested user-visible outcome
60
+ 3. no completion claim when repro truth was never established but was required
61
+ 4. no completion claim after unauthorized scope expansion
62
+
63
+ ## Detailed stage mapping
64
+
65
+ - cross-agent eval design: `pipeline/02-evals.md`
66
+ - agent-specific eval design: `agents/*/02-evals.md`
67
+ - scenario planning inputs: `scenarios/`
68
+
69
+ ## Expected future repo mapping
70
+
71
+ When implementation begins, these eval categories should map cleanly into:
72
+ - intake routing scenarios for the new `iterate` arm
73
+ - iterate scenario fixtures under `src/devin/nodes/iterate/scenarios/`
74
+ - eventual `_evals.py` checks that mirror the existing ideation and insight harness style
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from devflow_engine.devin2.pi_runner import run_devin2_pi_agent
6
+ from devflow_engine.vendor.datalumina_genai.core.nodes.base import Node
7
+ from devflow_engine.vendor.datalumina_genai.core.task import TaskContext
8
+ from devin.nodes.shared.helpers import (
9
+ dfs_node_running,
10
+ load_node_prompt_lines,
11
+ pipeline_root,
12
+ resolve_project_id,
13
+ store_run,
14
+ write_json,
15
+ )
16
+ from devin.nodes.shared.models import DevinAgentResponse
17
+
18
+
19
+ class IterateAgentNode(Node):
20
+ async def process(self, task_context: TaskContext) -> TaskContext:
21
+ event = task_context.event
22
+ repo_root = Path(event.repo_root)
23
+ store, run_id = store_run()
24
+ node_exec_id = store.create_node_attempt(
25
+ run_id=run_id,
26
+ node_id='iterate_agent',
27
+ node_name='IterateAgent',
28
+ attempt=1,
29
+ )
30
+ project_id = str(
31
+ task_context.metadata.get('project_id')
32
+ or resolve_project_id(repo_root, idea_id=event.idea_id)
33
+ )
34
+ dfs_node_running(
35
+ project_id=project_id,
36
+ run_id=run_id,
37
+ node_id='iterate_agent',
38
+ summary='Running Devin iterate agent',
39
+ idea_id=event.idea_id,
40
+ )
41
+ guidance = load_node_prompt_lines(__file__) + [
42
+ 'Return exactly one truthful outcome kind.',
43
+ 'Keep the work task-scale and tied to the existing surface described by the user.',
44
+ 'Use available context and artifacts as the only truth source.',
45
+ ]
46
+
47
+ session_id = f"iterate:{project_id}:{event.idea_id}"
48
+
49
+ context_payload = {
50
+ 'idea_id': event.idea_id,
51
+ 'current_user_message': str(task_context.metadata.get('raw_text') or event.raw_text or ''),
52
+ 'route': task_context.metadata.get('route') or {},
53
+ 'project_id': project_id,
54
+ 'repo_root': str(repo_root),
55
+ 'session_id': session_id,
56
+ }
57
+ result = run_devin2_pi_agent(
58
+ repo_root=repo_root,
59
+ stage_name='devin_iterate_response',
60
+ route_arm='iterate',
61
+ context_payload=context_payload,
62
+ operational_guidance=guidance,
63
+ output_model=DevinAgentResponse,
64
+ timeout_seconds=90,
65
+ )
66
+ model = DevinAgentResponse.model_validate(result.response_model.model_dump())
67
+ response_payload = {
68
+ 'idea_id': event.idea_id,
69
+ 'pipeline_dir': str(
70
+ pipeline_root(repo_root, idea_id=event.idea_id, pipeline_key=event.pipeline_key)
71
+ ),
72
+ 'response_message': model.response_message,
73
+ 'response_kind': model.response_kind,
74
+ 'suggested_next_step': model.suggested_next_step,
75
+ 'follow_up_questions': model.follow_up_questions,
76
+ 'response_style_notes': model.style_notes,
77
+ }
78
+ out_path = (
79
+ pipeline_root(repo_root, idea_id=event.idea_id, pipeline_key=event.pipeline_key)
80
+ / 'iterate_response.json'
81
+ )
82
+ write_json(out_path, response_payload)
83
+ store.add_artifact(
84
+ run_id=run_id,
85
+ node_exec_id=node_exec_id,
86
+ kind='devin_iterate_response',
87
+ uri=str(out_path),
88
+ metadata={'response_kind': model.response_kind},
89
+ )
90
+ store.mark_node_finished(
91
+ node_exec_id=node_exec_id,
92
+ status='succeeded',
93
+ output=response_payload,
94
+ )
95
+ task_context.metadata['response_guidance'] = response_payload
96
+ task_context.metadata['agent_loop_terminal'] = {
97
+ 'status': model.response_kind,
98
+ **response_payload,
99
+ }
100
+ return task_context
@@ -0,0 +1,13 @@
1
+ # Iterate design pipeline
2
+
3
+ This folder makes the development order explicit for the iterate lane.
4
+
5
+ Read and review in order:
6
+ 1. `01-objectives-requirements.md`
7
+ 2. `02-evals.md`
8
+ 3. `03-tools-and-boundaries.md`
9
+ 4. `04-harness-and-playground.md`
10
+ 5. `05-prompt-deferred.md`
11
+
12
+ The first four stages are design inputs.
13
+ The fifth is a reminder that prompt authoring is intentionally delayed.
@@ -0,0 +1,76 @@
1
+ # Iterate playground contract
2
+
3
+ A future Iterate playground should validate the arm as a whole, not just isolated prompt wording.
4
+
5
+ ## Placement in the development pipeline
6
+
7
+ Harness and playground design is the fourth stage, after:
8
+ 1. objectives and requirements
9
+ 2. evals
10
+ 3. tools and boundaries
11
+
12
+ This doc stays downstream of those decisions and should not invent them.
13
+
14
+ ## Required scenario coverage
15
+
16
+ - reproducible error with logs
17
+ - non-reproducible reported error
18
+ - quick UI or behavior tweak
19
+ - targeted improvement with explicit success criteria
20
+ - coder first pass fails, second pass succeeds
21
+ - request that should route to `idea`
22
+ - request that should route to `insight`
23
+
24
+ ## What the playground should verify
25
+
26
+ - routing correctness
27
+ - task artifact completeness
28
+ - observation honesty
29
+ - iterator supervision behavior
30
+ - coder respawn behavior
31
+ - completion truthfulness
32
+ - scope discipline
33
+ - readiness state transitions in `iterator_run`
34
+ - monotonic top-level artifact revisions
35
+ - promotion linkage through `promotion_handoff.json` when the lane exits to `idea` or `insight`
36
+ - attempt-scoped verifier artifact references and summaries
37
+ - exact ordinal attempt ids of the form `attempt-001`, `attempt-002`, and so on
38
+ - normalized verifier envelopes that stay parseable across verifier types
39
+
40
+ ## Harness expectations
41
+
42
+ The eventual harness should be able to:
43
+ - load prior conversation turns
44
+ - provide repo root and project context
45
+ - feed a route payload into the iterate arm
46
+ - inspect generated artifacts under `.devflow/iterate/<task_id>/`
47
+ - inspect `iterator_run.run_state` and `iterator_run.readiness`
48
+ - inspect top-level artifact revisions and the exact revisions cited by readiness or promotion decisions
49
+ - inspect `promotion_handoff.json` and its source refs when work leaves iterate
50
+ - inspect attempt-scoped verification summaries and raw verifier outputs separately
51
+ - assert that attempt directories sort in execution order via `attempt-<NNN...>` ids
52
+ - read shared verifier-envelope fields such as `overall_result`, `green_condition_alignment`, and `evidence_refs` without depending on verifier-specific payload structure
53
+ - inspect tool-call or worker-attempt traces
54
+ - evaluate terminal response guidance and disposition
55
+
56
+ ## Fixture expectations
57
+
58
+ Each fixture should make clear:
59
+ - user request
60
+ - relevant prior turns
61
+ - project and repo context
62
+ - expected route outcome
63
+ - expected artifact shape
64
+ - expected readiness or blocker verdict
65
+ - expected completion or escalation behavior
66
+
67
+ ## Per-agent harness hooks
68
+
69
+ - `Iterator` needs supervision and terminal-verdict inspection
70
+ - `Framer` needs task-artifact inspection
71
+ - `Observer` needs evidence and green-condition inspection
72
+ - `Coder` needs attempt-report and narrow-verification inspection
73
+
74
+ ## Guardrail
75
+
76
+ The playground should not treat superficial harness success as proof of real coding quality. It should only verify the bounded contract the Iterate arm claims to satisfy.
@@ -0,0 +1,11 @@
1
+ # Iterator
2
+
3
+ Own the iterate task end to end using only the provided request, artifacts, and evidence, then return one truthful lane outcome.
4
+
5
+ - Synthesize framing, observation, and attempt evidence without collapsing their roles.
6
+ - Decide readiness before coding starts.
7
+ - Keep scope aligned to the task contract.
8
+ - Respawn only with repair-specific guidance when the task is still viable.
9
+ - Block, reroute, or promote when truth or scope is insufficient.
10
+ - Do not rewrite advisor conclusions casually.
11
+ - Do not claim success without verification against the stated green condition.
@@ -0,0 +1,38 @@
1
+ # Iterate scenarios
2
+
3
+ Runnable scenario modules and evals for the Iterate arm.
4
+
5
+ ## Scenario files
6
+
7
+ Each scenario has a `.py` file (scenario definition) and a matching `_evals.py` (evaluation criteria and `evaluate()` function).
8
+
9
+ ### Arm-level scenarios
10
+ - `devin_iterate_routing.py` — routes to iterate correctly
11
+ - `iterate_error_fix.py` — reproducible error fix
12
+ - `iterate_quick_change.py` — targeted behavior change
13
+ - `iterate_to_insight_reroute.py` — reroute to insight via tool call
14
+ - `iterate_to_idea_promotion.py` — promote to idea via tool call
15
+
16
+ ### Subagent scenarios
17
+ - `framer_task_framing.py` — vague request → well-formed task_artifact
18
+ - `framer_scope_boundary.py` — framer keeps scope bounded
19
+ - `observer_evidence_seam.py` — observer identifies failing seam from evidence
20
+ - `observer_repro_creation.py` — observer creates narrow, reproducible repro
21
+ - `coder_bounded_fix.py` — coder implements targeted fix matching framed task
22
+ - `coder_artifact_alignment.py` — coder output aligns with observation_artifact
23
+
24
+ ## Running scenarios
25
+
26
+ ```bash
27
+ # Run all scenarios
28
+ /Users/devflow/repos/devflow_engine/.venv/bin/python3 \
29
+ /Users/devflow/repos/devflow_engine/playground/iterate_arm_playground.py
30
+
31
+ # Run specific scenario
32
+ /Users/devflow/repos/devflow_engine/.venv/bin/python3 \
33
+ /Users/devflow/repos/devflow_engine/playground/iterate_arm_playground.py \
34
+ --scenario-name <name>
35
+
36
+ # Run via shell runner
37
+ bash /Users/devflow/repos/devflow_engine/playground/run_iterate_scenarios.sh
38
+ ```
@@ -0,0 +1,101 @@
1
+ # Iterate artifact and loop scenarios
2
+
3
+ ## Framing scenarios
4
+
5
+ ### messy_error_report_with_partial_location
6
+ Expected checks:
7
+ - Framer derives `task_type=error_fix`
8
+ - captures partial surface or route hints
9
+ - separates facts from assumptions
10
+ - records blocking unknowns if needed
11
+ - uses the shared base `task_artifact` shape with `task_details.error_fix`
12
+
13
+ ### tiny_copy_or_behavior_tweak
14
+ Expected checks:
15
+ - Framer keeps scope small
16
+ - success criteria stay observable
17
+ - no inflation into broader planning
18
+ - uses `task_details.quick_change` rather than inventing a separate task schema
19
+
20
+ ### underspecified_but_repairable_request
21
+ Expected checks:
22
+ - Framer recommends `investigate_first`
23
+ - task artifact stays honest about unknowns
24
+ - Iterator does not move to `ready_for_coder` until observation truth exists
25
+
26
+ ## Observation scenarios
27
+
28
+ ### error_confirmed_by_logs
29
+ Expected checks:
30
+ - Observer records log evidence
31
+ - minimal repro exists or repeatability is confirmed
32
+ - verdict is `ready_for_coder`
33
+ - observation artifact names a green condition that later verifier summaries can cite
34
+
35
+ ### user_reported_error_not_confirmed
36
+ Expected checks:
37
+ - Observer reports `not_confirmed` or `inconclusive`
38
+ - no fake repro claim
39
+ - verdict is `needs_more_context`
40
+ - iterator run stays in a pre-coding state or blocks honestly
41
+
42
+ ### targeted_improvement_red_seam
43
+ Expected checks:
44
+ - Observer creates a bounded failing seam
45
+ - expected green condition is explicit
46
+ - seam remains narrow enough for attempt-scoped verifier artifacts
47
+ - later verifier records can state `green_condition_alignment` against one explicit seam without inventing verifier-specific top-level schemas
48
+
49
+ ## Iterator loop scenarios
50
+
51
+ ### coder_near_miss_then_repair_success
52
+ Expected checks:
53
+ - Iterator rejects premature success
54
+ - respawn reason is explicit
55
+ - first attempt stores summary in `iterator_run` and normalized verifier artifacts under `attempts/<attempt_id>/`
56
+ - first attempt uses `attempt-001`, second uses `attempt-002`
57
+ - second attempt converges on the scoped green condition
58
+ - `run_state` moves through `needs_respawn` to `completed`
59
+
60
+ ### missing_truth_blocks_safe_implementation
61
+ Expected checks:
62
+ - Iterator does not spawn blindly or does not claim completion
63
+ - readiness is recorded as not ready or blocked, not buried in attempt notes
64
+ - terminal state is blocked with a concrete reason
65
+
66
+ ### request_outgrows_iterate_during_loop
67
+ Expected checks:
68
+ - Iterator promotes or escalates instead of hiding broader planning inside iterate
69
+ - run record lands in `promoted`
70
+ - promotion is visible in the durable iterate record rather than implied only in chat output
71
+ - `promotion_handoff.json` exists and points back to the exact task and observation revisions that justified the promotion
72
+
73
+ ### reroute_to_insight_after_observation
74
+ Expected checks:
75
+ - Observer discovers the user actually needed diagnosis or investigation, not implementation
76
+ - Iterator reroutes to `insight` instead of spawning Coder
77
+ - `promotion_handoff.json` records `target_lane=insight`
78
+ - downstream refs may be null, but the iterate-owned handoff record is present and auditable
79
+
80
+ ## Coder attempt scenarios
81
+
82
+ ### scoped_fix_succeeds_first_pass
83
+ Expected checks:
84
+ - Coder stays in scoped files or surfaces
85
+ - verification seam is relevant and narrow
86
+ - Iterator can validate completion without ambiguity
87
+ - run record references attempt-scoped verifier artifacts instead of inlining raw output
88
+ - verifier artifacts expose shared fields like `overall_result` and `green_condition_alignment` even if the underlying verifier payload differs
89
+
90
+ ### first_pass_fails_but_report_is_honest
91
+ Expected checks:
92
+ - Coder reports what failed and what remains blocked
93
+ - Iterator has enough signal to craft a repair-specific retry
94
+ - attempt summary stays concise while preserving a pointer to fuller verifier output
95
+ - fuller verifier output still keeps the shared normalization envelope, with tool-specific detail nested under `native_payload`
96
+
97
+ ### revised_task_before_coding
98
+ Expected checks:
99
+ - Framer can amend the task without creating a second task root
100
+ - `task_artifact.json` revision increases monotonically
101
+ - Iterator readiness cites the exact task and observation revisions it relied on
@@ -0,0 +1,32 @@
1
+ SCENARIO_NAME = "coder_artifact_alignment"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Coder output should stay aligned to the supplied observation artifact and not solve a "
4
+ "different problem than the one observer bounded."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "coder",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "task_artifact": {
10
+ "task_type": "targeted_improvement",
11
+ "surface": "CSV import wizard",
12
+ "scope_boundary": "Trim whitespace around email values during import only.",
13
+ "success_criteria": ["emails with surrounding spaces import successfully"],
14
+ },
15
+ "observation_artifact": {
16
+ "failing_seam": "CSV preview rejects rows because email validation runs before whitespace trim.",
17
+ "expected_green_condition": "Import preview trims email values before validation.",
18
+ "repro_steps": [
19
+ "Upload CSV with email value ' alice@example.com '",
20
+ "Open import preview",
21
+ "Observe invalid email validation error",
22
+ ],
23
+ "ready_for_coder": True,
24
+ },
25
+ }
26
+ EXPECTED_BEHAVIOR = {
27
+ "produces_coder_report": True,
28
+ "aligns_to_observation_artifact": True,
29
+ "stays_in_scope": True,
30
+ "runs_narrow_verification": True,
31
+ "reports_attempt_honestly": True,
32
+ }
@@ -0,0 +1,45 @@
1
+ from devin.nodes.iterate.scenarios.coder_artifact_alignment import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_coder_report": True,
5
+ "aligns_to_observation_artifact": True,
6
+ "stays_in_scope": True,
7
+ "runs_narrow_verification": True,
8
+ "reports_attempt_honestly": True,
9
+ }
10
+
11
+
12
+
13
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
14
+ ok = True
15
+ notes = []
16
+
17
+ changed_files = actual_output.get("changed_files") or []
18
+ if not changed_files:
19
+ ok = False
20
+ notes.append("missing changed_files for coder output")
21
+
22
+ verification = actual_output.get("verification") or actual_output.get("verification_summary")
23
+ if not verification:
24
+ ok = False
25
+ notes.append("missing verification aligned to observation_artifact")
26
+
27
+ report_text = " ".join(
28
+ str(v)
29
+ for v in [
30
+ actual_output.get("summary"),
31
+ actual_output.get("what_changed"),
32
+ actual_output.get("what_passed"),
33
+ actual_output.get("what_failed"),
34
+ ]
35
+ if v
36
+ ).lower()
37
+ if report_text and not any(tok in report_text for tok in ("csv", "email", "trim", "validation", "preview")):
38
+ ok = False
39
+ notes.append("coder report does not align to the observation artifact seam")
40
+
41
+ if report_text and any(tok in report_text for tok in ("invite", "dashboard", "unrelated cleanup", "schema redesign")):
42
+ ok = False
43
+ notes.append("coder output appears to solve a different problem than the observed seam")
44
+
45
+ return ok, notes
@@ -0,0 +1,27 @@
1
+ SCENARIO_NAME = "coder_bounded_fix"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Coder receives framed and observed iterate artifacts and applies a minimal fix that "
4
+ "matches the scoped task instead of broadening the change."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "coder",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "task_artifact": {
10
+ "task_type": "error_fix",
11
+ "surface": "invite acceptance",
12
+ "scope_boundary": "Fix the accept-invite failure only. No refactor or UX redesign.",
13
+ "success_criteria": ["POST /api/invites/accept returns 200 for a valid invite token"],
14
+ },
15
+ "observation_artifact": {
16
+ "failing_seam": "Invite acceptance fails when accepted_at is None during persistence.",
17
+ "expected_green_condition": "Valid invite acceptance sets accepted_at before persistence.",
18
+ "ready_for_coder": True,
19
+ },
20
+ }
21
+ EXPECTED_BEHAVIOR = {
22
+ "produces_coder_report": True,
23
+ "implements_targeted_fix": True,
24
+ "stays_in_scope": True,
25
+ "runs_narrow_verification": True,
26
+ "reports_attempt_honestly": True,
27
+ }
@@ -0,0 +1,45 @@
1
+ from devin.nodes.iterate.scenarios.coder_bounded_fix import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_coder_report": True,
5
+ "implements_targeted_fix": True,
6
+ "stays_in_scope": True,
7
+ "runs_narrow_verification": True,
8
+ "reports_attempt_honestly": True,
9
+ }
10
+
11
+
12
+
13
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
14
+ ok = True
15
+ notes = []
16
+
17
+ changed_files = actual_output.get("changed_files") or []
18
+ if not changed_files:
19
+ ok = False
20
+ notes.append("missing changed_files for coder output")
21
+
22
+ verification = actual_output.get("verification") or actual_output.get("verification_summary")
23
+ if not verification:
24
+ ok = False
25
+ notes.append("missing narrow verification result")
26
+
27
+ report_text = " ".join(
28
+ str(v)
29
+ for v in [
30
+ actual_output.get("summary"),
31
+ actual_output.get("what_changed"),
32
+ actual_output.get("what_passed"),
33
+ actual_output.get("what_failed"),
34
+ actual_output.get("remaining_blockers"),
35
+ ]
36
+ if v
37
+ ).lower()
38
+ if report_text and any(tok in report_text for tok in ("refactor", "cleanup unrelated", "redesign")):
39
+ ok = False
40
+ notes.append("coder report suggests scope drift beyond the bounded fix")
41
+
42
+ if report_text and not any(tok in report_text for tok in ("invite", "accepted_at", "accept")):
43
+ notes.append("coder report does not clearly align to the framed invite seam")
44
+
45
+ return ok, notes