devflow-engine 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. devflow_engine/__init__.py +3 -0
  2. devflow_engine/agentic_prompts.py +100 -0
  3. devflow_engine/agentic_runtime.py +398 -0
  4. devflow_engine/api_key_flow_harness.py +539 -0
  5. devflow_engine/api_keys.py +357 -0
  6. devflow_engine/bootstrap/__init__.py +2 -0
  7. devflow_engine/bootstrap/provision_from_template.py +84 -0
  8. devflow_engine/cli/__init__.py +0 -0
  9. devflow_engine/cli/app.py +7270 -0
  10. devflow_engine/core/__init__.py +0 -0
  11. devflow_engine/core/config.py +86 -0
  12. devflow_engine/core/logging.py +29 -0
  13. devflow_engine/core/paths.py +45 -0
  14. devflow_engine/core/toml_kv.py +33 -0
  15. devflow_engine/devflow_event_worker.py +1292 -0
  16. devflow_engine/devflow_state.py +201 -0
  17. devflow_engine/devin2/__init__.py +9 -0
  18. devflow_engine/devin2/agent_definition.py +120 -0
  19. devflow_engine/devin2/pi_runner.py +204 -0
  20. devflow_engine/devin_orchestration.py +69 -0
  21. devflow_engine/docs/prompts/anti-patterns.md +42 -0
  22. devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
  23. devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
  24. devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
  25. devflow_engine/doctor/__init__.py +2 -0
  26. devflow_engine/doctor/triage.py +140 -0
  27. devflow_engine/error/__init__.py +0 -0
  28. devflow_engine/error/remediation.py +21 -0
  29. devflow_engine/errors/error_solver_dag.py +522 -0
  30. devflow_engine/errors/runtime_observability.py +67 -0
  31. devflow_engine/idea/__init__.py +4 -0
  32. devflow_engine/idea/actors.py +481 -0
  33. devflow_engine/idea/agentic.py +465 -0
  34. devflow_engine/idea/analyze.py +93 -0
  35. devflow_engine/idea/devin_chat_dag.py +1 -0
  36. devflow_engine/idea/diff.py +99 -0
  37. devflow_engine/idea/drafts.py +446 -0
  38. devflow_engine/idea/idea_creation_dag.py +643 -0
  39. devflow_engine/idea/ideation_enrichment.py +355 -0
  40. devflow_engine/idea/ideation_enrichment_worker.py +19 -0
  41. devflow_engine/idea/paths.py +28 -0
  42. devflow_engine/idea/promote.py +53 -0
  43. devflow_engine/idea/redaction.py +27 -0
  44. devflow_engine/idea/repo_tools.py +1277 -0
  45. devflow_engine/idea/response_mode.py +30 -0
  46. devflow_engine/idea/story_pipeline.py +1585 -0
  47. devflow_engine/idea/sufficiency.py +376 -0
  48. devflow_engine/idea/traditional_stories.py +1257 -0
  49. devflow_engine/implementation/__init__.py +0 -0
  50. devflow_engine/implementation/alembic_preflight.py +700 -0
  51. devflow_engine/implementation/dag.py +8450 -0
  52. devflow_engine/implementation/green_gate.py +93 -0
  53. devflow_engine/implementation/prompts.py +108 -0
  54. devflow_engine/implementation/test_runtime.py +623 -0
  55. devflow_engine/integration/__init__.py +19 -0
  56. devflow_engine/integration/agentic.py +66 -0
  57. devflow_engine/integration/dag.py +3539 -0
  58. devflow_engine/integration/prompts.py +114 -0
  59. devflow_engine/integration/supabase_schema.sql +31 -0
  60. devflow_engine/integration/supabase_sync.py +177 -0
  61. devflow_engine/llm/__init__.py +1 -0
  62. devflow_engine/llm/cli_one_shot.py +84 -0
  63. devflow_engine/llm/cli_stream.py +371 -0
  64. devflow_engine/llm/execution_context.py +26 -0
  65. devflow_engine/llm/invoke.py +1322 -0
  66. devflow_engine/llm/provider_api.py +304 -0
  67. devflow_engine/llm/repo_knowledge.py +588 -0
  68. devflow_engine/llm_primitives.py +315 -0
  69. devflow_engine/orchestration.py +62 -0
  70. devflow_engine/planning/__init__.py +0 -0
  71. devflow_engine/planning/analyze_repo.py +92 -0
  72. devflow_engine/planning/render_drafts.py +133 -0
  73. devflow_engine/playground/__init__.py +0 -0
  74. devflow_engine/playground/hooks.py +26 -0
  75. devflow_engine/playwright_workflow/__init__.py +5 -0
  76. devflow_engine/playwright_workflow/dag.py +1317 -0
  77. devflow_engine/process/__init__.py +5 -0
  78. devflow_engine/process/dag.py +59 -0
  79. devflow_engine/project_registration/__init__.py +3 -0
  80. devflow_engine/project_registration/dag.py +1581 -0
  81. devflow_engine/project_registry.py +109 -0
  82. devflow_engine/prompts/devin/generic/prompt.md +6 -0
  83. devflow_engine/prompts/devin/ideation/prompt.md +263 -0
  84. devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
  85. devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
  86. devflow_engine/prompts/devin/insight/prompt.md +11 -0
  87. devflow_engine/prompts/devin/insight/scenarios.md +5 -0
  88. devflow_engine/prompts/devin/intake/prompt.md +15 -0
  89. devflow_engine/prompts/devin/iterate/prompt.md +12 -0
  90. devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
  91. devflow_engine/prompts/devin/shared/principles.md +246 -0
  92. devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
  93. devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
  94. devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
  95. devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
  96. devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
  97. devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
  98. devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
  99. devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
  100. devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
  101. devflow_engine/prompts/implementation/red/prompt.md +27 -0
  102. devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
  103. devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
  104. devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
  105. devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
  106. devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
  107. devflow_engine/prompts/integration/README.md +185 -0
  108. devflow_engine/prompts/integration/green/example.md +67 -0
  109. devflow_engine/prompts/integration/green/green/prompt.md +10 -0
  110. devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
  111. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
  112. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
  113. devflow_engine/prompts/integration/green_enrich/example.md +79 -0
  114. devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
  115. devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
  116. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
  117. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  118. devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
  119. devflow_engine/prompts/integration/red/example.md +152 -0
  120. devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
  121. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  122. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
  123. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
  124. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
  125. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  126. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
  127. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
  128. devflow_engine/prompts/integration/red/red/prompt.md +11 -0
  129. devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
  130. devflow_engine/prompts/integration/red_review/example.md +71 -0
  131. devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
  132. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  133. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
  134. devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
  135. devflow_engine/prompts/integration/resolve/example.md +111 -0
  136. devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
  137. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
  138. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
  139. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
  140. devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
  141. devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
  142. devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
  143. devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
  144. devflow_engine/prompts/integration/validate/example.md +143 -0
  145. devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
  146. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  147. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
  148. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
  149. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
  150. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  151. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
  152. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
  153. devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
  154. devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
  155. devflow_engine/prompts/integration/write_workflows/example.md +100 -0
  156. devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
  157. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
  158. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
  159. devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
  160. devflow_engine/prompts/iterate/README.md +7 -0
  161. devflow_engine/prompts/iterate/coder/prompt.md +11 -0
  162. devflow_engine/prompts/iterate/framer/prompt.md +11 -0
  163. devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
  164. devflow_engine/prompts/iterate/observer/prompt.md +11 -0
  165. devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
  166. devflow_engine/prompts/recovery/execution/prompt.md +8 -0
  167. devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
  168. devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
  169. devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
  170. devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
  171. devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
  172. devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
  173. devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
  174. devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
  175. devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
  176. devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
  177. devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
  178. devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
  179. devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
  180. devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
  181. devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
  182. devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
  183. devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
  184. devflow_engine/recovery/__init__.py +3 -0
  185. devflow_engine/recovery/dag.py +2609 -0
  186. devflow_engine/recovery/models.py +220 -0
  187. devflow_engine/refactor.py +93 -0
  188. devflow_engine/registry/__init__.py +1 -0
  189. devflow_engine/registry/cards.py +238 -0
  190. devflow_engine/registry/domain_normalize.py +60 -0
  191. devflow_engine/registry/effects.py +65 -0
  192. devflow_engine/registry/enforce_report.py +150 -0
  193. devflow_engine/registry/module_cards_classify.py +164 -0
  194. devflow_engine/registry/module_cards_draft.py +184 -0
  195. devflow_engine/registry/module_cards_gate.py +59 -0
  196. devflow_engine/registry/packages.py +347 -0
  197. devflow_engine/registry/pathways.py +323 -0
  198. devflow_engine/review/__init__.py +11 -0
  199. devflow_engine/review/dag.py +588 -0
  200. devflow_engine/review/review_story.py +67 -0
  201. devflow_engine/scope_idea/__init__.py +3 -0
  202. devflow_engine/scope_idea/agentic.py +39 -0
  203. devflow_engine/scope_idea/dag.py +1069 -0
  204. devflow_engine/scope_idea/models.py +175 -0
  205. devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
  206. devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
  207. devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
  208. devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
  209. devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
  210. devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
  211. devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
  212. devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
  213. devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
  214. devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
  215. devflow_engine/skills/registry.example.yaml +42 -0
  216. devflow_engine/source_doc_assumptions.py +291 -0
  217. devflow_engine/source_doc_mutation_dag.py +1606 -0
  218. devflow_engine/source_doc_mutation_eval.py +417 -0
  219. devflow_engine/source_doc_mutation_worker.py +25 -0
  220. devflow_engine/source_docs_schema.py +207 -0
  221. devflow_engine/source_docs_updater.py +309 -0
  222. devflow_engine/source_scope/__init__.py +15 -0
  223. devflow_engine/source_scope/agentic.py +45 -0
  224. devflow_engine/source_scope/dag.py +1626 -0
  225. devflow_engine/source_scope/models.py +177 -0
  226. devflow_engine/stores/__init__.py +0 -0
  227. devflow_engine/stores/execution_store.py +3534 -0
  228. devflow_engine/story/__init__.py +0 -0
  229. devflow_engine/story/contracts.py +160 -0
  230. devflow_engine/story/discovery.py +47 -0
  231. devflow_engine/story/evidence.py +118 -0
  232. devflow_engine/story/hashing.py +27 -0
  233. devflow_engine/story/implemented_queue_purge.py +148 -0
  234. devflow_engine/story/indexer.py +105 -0
  235. devflow_engine/story/io.py +20 -0
  236. devflow_engine/story/markdown_contracts.py +298 -0
  237. devflow_engine/story/reconciliation.py +408 -0
  238. devflow_engine/story/validate_stories.py +149 -0
  239. devflow_engine/story/validate_tests_story.py +512 -0
  240. devflow_engine/story/validation.py +133 -0
  241. devflow_engine/ui_grounding/__init__.py +11 -0
  242. devflow_engine/ui_grounding/agentic.py +31 -0
  243. devflow_engine/ui_grounding/dag.py +874 -0
  244. devflow_engine/ui_grounding/models.py +224 -0
  245. devflow_engine/ui_grounding/pencil_bridge.py +247 -0
  246. devflow_engine/vendor/__init__.py +0 -0
  247. devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
  248. devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
  249. devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
  250. devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
  251. devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
  252. devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
  253. devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
  254. devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
  255. devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
  256. devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
  257. devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
  258. devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
  259. devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
  260. devflow_engine/worker.py +1086 -0
  261. devflow_engine/worker_guard.py +233 -0
  262. devflow_engine-1.0.0.dist-info/METADATA +235 -0
  263. devflow_engine-1.0.0.dist-info/RECORD +393 -0
  264. devflow_engine-1.0.0.dist-info/WHEEL +4 -0
  265. devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
  266. devin/__init__.py +6 -0
  267. devin/dag.py +58 -0
  268. devin/dag_two_arm.py +138 -0
  269. devin/devin_chat_scenario_catalog.json +588 -0
  270. devin/devin_eval.py +677 -0
  271. devin/nodes/__init__.py +0 -0
  272. devin/nodes/ideation/__init__.py +0 -0
  273. devin/nodes/ideation/node.py +195 -0
  274. devin/nodes/ideation/playground.py +267 -0
  275. devin/nodes/ideation/prompt.md +65 -0
  276. devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
  277. devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
  278. devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
  279. devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
  280. devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
  281. devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
  282. devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
  283. devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
  284. devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
  285. devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
  286. devin/nodes/ideation/scenarios/vague_idea.py +16 -0
  287. devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
  288. devin/nodes/ideation/tools.json +312 -0
  289. devin/nodes/insight/__init__.py +0 -0
  290. devin/nodes/insight/node.py +49 -0
  291. devin/nodes/insight/playground.py +154 -0
  292. devin/nodes/insight/prompt.md +61 -0
  293. devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
  294. devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
  295. devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
  296. devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
  297. devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
  298. devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
  299. devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
  300. devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
  301. devin/nodes/insight/scenarios/operational_debugging.py +15 -0
  302. devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
  303. devin/nodes/insight/scenarios/operational_question.py +9 -0
  304. devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
  305. devin/nodes/insight/scenarios/queue_status.py +15 -0
  306. devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
  307. devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
  308. devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
  309. devin/nodes/insight/scenarios/worker_state_check.py +15 -0
  310. devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
  311. devin/nodes/insight/tools.json +126 -0
  312. devin/nodes/intake/__init__.py +0 -0
  313. devin/nodes/intake/node.py +27 -0
  314. devin/nodes/intake/playground.py +47 -0
  315. devin/nodes/intake/prompt.md +12 -0
  316. devin/nodes/intake/scenarios/ideation_routing.py +4 -0
  317. devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
  318. devin/nodes/intake/scenarios/insight_routing.py +4 -0
  319. devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
  320. devin/nodes/iterate/README.md +44 -0
  321. devin/nodes/iterate/__init__.py +1 -0
  322. devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
  323. devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
  324. devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
  325. devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
  326. devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
  327. devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
  328. devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
  329. devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
  330. devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
  331. devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
  332. devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
  333. devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
  334. devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
  335. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
  336. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
  337. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
  338. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
  339. devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
  340. devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
  341. devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
  342. devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
  343. devin/nodes/iterate/agent-roles.md +89 -0
  344. devin/nodes/iterate/agents/README.md +10 -0
  345. devin/nodes/iterate/artifacts.md +504 -0
  346. devin/nodes/iterate/contract.md +100 -0
  347. devin/nodes/iterate/eval-plan.md +74 -0
  348. devin/nodes/iterate/node.py +100 -0
  349. devin/nodes/iterate/pipeline/README.md +13 -0
  350. devin/nodes/iterate/playground-contract.md +76 -0
  351. devin/nodes/iterate/prompt.md +11 -0
  352. devin/nodes/iterate/scenarios/README.md +38 -0
  353. devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
  354. devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
  355. devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
  356. devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
  357. devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
  358. devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
  359. devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
  360. devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
  361. devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
  362. devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
  363. devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
  364. devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
  365. devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
  366. devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
  367. devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
  368. devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
  369. devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
  370. devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
  371. devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
  372. devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
  373. devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
  374. devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
  375. devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
  376. devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
  377. devin/nodes/shared/__init__.py +0 -0
  378. devin/nodes/shared/filemaker_expert.md +80 -0
  379. devin/nodes/shared/filemaker_expert.py +354 -0
  380. devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
  381. devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
  382. devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
  383. devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
  384. devin/nodes/shared/helpers.py +156 -0
  385. devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
  386. devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
  387. devin/nodes/shared/models.py +44 -0
  388. devin/nodes/shared/post.py +40 -0
  389. devin/nodes/shared/router.py +107 -0
  390. devin/nodes/shared/tools.py +191 -0
  391. devin/shared/devin-chat-rubric.md +237 -0
  392. devin/shared/devin-chat-scenario-suite.md +90 -0
  393. devin/shared/eval_doctrine.md +9 -0
@@ -0,0 +1,21 @@
1
+ SCENARIO_NAME = "devin_iterate_routing"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Devin two-arm DAG: bounded existing-surface change request stays in the iterate "
4
+ "arm and does not drift into insight or idea handling."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "current_user_message": (
8
+ "On the existing account settings page, clicking Save on the profile form does "
9
+ "nothing. Fix that bug without changing anything else."
10
+ ),
11
+ "idea_id": "devin_eval_iterate_routing",
12
+ "project_id": "proj_75f63d30",
13
+ "repo_root": "/Users/devflow/repos/Spicy-Server",
14
+ }
15
+ EXPECTED_BEHAVIOR = {
16
+ "route_arm": "iterate",
17
+ "response_kind_in": ["completed", "blocked", "needs_more_context"],
18
+ "keeps_existing_surface_scope": True,
19
+ "does_not_reroute": True,
20
+ "does_not_promote": True,
21
+ }
@@ -0,0 +1,36 @@
1
+ from devin.nodes.iterate.scenarios.devin_iterate_routing import (
2
+ EXPECTED_BEHAVIOR,
3
+ INPUT_PAYLOAD,
4
+ SCENARIO_NAME,
5
+ )
6
+
7
+ EVAL_CRITERIA = {
8
+ "route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
9
+ "response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
10
+ "keeps_existing_surface_scope": True,
11
+ "does_not_reroute": True,
12
+ "does_not_promote": True,
13
+ }
14
+
15
+
16
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
17
+ ok = True
18
+ notes = []
19
+ exp = EXPECTED_BEHAVIOR
20
+
21
+ route_arm = actual_output.get("route_arm")
22
+ if route_arm != exp["route_arm"]:
23
+ ok = False
24
+ notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
25
+
26
+ kind = str(actual_output.get("response_kind") or "")
27
+ if kind not in exp["response_kind_in"]:
28
+ ok = False
29
+ notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
30
+
31
+ target_lane = str(actual_output.get("target_lane") or "").lower()
32
+ if target_lane in {"insight", "idea"}:
33
+ ok = False
34
+ notes.append(f"unexpected lane transition to {target_lane}")
35
+
36
+ return ok, notes
@@ -0,0 +1,25 @@
1
+ SCENARIO_NAME = "framer_scope_boundary"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Framer keeps a small existing-surface request bounded instead of inflating it into "
4
+ "a redesign or broader product initiative."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "framer",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "current_user_message": (
10
+ "On the current dashboard, rename the KPI card label from Gross Revenue to Revenue. "
11
+ "Please do not redesign the dashboard."
12
+ ),
13
+ "context": {
14
+ "surface_hints": ["dashboard", "KPI card label"],
15
+ "constraint": "no redesign",
16
+ },
17
+ }
18
+ EXPECTED_BEHAVIOR = {
19
+ "produces_task_artifact": True,
20
+ "classifies_request_as": "quick_change",
21
+ "keeps_scope_bounded": True,
22
+ "does_not_promote": True,
23
+ "does_not_expand_into_redesign": True,
24
+ "recommended_next_step_in": ["stay_iterate"],
25
+ }
@@ -0,0 +1,57 @@
1
+ from devin.nodes.iterate.scenarios.framer_scope_boundary import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_task_artifact": True,
5
+ "classifies_request_as": EXPECTED_BEHAVIOR["classifies_request_as"],
6
+ "keeps_scope_bounded": True,
7
+ "does_not_promote": True,
8
+ "does_not_expand_into_redesign": True,
9
+ "recommended_next_step_in": EXPECTED_BEHAVIOR["recommended_next_step_in"],
10
+ }
11
+
12
+
13
+
14
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
15
+ ok = True
16
+ notes = []
17
+ exp = EXPECTED_BEHAVIOR
18
+
19
+ task_artifact = actual_output.get("task_artifact") or {}
20
+ if not task_artifact:
21
+ ok = False
22
+ notes.append("missing task_artifact")
23
+ return ok, notes
24
+
25
+ classification = str(task_artifact.get("task_type") or task_artifact.get("classification") or "")
26
+ if classification != exp["classifies_request_as"]:
27
+ ok = False
28
+ notes.append(
29
+ f"expected classification={exp['classifies_request_as']}, got {classification}"
30
+ )
31
+
32
+ next_step = str(actual_output.get("recommended_next_step") or task_artifact.get("recommended_next_step") or "")
33
+ if next_step not in exp["recommended_next_step_in"]:
34
+ ok = False
35
+ notes.append(
36
+ f"expected recommended_next_step in {exp['recommended_next_step_in']}, got {next_step}"
37
+ )
38
+
39
+ target_lane = str(actual_output.get("target_lane") or task_artifact.get("target_lane") or "").lower()
40
+ if target_lane in {"idea", "insight"}:
41
+ ok = False
42
+ notes.append(f"unexpected lane transition to {target_lane}")
43
+
44
+ text = " ".join(
45
+ str(v)
46
+ for v in [
47
+ task_artifact.get("summary"),
48
+ task_artifact.get("scope_boundary"),
49
+ task_artifact.get("non_goals"),
50
+ ]
51
+ if v
52
+ ).lower()
53
+ if any(tok in text for tok in ("redesign", "rewrite", "new dashboard", "broader analytics overhaul")):
54
+ ok = False
55
+ notes.append("task_artifact broadened the task beyond the requested label change")
56
+
57
+ return ok, notes
@@ -0,0 +1,25 @@
1
+ SCENARIO_NAME = "framer_task_framing"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Framer receives a vague iterate request and turns it into a bounded, well-formed "
4
+ "task artifact another iterate role can safely use."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "framer",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "current_user_message": (
10
+ "The invite flow feels broken somewhere. Please fix it without redoing the page."
11
+ ),
12
+ "context": {
13
+ "surface_hints": ["invite flow", "existing page"],
14
+ "reported_symptom": "broken somewhere",
15
+ },
16
+ }
17
+ EXPECTED_BEHAVIOR = {
18
+ "produces_task_artifact": True,
19
+ "classifies_request_as": "error_fix",
20
+ "keeps_scope_bounded": True,
21
+ "distinguishes_current_vs_desired": True,
22
+ "writes_success_criteria": True,
23
+ "separates_unknowns": True,
24
+ "recommended_next_step_in": ["stay_iterate", "investigate_first"],
25
+ }
@@ -0,0 +1,58 @@
1
+ from devin.nodes.iterate.scenarios.framer_task_framing import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_task_artifact": True,
5
+ "classifies_request_as": EXPECTED_BEHAVIOR["classifies_request_as"],
6
+ "keeps_scope_bounded": True,
7
+ "distinguishes_current_vs_desired": True,
8
+ "writes_success_criteria": True,
9
+ "separates_unknowns": True,
10
+ "recommended_next_step_in": EXPECTED_BEHAVIOR["recommended_next_step_in"],
11
+ }
12
+
13
+
14
+
15
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
16
+ ok = True
17
+ notes = []
18
+ exp = EXPECTED_BEHAVIOR
19
+
20
+ task_artifact = actual_output.get("task_artifact") or {}
21
+ if not task_artifact:
22
+ ok = False
23
+ notes.append("missing task_artifact")
24
+ return ok, notes
25
+
26
+ classification = str(task_artifact.get("task_type") or task_artifact.get("classification") or "")
27
+ if classification != exp["classifies_request_as"]:
28
+ ok = False
29
+ notes.append(
30
+ f"expected classification={exp['classifies_request_as']}, got {classification}"
31
+ )
32
+
33
+ if not (task_artifact.get("current_behavior") and task_artifact.get("desired_behavior")):
34
+ ok = False
35
+ notes.append("task_artifact must distinguish current_behavior and desired_behavior")
36
+
37
+ success = task_artifact.get("success_criteria") or []
38
+ if not success:
39
+ ok = False
40
+ notes.append("task_artifact missing observable success_criteria")
41
+
42
+ unknowns = task_artifact.get("unknowns") or {}
43
+ if not (unknowns.get("blocking") is not None and unknowns.get("non_blocking") is not None):
44
+ ok = False
45
+ notes.append("task_artifact must separate blocking and non_blocking unknowns")
46
+
47
+ next_step = str(actual_output.get("recommended_next_step") or task_artifact.get("recommended_next_step") or "")
48
+ if next_step and next_step not in exp["recommended_next_step_in"]:
49
+ ok = False
50
+ notes.append(
51
+ f"expected recommended_next_step in {exp['recommended_next_step_in']}, got {next_step}"
52
+ )
53
+
54
+ scope = str(task_artifact.get("scope_boundary") or task_artifact.get("non_goals") or "").lower()
55
+ if scope and not any(tok in scope for tok in ("no redesign", "existing", "invite", "bounded")):
56
+ notes.append("scope boundary is present but may not be clearly bounded")
57
+
58
+ return ok, notes
@@ -0,0 +1,21 @@
1
+ SCENARIO_NAME = "iterate_error_fix"
2
+ SCENARIO_DESCRIPTION = (
3
+ "A reproducible existing-product error should stay in iterate, keep a repair scope, "
4
+ "and return an iterate-owned outcome rather than diagnosis-only handling."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "current_user_message": (
8
+ "Fix this bug: opening /billing throws a 500 every time. The logs say "
9
+ "KeyError: customer_id in billing_summary()."
10
+ ),
11
+ "idea_id": "devin_eval_iterate_error_fix",
12
+ "project_id": "proj_75f63d30",
13
+ "repo_root": "/Users/devflow/repos/Spicy-Server",
14
+ }
15
+ EXPECTED_BEHAVIOR = {
16
+ "route_arm": "iterate",
17
+ "response_kind_in": ["completed", "blocked", "needs_more_context"],
18
+ "treats_request_as_error_fix": True,
19
+ "preserves_reproducible_failure_context": True,
20
+ "does_not_reroute": True,
21
+ }
@@ -0,0 +1,39 @@
1
+ from devin.nodes.iterate.scenarios.iterate_error_fix import (
2
+ EXPECTED_BEHAVIOR,
3
+ INPUT_PAYLOAD,
4
+ SCENARIO_NAME,
5
+ )
6
+
7
+ EVAL_CRITERIA = {
8
+ "route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
9
+ "response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
10
+ "treats_request_as_error_fix": True,
11
+ "preserves_reproducible_failure_context": True,
12
+ "does_not_reroute": True,
13
+ }
14
+
15
+
16
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
17
+ ok = True
18
+ notes = []
19
+ exp = EXPECTED_BEHAVIOR
20
+
21
+ route_arm = actual_output.get("route_arm")
22
+ if route_arm != exp["route_arm"]:
23
+ ok = False
24
+ notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
25
+
26
+ kind = str(actual_output.get("response_kind") or "")
27
+ if kind not in exp["response_kind_in"]:
28
+ ok = False
29
+ notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
30
+
31
+ msg = str(actual_output.get("response_message") or "").lower()
32
+ if msg and not any(tok in msg for tok in ("500", "keyerror", "billing", "error", "bug")):
33
+ notes.append("response does not clearly preserve the reported error context")
34
+
35
+ if str(actual_output.get("target_lane") or "").lower() == "insight":
36
+ ok = False
37
+ notes.append("unexpected reroute to insight for a reproducible fix request")
38
+
39
+ return ok, notes
@@ -0,0 +1,21 @@
1
+ SCENARIO_NAME = "iterate_quick_change"
2
+ SCENARIO_DESCRIPTION = (
3
+ "A tiny existing-surface behavior tweak should remain task-scale in iterate and "
4
+ "avoid inflation into broader planning."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "current_user_message": (
8
+ "On the current invoices table, default the Status filter to Open instead of All. "
9
+ "No redesign, just that behavior change."
10
+ ),
11
+ "idea_id": "devin_eval_iterate_quick_change",
12
+ "project_id": "proj_75f63d30",
13
+ "repo_root": "/Users/devflow/repos/Spicy-Server",
14
+ }
15
+ EXPECTED_BEHAVIOR = {
16
+ "route_arm": "iterate",
17
+ "response_kind_in": ["completed", "blocked", "needs_more_context"],
18
+ "keeps_scope_small": True,
19
+ "treats_request_as_quick_change": True,
20
+ "does_not_promote": True,
21
+ }
@@ -0,0 +1,35 @@
1
+ from devin.nodes.iterate.scenarios.iterate_quick_change import (
2
+ EXPECTED_BEHAVIOR,
3
+ INPUT_PAYLOAD,
4
+ SCENARIO_NAME,
5
+ )
6
+
7
+ EVAL_CRITERIA = {
8
+ "route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
9
+ "response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
10
+ "keeps_scope_small": True,
11
+ "treats_request_as_quick_change": True,
12
+ "does_not_promote": True,
13
+ }
14
+
15
+
16
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
17
+ ok = True
18
+ notes = []
19
+ exp = EXPECTED_BEHAVIOR
20
+
21
+ route_arm = actual_output.get("route_arm")
22
+ if route_arm != exp["route_arm"]:
23
+ ok = False
24
+ notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
25
+
26
+ kind = str(actual_output.get("response_kind") or "")
27
+ if kind not in exp["response_kind_in"]:
28
+ ok = False
29
+ notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
30
+
31
+ if str(actual_output.get("target_lane") or "").lower() == "idea":
32
+ ok = False
33
+ notes.append("unexpected promotion to idea for a scoped quick change")
34
+
35
+ return ok, notes
@@ -0,0 +1,23 @@
1
+ SCENARIO_NAME = "iterate_to_idea_promotion"
2
+ SCENARIO_DESCRIPTION = (
3
+ "A task that starts as a small iterate ask but expands into broader feature or "
4
+ "workflow planning should be promoted to idea with durable iterate linkage."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "current_user_message": (
8
+ "Start by making the current order export include discounts, but if that means we "
9
+ "need a full configurable export builder across orders, refunds, and payouts, figure "
10
+ "out the right next step."
11
+ ),
12
+ "idea_id": "devin_eval_iterate_to_idea_promotion",
13
+ "project_id": "proj_75f63d30",
14
+ "repo_root": "/Users/devflow/repos/Spicy-Server",
15
+ }
16
+ EXPECTED_BEHAVIOR = {
17
+ "route_arm": "iterate",
18
+ "response_kind": "promote_to_idea",
19
+ "target_lane": "idea",
20
+ "uses_tool": "call_devin_ideation",
21
+ "writes_promotion_handoff": True,
22
+ "run_state": "promoted",
23
+ }
@@ -0,0 +1,53 @@
1
+ from devin.nodes.iterate.scenarios.iterate_to_idea_promotion import (
2
+ EXPECTED_BEHAVIOR,
3
+ INPUT_PAYLOAD,
4
+ SCENARIO_NAME,
5
+ )
6
+
7
+ EVAL_CRITERIA = {
8
+ "route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
9
+ "response_kind_must_equal": EXPECTED_BEHAVIOR["response_kind"],
10
+ "target_lane_must_equal": EXPECTED_BEHAVIOR["target_lane"],
11
+ "writes_promotion_handoff": True,
12
+ "run_state_must_equal": EXPECTED_BEHAVIOR["run_state"],
13
+ }
14
+
15
+
16
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
17
+ ok = True
18
+ notes = []
19
+ exp = EXPECTED_BEHAVIOR
20
+
21
+ route_arm = actual_output.get("route_arm")
22
+ if route_arm != exp["route_arm"]:
23
+ ok = False
24
+ notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
25
+
26
+ kind = actual_output.get("response_kind")
27
+ if kind != exp["response_kind"]:
28
+ ok = False
29
+ notes.append(f"expected response_kind={exp['response_kind']}, got {kind}")
30
+
31
+ target_lane = str(actual_output.get("target_lane") or "").lower()
32
+ if target_lane != exp["target_lane"]:
33
+ ok = False
34
+ notes.append(f"expected target_lane={exp['target_lane']}, got {target_lane}")
35
+
36
+ run_state = str(actual_output.get("run_state") or actual_output.get("final_verdict") or "")
37
+ if run_state and run_state != exp["run_state"]:
38
+ notes.append(f"expected promoted terminal state, got {run_state}")
39
+
40
+ promotion_handoff = actual_output.get("promotion_handoff") or {}
41
+ handoff_lane = str(
42
+ promotion_handoff.get("target_lane")
43
+ or actual_output.get("handoff_target_lane")
44
+ or ""
45
+ ).lower()
46
+ if handoff_lane and handoff_lane != exp["target_lane"]:
47
+ ok = False
48
+ notes.append(f"expected promotion handoff target_lane={exp['target_lane']}, got {handoff_lane}")
49
+ elif not promotion_handoff and not actual_output.get("handoff_ref"):
50
+ ok = False
51
+ notes.append("missing iterate-owned promotion_handoff linkage for idea promotion")
52
+
53
+ return ok, notes
@@ -0,0 +1,23 @@
1
+ SCENARIO_NAME = "iterate_to_insight_reroute"
2
+ SCENARIO_DESCRIPTION = (
3
+ "A request that initially looks like iterate but becomes diagnosis-only after "
4
+ "observation should call the insight arm as a subagent while iterate remains the owner."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "current_user_message": (
8
+ "I thought the checkout totals were wrong, but before changing code I mostly need "
9
+ "you to determine why tax is being calculated this way and explain whether it is "
10
+ "correct."
11
+ ),
12
+ "idea_id": "devin_eval_iterate_to_insight_reroute",
13
+ "project_id": "proj_75f63d30",
14
+ "repo_root": "/Users/devflow/repos/Spicy-Server",
15
+ }
16
+ EXPECTED_BEHAVIOR = {
17
+ "route_arm": "iterate",
18
+ "calls_tool": "devin_insight",
19
+ "response_kind_in": ["completed", "blocked", "needs_more_context"],
20
+ "does_not_return_route_to_insight_only": True,
21
+ "does_not_spawn_coder": True,
22
+ "stays_iterate_owned": True,
23
+ }
@@ -0,0 +1,53 @@
1
+ from devin.nodes.iterate.scenarios.iterate_to_insight_reroute import (
2
+ EXPECTED_BEHAVIOR,
3
+ INPUT_PAYLOAD,
4
+ SCENARIO_NAME,
5
+ )
6
+
7
+ EVAL_CRITERIA = {
8
+ "route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
9
+ "response_kind_must_equal": EXPECTED_BEHAVIOR["response_kind"],
10
+ "target_lane_must_equal": EXPECTED_BEHAVIOR["target_lane"],
11
+ "writes_promotion_handoff": True,
12
+ "does_not_claim_completion": True,
13
+ }
14
+
15
+
16
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
17
+ ok = True
18
+ notes = []
19
+ exp = EXPECTED_BEHAVIOR
20
+
21
+ route_arm = actual_output.get("route_arm")
22
+ if route_arm != exp["route_arm"]:
23
+ ok = False
24
+ notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
25
+
26
+ kind = actual_output.get("response_kind")
27
+ if kind != exp["response_kind"]:
28
+ ok = False
29
+ notes.append(f"expected response_kind={exp['response_kind']}, got {kind}")
30
+
31
+ target_lane = str(actual_output.get("target_lane") or "").lower()
32
+ if target_lane != exp["target_lane"]:
33
+ ok = False
34
+ notes.append(f"expected target_lane={exp['target_lane']}, got {target_lane}")
35
+
36
+ promotion_handoff = actual_output.get("promotion_handoff") or {}
37
+ handoff_lane = str(
38
+ promotion_handoff.get("target_lane")
39
+ or actual_output.get("handoff_target_lane")
40
+ or ""
41
+ ).lower()
42
+ if handoff_lane and handoff_lane != exp["target_lane"]:
43
+ ok = False
44
+ notes.append(f"expected promotion handoff target_lane={exp['target_lane']}, got {handoff_lane}")
45
+ elif not promotion_handoff and not actual_output.get("handoff_ref"):
46
+ ok = False
47
+ notes.append("missing iterate-owned promotion_handoff linkage for insight reroute")
48
+
49
+ if actual_output.get("response_kind") == "completed":
50
+ ok = False
51
+ notes.append("should reroute to insight instead of claiming completion")
52
+
53
+ return ok, notes
@@ -0,0 +1,28 @@
1
+ SCENARIO_NAME = "observer_evidence_seam"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Observer receives a framed iterate task plus evidence and identifies the bounded "
4
+ "failing seam that should govern coding."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "observer",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "task_artifact": {
10
+ "task_type": "error_fix",
11
+ "surface": "invite acceptance",
12
+ "current_behavior": "Submitting a valid invite token returns HTTP 500.",
13
+ "desired_behavior": "Valid invite tokens complete account activation successfully.",
14
+ "success_criteria": ["invite acceptance returns 200", "account activation completes"],
15
+ },
16
+ "evidence": {
17
+ "http_status": 500,
18
+ "log_excerpt": "TypeError: invite.accepted_at must be datetime, got None",
19
+ "route": "POST /api/invites/accept",
20
+ },
21
+ }
22
+ EXPECTED_BEHAVIOR = {
23
+ "produces_observation_artifact": True,
24
+ "confirms_failure": True,
25
+ "identifies_failing_seam": True,
26
+ "expected_green_condition_present": True,
27
+ "ready_for_coder": True,
28
+ }
@@ -0,0 +1,55 @@
1
+ from devin.nodes.iterate.scenarios.observer_evidence_seam import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_observation_artifact": True,
5
+ "confirms_failure": True,
6
+ "identifies_failing_seam": True,
7
+ "expected_green_condition_present": True,
8
+ "ready_for_coder": True,
9
+ }
10
+
11
+
12
+
13
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
14
+ ok = True
15
+ notes = []
16
+
17
+ observation_artifact = actual_output.get("observation_artifact") or {}
18
+ if not observation_artifact:
19
+ ok = False
20
+ notes.append("missing observation_artifact")
21
+ return ok, notes
22
+
23
+ verdict = str(observation_artifact.get("status") or actual_output.get("response_kind") or "").lower()
24
+ if verdict not in {"confirmed", "ready_for_coder", "completed"}:
25
+ ok = False
26
+ notes.append(f"expected confirmed-style verdict, got {verdict}")
27
+
28
+ seam = str(observation_artifact.get("failing_seam") or observation_artifact.get("seam") or "")
29
+ if not seam:
30
+ ok = False
31
+ notes.append("missing failing_seam in observation_artifact")
32
+
33
+ green = str(observation_artifact.get("expected_green_condition") or observation_artifact.get("green_condition") or "")
34
+ if not green:
35
+ ok = False
36
+ notes.append("missing expected_green_condition")
37
+
38
+ ready = observation_artifact.get("ready_for_coder")
39
+ if ready is False:
40
+ ok = False
41
+ notes.append("observer should mark this evidence-backed seam as ready_for_coder")
42
+
43
+ evidence_text = " ".join(
44
+ str(v)
45
+ for v in [
46
+ observation_artifact.get("evidence_summary"),
47
+ observation_artifact.get("failing_seam"),
48
+ observation_artifact.get("log_excerpt"),
49
+ ]
50
+ if v
51
+ ).lower()
52
+ if evidence_text and not any(tok in evidence_text for tok in ("500", "typeerror", "invite", "accept")):
53
+ notes.append("observation_artifact does not clearly preserve the supplied evidence")
54
+
55
+ return ok, notes
@@ -0,0 +1,28 @@
1
+ SCENARIO_NAME = "observer_repro_creation"
2
+ SCENARIO_DESCRIPTION = (
3
+ "Observer turns a bounded task into a narrow reproducible failing repro instead of "
4
+ "jumping to implementation."
5
+ )
6
+ INPUT_PAYLOAD = {
7
+ "role": "observer",
8
+ "repo_root": "/Users/devflow/repos/devflow_engine",
9
+ "task_artifact": {
10
+ "task_type": "targeted_improvement",
11
+ "surface": "CSV import wizard",
12
+ "current_behavior": "Rows with trailing spaces in email fields are rejected as invalid.",
13
+ "desired_behavior": "Trailing whitespace is trimmed before validation during import.",
14
+ "success_criteria": ["trimmed emails import successfully", "invalid emails still fail validation"],
15
+ },
16
+ "evidence": {
17
+ "sample_row": {"email": " alice@example.com "},
18
+ "observed_result": "validation error: invalid email",
19
+ "entry_point": "POST /api/imports/preview",
20
+ },
21
+ }
22
+ EXPECTED_BEHAVIOR = {
23
+ "produces_observation_artifact": True,
24
+ "creates_narrow_repro": True,
25
+ "repro_is_deterministic": True,
26
+ "expected_green_condition_present": True,
27
+ "ready_for_coder": True,
28
+ }
@@ -0,0 +1,45 @@
1
+ from devin.nodes.iterate.scenarios.observer_repro_creation import EXPECTED_BEHAVIOR
2
+
3
+ EVAL_CRITERIA = {
4
+ "produces_observation_artifact": True,
5
+ "creates_narrow_repro": True,
6
+ "repro_is_deterministic": True,
7
+ "expected_green_condition_present": True,
8
+ "ready_for_coder": True,
9
+ }
10
+
11
+
12
+
13
+ def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
14
+ ok = True
15
+ notes = []
16
+
17
+ observation_artifact = actual_output.get("observation_artifact") or {}
18
+ if not observation_artifact:
19
+ ok = False
20
+ notes.append("missing observation_artifact")
21
+ return ok, notes
22
+
23
+ repro = observation_artifact.get("repro_steps") or observation_artifact.get("repro") or []
24
+ if not repro:
25
+ ok = False
26
+ notes.append("missing bounded repro steps")
27
+
28
+ if isinstance(repro, list) and len(repro) > 6:
29
+ notes.append("repro exists but may be broader than necessary")
30
+
31
+ green = str(observation_artifact.get("expected_green_condition") or observation_artifact.get("green_condition") or "")
32
+ if not green:
33
+ ok = False
34
+ notes.append("missing expected_green_condition")
35
+
36
+ ready = observation_artifact.get("ready_for_coder")
37
+ if ready is False:
38
+ ok = False
39
+ notes.append("observer should mark deterministic repro as ready_for_coder")
40
+
41
+ repro_text = " ".join(str(step) for step in repro).lower() if isinstance(repro, list) else str(repro).lower()
42
+ if repro_text and not any(tok in repro_text for tok in ("csv", "email", "preview", "trailing", "whitespace")):
43
+ notes.append("repro does not appear aligned to the supplied seam")
44
+
45
+ return ok, notes