devflow-engine 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. devflow_engine/__init__.py +3 -0
  2. devflow_engine/agentic_prompts.py +100 -0
  3. devflow_engine/agentic_runtime.py +398 -0
  4. devflow_engine/api_key_flow_harness.py +539 -0
  5. devflow_engine/api_keys.py +357 -0
  6. devflow_engine/bootstrap/__init__.py +2 -0
  7. devflow_engine/bootstrap/provision_from_template.py +84 -0
  8. devflow_engine/cli/__init__.py +0 -0
  9. devflow_engine/cli/app.py +7270 -0
  10. devflow_engine/core/__init__.py +0 -0
  11. devflow_engine/core/config.py +86 -0
  12. devflow_engine/core/logging.py +29 -0
  13. devflow_engine/core/paths.py +45 -0
  14. devflow_engine/core/toml_kv.py +33 -0
  15. devflow_engine/devflow_event_worker.py +1292 -0
  16. devflow_engine/devflow_state.py +201 -0
  17. devflow_engine/devin2/__init__.py +9 -0
  18. devflow_engine/devin2/agent_definition.py +120 -0
  19. devflow_engine/devin2/pi_runner.py +204 -0
  20. devflow_engine/devin_orchestration.py +69 -0
  21. devflow_engine/docs/prompts/anti-patterns.md +42 -0
  22. devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
  23. devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
  24. devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
  25. devflow_engine/doctor/__init__.py +2 -0
  26. devflow_engine/doctor/triage.py +140 -0
  27. devflow_engine/error/__init__.py +0 -0
  28. devflow_engine/error/remediation.py +21 -0
  29. devflow_engine/errors/error_solver_dag.py +522 -0
  30. devflow_engine/errors/runtime_observability.py +67 -0
  31. devflow_engine/idea/__init__.py +4 -0
  32. devflow_engine/idea/actors.py +481 -0
  33. devflow_engine/idea/agentic.py +465 -0
  34. devflow_engine/idea/analyze.py +93 -0
  35. devflow_engine/idea/devin_chat_dag.py +1 -0
  36. devflow_engine/idea/diff.py +99 -0
  37. devflow_engine/idea/drafts.py +446 -0
  38. devflow_engine/idea/idea_creation_dag.py +643 -0
  39. devflow_engine/idea/ideation_enrichment.py +355 -0
  40. devflow_engine/idea/ideation_enrichment_worker.py +19 -0
  41. devflow_engine/idea/paths.py +28 -0
  42. devflow_engine/idea/promote.py +53 -0
  43. devflow_engine/idea/redaction.py +27 -0
  44. devflow_engine/idea/repo_tools.py +1277 -0
  45. devflow_engine/idea/response_mode.py +30 -0
  46. devflow_engine/idea/story_pipeline.py +1585 -0
  47. devflow_engine/idea/sufficiency.py +376 -0
  48. devflow_engine/idea/traditional_stories.py +1257 -0
  49. devflow_engine/implementation/__init__.py +0 -0
  50. devflow_engine/implementation/alembic_preflight.py +700 -0
  51. devflow_engine/implementation/dag.py +8450 -0
  52. devflow_engine/implementation/green_gate.py +93 -0
  53. devflow_engine/implementation/prompts.py +108 -0
  54. devflow_engine/implementation/test_runtime.py +623 -0
  55. devflow_engine/integration/__init__.py +19 -0
  56. devflow_engine/integration/agentic.py +66 -0
  57. devflow_engine/integration/dag.py +3539 -0
  58. devflow_engine/integration/prompts.py +114 -0
  59. devflow_engine/integration/supabase_schema.sql +31 -0
  60. devflow_engine/integration/supabase_sync.py +177 -0
  61. devflow_engine/llm/__init__.py +1 -0
  62. devflow_engine/llm/cli_one_shot.py +84 -0
  63. devflow_engine/llm/cli_stream.py +371 -0
  64. devflow_engine/llm/execution_context.py +26 -0
  65. devflow_engine/llm/invoke.py +1322 -0
  66. devflow_engine/llm/provider_api.py +304 -0
  67. devflow_engine/llm/repo_knowledge.py +588 -0
  68. devflow_engine/llm_primitives.py +315 -0
  69. devflow_engine/orchestration.py +62 -0
  70. devflow_engine/planning/__init__.py +0 -0
  71. devflow_engine/planning/analyze_repo.py +92 -0
  72. devflow_engine/planning/render_drafts.py +133 -0
  73. devflow_engine/playground/__init__.py +0 -0
  74. devflow_engine/playground/hooks.py +26 -0
  75. devflow_engine/playwright_workflow/__init__.py +5 -0
  76. devflow_engine/playwright_workflow/dag.py +1317 -0
  77. devflow_engine/process/__init__.py +5 -0
  78. devflow_engine/process/dag.py +59 -0
  79. devflow_engine/project_registration/__init__.py +3 -0
  80. devflow_engine/project_registration/dag.py +1581 -0
  81. devflow_engine/project_registry.py +109 -0
  82. devflow_engine/prompts/devin/generic/prompt.md +6 -0
  83. devflow_engine/prompts/devin/ideation/prompt.md +263 -0
  84. devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
  85. devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
  86. devflow_engine/prompts/devin/insight/prompt.md +11 -0
  87. devflow_engine/prompts/devin/insight/scenarios.md +5 -0
  88. devflow_engine/prompts/devin/intake/prompt.md +15 -0
  89. devflow_engine/prompts/devin/iterate/prompt.md +12 -0
  90. devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
  91. devflow_engine/prompts/devin/shared/principles.md +246 -0
  92. devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
  93. devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
  94. devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
  95. devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
  96. devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
  97. devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
  98. devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
  99. devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
  100. devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
  101. devflow_engine/prompts/implementation/red/prompt.md +27 -0
  102. devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
  103. devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
  104. devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
  105. devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
  106. devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
  107. devflow_engine/prompts/integration/README.md +185 -0
  108. devflow_engine/prompts/integration/green/example.md +67 -0
  109. devflow_engine/prompts/integration/green/green/prompt.md +10 -0
  110. devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
  111. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
  112. devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
  113. devflow_engine/prompts/integration/green_enrich/example.md +79 -0
  114. devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
  115. devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
  116. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
  117. devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  118. devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
  119. devflow_engine/prompts/integration/red/example.md +152 -0
  120. devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
  121. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  122. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
  123. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
  124. devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
  125. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  126. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
  127. devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
  128. devflow_engine/prompts/integration/red/red/prompt.md +11 -0
  129. devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
  130. devflow_engine/prompts/integration/red_review/example.md +71 -0
  131. devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
  132. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
  133. devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
  134. devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
  135. devflow_engine/prompts/integration/resolve/example.md +111 -0
  136. devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
  137. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
  138. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
  139. devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
  140. devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
  141. devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
  142. devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
  143. devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
  144. devflow_engine/prompts/integration/validate/example.md +143 -0
  145. devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
  146. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
  147. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
  148. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
  149. devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
  150. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
  151. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
  152. devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
  153. devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
  154. devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
  155. devflow_engine/prompts/integration/write_workflows/example.md +100 -0
  156. devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
  157. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
  158. devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
  159. devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
  160. devflow_engine/prompts/iterate/README.md +7 -0
  161. devflow_engine/prompts/iterate/coder/prompt.md +11 -0
  162. devflow_engine/prompts/iterate/framer/prompt.md +11 -0
  163. devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
  164. devflow_engine/prompts/iterate/observer/prompt.md +11 -0
  165. devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
  166. devflow_engine/prompts/recovery/execution/prompt.md +8 -0
  167. devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
  168. devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
  169. devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
  170. devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
  171. devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
  172. devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
  173. devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
  174. devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
  175. devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
  176. devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
  177. devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
  178. devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
  179. devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
  180. devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
  181. devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
  182. devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
  183. devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
  184. devflow_engine/recovery/__init__.py +3 -0
  185. devflow_engine/recovery/dag.py +2609 -0
  186. devflow_engine/recovery/models.py +220 -0
  187. devflow_engine/refactor.py +93 -0
  188. devflow_engine/registry/__init__.py +1 -0
  189. devflow_engine/registry/cards.py +238 -0
  190. devflow_engine/registry/domain_normalize.py +60 -0
  191. devflow_engine/registry/effects.py +65 -0
  192. devflow_engine/registry/enforce_report.py +150 -0
  193. devflow_engine/registry/module_cards_classify.py +164 -0
  194. devflow_engine/registry/module_cards_draft.py +184 -0
  195. devflow_engine/registry/module_cards_gate.py +59 -0
  196. devflow_engine/registry/packages.py +347 -0
  197. devflow_engine/registry/pathways.py +323 -0
  198. devflow_engine/review/__init__.py +11 -0
  199. devflow_engine/review/dag.py +588 -0
  200. devflow_engine/review/review_story.py +67 -0
  201. devflow_engine/scope_idea/__init__.py +3 -0
  202. devflow_engine/scope_idea/agentic.py +39 -0
  203. devflow_engine/scope_idea/dag.py +1069 -0
  204. devflow_engine/scope_idea/models.py +175 -0
  205. devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
  206. devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
  207. devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
  208. devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
  209. devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
  210. devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
  211. devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
  212. devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
  213. devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
  214. devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
  215. devflow_engine/skills/registry.example.yaml +42 -0
  216. devflow_engine/source_doc_assumptions.py +291 -0
  217. devflow_engine/source_doc_mutation_dag.py +1606 -0
  218. devflow_engine/source_doc_mutation_eval.py +417 -0
  219. devflow_engine/source_doc_mutation_worker.py +25 -0
  220. devflow_engine/source_docs_schema.py +207 -0
  221. devflow_engine/source_docs_updater.py +309 -0
  222. devflow_engine/source_scope/__init__.py +15 -0
  223. devflow_engine/source_scope/agentic.py +45 -0
  224. devflow_engine/source_scope/dag.py +1626 -0
  225. devflow_engine/source_scope/models.py +177 -0
  226. devflow_engine/stores/__init__.py +0 -0
  227. devflow_engine/stores/execution_store.py +3534 -0
  228. devflow_engine/story/__init__.py +0 -0
  229. devflow_engine/story/contracts.py +160 -0
  230. devflow_engine/story/discovery.py +47 -0
  231. devflow_engine/story/evidence.py +118 -0
  232. devflow_engine/story/hashing.py +27 -0
  233. devflow_engine/story/implemented_queue_purge.py +148 -0
  234. devflow_engine/story/indexer.py +105 -0
  235. devflow_engine/story/io.py +20 -0
  236. devflow_engine/story/markdown_contracts.py +298 -0
  237. devflow_engine/story/reconciliation.py +408 -0
  238. devflow_engine/story/validate_stories.py +149 -0
  239. devflow_engine/story/validate_tests_story.py +512 -0
  240. devflow_engine/story/validation.py +133 -0
  241. devflow_engine/ui_grounding/__init__.py +11 -0
  242. devflow_engine/ui_grounding/agentic.py +31 -0
  243. devflow_engine/ui_grounding/dag.py +874 -0
  244. devflow_engine/ui_grounding/models.py +224 -0
  245. devflow_engine/ui_grounding/pencil_bridge.py +247 -0
  246. devflow_engine/vendor/__init__.py +0 -0
  247. devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
  248. devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
  249. devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
  250. devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
  251. devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
  252. devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
  253. devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
  254. devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
  255. devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
  256. devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
  257. devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
  258. devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
  259. devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
  260. devflow_engine/worker.py +1086 -0
  261. devflow_engine/worker_guard.py +233 -0
  262. devflow_engine-1.0.0.dist-info/METADATA +235 -0
  263. devflow_engine-1.0.0.dist-info/RECORD +393 -0
  264. devflow_engine-1.0.0.dist-info/WHEEL +4 -0
  265. devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
  266. devin/__init__.py +6 -0
  267. devin/dag.py +58 -0
  268. devin/dag_two_arm.py +138 -0
  269. devin/devin_chat_scenario_catalog.json +588 -0
  270. devin/devin_eval.py +677 -0
  271. devin/nodes/__init__.py +0 -0
  272. devin/nodes/ideation/__init__.py +0 -0
  273. devin/nodes/ideation/node.py +195 -0
  274. devin/nodes/ideation/playground.py +267 -0
  275. devin/nodes/ideation/prompt.md +65 -0
  276. devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
  277. devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
  278. devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
  279. devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
  280. devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
  281. devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
  282. devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
  283. devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
  284. devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
  285. devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
  286. devin/nodes/ideation/scenarios/vague_idea.py +16 -0
  287. devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
  288. devin/nodes/ideation/tools.json +312 -0
  289. devin/nodes/insight/__init__.py +0 -0
  290. devin/nodes/insight/node.py +49 -0
  291. devin/nodes/insight/playground.py +154 -0
  292. devin/nodes/insight/prompt.md +61 -0
  293. devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
  294. devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
  295. devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
  296. devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
  297. devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
  298. devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
  299. devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
  300. devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
  301. devin/nodes/insight/scenarios/operational_debugging.py +15 -0
  302. devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
  303. devin/nodes/insight/scenarios/operational_question.py +9 -0
  304. devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
  305. devin/nodes/insight/scenarios/queue_status.py +15 -0
  306. devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
  307. devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
  308. devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
  309. devin/nodes/insight/scenarios/worker_state_check.py +15 -0
  310. devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
  311. devin/nodes/insight/tools.json +126 -0
  312. devin/nodes/intake/__init__.py +0 -0
  313. devin/nodes/intake/node.py +27 -0
  314. devin/nodes/intake/playground.py +47 -0
  315. devin/nodes/intake/prompt.md +12 -0
  316. devin/nodes/intake/scenarios/ideation_routing.py +4 -0
  317. devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
  318. devin/nodes/intake/scenarios/insight_routing.py +4 -0
  319. devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
  320. devin/nodes/iterate/README.md +44 -0
  321. devin/nodes/iterate/__init__.py +1 -0
  322. devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
  323. devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
  324. devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
  325. devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
  326. devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
  327. devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
  328. devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
  329. devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
  330. devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
  331. devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
  332. devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
  333. devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
  334. devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
  335. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
  336. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
  337. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
  338. devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
  339. devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
  340. devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
  341. devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
  342. devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
  343. devin/nodes/iterate/agent-roles.md +89 -0
  344. devin/nodes/iterate/agents/README.md +10 -0
  345. devin/nodes/iterate/artifacts.md +504 -0
  346. devin/nodes/iterate/contract.md +100 -0
  347. devin/nodes/iterate/eval-plan.md +74 -0
  348. devin/nodes/iterate/node.py +100 -0
  349. devin/nodes/iterate/pipeline/README.md +13 -0
  350. devin/nodes/iterate/playground-contract.md +76 -0
  351. devin/nodes/iterate/prompt.md +11 -0
  352. devin/nodes/iterate/scenarios/README.md +38 -0
  353. devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
  354. devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
  355. devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
  356. devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
  357. devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
  358. devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
  359. devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
  360. devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
  361. devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
  362. devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
  363. devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
  364. devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
  365. devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
  366. devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
  367. devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
  368. devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
  369. devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
  370. devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
  371. devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
  372. devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
  373. devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
  374. devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
  375. devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
  376. devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
  377. devin/nodes/shared/__init__.py +0 -0
  378. devin/nodes/shared/filemaker_expert.md +80 -0
  379. devin/nodes/shared/filemaker_expert.py +354 -0
  380. devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
  381. devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
  382. devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
  383. devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
  384. devin/nodes/shared/helpers.py +156 -0
  385. devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
  386. devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
  387. devin/nodes/shared/models.py +44 -0
  388. devin/nodes/shared/post.py +40 -0
  389. devin/nodes/shared/router.py +107 -0
  390. devin/nodes/shared/tools.py +191 -0
  391. devin/shared/devin-chat-rubric.md +237 -0
  392. devin/shared/devin-chat-scenario-suite.md +90 -0
  393. devin/shared/eval_doctrine.md +9 -0
@@ -0,0 +1,191 @@
1
+ """Devin tools available to IdeationAgent and InsightAgent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import subprocess
8
+ import time
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from devflow_engine.stores.execution_store import ExecutionStore
14
+ from devflow_engine.vendor.datalumina_genai.core.nodes.base import Node as DataluminaNode
15
+
16
+ from .helpers import DAG_ID, store_run
17
+
18
+ # -------------------------------------------------------------------
19
+ # Agent tool result types
20
+ # -------------------------------------------------------------------
21
+
22
+ @dataclass
23
+ class ToolResult:
24
+ ok: bool
25
+ tool_name: str
26
+ output: dict[str, Any]
27
+ error: str | None = None
28
+
29
+ # -------------------------------------------------------------------
30
+ # Devflow primitives
31
+ # -------------------------------------------------------------------
32
+
33
+ def devflow_init_idea(*, idea_id: str, title: str, repo_root: Path) -> ToolResult:
34
+ """Initialize a new idea artifact in .devflow/ideas/<idea_id>/"""
35
+ try:
36
+ result = subprocess.run(
37
+ ['devflow', 'idea', 'init', '--idea', idea_id, '--title', title],
38
+ cwd=str(repo_root),
39
+ capture_output=True,
40
+ text=True,
41
+ timeout=30,
42
+ )
43
+ if result.returncode == 0:
44
+ return ToolResult(ok=True, tool_name='Devflow_Init_Idea', output={'idea_id': idea_id, 'stdout': result.stdout})
45
+ return ToolResult(ok=False, tool_name='Devflow_Init_Idea', output={}, error=result.stderr or result.stdout)
46
+ except Exception as e:
47
+ return ToolResult(ok=False, tool_name='Devflow_Init_Idea', output={}, error=str(e))
48
+
49
+
50
+ def devflow_amend_idea(*, idea_id: str, refined_text: str, repo_root: Path) -> ToolResult:
51
+ """Amend an existing idea artifact with refined text."""
52
+ try:
53
+ # Run sufficiency to get updated shape
54
+ result = subprocess.run(
55
+ ['devflow', 'idea', 'sufficiency', '--text', refined_text],
56
+ cwd=str(repo_root),
57
+ capture_output=True,
58
+ text=True,
59
+ timeout=30,
60
+ )
61
+ sufficiency = {}
62
+ if result.returncode == 0:
63
+ try:
64
+ sufficiency = json.loads(result.stdout)
65
+ except Exception:
66
+ pass
67
+ # Write back to the idea artifact
68
+ idea_json_path = repo_root / '.devflow' / 'ideas' / idea_id / 'idea.json'
69
+ if idea_json_path.exists():
70
+ current = json.loads(idea_json_path.read_text(encoding='utf-8'))
71
+ current['refined_text'] = refined_text
72
+ current['sufficiency'] = sufficiency
73
+ idea_json_path.write_text(json.dumps(current, indent=2, sort_keys=True) + '\n', encoding='utf-8')
74
+ return ToolResult(ok=True, tool_name='Devflow_Amend_Idea', output={'idea_id': idea_id, 'sufficiency': sufficiency})
75
+ except Exception as e:
76
+ return ToolResult(ok=False, tool_name='Devflow_Amend_Idea', output={}, error=str(e))
77
+
78
+
79
+ def devflow_commit_idea(*, idea_id: str, draft_set: str = 'current', repo_root: Path) -> ToolResult:
80
+ """Promote/commit an idea to ready-for-downstream state."""
81
+ try:
82
+ result = subprocess.run(
83
+ ['devflow', 'idea', 'promote', '--idea', idea_id, '--draft-set', draft_set, '--dest', str(repo_root)],
84
+ cwd=str(repo_root),
85
+ capture_output=True,
86
+ text=True,
87
+ timeout=30,
88
+ )
89
+ if result.returncode == 0:
90
+ return ToolResult(ok=True, tool_name='Devflow_Commit_Idea', output={'idea_id': idea_id, 'stdout': result.stdout})
91
+ return ToolResult(ok=False, tool_name='Devflow_Commit_Idea', output={}, error=result.stderr or result.stdout)
92
+ except Exception as e:
93
+ return ToolResult(ok=False, tool_name='Devflow_Commit_Idea', output={}, error=str(e))
94
+
95
+
96
+ # -------------------------------------------------------------------
97
+ # DevinInsight as subagent
98
+ # -------------------------------------------------------------------
99
+
100
+ def devin_insight(*, current_user_message: str, repo_root: Path, context: dict[str, Any]) -> ToolResult:
101
+ """Call Devin InsightAgent as a subagent to explore codebase / provide grounded context."""
102
+ try:
103
+ from devin.nodes.insight.node import InsightAgentNode
104
+ from devin.nodes.shared.models import DevinChatDagEvent
105
+ from devflow_engine.vendor.datalumina_genai.core.task import TaskContext
106
+
107
+ event = DevinChatDagEvent(
108
+ repo_root=str(repo_root),
109
+ idea_id=context.get('idea_id', 'unknown'),
110
+ raw_text=current_user_message,
111
+ pipeline_key=f'insight_subagent_{int(time.time())}',
112
+ )
113
+ node = InsightAgentNode(task_context=None)
114
+ # Run synchronously via asyncio
115
+ import asyncio
116
+ ctx = asyncio.run(node.process(TaskContext(event=event, metadata=context.copy())))
117
+ return ToolResult(
118
+ ok=True,
119
+ tool_name='DevinInsight',
120
+ output={'insight_response': ctx.metadata.get('response_guidance', {}), 'idea_id': context.get('idea_id')},
121
+ )
122
+ except Exception as e:
123
+ return ToolResult(ok=False, tool_name='DevinInsight', output={}, error=str(e))
124
+
125
+
126
+ # -------------------------------------------------------------------
127
+ # Emit_Response — real-time UI feedback via agent_devin_messages
128
+ # -------------------------------------------------------------------
129
+
130
+ def emit_response(*, message: str, emit_type: str, metadata: dict[str, Any] | None = None, session_id: str) -> ToolResult:
131
+ """Emit real-time feedback to agent_devin_messages for frontend rendering.
132
+
133
+ emit_type values:
134
+ - start_working: "Running...", "Reviewing...", "Splunking...", etc.
135
+ - stop_working: finalizes the working feedback
136
+ - progress: incremental progress update
137
+ - info: informational message
138
+ """
139
+ row = {
140
+ 'session_id': session_id,
141
+ 'from_agent': 'devin',
142
+ 'to_agent': 'user',
143
+ 'message': message,
144
+ 'metadata': {
145
+ 'emit_type': emit_type,
146
+ 'tool': 'Emit_Response',
147
+ **(metadata or {}),
148
+ },
149
+ 'project_id': None,
150
+ 'message_type': 'emit',
151
+ }
152
+ created = _post_agent_message(row)
153
+ return ToolResult(ok=True, tool_name='Emit_Response', output={'emit_type': emit_type, 'message': message, 'posted': created.get('id')})
154
+
155
+
156
+ def emit_start_working(*, activity: str | None = None, session_id: str) -> ToolResult:
157
+ """Emit Start_Working_Feedback to agent_devin_messages."""
158
+ verbs = ['running', 'reviewing', 'shaping', 'crafting', 'exploring', 'analyzing', 'mapping', 'building', 'checking', 'preparing']
159
+ activity = activity or verbs[int(time.time()) % len(verbs)]
160
+ return emit_response(message=f"{activity.title()}...", emit_type='start_working', session_id=session_id)
161
+
162
+
163
+ def emit_stop_working(*, activity: str | None = None, session_id: str) -> ToolResult:
164
+ """Emit Stop_Working_Feedback to agent_devin_messages."""
165
+ verbs = ['running', 'reviewing', 'shaping', 'crafting', 'exploring', 'analyzing', 'mapping', 'building', 'checking', 'preparing']
166
+ activity = activity or verbs[int(time.time()) % len(verbs)]
167
+ return emit_response(message=f"Done {activity.title()}.", emit_type='stop_working', session_id=session_id)
168
+
169
+
170
+ def emit_conclude_node(*, node_output: dict[str, Any], session_id: str) -> ToolResult:
171
+ """Emit Conclude_Node signal with node output for final node processing."""
172
+ return emit_response(
173
+ message="Node complete.",
174
+ emit_type='conclude_node',
175
+ metadata={'node_output': node_output, 'tool': 'Conclude_Node'},
176
+ session_id=session_id,
177
+ )
178
+
179
+
180
+ # -------------------------------------------------------------------
181
+ # Internal helpers
182
+ # -------------------------------------------------------------------
183
+
184
+ def _post_agent_message(row: dict[str, Any]) -> dict[str, Any]:
185
+ if os.environ.get('PYTEST_CURRENT_TEST'):
186
+ return {'id': 'test-agent-agent-message', **row}
187
+ try:
188
+ from devflow_engine.devin_orchestration import maybe_post_devin_message
189
+ return maybe_post_devin_message(row=row) or {'id': 'local-only'}
190
+ except Exception:
191
+ return {'id': 'local-only', **row, 'status': 'local_only'}
@@ -0,0 +1,237 @@
1
+ # Devin chat eval rubric
2
+
3
+ Use this rubric to evaluate Devin chat outputs for future DAG-output testing.
4
+
5
+ This rubric is intentionally compact. It is meant to be operational, not aspirational wallpaper.
6
+
7
+ Primary companion doc:
8
+ - [Devin chat principles](../devin-chat-principles.md)
9
+
10
+ ## Scoring shape
11
+
12
+ Recommended per-dimension scoring:
13
+ - `1.0` = clear pass
14
+ - `0.5` = mixed / borderline
15
+ - `0.0` = fail
16
+
17
+ A strong response should pass every hard-gate dimension and score well on conversational quality.
18
+
19
+ ## Dimensions
20
+
21
+ ### 1. Approach ownership
22
+
23
+ Question:
24
+ - Did Devin own the implementation approach while leaving outcome ownership with the user?
25
+
26
+ Pass signals:
27
+ - proposes a concrete framing or direction
28
+ - chooses sensible defaults inside stated constraints
29
+ - does not push technical decision-making onto the user prematurely
30
+
31
+ Fail signals:
32
+ - asks the user to choose architecture, stack, or decomposition Devin should own
33
+ - behaves like a form intake clerk
34
+
35
+ ### 2. Outcome-focused clarification
36
+
37
+ Question:
38
+ - If Devin asked a question, was it about business need, UX, constraints, users, or approval boundaries rather than low-level implementation choice?
39
+
40
+ Pass signals:
41
+ - asks one sharp question that changes the solution materially
42
+ - question is outcome- or constraint-oriented
43
+
44
+ Fail signals:
45
+ - asks multiple low-value questions
46
+ - asks the user to pick stack, schema, transport, framework, or similar Devin-owned details
47
+ - asks a question when a reasonable assumption would have been enough
48
+
49
+ ### 3. Momentum
50
+
51
+ Question:
52
+ - Did the response make the work feel underway?
53
+
54
+ Pass signals:
55
+ - direct answer first
56
+ - concrete next-step framing
57
+ - reasonable assumptions used to keep motion
58
+
59
+ Fail signals:
60
+ - stalls in generic planning language
61
+ - turns the turn into a checklist interview
62
+ - produces commentary instead of progress
63
+
64
+ ### 4. No fake progress
65
+
66
+ Question:
67
+ - Did Devin avoid implying work happened when it did not?
68
+
69
+ Hard fail examples:
70
+ - claims implementation exists when it does not
71
+ - claims queues ran when they did not
72
+ - claims downstream docs/stories were generated when they were not
73
+ - presents placeholder/scaffold output as complete
74
+
75
+ ### 5. No unsafe overreach
76
+
77
+ Question:
78
+ - Did Devin stay within grounded knowledge and safe assumption boundaries?
79
+
80
+ Pass signals:
81
+ - uses grounded assumptions
82
+ - asks for clarification when ambiguity materially affects correctness or risk
83
+ - does not invent repo facts or runtime state
84
+
85
+ Fail signals:
86
+ - fabricated codebase/runtime claims
87
+ - reckless certainty on ambiguous high-impact details
88
+ - unauthorized or unjustified high-risk leaps
89
+
90
+ ### 6. Attention discipline
91
+
92
+ Question:
93
+ - Did Devin answer the current request instead of getting hijacked by stale context?
94
+
95
+ Pass signals:
96
+ - current message is clearly answered
97
+ - thread continuity is preserved without changing the subject
98
+
99
+ Fail signals:
100
+ - responds to an older unresolved issue instead of the current ask
101
+ - drifts back into ideation when the user asked an ops/status question
102
+ - carries stale clarifications into the wrong turn
103
+
104
+ ### 7. Conversational quality
105
+
106
+ Question:
107
+ - Did the reply feel like a capable implementation partner?
108
+
109
+ Pass signals:
110
+ - plainspoken
111
+ - concise
112
+ - confident without bluffing
113
+ - helpful without fluff
114
+ - structure only when useful
115
+
116
+ Fail signals:
117
+ - robotic or bureaucratic tone
118
+ - overformatted reply for a simple turn
119
+ - hedgy filler or apology-heavy language
120
+ - internal orchestration jargon by default
121
+
122
+ ## Hard gates
123
+
124
+ A response should be marked overall fail if any of these are true:
125
+ - fake progress is present
126
+ - unsafe overreach is present
127
+ - attention discipline fails badly enough that the current request was not answered
128
+
129
+ ## Pass / fail exemplars
130
+
131
+ ### Pass: approach ownership + one sharp clarification
132
+
133
+ User:
134
+ > Build an internal support approvals workflow for risky requests.
135
+
136
+ Response:
137
+ > I’d treat this as an operator-first workflow with intake, risk scoring, approval, and an audit trail. I’ll assume internal-only and single-team ownership unless you say otherwise. The one thing that changes the shape meaningfully is whether approval is always single-step or can require multiple approvers.
138
+
139
+ Why it passes:
140
+ - approach is owned
141
+ - assumptions are reasonable
142
+ - only one high-leverage question
143
+ - momentum preserved
144
+
145
+ ### Fail: user forced to do Devin’s job
146
+
147
+ User:
148
+ > Build an internal support approvals workflow for risky requests.
149
+
150
+ Response:
151
+ > What framework, database, auth provider, transport protocol, and architecture style do you want?
152
+
153
+ Why it fails:
154
+ - no approach ownership
155
+ - form-filler behavior
156
+ - momentum collapse
157
+
158
+ ### Pass: direct current-turn answer
159
+
160
+ User:
161
+ > What is the source-doc queue status right now?
162
+
163
+ Response:
164
+ > There are no source-doc mutations queued right now.
165
+
166
+ Why it passes:
167
+ - direct answer
168
+ - no stale-context hijack
169
+ - no unnecessary orchestration dump
170
+
171
+ ### Fail: stale context hijack
172
+
173
+ User:
174
+ > What is the source-doc queue status right now?
175
+
176
+ Response:
177
+ > Before that, I still need you to clarify the primary workflow and target persona for the idea.
178
+
179
+ Why it fails:
180
+ - did not answer the actual question
181
+ - stale ideation context hijacked the turn
182
+
183
+ ### Fail: fake progress
184
+
185
+ User:
186
+ > Keep refining the idea; don’t generate anything yet.
187
+
188
+ Response:
189
+ > Done — I already generated stories and downstream planning artifacts.
190
+
191
+ Why it fails:
192
+ - directly contradicts the request
193
+ - claims work that should not have happened
194
+
195
+ ## Suggested evaluator prompts / checks
196
+
197
+ When building evaluator logic, check for:
198
+ - direct answer to current request
199
+ - evidence of owned framing / proposed approach
200
+ - whether any question is outcome-focused and singular
201
+ - whether prohibited claims of completed work appear
202
+ - whether stale-context drift appears
203
+ - whether response tone stays concise and plainspoken
204
+
205
+ Useful string-level negative checks:
206
+ - claims of `implemented`, `completed`, `generated`, `queued`, or `ran` when artifacts/state do not support that
207
+ - unexplained internal terms like `DAG`, `node`, `handoff`, `router` in ordinary user-facing replies
208
+ - multi-question interrogation patterns in early ideation turns
209
+ - implementation-choice questions like `what framework`, `what database`, `what auth provider`, or `what architecture` in sparse greenfield turns unless the ambiguity is truly outcome-shaping
210
+
211
+ ## Persona suite coverage in the existing multi-turn eval
212
+
213
+ The existing `devin_multi_turn` live eval should cover these default personas:
214
+ - **Sally ExplicitApproval** — wants to build a new app; tests forward-ready momentum, approach ownership, and explicit-approval gating.
215
+ - **Jimmy ExistingRepo** — needs a new idea added to an existing repo; tests repo-grounded assumptions and approval handoff in an existing product context.
216
+ - **Jeff SparseBrief** — brings very little detail on a greenfield idea and wants to get coding; tests aggressive default assumptions and anti-form-filler behavior.
217
+ - **Cleo ReviewFirst** — plans and reviews patiently; tests current-turn attention, review-first behavior, and no premature downstream handoff.
218
+
219
+ Evaluator expectations for this suite should explicitly check:
220
+ - Devin behaves like an implementation partner, not a form collector.
221
+ - Devin owns approach while the user owns outcome, UX intent, and business need.
222
+ - The first reply is forward-ready and momentum-preserving.
223
+ - Missing details are filled with grounded assumptions before low-value questioning.
224
+ - Internal orchestration stays abstracted unless operationally relevant.
225
+ - Multi-turn quality is judged mainly on trajectory, continuity, and decision points, not isolated single-turn polish.
226
+
227
+ ## Minimal overall judgment rule
228
+
229
+ A response passes if it:
230
+ - owns the approach
231
+ - maintains momentum
232
+ - asks only necessary, outcome-shaping clarification
233
+ - avoids fake progress
234
+ - avoids unsafe overreach
235
+ - answers the current request cleanly
236
+
237
+ If those are not all true, it should not be treated as a good Devin turn.
@@ -0,0 +1,90 @@
1
+ # Devin chat scenario suite
2
+
3
+ This defines the next concrete evaluation set for Devin chat, aligned with Marcus's clarified doctrine.
4
+
5
+ Structured fixture source:
6
+ - `src/devflow_engine/devin/devin_chat_scenario_catalog.json`
7
+
8
+ Runtime loader:
9
+ - `devflow_engine.devin.devin_eval.load_devin_chat_eval_catalog()`
10
+
11
+ ## Design goals
12
+
13
+ - Keep a **few fast single-turn screens** for first-turn posture and current-request discipline.
14
+ - Put the real weight on **multi-turn evals** because Devin should be judged mainly on **trajectory** and **decision-point handling across turns**.
15
+ - Evaluate Devin as an **implementation partner**:
16
+ - user owns outcome / UX / business need
17
+ - Devin owns approach
18
+ - first response should feel forward-ready
19
+ - assumptions should be grounded and aggressive enough to preserve momentum
20
+ - internal orchestration should stay abstracted by default
21
+
22
+ ## Suite shape
23
+
24
+ - **Single-turn screens:** 3
25
+ - **Multi-turn scenarios:** 6
26
+
27
+ ## Persona coverage
28
+
29
+ Required personas included:
30
+
31
+ - **Sally ForwardReady** — first-turn new-app posture and momentum
32
+ - **Jimmy RepoExtension** — existing-repo extension posture without form-filler drift
33
+ - **Jeff SparseBrief** — sparse greenfield input with aggressive assumptions and no questionnaire collapse
34
+ - **Cleo ReviewFirst** — review-first planning without premature downstream handoff
35
+
36
+ Additional personas cover doctrine edges:
37
+
38
+ - **Nora CurrentRequest** — current request beats stale ideation context
39
+ - **Priya RiskBoundary** — high-risk constraints require one sharp outcome-level clarification
40
+ - **Omar ContextSwitch** — ops/status detour, then clean return to ideation
41
+
42
+ ## Scenario inventory
43
+
44
+ ### Single-turn screens
45
+
46
+ 1. **sally_forward_ready_screen**
47
+ - Checks first-turn approach ownership, momentum, and outcome-focused clarification.
48
+ 2. **jimmy_repo_extension_screen**
49
+ - Checks repo-extension posture without form-filler behavior.
50
+ 3. **nora_current_request_screen**
51
+ - Checks attention to the current request over stale history.
52
+
53
+ ### Multi-turn scenarios
54
+
55
+ 1. **sally_explicit_approval_new_app**
56
+ - New app trajectory from first prompt through explicit approval.
57
+ 2. **jimmy_existing_repo_handoff**
58
+ - Existing-repo addition with UX constraints and approval handoff.
59
+ 3. **jeff_sparse_brief_fast_assumptions**
60
+ - Sparse input, aggressive assumptions, no questionnaire collapse.
61
+ 4. **cleo_review_first_preactivation**
62
+ - Planning/review thread that must avoid fake progress until approval.
63
+ 5. **priya_risk_boundary_clarification**
64
+ - Safety/constraint-sensitive ideation without unsafe overreach.
65
+ 6. **omar_status_detour_return_to_ideation**
66
+ - Status detour plus clean return to ideation before approval.
67
+
68
+ ## Doctrine coverage map
69
+
70
+ The suite explicitly covers:
71
+
72
+ - approach ownership
73
+ - outcome/UX-focused clarification
74
+ - momentum
75
+ - no form-filler behavior
76
+ - no fake progress
77
+ - no unsafe overreach
78
+ - attention to the current request instead of stale context
79
+ - correct decision-point handling across turns
80
+
81
+ ## Why this moves the work forward
82
+
83
+ Before this change, the eval shape existed mostly as hard-coded scenarios. The new suite makes the doctrine concrete in a **repo-native fixture catalog** that can be loaded directly by code and expanded later into richer evaluators.
84
+
85
+ That gives us:
86
+
87
+ - a durable scenario source of truth
88
+ - explicit persona coverage
89
+ - explicit doctrine-tag coverage
90
+ - a clean handoff point for future live eval runners, graders, and reporting
@@ -0,0 +1,9 @@
1
+ # Devin eval doctrine
2
+
3
+ - Scenarios describe the user request, the minimal input payload, and the expected behavior boundary.
4
+ - Scenario evals define explicit pass/fail checks.
5
+ - Node playgrounds should report: scenario name, pass/fail, actual output, expected behavior, and notes.
6
+ - Failing a scenario should never be hidden behind fallback prose.
7
+ - Insight evals should reward direct grounded answers.
8
+ - Ideation evals should reward momentum, truthful assumptions, and at most one sharp clarifying question when needed.
9
+ - Intake evals should reward correct routing and clear routing rationale.