devflow-engine 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devflow_engine/__init__.py +3 -0
- devflow_engine/agentic_prompts.py +100 -0
- devflow_engine/agentic_runtime.py +398 -0
- devflow_engine/api_key_flow_harness.py +539 -0
- devflow_engine/api_keys.py +357 -0
- devflow_engine/bootstrap/__init__.py +2 -0
- devflow_engine/bootstrap/provision_from_template.py +84 -0
- devflow_engine/cli/__init__.py +0 -0
- devflow_engine/cli/app.py +7270 -0
- devflow_engine/core/__init__.py +0 -0
- devflow_engine/core/config.py +86 -0
- devflow_engine/core/logging.py +29 -0
- devflow_engine/core/paths.py +45 -0
- devflow_engine/core/toml_kv.py +33 -0
- devflow_engine/devflow_event_worker.py +1292 -0
- devflow_engine/devflow_state.py +201 -0
- devflow_engine/devin2/__init__.py +9 -0
- devflow_engine/devin2/agent_definition.py +120 -0
- devflow_engine/devin2/pi_runner.py +204 -0
- devflow_engine/devin_orchestration.py +69 -0
- devflow_engine/docs/prompts/anti-patterns.md +42 -0
- devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
- devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
- devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
- devflow_engine/doctor/__init__.py +2 -0
- devflow_engine/doctor/triage.py +140 -0
- devflow_engine/error/__init__.py +0 -0
- devflow_engine/error/remediation.py +21 -0
- devflow_engine/errors/error_solver_dag.py +522 -0
- devflow_engine/errors/runtime_observability.py +67 -0
- devflow_engine/idea/__init__.py +4 -0
- devflow_engine/idea/actors.py +481 -0
- devflow_engine/idea/agentic.py +465 -0
- devflow_engine/idea/analyze.py +93 -0
- devflow_engine/idea/devin_chat_dag.py +1 -0
- devflow_engine/idea/diff.py +99 -0
- devflow_engine/idea/drafts.py +446 -0
- devflow_engine/idea/idea_creation_dag.py +643 -0
- devflow_engine/idea/ideation_enrichment.py +355 -0
- devflow_engine/idea/ideation_enrichment_worker.py +19 -0
- devflow_engine/idea/paths.py +28 -0
- devflow_engine/idea/promote.py +53 -0
- devflow_engine/idea/redaction.py +27 -0
- devflow_engine/idea/repo_tools.py +1277 -0
- devflow_engine/idea/response_mode.py +30 -0
- devflow_engine/idea/story_pipeline.py +1585 -0
- devflow_engine/idea/sufficiency.py +376 -0
- devflow_engine/idea/traditional_stories.py +1257 -0
- devflow_engine/implementation/__init__.py +0 -0
- devflow_engine/implementation/alembic_preflight.py +700 -0
- devflow_engine/implementation/dag.py +8450 -0
- devflow_engine/implementation/green_gate.py +93 -0
- devflow_engine/implementation/prompts.py +108 -0
- devflow_engine/implementation/test_runtime.py +623 -0
- devflow_engine/integration/__init__.py +19 -0
- devflow_engine/integration/agentic.py +66 -0
- devflow_engine/integration/dag.py +3539 -0
- devflow_engine/integration/prompts.py +114 -0
- devflow_engine/integration/supabase_schema.sql +31 -0
- devflow_engine/integration/supabase_sync.py +177 -0
- devflow_engine/llm/__init__.py +1 -0
- devflow_engine/llm/cli_one_shot.py +84 -0
- devflow_engine/llm/cli_stream.py +371 -0
- devflow_engine/llm/execution_context.py +26 -0
- devflow_engine/llm/invoke.py +1322 -0
- devflow_engine/llm/provider_api.py +304 -0
- devflow_engine/llm/repo_knowledge.py +588 -0
- devflow_engine/llm_primitives.py +315 -0
- devflow_engine/orchestration.py +62 -0
- devflow_engine/planning/__init__.py +0 -0
- devflow_engine/planning/analyze_repo.py +92 -0
- devflow_engine/planning/render_drafts.py +133 -0
- devflow_engine/playground/__init__.py +0 -0
- devflow_engine/playground/hooks.py +26 -0
- devflow_engine/playwright_workflow/__init__.py +5 -0
- devflow_engine/playwright_workflow/dag.py +1317 -0
- devflow_engine/process/__init__.py +5 -0
- devflow_engine/process/dag.py +59 -0
- devflow_engine/project_registration/__init__.py +3 -0
- devflow_engine/project_registration/dag.py +1581 -0
- devflow_engine/project_registry.py +109 -0
- devflow_engine/prompts/devin/generic/prompt.md +6 -0
- devflow_engine/prompts/devin/ideation/prompt.md +263 -0
- devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
- devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
- devflow_engine/prompts/devin/insight/prompt.md +11 -0
- devflow_engine/prompts/devin/insight/scenarios.md +5 -0
- devflow_engine/prompts/devin/intake/prompt.md +15 -0
- devflow_engine/prompts/devin/iterate/prompt.md +12 -0
- devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
- devflow_engine/prompts/devin/shared/principles.md +246 -0
- devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
- devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
- devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
- devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
- devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
- devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/red/prompt.md +27 -0
- devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
- devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
- devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
- devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
- devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
- devflow_engine/prompts/integration/README.md +185 -0
- devflow_engine/prompts/integration/green/example.md +67 -0
- devflow_engine/prompts/integration/green/green/prompt.md +10 -0
- devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green_enrich/example.md +79 -0
- devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
- devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red/example.md +152 -0
- devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red_review/example.md +71 -0
- devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
- devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
- devflow_engine/prompts/integration/resolve/example.md +111 -0
- devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
- devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
- devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
- devflow_engine/prompts/integration/validate/example.md +143 -0
- devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
- devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
- devflow_engine/prompts/integration/write_workflows/example.md +100 -0
- devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
- devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
- devflow_engine/prompts/iterate/README.md +7 -0
- devflow_engine/prompts/iterate/coder/prompt.md +11 -0
- devflow_engine/prompts/iterate/framer/prompt.md +11 -0
- devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
- devflow_engine/prompts/iterate/observer/prompt.md +11 -0
- devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
- devflow_engine/prompts/recovery/execution/prompt.md +8 -0
- devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
- devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
- devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
- devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
- devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
- devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
- devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
- devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
- devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
- devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
- devflow_engine/recovery/__init__.py +3 -0
- devflow_engine/recovery/dag.py +2609 -0
- devflow_engine/recovery/models.py +220 -0
- devflow_engine/refactor.py +93 -0
- devflow_engine/registry/__init__.py +1 -0
- devflow_engine/registry/cards.py +238 -0
- devflow_engine/registry/domain_normalize.py +60 -0
- devflow_engine/registry/effects.py +65 -0
- devflow_engine/registry/enforce_report.py +150 -0
- devflow_engine/registry/module_cards_classify.py +164 -0
- devflow_engine/registry/module_cards_draft.py +184 -0
- devflow_engine/registry/module_cards_gate.py +59 -0
- devflow_engine/registry/packages.py +347 -0
- devflow_engine/registry/pathways.py +323 -0
- devflow_engine/review/__init__.py +11 -0
- devflow_engine/review/dag.py +588 -0
- devflow_engine/review/review_story.py +67 -0
- devflow_engine/scope_idea/__init__.py +3 -0
- devflow_engine/scope_idea/agentic.py +39 -0
- devflow_engine/scope_idea/dag.py +1069 -0
- devflow_engine/scope_idea/models.py +175 -0
- devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
- devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
- devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
- devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
- devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
- devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
- devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
- devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
- devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
- devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
- devflow_engine/skills/registry.example.yaml +42 -0
- devflow_engine/source_doc_assumptions.py +291 -0
- devflow_engine/source_doc_mutation_dag.py +1606 -0
- devflow_engine/source_doc_mutation_eval.py +417 -0
- devflow_engine/source_doc_mutation_worker.py +25 -0
- devflow_engine/source_docs_schema.py +207 -0
- devflow_engine/source_docs_updater.py +309 -0
- devflow_engine/source_scope/__init__.py +15 -0
- devflow_engine/source_scope/agentic.py +45 -0
- devflow_engine/source_scope/dag.py +1626 -0
- devflow_engine/source_scope/models.py +177 -0
- devflow_engine/stores/__init__.py +0 -0
- devflow_engine/stores/execution_store.py +3534 -0
- devflow_engine/story/__init__.py +0 -0
- devflow_engine/story/contracts.py +160 -0
- devflow_engine/story/discovery.py +47 -0
- devflow_engine/story/evidence.py +118 -0
- devflow_engine/story/hashing.py +27 -0
- devflow_engine/story/implemented_queue_purge.py +148 -0
- devflow_engine/story/indexer.py +105 -0
- devflow_engine/story/io.py +20 -0
- devflow_engine/story/markdown_contracts.py +298 -0
- devflow_engine/story/reconciliation.py +408 -0
- devflow_engine/story/validate_stories.py +149 -0
- devflow_engine/story/validate_tests_story.py +512 -0
- devflow_engine/story/validation.py +133 -0
- devflow_engine/ui_grounding/__init__.py +11 -0
- devflow_engine/ui_grounding/agentic.py +31 -0
- devflow_engine/ui_grounding/dag.py +874 -0
- devflow_engine/ui_grounding/models.py +224 -0
- devflow_engine/ui_grounding/pencil_bridge.py +247 -0
- devflow_engine/vendor/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
- devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
- devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
- devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
- devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
- devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
- devflow_engine/worker.py +1086 -0
- devflow_engine/worker_guard.py +233 -0
- devflow_engine-1.0.0.dist-info/METADATA +235 -0
- devflow_engine-1.0.0.dist-info/RECORD +393 -0
- devflow_engine-1.0.0.dist-info/WHEEL +4 -0
- devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
- devin/__init__.py +6 -0
- devin/dag.py +58 -0
- devin/dag_two_arm.py +138 -0
- devin/devin_chat_scenario_catalog.json +588 -0
- devin/devin_eval.py +677 -0
- devin/nodes/__init__.py +0 -0
- devin/nodes/ideation/__init__.py +0 -0
- devin/nodes/ideation/node.py +195 -0
- devin/nodes/ideation/playground.py +267 -0
- devin/nodes/ideation/prompt.md +65 -0
- devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
- devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
- devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
- devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
- devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
- devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
- devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
- devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
- devin/nodes/ideation/scenarios/vague_idea.py +16 -0
- devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
- devin/nodes/ideation/tools.json +312 -0
- devin/nodes/insight/__init__.py +0 -0
- devin/nodes/insight/node.py +49 -0
- devin/nodes/insight/playground.py +154 -0
- devin/nodes/insight/prompt.md +61 -0
- devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
- devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
- devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
- devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
- devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
- devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
- devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
- devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
- devin/nodes/insight/scenarios/operational_debugging.py +15 -0
- devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
- devin/nodes/insight/scenarios/operational_question.py +9 -0
- devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
- devin/nodes/insight/scenarios/queue_status.py +15 -0
- devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
- devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
- devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
- devin/nodes/insight/scenarios/worker_state_check.py +15 -0
- devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
- devin/nodes/insight/tools.json +126 -0
- devin/nodes/intake/__init__.py +0 -0
- devin/nodes/intake/node.py +27 -0
- devin/nodes/intake/playground.py +47 -0
- devin/nodes/intake/prompt.md +12 -0
- devin/nodes/intake/scenarios/ideation_routing.py +4 -0
- devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
- devin/nodes/intake/scenarios/insight_routing.py +4 -0
- devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
- devin/nodes/iterate/README.md +44 -0
- devin/nodes/iterate/__init__.py +1 -0
- devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
- devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
- devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
- devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
- devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
- devin/nodes/iterate/agent-roles.md +89 -0
- devin/nodes/iterate/agents/README.md +10 -0
- devin/nodes/iterate/artifacts.md +504 -0
- devin/nodes/iterate/contract.md +100 -0
- devin/nodes/iterate/eval-plan.md +74 -0
- devin/nodes/iterate/node.py +100 -0
- devin/nodes/iterate/pipeline/README.md +13 -0
- devin/nodes/iterate/playground-contract.md +76 -0
- devin/nodes/iterate/prompt.md +11 -0
- devin/nodes/iterate/scenarios/README.md +38 -0
- devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
- devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
- devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
- devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
- devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
- devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
- devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
- devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
- devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
- devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
- devin/nodes/shared/__init__.py +0 -0
- devin/nodes/shared/filemaker_expert.md +80 -0
- devin/nodes/shared/filemaker_expert.py +354 -0
- devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
- devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
- devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
- devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
- devin/nodes/shared/helpers.py +156 -0
- devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
- devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
- devin/nodes/shared/models.py +44 -0
- devin/nodes/shared/post.py +40 -0
- devin/nodes/shared/router.py +107 -0
- devin/nodes/shared/tools.py +191 -0
- devin/shared/devin-chat-rubric.md +237 -0
- devin/shared/devin-chat-scenario-suite.md +90 -0
- devin/shared/eval_doctrine.md +9 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Iterate arm contract
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
`Iterate` is the Devin chat arm for hyper-specific fixes, quick changes, and targeted improvements against an existing surface.
|
|
6
|
+
|
|
7
|
+
It fills the gap between:
|
|
8
|
+
- `Idea`, which shapes planning truth for broader downstream work
|
|
9
|
+
- `Insight`, which stays read-only and explanatory
|
|
10
|
+
|
|
11
|
+
## Recommended route set
|
|
12
|
+
|
|
13
|
+
Top-level intake routing should become:
|
|
14
|
+
- `idea`
|
|
15
|
+
- `insight`
|
|
16
|
+
- `iterate`
|
|
17
|
+
- `neither`
|
|
18
|
+
|
|
19
|
+
## Route into Iterate when
|
|
20
|
+
|
|
21
|
+
The user is asking for a bounded change against an existing surface, for example:
|
|
22
|
+
- fixing a concrete error
|
|
23
|
+
- making a small behavior change
|
|
24
|
+
- tweaking a page, component, route, or flow
|
|
25
|
+
- improving a narrow interaction without opening broad product planning
|
|
26
|
+
|
|
27
|
+
## Route away from Iterate when
|
|
28
|
+
|
|
29
|
+
### To `insight`
|
|
30
|
+
- the user only wants explanation, diagnosis, or investigation
|
|
31
|
+
- no implementation path is being requested yet
|
|
32
|
+
|
|
33
|
+
### To `idea`
|
|
34
|
+
- the request is broader feature or workflow planning
|
|
35
|
+
- the change is no longer task-scale
|
|
36
|
+
- success depends on new planning truth rather than a targeted delta
|
|
37
|
+
|
|
38
|
+
## Core objective
|
|
39
|
+
|
|
40
|
+
Own a targeted request from framing through verified completion, or stop with an honest blocked or promotion verdict.
|
|
41
|
+
|
|
42
|
+
## Primary objectives
|
|
43
|
+
|
|
44
|
+
1. Convert the request into a precise task artifact.
|
|
45
|
+
2. Ground the task in evidence, repro, or a red verification seam.
|
|
46
|
+
3. Supervise implementation through a bounded coding loop.
|
|
47
|
+
4. Keep scope tight and resist turning task work into ideation theater.
|
|
48
|
+
5. Preserve ownership boundaries: Iterator owns truth and validation, Coder owns implementation attempts.
|
|
49
|
+
|
|
50
|
+
## Derived non-goals
|
|
51
|
+
|
|
52
|
+
- Do not behave like broad ideation intake.
|
|
53
|
+
- Do not stay read-only when the user clearly wants a fix or change.
|
|
54
|
+
- Do not claim reproducibility without evidence.
|
|
55
|
+
- Do not claim completion without scoped verification.
|
|
56
|
+
- Do not broaden scope without explicit approval.
|
|
57
|
+
- Do not let the implementation worker redefine the task contract.
|
|
58
|
+
|
|
59
|
+
## Orchestration pattern
|
|
60
|
+
|
|
61
|
+
Preferred pattern: advisor-primary with one supervised coding subagent.
|
|
62
|
+
|
|
63
|
+
- Primary: `Iterator`
|
|
64
|
+
- Advisors: `Framer`, `Observer`
|
|
65
|
+
- Worker subagent: `Coder`
|
|
66
|
+
|
|
67
|
+
This should remain a simple accountable structure, not a peer swarm.
|
|
68
|
+
|
|
69
|
+
## Pipeline order for this node
|
|
70
|
+
|
|
71
|
+
The iterate lane should be designed in this order:
|
|
72
|
+
1. objectives and requirements
|
|
73
|
+
2. evals
|
|
74
|
+
3. tools and boundaries
|
|
75
|
+
4. harness and playground
|
|
76
|
+
5. prompt content only after the first four are stable
|
|
77
|
+
|
|
78
|
+
Cross-agent stage docs live in `pipeline/`.
|
|
79
|
+
Per-agent stage docs live in `agents/<agent>/`.
|
|
80
|
+
|
|
81
|
+
## Verification loop
|
|
82
|
+
|
|
83
|
+
1. Framer produces the task artifact.
|
|
84
|
+
2. Observer produces the observation artifact.
|
|
85
|
+
3. Iterator decides whether truth is sufficient to proceed.
|
|
86
|
+
4. Iterator spawns Coder.
|
|
87
|
+
5. Coder attempts the scoped delta.
|
|
88
|
+
6. Iterator validates against the observation seam, success criteria, and scope boundary.
|
|
89
|
+
7. If repairable but not aligned, Iterator respawns Coder with repair-specific context.
|
|
90
|
+
8. If aligned, Iterator returns completion.
|
|
91
|
+
9. If no longer task-scale or truth is missing, Iterator blocks or escalates clearly.
|
|
92
|
+
|
|
93
|
+
## Completion gate
|
|
94
|
+
|
|
95
|
+
Iterator may only return completion when all are true:
|
|
96
|
+
1. the confirmed repro no longer reproduces, or the scoped failing seam is green
|
|
97
|
+
2. success criteria are satisfied
|
|
98
|
+
3. the implemented change stayed within task scope
|
|
99
|
+
4. observation evidence and final behavior agree
|
|
100
|
+
5. no blocker remains that invalidates the claim
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Iterate eval plan
|
|
2
|
+
|
|
3
|
+
This file captures the pre-prompt eval targets that should exist before prompt writing or runtime wiring.
|
|
4
|
+
|
|
5
|
+
## Pipeline stance
|
|
6
|
+
|
|
7
|
+
The eval layer comes after objectives and requirements, but before tool affordances or playground implementation details.
|
|
8
|
+
|
|
9
|
+
For the iterate lane, evals should be readable at three levels:
|
|
10
|
+
- route and lane level
|
|
11
|
+
- per-agent accountability level
|
|
12
|
+
- end-to-end truthful completion level
|
|
13
|
+
|
|
14
|
+
## Route evals
|
|
15
|
+
|
|
16
|
+
1. route a concrete fix request into `iterate`
|
|
17
|
+
2. route an investigation-only request into `insight`
|
|
18
|
+
3. promote a broader feature or workflow request into `idea`
|
|
19
|
+
4. keep a small request in `iterate` instead of inflating it
|
|
20
|
+
|
|
21
|
+
## Framing evals
|
|
22
|
+
|
|
23
|
+
1. turn messy conversational input into a bounded task artifact
|
|
24
|
+
2. extract route, page, component, file, or function hints when present
|
|
25
|
+
3. distinguish current behavior from desired behavior clearly
|
|
26
|
+
4. write observable success criteria rather than vague aspirations
|
|
27
|
+
5. surface blocking unknowns when the request is underspecified
|
|
28
|
+
|
|
29
|
+
## Observation evals
|
|
30
|
+
|
|
31
|
+
1. confirm a reported error when evidence exists
|
|
32
|
+
2. report `not_confirmed` honestly when the error cannot be reproduced
|
|
33
|
+
3. create a bounded red seam for a targeted improvement
|
|
34
|
+
4. provide repro steps another agent can run
|
|
35
|
+
5. avoid inventing evidence when logs are silent
|
|
36
|
+
6. ask for more context when truth is genuinely missing
|
|
37
|
+
|
|
38
|
+
## Iterator supervision evals
|
|
39
|
+
|
|
40
|
+
1. refuse to spawn Coder before framing and observation are sufficient
|
|
41
|
+
2. respawn Coder after a near miss with tighter repair context
|
|
42
|
+
3. refuse completion when the observation seam is still red
|
|
43
|
+
4. refuse completion when success criteria are unmet
|
|
44
|
+
5. block honestly when missing truth prevents safe implementation
|
|
45
|
+
6. escalate when the task grows past iterate scale
|
|
46
|
+
|
|
47
|
+
## Coder evals
|
|
48
|
+
|
|
49
|
+
1. fix a reproducible error without unrelated drift
|
|
50
|
+
2. satisfy a targeted improvement seam
|
|
51
|
+
3. report partial progress honestly when the first attempt fails
|
|
52
|
+
4. stay within the scoped files and surfaces when the task is narrow
|
|
53
|
+
5. return actionable blocker detail when safe completion is impossible
|
|
54
|
+
6. improve on a second attempt when respawned with repair context
|
|
55
|
+
|
|
56
|
+
## Completion truth evals
|
|
57
|
+
|
|
58
|
+
1. no green claim without green seam
|
|
59
|
+
2. no success claim without requested user-visible outcome
|
|
60
|
+
3. no completion claim when repro truth was never established but was required
|
|
61
|
+
4. no completion claim after unauthorized scope expansion
|
|
62
|
+
|
|
63
|
+
## Detailed stage mapping
|
|
64
|
+
|
|
65
|
+
- cross-agent eval design: `pipeline/02-evals.md`
|
|
66
|
+
- agent-specific eval design: `agents/*/02-evals.md`
|
|
67
|
+
- scenario planning inputs: `scenarios/`
|
|
68
|
+
|
|
69
|
+
## Expected future repo mapping
|
|
70
|
+
|
|
71
|
+
When implementation begins, these eval categories should map cleanly into:
|
|
72
|
+
- intake routing scenarios for the new `iterate` arm
|
|
73
|
+
- iterate scenario fixtures under `src/devin/nodes/iterate/scenarios/`
|
|
74
|
+
- eventual `_evals.py` checks that mirror the existing ideation and insight harness style
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from devflow_engine.devin2.pi_runner import run_devin2_pi_agent
|
|
6
|
+
from devflow_engine.vendor.datalumina_genai.core.nodes.base import Node
|
|
7
|
+
from devflow_engine.vendor.datalumina_genai.core.task import TaskContext
|
|
8
|
+
from devin.nodes.shared.helpers import (
|
|
9
|
+
dfs_node_running,
|
|
10
|
+
load_node_prompt_lines,
|
|
11
|
+
pipeline_root,
|
|
12
|
+
resolve_project_id,
|
|
13
|
+
store_run,
|
|
14
|
+
write_json,
|
|
15
|
+
)
|
|
16
|
+
from devin.nodes.shared.models import DevinAgentResponse
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class IterateAgentNode(Node):
|
|
20
|
+
async def process(self, task_context: TaskContext) -> TaskContext:
|
|
21
|
+
event = task_context.event
|
|
22
|
+
repo_root = Path(event.repo_root)
|
|
23
|
+
store, run_id = store_run()
|
|
24
|
+
node_exec_id = store.create_node_attempt(
|
|
25
|
+
run_id=run_id,
|
|
26
|
+
node_id='iterate_agent',
|
|
27
|
+
node_name='IterateAgent',
|
|
28
|
+
attempt=1,
|
|
29
|
+
)
|
|
30
|
+
project_id = str(
|
|
31
|
+
task_context.metadata.get('project_id')
|
|
32
|
+
or resolve_project_id(repo_root, idea_id=event.idea_id)
|
|
33
|
+
)
|
|
34
|
+
dfs_node_running(
|
|
35
|
+
project_id=project_id,
|
|
36
|
+
run_id=run_id,
|
|
37
|
+
node_id='iterate_agent',
|
|
38
|
+
summary='Running Devin iterate agent',
|
|
39
|
+
idea_id=event.idea_id,
|
|
40
|
+
)
|
|
41
|
+
guidance = load_node_prompt_lines(__file__) + [
|
|
42
|
+
'Return exactly one truthful outcome kind.',
|
|
43
|
+
'Keep the work task-scale and tied to the existing surface described by the user.',
|
|
44
|
+
'Use available context and artifacts as the only truth source.',
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
session_id = f"iterate:{project_id}:{event.idea_id}"
|
|
48
|
+
|
|
49
|
+
context_payload = {
|
|
50
|
+
'idea_id': event.idea_id,
|
|
51
|
+
'current_user_message': str(task_context.metadata.get('raw_text') or event.raw_text or ''),
|
|
52
|
+
'route': task_context.metadata.get('route') or {},
|
|
53
|
+
'project_id': project_id,
|
|
54
|
+
'repo_root': str(repo_root),
|
|
55
|
+
'session_id': session_id,
|
|
56
|
+
}
|
|
57
|
+
result = run_devin2_pi_agent(
|
|
58
|
+
repo_root=repo_root,
|
|
59
|
+
stage_name='devin_iterate_response',
|
|
60
|
+
route_arm='iterate',
|
|
61
|
+
context_payload=context_payload,
|
|
62
|
+
operational_guidance=guidance,
|
|
63
|
+
output_model=DevinAgentResponse,
|
|
64
|
+
timeout_seconds=90,
|
|
65
|
+
)
|
|
66
|
+
model = DevinAgentResponse.model_validate(result.response_model.model_dump())
|
|
67
|
+
response_payload = {
|
|
68
|
+
'idea_id': event.idea_id,
|
|
69
|
+
'pipeline_dir': str(
|
|
70
|
+
pipeline_root(repo_root, idea_id=event.idea_id, pipeline_key=event.pipeline_key)
|
|
71
|
+
),
|
|
72
|
+
'response_message': model.response_message,
|
|
73
|
+
'response_kind': model.response_kind,
|
|
74
|
+
'suggested_next_step': model.suggested_next_step,
|
|
75
|
+
'follow_up_questions': model.follow_up_questions,
|
|
76
|
+
'response_style_notes': model.style_notes,
|
|
77
|
+
}
|
|
78
|
+
out_path = (
|
|
79
|
+
pipeline_root(repo_root, idea_id=event.idea_id, pipeline_key=event.pipeline_key)
|
|
80
|
+
/ 'iterate_response.json'
|
|
81
|
+
)
|
|
82
|
+
write_json(out_path, response_payload)
|
|
83
|
+
store.add_artifact(
|
|
84
|
+
run_id=run_id,
|
|
85
|
+
node_exec_id=node_exec_id,
|
|
86
|
+
kind='devin_iterate_response',
|
|
87
|
+
uri=str(out_path),
|
|
88
|
+
metadata={'response_kind': model.response_kind},
|
|
89
|
+
)
|
|
90
|
+
store.mark_node_finished(
|
|
91
|
+
node_exec_id=node_exec_id,
|
|
92
|
+
status='succeeded',
|
|
93
|
+
output=response_payload,
|
|
94
|
+
)
|
|
95
|
+
task_context.metadata['response_guidance'] = response_payload
|
|
96
|
+
task_context.metadata['agent_loop_terminal'] = {
|
|
97
|
+
'status': model.response_kind,
|
|
98
|
+
**response_payload,
|
|
99
|
+
}
|
|
100
|
+
return task_context
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Iterate design pipeline
|
|
2
|
+
|
|
3
|
+
This folder makes the development order explicit for the iterate lane.
|
|
4
|
+
|
|
5
|
+
Read and review in order:
|
|
6
|
+
1. `01-objectives-requirements.md`
|
|
7
|
+
2. `02-evals.md`
|
|
8
|
+
3. `03-tools-and-boundaries.md`
|
|
9
|
+
4. `04-harness-and-playground.md`
|
|
10
|
+
5. `05-prompt-deferred.md`
|
|
11
|
+
|
|
12
|
+
The first four stages are design inputs.
|
|
13
|
+
The fifth is a reminder that prompt authoring is intentionally delayed.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Iterate playground contract
|
|
2
|
+
|
|
3
|
+
A future Iterate playground should validate the arm as a whole, not just isolated prompt wording.
|
|
4
|
+
|
|
5
|
+
## Placement in the development pipeline
|
|
6
|
+
|
|
7
|
+
Harness and playground design is the fourth stage, after:
|
|
8
|
+
1. objectives and requirements
|
|
9
|
+
2. evals
|
|
10
|
+
3. tools and boundaries
|
|
11
|
+
|
|
12
|
+
This doc stays downstream of those decisions and should not invent them.
|
|
13
|
+
|
|
14
|
+
## Required scenario coverage
|
|
15
|
+
|
|
16
|
+
- reproducible error with logs
|
|
17
|
+
- non-reproducible reported error
|
|
18
|
+
- quick UI or behavior tweak
|
|
19
|
+
- targeted improvement with explicit success criteria
|
|
20
|
+
- coder first pass fails, second pass succeeds
|
|
21
|
+
- request that should route to `idea`
|
|
22
|
+
- request that should route to `insight`
|
|
23
|
+
|
|
24
|
+
## What the playground should verify
|
|
25
|
+
|
|
26
|
+
- routing correctness
|
|
27
|
+
- task artifact completeness
|
|
28
|
+
- observation honesty
|
|
29
|
+
- iterator supervision behavior
|
|
30
|
+
- coder respawn behavior
|
|
31
|
+
- completion truthfulness
|
|
32
|
+
- scope discipline
|
|
33
|
+
- readiness state transitions in `iterator_run`
|
|
34
|
+
- monotonic top-level artifact revisions
|
|
35
|
+
- promotion linkage through `promotion_handoff.json` when the lane exits to `idea` or `insight`
|
|
36
|
+
- attempt-scoped verifier artifact references and summaries
|
|
37
|
+
- exact ordinal attempt ids of the form `attempt-001`, `attempt-002`, and so on
|
|
38
|
+
- normalized verifier envelopes that stay parseable across verifier types
|
|
39
|
+
|
|
40
|
+
## Harness expectations
|
|
41
|
+
|
|
42
|
+
The eventual harness should be able to:
|
|
43
|
+
- load prior conversation turns
|
|
44
|
+
- provide repo root and project context
|
|
45
|
+
- feed a route payload into the iterate arm
|
|
46
|
+
- inspect generated artifacts under `.devflow/iterate/<task_id>/`
|
|
47
|
+
- inspect `iterator_run.run_state` and `iterator_run.readiness`
|
|
48
|
+
- inspect top-level artifact revisions and the exact revisions cited by readiness or promotion decisions
|
|
49
|
+
- inspect `promotion_handoff.json` and its source refs when work leaves iterate
|
|
50
|
+
- inspect attempt-scoped verification summaries and raw verifier outputs separately
|
|
51
|
+
- assert that attempt directories sort in execution order via `attempt-<NNN...>` ids
|
|
52
|
+
- read shared verifier-envelope fields such as `overall_result`, `green_condition_alignment`, and `evidence_refs` without depending on verifier-specific payload structure
|
|
53
|
+
- inspect tool-call or worker-attempt traces
|
|
54
|
+
- evaluate terminal response guidance and disposition
|
|
55
|
+
|
|
56
|
+
## Fixture expectations
|
|
57
|
+
|
|
58
|
+
Each fixture should make clear:
|
|
59
|
+
- user request
|
|
60
|
+
- relevant prior turns
|
|
61
|
+
- project and repo context
|
|
62
|
+
- expected route outcome
|
|
63
|
+
- expected artifact shape
|
|
64
|
+
- expected readiness or blocker verdict
|
|
65
|
+
- expected completion or escalation behavior
|
|
66
|
+
|
|
67
|
+
## Per-agent harness hooks
|
|
68
|
+
|
|
69
|
+
- `Iterator` needs supervision and terminal-verdict inspection
|
|
70
|
+
- `Framer` needs task-artifact inspection
|
|
71
|
+
- `Observer` needs evidence and green-condition inspection
|
|
72
|
+
- `Coder` needs attempt-report and narrow-verification inspection
|
|
73
|
+
|
|
74
|
+
## Guardrail
|
|
75
|
+
|
|
76
|
+
The playground should not treat superficial harness success as proof of real coding quality. It should only verify the bounded contract the Iterate arm claims to satisfy.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Iterator
|
|
2
|
+
|
|
3
|
+
Own the iterate task end to end using only the provided request, artifacts, and evidence, then return one truthful lane outcome.
|
|
4
|
+
|
|
5
|
+
- Synthesize framing, observation, and attempt evidence without collapsing their roles.
|
|
6
|
+
- Decide readiness before coding starts.
|
|
7
|
+
- Keep scope aligned to the task contract.
|
|
8
|
+
- Respawn only with repair-specific guidance when the task is still viable.
|
|
9
|
+
- Block, reroute, or promote when truth or scope is insufficient.
|
|
10
|
+
- Do not rewrite advisor conclusions casually.
|
|
11
|
+
- Do not claim success without verification against the stated green condition.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Iterate scenarios
|
|
2
|
+
|
|
3
|
+
Runnable scenario modules and evals for the Iterate arm.
|
|
4
|
+
|
|
5
|
+
## Scenario files
|
|
6
|
+
|
|
7
|
+
Each scenario has a `.py` file (scenario definition) and a matching `_evals.py` (evaluation criteria and `evaluate()` function).
|
|
8
|
+
|
|
9
|
+
### Arm-level scenarios
|
|
10
|
+
- `devin_iterate_routing.py` — routes to iterate correctly
|
|
11
|
+
- `iterate_error_fix.py` — reproducible error fix
|
|
12
|
+
- `iterate_quick_change.py` — targeted behavior change
|
|
13
|
+
- `iterate_to_insight_reroute.py` — reroute to insight via tool call
|
|
14
|
+
- `iterate_to_idea_promotion.py` — promote to idea via tool call
|
|
15
|
+
|
|
16
|
+
### Subagent scenarios
|
|
17
|
+
- `framer_task_framing.py` — vague request → well-formed task_artifact
|
|
18
|
+
- `framer_scope_boundary.py` — framer keeps scope bounded
|
|
19
|
+
- `observer_evidence_seam.py` — observer identifies failing seam from evidence
|
|
20
|
+
- `observer_repro_creation.py` — observer creates narrow, reproducible repro
|
|
21
|
+
- `coder_bounded_fix.py` — coder implements targeted fix matching framed task
|
|
22
|
+
- `coder_artifact_alignment.py` — coder output aligns with observation_artifact
|
|
23
|
+
|
|
24
|
+
## Running scenarios
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Run all scenarios
|
|
28
|
+
/Users/devflow/repos/devflow_engine/.venv/bin/python3 \
|
|
29
|
+
/Users/devflow/repos/devflow_engine/playground/iterate_arm_playground.py
|
|
30
|
+
|
|
31
|
+
# Run specific scenario
|
|
32
|
+
/Users/devflow/repos/devflow_engine/.venv/bin/python3 \
|
|
33
|
+
/Users/devflow/repos/devflow_engine/playground/iterate_arm_playground.py \
|
|
34
|
+
--scenario-name <name>
|
|
35
|
+
|
|
36
|
+
# Run via shell runner
|
|
37
|
+
bash /Users/devflow/repos/devflow_engine/playground/run_iterate_scenarios.sh
|
|
38
|
+
```
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Iterate artifact and loop scenarios
|
|
2
|
+
|
|
3
|
+
## Framing scenarios
|
|
4
|
+
|
|
5
|
+
### messy_error_report_with_partial_location
|
|
6
|
+
Expected checks:
|
|
7
|
+
- Framer derives `task_type=error_fix`
|
|
8
|
+
- captures partial surface or route hints
|
|
9
|
+
- separates facts from assumptions
|
|
10
|
+
- records blocking unknowns if needed
|
|
11
|
+
- uses the shared base `task_artifact` shape with `task_details.error_fix`
|
|
12
|
+
|
|
13
|
+
### tiny_copy_or_behavior_tweak
|
|
14
|
+
Expected checks:
|
|
15
|
+
- Framer keeps scope small
|
|
16
|
+
- success criteria stay observable
|
|
17
|
+
- no inflation into broader planning
|
|
18
|
+
- uses `task_details.quick_change` rather than inventing a separate task schema
|
|
19
|
+
|
|
20
|
+
### underspecified_but_repairable_request
|
|
21
|
+
Expected checks:
|
|
22
|
+
- Framer recommends `investigate_first`
|
|
23
|
+
- task artifact stays honest about unknowns
|
|
24
|
+
- Iterator does not move to `ready_for_coder` until observation truth exists
|
|
25
|
+
|
|
26
|
+
## Observation scenarios
|
|
27
|
+
|
|
28
|
+
### error_confirmed_by_logs
|
|
29
|
+
Expected checks:
|
|
30
|
+
- Observer records log evidence
|
|
31
|
+
- minimal repro exists or repeatability is confirmed
|
|
32
|
+
- verdict is `ready_for_coder`
|
|
33
|
+
- observation artifact names a green condition that later verifier summaries can cite
|
|
34
|
+
|
|
35
|
+
### user_reported_error_not_confirmed
|
|
36
|
+
Expected checks:
|
|
37
|
+
- Observer reports `not_confirmed` or `inconclusive`
|
|
38
|
+
- no fake repro claim
|
|
39
|
+
- verdict is `needs_more_context`
|
|
40
|
+
- iterator run stays in a pre-coding state or blocks honestly
|
|
41
|
+
|
|
42
|
+
### targeted_improvement_red_seam
|
|
43
|
+
Expected checks:
|
|
44
|
+
- Observer creates a bounded failing seam
|
|
45
|
+
- expected green condition is explicit
|
|
46
|
+
- seam remains narrow enough for attempt-scoped verifier artifacts
|
|
47
|
+
- later verifier records can state `green_condition_alignment` against one explicit seam without inventing verifier-specific top-level schemas
|
|
48
|
+
|
|
49
|
+
## Iterator loop scenarios
|
|
50
|
+
|
|
51
|
+
### coder_near_miss_then_repair_success
|
|
52
|
+
Expected checks:
|
|
53
|
+
- Iterator rejects premature success
|
|
54
|
+
- respawn reason is explicit
|
|
55
|
+
- first attempt stores summary in `iterator_run` and normalized verifier artifacts under `attempts/<attempt_id>/`
|
|
56
|
+
- first attempt uses `attempt-001`, second uses `attempt-002`
|
|
57
|
+
- second attempt converges on the scoped green condition
|
|
58
|
+
- `run_state` moves through `needs_respawn` to `completed`
|
|
59
|
+
|
|
60
|
+
### missing_truth_blocks_safe_implementation
|
|
61
|
+
Expected checks:
|
|
62
|
+
- Iterator does not spawn blindly or does not claim completion
|
|
63
|
+
- readiness is recorded as not ready or blocked, not buried in attempt notes
|
|
64
|
+
- terminal state is blocked with a concrete reason
|
|
65
|
+
|
|
66
|
+
### request_outgrows_iterate_during_loop
|
|
67
|
+
Expected checks:
|
|
68
|
+
- Iterator promotes or escalates instead of hiding broader planning inside iterate
|
|
69
|
+
- run record lands in `promoted`
|
|
70
|
+
- promotion is visible in the durable iterate record rather than implied only in chat output
|
|
71
|
+
- `promotion_handoff.json` exists and points back to the exact task and observation revisions that justified the promotion
|
|
72
|
+
|
|
73
|
+
### reroute_to_insight_after_observation
|
|
74
|
+
Expected checks:
|
|
75
|
+
- Observer discovers the user actually needed diagnosis or investigation, not implementation
|
|
76
|
+
- Iterator reroutes to `insight` instead of spawning Coder
|
|
77
|
+
- `promotion_handoff.json` records `target_lane=insight`
|
|
78
|
+
- downstream refs may be null, but the iterate-owned handoff record is present and auditable
|
|
79
|
+
|
|
80
|
+
## Coder attempt scenarios
|
|
81
|
+
|
|
82
|
+
### scoped_fix_succeeds_first_pass
|
|
83
|
+
Expected checks:
|
|
84
|
+
- Coder stays in scoped files or surfaces
|
|
85
|
+
- verification seam is relevant and narrow
|
|
86
|
+
- Iterator can validate completion without ambiguity
|
|
87
|
+
- run record references attempt-scoped verifier artifacts instead of inlining raw output
|
|
88
|
+
- verifier artifacts expose shared fields like `overall_result` and `green_condition_alignment` even if the underlying verifier payload differs
|
|
89
|
+
|
|
90
|
+
### first_pass_fails_but_report_is_honest
|
|
91
|
+
Expected checks:
|
|
92
|
+
- Coder reports what failed and what remains blocked
|
|
93
|
+
- Iterator has enough signal to craft a repair-specific retry
|
|
94
|
+
- attempt summary stays concise while preserving a pointer to fuller verifier output
|
|
95
|
+
- fuller verifier output still keeps the shared normalization envelope, with tool-specific detail nested under `native_payload`
|
|
96
|
+
|
|
97
|
+
### revised_task_before_coding
|
|
98
|
+
Expected checks:
|
|
99
|
+
- Framer can amend the task without creating a second task root
|
|
100
|
+
- `task_artifact.json` revision increases monotonically
|
|
101
|
+
- Iterator readiness cites the exact task and observation revisions it relied on
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
SCENARIO_NAME = "coder_artifact_alignment"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Coder output should stay aligned to the supplied observation artifact and not solve a "
|
|
4
|
+
"different problem than the one observer bounded."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "coder",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"task_artifact": {
|
|
10
|
+
"task_type": "targeted_improvement",
|
|
11
|
+
"surface": "CSV import wizard",
|
|
12
|
+
"scope_boundary": "Trim whitespace around email values during import only.",
|
|
13
|
+
"success_criteria": ["emails with surrounding spaces import successfully"],
|
|
14
|
+
},
|
|
15
|
+
"observation_artifact": {
|
|
16
|
+
"failing_seam": "CSV preview rejects rows because email validation runs before whitespace trim.",
|
|
17
|
+
"expected_green_condition": "Import preview trims email values before validation.",
|
|
18
|
+
"repro_steps": [
|
|
19
|
+
"Upload CSV with email value ' alice@example.com '",
|
|
20
|
+
"Open import preview",
|
|
21
|
+
"Observe invalid email validation error",
|
|
22
|
+
],
|
|
23
|
+
"ready_for_coder": True,
|
|
24
|
+
},
|
|
25
|
+
}
|
|
26
|
+
EXPECTED_BEHAVIOR = {
|
|
27
|
+
"produces_coder_report": True,
|
|
28
|
+
"aligns_to_observation_artifact": True,
|
|
29
|
+
"stays_in_scope": True,
|
|
30
|
+
"runs_narrow_verification": True,
|
|
31
|
+
"reports_attempt_honestly": True,
|
|
32
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.coder_artifact_alignment import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_coder_report": True,
|
|
5
|
+
"aligns_to_observation_artifact": True,
|
|
6
|
+
"stays_in_scope": True,
|
|
7
|
+
"runs_narrow_verification": True,
|
|
8
|
+
"reports_attempt_honestly": True,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
14
|
+
ok = True
|
|
15
|
+
notes = []
|
|
16
|
+
|
|
17
|
+
changed_files = actual_output.get("changed_files") or []
|
|
18
|
+
if not changed_files:
|
|
19
|
+
ok = False
|
|
20
|
+
notes.append("missing changed_files for coder output")
|
|
21
|
+
|
|
22
|
+
verification = actual_output.get("verification") or actual_output.get("verification_summary")
|
|
23
|
+
if not verification:
|
|
24
|
+
ok = False
|
|
25
|
+
notes.append("missing verification aligned to observation_artifact")
|
|
26
|
+
|
|
27
|
+
report_text = " ".join(
|
|
28
|
+
str(v)
|
|
29
|
+
for v in [
|
|
30
|
+
actual_output.get("summary"),
|
|
31
|
+
actual_output.get("what_changed"),
|
|
32
|
+
actual_output.get("what_passed"),
|
|
33
|
+
actual_output.get("what_failed"),
|
|
34
|
+
]
|
|
35
|
+
if v
|
|
36
|
+
).lower()
|
|
37
|
+
if report_text and not any(tok in report_text for tok in ("csv", "email", "trim", "validation", "preview")):
|
|
38
|
+
ok = False
|
|
39
|
+
notes.append("coder report does not align to the observation artifact seam")
|
|
40
|
+
|
|
41
|
+
if report_text and any(tok in report_text for tok in ("invite", "dashboard", "unrelated cleanup", "schema redesign")):
|
|
42
|
+
ok = False
|
|
43
|
+
notes.append("coder output appears to solve a different problem than the observed seam")
|
|
44
|
+
|
|
45
|
+
return ok, notes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
SCENARIO_NAME = "coder_bounded_fix"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Coder receives framed and observed iterate artifacts and applies a minimal fix that "
|
|
4
|
+
"matches the scoped task instead of broadening the change."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "coder",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"task_artifact": {
|
|
10
|
+
"task_type": "error_fix",
|
|
11
|
+
"surface": "invite acceptance",
|
|
12
|
+
"scope_boundary": "Fix the accept-invite failure only. No refactor or UX redesign.",
|
|
13
|
+
"success_criteria": ["POST /api/invites/accept returns 200 for a valid invite token"],
|
|
14
|
+
},
|
|
15
|
+
"observation_artifact": {
|
|
16
|
+
"failing_seam": "Invite acceptance fails when accepted_at is None during persistence.",
|
|
17
|
+
"expected_green_condition": "Valid invite acceptance sets accepted_at before persistence.",
|
|
18
|
+
"ready_for_coder": True,
|
|
19
|
+
},
|
|
20
|
+
}
|
|
21
|
+
EXPECTED_BEHAVIOR = {
|
|
22
|
+
"produces_coder_report": True,
|
|
23
|
+
"implements_targeted_fix": True,
|
|
24
|
+
"stays_in_scope": True,
|
|
25
|
+
"runs_narrow_verification": True,
|
|
26
|
+
"reports_attempt_honestly": True,
|
|
27
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.coder_bounded_fix import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_coder_report": True,
|
|
5
|
+
"implements_targeted_fix": True,
|
|
6
|
+
"stays_in_scope": True,
|
|
7
|
+
"runs_narrow_verification": True,
|
|
8
|
+
"reports_attempt_honestly": True,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
14
|
+
ok = True
|
|
15
|
+
notes = []
|
|
16
|
+
|
|
17
|
+
changed_files = actual_output.get("changed_files") or []
|
|
18
|
+
if not changed_files:
|
|
19
|
+
ok = False
|
|
20
|
+
notes.append("missing changed_files for coder output")
|
|
21
|
+
|
|
22
|
+
verification = actual_output.get("verification") or actual_output.get("verification_summary")
|
|
23
|
+
if not verification:
|
|
24
|
+
ok = False
|
|
25
|
+
notes.append("missing narrow verification result")
|
|
26
|
+
|
|
27
|
+
report_text = " ".join(
|
|
28
|
+
str(v)
|
|
29
|
+
for v in [
|
|
30
|
+
actual_output.get("summary"),
|
|
31
|
+
actual_output.get("what_changed"),
|
|
32
|
+
actual_output.get("what_passed"),
|
|
33
|
+
actual_output.get("what_failed"),
|
|
34
|
+
actual_output.get("remaining_blockers"),
|
|
35
|
+
]
|
|
36
|
+
if v
|
|
37
|
+
).lower()
|
|
38
|
+
if report_text and any(tok in report_text for tok in ("refactor", "cleanup unrelated", "redesign")):
|
|
39
|
+
ok = False
|
|
40
|
+
notes.append("coder report suggests scope drift beyond the bounded fix")
|
|
41
|
+
|
|
42
|
+
if report_text and not any(tok in report_text for tok in ("invite", "accepted_at", "accept")):
|
|
43
|
+
notes.append("coder report does not clearly align to the framed invite seam")
|
|
44
|
+
|
|
45
|
+
return ok, notes
|