devflow-engine 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devflow_engine/__init__.py +3 -0
- devflow_engine/agentic_prompts.py +100 -0
- devflow_engine/agentic_runtime.py +398 -0
- devflow_engine/api_key_flow_harness.py +539 -0
- devflow_engine/api_keys.py +357 -0
- devflow_engine/bootstrap/__init__.py +2 -0
- devflow_engine/bootstrap/provision_from_template.py +84 -0
- devflow_engine/cli/__init__.py +0 -0
- devflow_engine/cli/app.py +7270 -0
- devflow_engine/core/__init__.py +0 -0
- devflow_engine/core/config.py +86 -0
- devflow_engine/core/logging.py +29 -0
- devflow_engine/core/paths.py +45 -0
- devflow_engine/core/toml_kv.py +33 -0
- devflow_engine/devflow_event_worker.py +1292 -0
- devflow_engine/devflow_state.py +201 -0
- devflow_engine/devin2/__init__.py +9 -0
- devflow_engine/devin2/agent_definition.py +120 -0
- devflow_engine/devin2/pi_runner.py +204 -0
- devflow_engine/devin_orchestration.py +69 -0
- devflow_engine/docs/prompts/anti-patterns.md +42 -0
- devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
- devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
- devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
- devflow_engine/doctor/__init__.py +2 -0
- devflow_engine/doctor/triage.py +140 -0
- devflow_engine/error/__init__.py +0 -0
- devflow_engine/error/remediation.py +21 -0
- devflow_engine/errors/error_solver_dag.py +522 -0
- devflow_engine/errors/runtime_observability.py +67 -0
- devflow_engine/idea/__init__.py +4 -0
- devflow_engine/idea/actors.py +481 -0
- devflow_engine/idea/agentic.py +465 -0
- devflow_engine/idea/analyze.py +93 -0
- devflow_engine/idea/devin_chat_dag.py +1 -0
- devflow_engine/idea/diff.py +99 -0
- devflow_engine/idea/drafts.py +446 -0
- devflow_engine/idea/idea_creation_dag.py +643 -0
- devflow_engine/idea/ideation_enrichment.py +355 -0
- devflow_engine/idea/ideation_enrichment_worker.py +19 -0
- devflow_engine/idea/paths.py +28 -0
- devflow_engine/idea/promote.py +53 -0
- devflow_engine/idea/redaction.py +27 -0
- devflow_engine/idea/repo_tools.py +1277 -0
- devflow_engine/idea/response_mode.py +30 -0
- devflow_engine/idea/story_pipeline.py +1585 -0
- devflow_engine/idea/sufficiency.py +376 -0
- devflow_engine/idea/traditional_stories.py +1257 -0
- devflow_engine/implementation/__init__.py +0 -0
- devflow_engine/implementation/alembic_preflight.py +700 -0
- devflow_engine/implementation/dag.py +8450 -0
- devflow_engine/implementation/green_gate.py +93 -0
- devflow_engine/implementation/prompts.py +108 -0
- devflow_engine/implementation/test_runtime.py +623 -0
- devflow_engine/integration/__init__.py +19 -0
- devflow_engine/integration/agentic.py +66 -0
- devflow_engine/integration/dag.py +3539 -0
- devflow_engine/integration/prompts.py +114 -0
- devflow_engine/integration/supabase_schema.sql +31 -0
- devflow_engine/integration/supabase_sync.py +177 -0
- devflow_engine/llm/__init__.py +1 -0
- devflow_engine/llm/cli_one_shot.py +84 -0
- devflow_engine/llm/cli_stream.py +371 -0
- devflow_engine/llm/execution_context.py +26 -0
- devflow_engine/llm/invoke.py +1322 -0
- devflow_engine/llm/provider_api.py +304 -0
- devflow_engine/llm/repo_knowledge.py +588 -0
- devflow_engine/llm_primitives.py +315 -0
- devflow_engine/orchestration.py +62 -0
- devflow_engine/planning/__init__.py +0 -0
- devflow_engine/planning/analyze_repo.py +92 -0
- devflow_engine/planning/render_drafts.py +133 -0
- devflow_engine/playground/__init__.py +0 -0
- devflow_engine/playground/hooks.py +26 -0
- devflow_engine/playwright_workflow/__init__.py +5 -0
- devflow_engine/playwright_workflow/dag.py +1317 -0
- devflow_engine/process/__init__.py +5 -0
- devflow_engine/process/dag.py +59 -0
- devflow_engine/project_registration/__init__.py +3 -0
- devflow_engine/project_registration/dag.py +1581 -0
- devflow_engine/project_registry.py +109 -0
- devflow_engine/prompts/devin/generic/prompt.md +6 -0
- devflow_engine/prompts/devin/ideation/prompt.md +263 -0
- devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
- devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
- devflow_engine/prompts/devin/insight/prompt.md +11 -0
- devflow_engine/prompts/devin/insight/scenarios.md +5 -0
- devflow_engine/prompts/devin/intake/prompt.md +15 -0
- devflow_engine/prompts/devin/iterate/prompt.md +12 -0
- devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
- devflow_engine/prompts/devin/shared/principles.md +246 -0
- devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
- devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
- devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
- devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
- devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
- devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/red/prompt.md +27 -0
- devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
- devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
- devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
- devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
- devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
- devflow_engine/prompts/integration/README.md +185 -0
- devflow_engine/prompts/integration/green/example.md +67 -0
- devflow_engine/prompts/integration/green/green/prompt.md +10 -0
- devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green_enrich/example.md +79 -0
- devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
- devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red/example.md +152 -0
- devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red_review/example.md +71 -0
- devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
- devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
- devflow_engine/prompts/integration/resolve/example.md +111 -0
- devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
- devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
- devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
- devflow_engine/prompts/integration/validate/example.md +143 -0
- devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
- devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
- devflow_engine/prompts/integration/write_workflows/example.md +100 -0
- devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
- devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
- devflow_engine/prompts/iterate/README.md +7 -0
- devflow_engine/prompts/iterate/coder/prompt.md +11 -0
- devflow_engine/prompts/iterate/framer/prompt.md +11 -0
- devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
- devflow_engine/prompts/iterate/observer/prompt.md +11 -0
- devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
- devflow_engine/prompts/recovery/execution/prompt.md +8 -0
- devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
- devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
- devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
- devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
- devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
- devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
- devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
- devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
- devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
- devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
- devflow_engine/recovery/__init__.py +3 -0
- devflow_engine/recovery/dag.py +2609 -0
- devflow_engine/recovery/models.py +220 -0
- devflow_engine/refactor.py +93 -0
- devflow_engine/registry/__init__.py +1 -0
- devflow_engine/registry/cards.py +238 -0
- devflow_engine/registry/domain_normalize.py +60 -0
- devflow_engine/registry/effects.py +65 -0
- devflow_engine/registry/enforce_report.py +150 -0
- devflow_engine/registry/module_cards_classify.py +164 -0
- devflow_engine/registry/module_cards_draft.py +184 -0
- devflow_engine/registry/module_cards_gate.py +59 -0
- devflow_engine/registry/packages.py +347 -0
- devflow_engine/registry/pathways.py +323 -0
- devflow_engine/review/__init__.py +11 -0
- devflow_engine/review/dag.py +588 -0
- devflow_engine/review/review_story.py +67 -0
- devflow_engine/scope_idea/__init__.py +3 -0
- devflow_engine/scope_idea/agentic.py +39 -0
- devflow_engine/scope_idea/dag.py +1069 -0
- devflow_engine/scope_idea/models.py +175 -0
- devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
- devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
- devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
- devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
- devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
- devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
- devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
- devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
- devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
- devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
- devflow_engine/skills/registry.example.yaml +42 -0
- devflow_engine/source_doc_assumptions.py +291 -0
- devflow_engine/source_doc_mutation_dag.py +1606 -0
- devflow_engine/source_doc_mutation_eval.py +417 -0
- devflow_engine/source_doc_mutation_worker.py +25 -0
- devflow_engine/source_docs_schema.py +207 -0
- devflow_engine/source_docs_updater.py +309 -0
- devflow_engine/source_scope/__init__.py +15 -0
- devflow_engine/source_scope/agentic.py +45 -0
- devflow_engine/source_scope/dag.py +1626 -0
- devflow_engine/source_scope/models.py +177 -0
- devflow_engine/stores/__init__.py +0 -0
- devflow_engine/stores/execution_store.py +3534 -0
- devflow_engine/story/__init__.py +0 -0
- devflow_engine/story/contracts.py +160 -0
- devflow_engine/story/discovery.py +47 -0
- devflow_engine/story/evidence.py +118 -0
- devflow_engine/story/hashing.py +27 -0
- devflow_engine/story/implemented_queue_purge.py +148 -0
- devflow_engine/story/indexer.py +105 -0
- devflow_engine/story/io.py +20 -0
- devflow_engine/story/markdown_contracts.py +298 -0
- devflow_engine/story/reconciliation.py +408 -0
- devflow_engine/story/validate_stories.py +149 -0
- devflow_engine/story/validate_tests_story.py +512 -0
- devflow_engine/story/validation.py +133 -0
- devflow_engine/ui_grounding/__init__.py +11 -0
- devflow_engine/ui_grounding/agentic.py +31 -0
- devflow_engine/ui_grounding/dag.py +874 -0
- devflow_engine/ui_grounding/models.py +224 -0
- devflow_engine/ui_grounding/pencil_bridge.py +247 -0
- devflow_engine/vendor/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
- devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
- devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
- devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
- devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
- devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
- devflow_engine/worker.py +1086 -0
- devflow_engine/worker_guard.py +233 -0
- devflow_engine-1.0.0.dist-info/METADATA +235 -0
- devflow_engine-1.0.0.dist-info/RECORD +393 -0
- devflow_engine-1.0.0.dist-info/WHEEL +4 -0
- devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
- devin/__init__.py +6 -0
- devin/dag.py +58 -0
- devin/dag_two_arm.py +138 -0
- devin/devin_chat_scenario_catalog.json +588 -0
- devin/devin_eval.py +677 -0
- devin/nodes/__init__.py +0 -0
- devin/nodes/ideation/__init__.py +0 -0
- devin/nodes/ideation/node.py +195 -0
- devin/nodes/ideation/playground.py +267 -0
- devin/nodes/ideation/prompt.md +65 -0
- devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
- devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
- devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
- devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
- devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
- devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
- devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
- devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
- devin/nodes/ideation/scenarios/vague_idea.py +16 -0
- devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
- devin/nodes/ideation/tools.json +312 -0
- devin/nodes/insight/__init__.py +0 -0
- devin/nodes/insight/node.py +49 -0
- devin/nodes/insight/playground.py +154 -0
- devin/nodes/insight/prompt.md +61 -0
- devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
- devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
- devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
- devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
- devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
- devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
- devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
- devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
- devin/nodes/insight/scenarios/operational_debugging.py +15 -0
- devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
- devin/nodes/insight/scenarios/operational_question.py +9 -0
- devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
- devin/nodes/insight/scenarios/queue_status.py +15 -0
- devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
- devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
- devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
- devin/nodes/insight/scenarios/worker_state_check.py +15 -0
- devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
- devin/nodes/insight/tools.json +126 -0
- devin/nodes/intake/__init__.py +0 -0
- devin/nodes/intake/node.py +27 -0
- devin/nodes/intake/playground.py +47 -0
- devin/nodes/intake/prompt.md +12 -0
- devin/nodes/intake/scenarios/ideation_routing.py +4 -0
- devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
- devin/nodes/intake/scenarios/insight_routing.py +4 -0
- devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
- devin/nodes/iterate/README.md +44 -0
- devin/nodes/iterate/__init__.py +1 -0
- devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
- devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
- devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
- devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
- devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
- devin/nodes/iterate/agent-roles.md +89 -0
- devin/nodes/iterate/agents/README.md +10 -0
- devin/nodes/iterate/artifacts.md +504 -0
- devin/nodes/iterate/contract.md +100 -0
- devin/nodes/iterate/eval-plan.md +74 -0
- devin/nodes/iterate/node.py +100 -0
- devin/nodes/iterate/pipeline/README.md +13 -0
- devin/nodes/iterate/playground-contract.md +76 -0
- devin/nodes/iterate/prompt.md +11 -0
- devin/nodes/iterate/scenarios/README.md +38 -0
- devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
- devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
- devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
- devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
- devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
- devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
- devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
- devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
- devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
- devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
- devin/nodes/shared/__init__.py +0 -0
- devin/nodes/shared/filemaker_expert.md +80 -0
- devin/nodes/shared/filemaker_expert.py +354 -0
- devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
- devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
- devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
- devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
- devin/nodes/shared/helpers.py +156 -0
- devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
- devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
- devin/nodes/shared/models.py +44 -0
- devin/nodes/shared/post.py +40 -0
- devin/nodes/shared/router.py +107 -0
- devin/nodes/shared/tools.py +191 -0
- devin/shared/devin-chat-rubric.md +237 -0
- devin/shared/devin-chat-scenario-suite.md +90 -0
- devin/shared/eval_doctrine.md +9 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
SCENARIO_NAME = "devin_iterate_routing"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Devin two-arm DAG: bounded existing-surface change request stays in the iterate "
|
|
4
|
+
"arm and does not drift into insight or idea handling."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"current_user_message": (
|
|
8
|
+
"On the existing account settings page, clicking Save on the profile form does "
|
|
9
|
+
"nothing. Fix that bug without changing anything else."
|
|
10
|
+
),
|
|
11
|
+
"idea_id": "devin_eval_iterate_routing",
|
|
12
|
+
"project_id": "proj_75f63d30",
|
|
13
|
+
"repo_root": "/Users/devflow/repos/Spicy-Server",
|
|
14
|
+
}
|
|
15
|
+
EXPECTED_BEHAVIOR = {
|
|
16
|
+
"route_arm": "iterate",
|
|
17
|
+
"response_kind_in": ["completed", "blocked", "needs_more_context"],
|
|
18
|
+
"keeps_existing_surface_scope": True,
|
|
19
|
+
"does_not_reroute": True,
|
|
20
|
+
"does_not_promote": True,
|
|
21
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.devin_iterate_routing import (
|
|
2
|
+
EXPECTED_BEHAVIOR,
|
|
3
|
+
INPUT_PAYLOAD,
|
|
4
|
+
SCENARIO_NAME,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
EVAL_CRITERIA = {
|
|
8
|
+
"route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
|
|
9
|
+
"response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
|
|
10
|
+
"keeps_existing_surface_scope": True,
|
|
11
|
+
"does_not_reroute": True,
|
|
12
|
+
"does_not_promote": True,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
17
|
+
ok = True
|
|
18
|
+
notes = []
|
|
19
|
+
exp = EXPECTED_BEHAVIOR
|
|
20
|
+
|
|
21
|
+
route_arm = actual_output.get("route_arm")
|
|
22
|
+
if route_arm != exp["route_arm"]:
|
|
23
|
+
ok = False
|
|
24
|
+
notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
|
|
25
|
+
|
|
26
|
+
kind = str(actual_output.get("response_kind") or "")
|
|
27
|
+
if kind not in exp["response_kind_in"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
|
|
30
|
+
|
|
31
|
+
target_lane = str(actual_output.get("target_lane") or "").lower()
|
|
32
|
+
if target_lane in {"insight", "idea"}:
|
|
33
|
+
ok = False
|
|
34
|
+
notes.append(f"unexpected lane transition to {target_lane}")
|
|
35
|
+
|
|
36
|
+
return ok, notes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
SCENARIO_NAME = "framer_scope_boundary"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Framer keeps a small existing-surface request bounded instead of inflating it into "
|
|
4
|
+
"a redesign or broader product initiative."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "framer",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"current_user_message": (
|
|
10
|
+
"On the current dashboard, rename the KPI card label from Gross Revenue to Revenue. "
|
|
11
|
+
"Please do not redesign the dashboard."
|
|
12
|
+
),
|
|
13
|
+
"context": {
|
|
14
|
+
"surface_hints": ["dashboard", "KPI card label"],
|
|
15
|
+
"constraint": "no redesign",
|
|
16
|
+
},
|
|
17
|
+
}
|
|
18
|
+
EXPECTED_BEHAVIOR = {
|
|
19
|
+
"produces_task_artifact": True,
|
|
20
|
+
"classifies_request_as": "quick_change",
|
|
21
|
+
"keeps_scope_bounded": True,
|
|
22
|
+
"does_not_promote": True,
|
|
23
|
+
"does_not_expand_into_redesign": True,
|
|
24
|
+
"recommended_next_step_in": ["stay_iterate"],
|
|
25
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.framer_scope_boundary import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_task_artifact": True,
|
|
5
|
+
"classifies_request_as": EXPECTED_BEHAVIOR["classifies_request_as"],
|
|
6
|
+
"keeps_scope_bounded": True,
|
|
7
|
+
"does_not_promote": True,
|
|
8
|
+
"does_not_expand_into_redesign": True,
|
|
9
|
+
"recommended_next_step_in": EXPECTED_BEHAVIOR["recommended_next_step_in"],
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
15
|
+
ok = True
|
|
16
|
+
notes = []
|
|
17
|
+
exp = EXPECTED_BEHAVIOR
|
|
18
|
+
|
|
19
|
+
task_artifact = actual_output.get("task_artifact") or {}
|
|
20
|
+
if not task_artifact:
|
|
21
|
+
ok = False
|
|
22
|
+
notes.append("missing task_artifact")
|
|
23
|
+
return ok, notes
|
|
24
|
+
|
|
25
|
+
classification = str(task_artifact.get("task_type") or task_artifact.get("classification") or "")
|
|
26
|
+
if classification != exp["classifies_request_as"]:
|
|
27
|
+
ok = False
|
|
28
|
+
notes.append(
|
|
29
|
+
f"expected classification={exp['classifies_request_as']}, got {classification}"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
next_step = str(actual_output.get("recommended_next_step") or task_artifact.get("recommended_next_step") or "")
|
|
33
|
+
if next_step not in exp["recommended_next_step_in"]:
|
|
34
|
+
ok = False
|
|
35
|
+
notes.append(
|
|
36
|
+
f"expected recommended_next_step in {exp['recommended_next_step_in']}, got {next_step}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
target_lane = str(actual_output.get("target_lane") or task_artifact.get("target_lane") or "").lower()
|
|
40
|
+
if target_lane in {"idea", "insight"}:
|
|
41
|
+
ok = False
|
|
42
|
+
notes.append(f"unexpected lane transition to {target_lane}")
|
|
43
|
+
|
|
44
|
+
text = " ".join(
|
|
45
|
+
str(v)
|
|
46
|
+
for v in [
|
|
47
|
+
task_artifact.get("summary"),
|
|
48
|
+
task_artifact.get("scope_boundary"),
|
|
49
|
+
task_artifact.get("non_goals"),
|
|
50
|
+
]
|
|
51
|
+
if v
|
|
52
|
+
).lower()
|
|
53
|
+
if any(tok in text for tok in ("redesign", "rewrite", "new dashboard", "broader analytics overhaul")):
|
|
54
|
+
ok = False
|
|
55
|
+
notes.append("task_artifact broadened the task beyond the requested label change")
|
|
56
|
+
|
|
57
|
+
return ok, notes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
SCENARIO_NAME = "framer_task_framing"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Framer receives a vague iterate request and turns it into a bounded, well-formed "
|
|
4
|
+
"task artifact another iterate role can safely use."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "framer",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"current_user_message": (
|
|
10
|
+
"The invite flow feels broken somewhere. Please fix it without redoing the page."
|
|
11
|
+
),
|
|
12
|
+
"context": {
|
|
13
|
+
"surface_hints": ["invite flow", "existing page"],
|
|
14
|
+
"reported_symptom": "broken somewhere",
|
|
15
|
+
},
|
|
16
|
+
}
|
|
17
|
+
EXPECTED_BEHAVIOR = {
|
|
18
|
+
"produces_task_artifact": True,
|
|
19
|
+
"classifies_request_as": "error_fix",
|
|
20
|
+
"keeps_scope_bounded": True,
|
|
21
|
+
"distinguishes_current_vs_desired": True,
|
|
22
|
+
"writes_success_criteria": True,
|
|
23
|
+
"separates_unknowns": True,
|
|
24
|
+
"recommended_next_step_in": ["stay_iterate", "investigate_first"],
|
|
25
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.framer_task_framing import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_task_artifact": True,
|
|
5
|
+
"classifies_request_as": EXPECTED_BEHAVIOR["classifies_request_as"],
|
|
6
|
+
"keeps_scope_bounded": True,
|
|
7
|
+
"distinguishes_current_vs_desired": True,
|
|
8
|
+
"writes_success_criteria": True,
|
|
9
|
+
"separates_unknowns": True,
|
|
10
|
+
"recommended_next_step_in": EXPECTED_BEHAVIOR["recommended_next_step_in"],
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
16
|
+
ok = True
|
|
17
|
+
notes = []
|
|
18
|
+
exp = EXPECTED_BEHAVIOR
|
|
19
|
+
|
|
20
|
+
task_artifact = actual_output.get("task_artifact") or {}
|
|
21
|
+
if not task_artifact:
|
|
22
|
+
ok = False
|
|
23
|
+
notes.append("missing task_artifact")
|
|
24
|
+
return ok, notes
|
|
25
|
+
|
|
26
|
+
classification = str(task_artifact.get("task_type") or task_artifact.get("classification") or "")
|
|
27
|
+
if classification != exp["classifies_request_as"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(
|
|
30
|
+
f"expected classification={exp['classifies_request_as']}, got {classification}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if not (task_artifact.get("current_behavior") and task_artifact.get("desired_behavior")):
|
|
34
|
+
ok = False
|
|
35
|
+
notes.append("task_artifact must distinguish current_behavior and desired_behavior")
|
|
36
|
+
|
|
37
|
+
success = task_artifact.get("success_criteria") or []
|
|
38
|
+
if not success:
|
|
39
|
+
ok = False
|
|
40
|
+
notes.append("task_artifact missing observable success_criteria")
|
|
41
|
+
|
|
42
|
+
unknowns = task_artifact.get("unknowns") or {}
|
|
43
|
+
if not (unknowns.get("blocking") is not None and unknowns.get("non_blocking") is not None):
|
|
44
|
+
ok = False
|
|
45
|
+
notes.append("task_artifact must separate blocking and non_blocking unknowns")
|
|
46
|
+
|
|
47
|
+
next_step = str(actual_output.get("recommended_next_step") or task_artifact.get("recommended_next_step") or "")
|
|
48
|
+
if next_step and next_step not in exp["recommended_next_step_in"]:
|
|
49
|
+
ok = False
|
|
50
|
+
notes.append(
|
|
51
|
+
f"expected recommended_next_step in {exp['recommended_next_step_in']}, got {next_step}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
scope = str(task_artifact.get("scope_boundary") or task_artifact.get("non_goals") or "").lower()
|
|
55
|
+
if scope and not any(tok in scope for tok in ("no redesign", "existing", "invite", "bounded")):
|
|
56
|
+
notes.append("scope boundary is present but may not be clearly bounded")
|
|
57
|
+
|
|
58
|
+
return ok, notes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
SCENARIO_NAME = "iterate_error_fix"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"A reproducible existing-product error should stay in iterate, keep a repair scope, "
|
|
4
|
+
"and return an iterate-owned outcome rather than diagnosis-only handling."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"current_user_message": (
|
|
8
|
+
"Fix this bug: opening /billing throws a 500 every time. The logs say "
|
|
9
|
+
"KeyError: customer_id in billing_summary()."
|
|
10
|
+
),
|
|
11
|
+
"idea_id": "devin_eval_iterate_error_fix",
|
|
12
|
+
"project_id": "proj_75f63d30",
|
|
13
|
+
"repo_root": "/Users/devflow/repos/Spicy-Server",
|
|
14
|
+
}
|
|
15
|
+
EXPECTED_BEHAVIOR = {
|
|
16
|
+
"route_arm": "iterate",
|
|
17
|
+
"response_kind_in": ["completed", "blocked", "needs_more_context"],
|
|
18
|
+
"treats_request_as_error_fix": True,
|
|
19
|
+
"preserves_reproducible_failure_context": True,
|
|
20
|
+
"does_not_reroute": True,
|
|
21
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.iterate_error_fix import (
|
|
2
|
+
EXPECTED_BEHAVIOR,
|
|
3
|
+
INPUT_PAYLOAD,
|
|
4
|
+
SCENARIO_NAME,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
EVAL_CRITERIA = {
|
|
8
|
+
"route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
|
|
9
|
+
"response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
|
|
10
|
+
"treats_request_as_error_fix": True,
|
|
11
|
+
"preserves_reproducible_failure_context": True,
|
|
12
|
+
"does_not_reroute": True,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
17
|
+
ok = True
|
|
18
|
+
notes = []
|
|
19
|
+
exp = EXPECTED_BEHAVIOR
|
|
20
|
+
|
|
21
|
+
route_arm = actual_output.get("route_arm")
|
|
22
|
+
if route_arm != exp["route_arm"]:
|
|
23
|
+
ok = False
|
|
24
|
+
notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
|
|
25
|
+
|
|
26
|
+
kind = str(actual_output.get("response_kind") or "")
|
|
27
|
+
if kind not in exp["response_kind_in"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
|
|
30
|
+
|
|
31
|
+
msg = str(actual_output.get("response_message") or "").lower()
|
|
32
|
+
if msg and not any(tok in msg for tok in ("500", "keyerror", "billing", "error", "bug")):
|
|
33
|
+
notes.append("response does not clearly preserve the reported error context")
|
|
34
|
+
|
|
35
|
+
if str(actual_output.get("target_lane") or "").lower() == "insight":
|
|
36
|
+
ok = False
|
|
37
|
+
notes.append("unexpected reroute to insight for a reproducible fix request")
|
|
38
|
+
|
|
39
|
+
return ok, notes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
SCENARIO_NAME = "iterate_quick_change"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"A tiny existing-surface behavior tweak should remain task-scale in iterate and "
|
|
4
|
+
"avoid inflation into broader planning."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"current_user_message": (
|
|
8
|
+
"On the current invoices table, default the Status filter to Open instead of All. "
|
|
9
|
+
"No redesign, just that behavior change."
|
|
10
|
+
),
|
|
11
|
+
"idea_id": "devin_eval_iterate_quick_change",
|
|
12
|
+
"project_id": "proj_75f63d30",
|
|
13
|
+
"repo_root": "/Users/devflow/repos/Spicy-Server",
|
|
14
|
+
}
|
|
15
|
+
EXPECTED_BEHAVIOR = {
|
|
16
|
+
"route_arm": "iterate",
|
|
17
|
+
"response_kind_in": ["completed", "blocked", "needs_more_context"],
|
|
18
|
+
"keeps_scope_small": True,
|
|
19
|
+
"treats_request_as_quick_change": True,
|
|
20
|
+
"does_not_promote": True,
|
|
21
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.iterate_quick_change import (
|
|
2
|
+
EXPECTED_BEHAVIOR,
|
|
3
|
+
INPUT_PAYLOAD,
|
|
4
|
+
SCENARIO_NAME,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
EVAL_CRITERIA = {
|
|
8
|
+
"route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
|
|
9
|
+
"response_kind_in": EXPECTED_BEHAVIOR["response_kind_in"],
|
|
10
|
+
"keeps_scope_small": True,
|
|
11
|
+
"treats_request_as_quick_change": True,
|
|
12
|
+
"does_not_promote": True,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
17
|
+
ok = True
|
|
18
|
+
notes = []
|
|
19
|
+
exp = EXPECTED_BEHAVIOR
|
|
20
|
+
|
|
21
|
+
route_arm = actual_output.get("route_arm")
|
|
22
|
+
if route_arm != exp["route_arm"]:
|
|
23
|
+
ok = False
|
|
24
|
+
notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
|
|
25
|
+
|
|
26
|
+
kind = str(actual_output.get("response_kind") or "")
|
|
27
|
+
if kind not in exp["response_kind_in"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(f"expected response_kind in {exp['response_kind_in']}, got {kind}")
|
|
30
|
+
|
|
31
|
+
if str(actual_output.get("target_lane") or "").lower() == "idea":
|
|
32
|
+
ok = False
|
|
33
|
+
notes.append("unexpected promotion to idea for a scoped quick change")
|
|
34
|
+
|
|
35
|
+
return ok, notes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
SCENARIO_NAME = "iterate_to_idea_promotion"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"A task that starts as a small iterate ask but expands into broader feature or "
|
|
4
|
+
"workflow planning should be promoted to idea with durable iterate linkage."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"current_user_message": (
|
|
8
|
+
"Start by making the current order export include discounts, but if that means we "
|
|
9
|
+
"need a full configurable export builder across orders, refunds, and payouts, figure "
|
|
10
|
+
"out the right next step."
|
|
11
|
+
),
|
|
12
|
+
"idea_id": "devin_eval_iterate_to_idea_promotion",
|
|
13
|
+
"project_id": "proj_75f63d30",
|
|
14
|
+
"repo_root": "/Users/devflow/repos/Spicy-Server",
|
|
15
|
+
}
|
|
16
|
+
EXPECTED_BEHAVIOR = {
|
|
17
|
+
"route_arm": "iterate",
|
|
18
|
+
"response_kind": "promote_to_idea",
|
|
19
|
+
"target_lane": "idea",
|
|
20
|
+
"uses_tool": "call_devin_ideation",
|
|
21
|
+
"writes_promotion_handoff": True,
|
|
22
|
+
"run_state": "promoted",
|
|
23
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.iterate_to_idea_promotion import (
|
|
2
|
+
EXPECTED_BEHAVIOR,
|
|
3
|
+
INPUT_PAYLOAD,
|
|
4
|
+
SCENARIO_NAME,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
EVAL_CRITERIA = {
|
|
8
|
+
"route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
|
|
9
|
+
"response_kind_must_equal": EXPECTED_BEHAVIOR["response_kind"],
|
|
10
|
+
"target_lane_must_equal": EXPECTED_BEHAVIOR["target_lane"],
|
|
11
|
+
"writes_promotion_handoff": True,
|
|
12
|
+
"run_state_must_equal": EXPECTED_BEHAVIOR["run_state"],
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
17
|
+
ok = True
|
|
18
|
+
notes = []
|
|
19
|
+
exp = EXPECTED_BEHAVIOR
|
|
20
|
+
|
|
21
|
+
route_arm = actual_output.get("route_arm")
|
|
22
|
+
if route_arm != exp["route_arm"]:
|
|
23
|
+
ok = False
|
|
24
|
+
notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
|
|
25
|
+
|
|
26
|
+
kind = actual_output.get("response_kind")
|
|
27
|
+
if kind != exp["response_kind"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(f"expected response_kind={exp['response_kind']}, got {kind}")
|
|
30
|
+
|
|
31
|
+
target_lane = str(actual_output.get("target_lane") or "").lower()
|
|
32
|
+
if target_lane != exp["target_lane"]:
|
|
33
|
+
ok = False
|
|
34
|
+
notes.append(f"expected target_lane={exp['target_lane']}, got {target_lane}")
|
|
35
|
+
|
|
36
|
+
run_state = str(actual_output.get("run_state") or actual_output.get("final_verdict") or "")
|
|
37
|
+
if run_state and run_state != exp["run_state"]:
|
|
38
|
+
notes.append(f"expected promoted terminal state, got {run_state}")
|
|
39
|
+
|
|
40
|
+
promotion_handoff = actual_output.get("promotion_handoff") or {}
|
|
41
|
+
handoff_lane = str(
|
|
42
|
+
promotion_handoff.get("target_lane")
|
|
43
|
+
or actual_output.get("handoff_target_lane")
|
|
44
|
+
or ""
|
|
45
|
+
).lower()
|
|
46
|
+
if handoff_lane and handoff_lane != exp["target_lane"]:
|
|
47
|
+
ok = False
|
|
48
|
+
notes.append(f"expected promotion handoff target_lane={exp['target_lane']}, got {handoff_lane}")
|
|
49
|
+
elif not promotion_handoff and not actual_output.get("handoff_ref"):
|
|
50
|
+
ok = False
|
|
51
|
+
notes.append("missing iterate-owned promotion_handoff linkage for idea promotion")
|
|
52
|
+
|
|
53
|
+
return ok, notes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
SCENARIO_NAME = "iterate_to_insight_reroute"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"A request that initially looks like iterate but becomes diagnosis-only after "
|
|
4
|
+
"observation should call the insight arm as a subagent while iterate remains the owner."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"current_user_message": (
|
|
8
|
+
"I thought the checkout totals were wrong, but before changing code I mostly need "
|
|
9
|
+
"you to determine why tax is being calculated this way and explain whether it is "
|
|
10
|
+
"correct."
|
|
11
|
+
),
|
|
12
|
+
"idea_id": "devin_eval_iterate_to_insight_reroute",
|
|
13
|
+
"project_id": "proj_75f63d30",
|
|
14
|
+
"repo_root": "/Users/devflow/repos/Spicy-Server",
|
|
15
|
+
}
|
|
16
|
+
EXPECTED_BEHAVIOR = {
|
|
17
|
+
"route_arm": "iterate",
|
|
18
|
+
"calls_tool": "devin_insight",
|
|
19
|
+
"response_kind_in": ["completed", "blocked", "needs_more_context"],
|
|
20
|
+
"does_not_return_route_to_insight_only": True,
|
|
21
|
+
"does_not_spawn_coder": True,
|
|
22
|
+
"stays_iterate_owned": True,
|
|
23
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.iterate_to_insight_reroute import (
|
|
2
|
+
EXPECTED_BEHAVIOR,
|
|
3
|
+
INPUT_PAYLOAD,
|
|
4
|
+
SCENARIO_NAME,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
EVAL_CRITERIA = {
|
|
8
|
+
"route_arm_must_equal": EXPECTED_BEHAVIOR["route_arm"],
|
|
9
|
+
"response_kind_must_equal": EXPECTED_BEHAVIOR["response_kind"],
|
|
10
|
+
"target_lane_must_equal": EXPECTED_BEHAVIOR["target_lane"],
|
|
11
|
+
"writes_promotion_handoff": True,
|
|
12
|
+
"does_not_claim_completion": True,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
17
|
+
ok = True
|
|
18
|
+
notes = []
|
|
19
|
+
exp = EXPECTED_BEHAVIOR
|
|
20
|
+
|
|
21
|
+
route_arm = actual_output.get("route_arm")
|
|
22
|
+
if route_arm != exp["route_arm"]:
|
|
23
|
+
ok = False
|
|
24
|
+
notes.append(f"expected route_arm={exp['route_arm']}, got {route_arm}")
|
|
25
|
+
|
|
26
|
+
kind = actual_output.get("response_kind")
|
|
27
|
+
if kind != exp["response_kind"]:
|
|
28
|
+
ok = False
|
|
29
|
+
notes.append(f"expected response_kind={exp['response_kind']}, got {kind}")
|
|
30
|
+
|
|
31
|
+
target_lane = str(actual_output.get("target_lane") or "").lower()
|
|
32
|
+
if target_lane != exp["target_lane"]:
|
|
33
|
+
ok = False
|
|
34
|
+
notes.append(f"expected target_lane={exp['target_lane']}, got {target_lane}")
|
|
35
|
+
|
|
36
|
+
promotion_handoff = actual_output.get("promotion_handoff") or {}
|
|
37
|
+
handoff_lane = str(
|
|
38
|
+
promotion_handoff.get("target_lane")
|
|
39
|
+
or actual_output.get("handoff_target_lane")
|
|
40
|
+
or ""
|
|
41
|
+
).lower()
|
|
42
|
+
if handoff_lane and handoff_lane != exp["target_lane"]:
|
|
43
|
+
ok = False
|
|
44
|
+
notes.append(f"expected promotion handoff target_lane={exp['target_lane']}, got {handoff_lane}")
|
|
45
|
+
elif not promotion_handoff and not actual_output.get("handoff_ref"):
|
|
46
|
+
ok = False
|
|
47
|
+
notes.append("missing iterate-owned promotion_handoff linkage for insight reroute")
|
|
48
|
+
|
|
49
|
+
if actual_output.get("response_kind") == "completed":
|
|
50
|
+
ok = False
|
|
51
|
+
notes.append("should reroute to insight instead of claiming completion")
|
|
52
|
+
|
|
53
|
+
return ok, notes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
SCENARIO_NAME = "observer_evidence_seam"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Observer receives a framed iterate task plus evidence and identifies the bounded "
|
|
4
|
+
"failing seam that should govern coding."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "observer",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"task_artifact": {
|
|
10
|
+
"task_type": "error_fix",
|
|
11
|
+
"surface": "invite acceptance",
|
|
12
|
+
"current_behavior": "Submitting a valid invite token returns HTTP 500.",
|
|
13
|
+
"desired_behavior": "Valid invite tokens complete account activation successfully.",
|
|
14
|
+
"success_criteria": ["invite acceptance returns 200", "account activation completes"],
|
|
15
|
+
},
|
|
16
|
+
"evidence": {
|
|
17
|
+
"http_status": 500,
|
|
18
|
+
"log_excerpt": "TypeError: invite.accepted_at must be datetime, got None",
|
|
19
|
+
"route": "POST /api/invites/accept",
|
|
20
|
+
},
|
|
21
|
+
}
|
|
22
|
+
EXPECTED_BEHAVIOR = {
|
|
23
|
+
"produces_observation_artifact": True,
|
|
24
|
+
"confirms_failure": True,
|
|
25
|
+
"identifies_failing_seam": True,
|
|
26
|
+
"expected_green_condition_present": True,
|
|
27
|
+
"ready_for_coder": True,
|
|
28
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.observer_evidence_seam import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_observation_artifact": True,
|
|
5
|
+
"confirms_failure": True,
|
|
6
|
+
"identifies_failing_seam": True,
|
|
7
|
+
"expected_green_condition_present": True,
|
|
8
|
+
"ready_for_coder": True,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
14
|
+
ok = True
|
|
15
|
+
notes = []
|
|
16
|
+
|
|
17
|
+
observation_artifact = actual_output.get("observation_artifact") or {}
|
|
18
|
+
if not observation_artifact:
|
|
19
|
+
ok = False
|
|
20
|
+
notes.append("missing observation_artifact")
|
|
21
|
+
return ok, notes
|
|
22
|
+
|
|
23
|
+
verdict = str(observation_artifact.get("status") or actual_output.get("response_kind") or "").lower()
|
|
24
|
+
if verdict not in {"confirmed", "ready_for_coder", "completed"}:
|
|
25
|
+
ok = False
|
|
26
|
+
notes.append(f"expected confirmed-style verdict, got {verdict}")
|
|
27
|
+
|
|
28
|
+
seam = str(observation_artifact.get("failing_seam") or observation_artifact.get("seam") or "")
|
|
29
|
+
if not seam:
|
|
30
|
+
ok = False
|
|
31
|
+
notes.append("missing failing_seam in observation_artifact")
|
|
32
|
+
|
|
33
|
+
green = str(observation_artifact.get("expected_green_condition") or observation_artifact.get("green_condition") or "")
|
|
34
|
+
if not green:
|
|
35
|
+
ok = False
|
|
36
|
+
notes.append("missing expected_green_condition")
|
|
37
|
+
|
|
38
|
+
ready = observation_artifact.get("ready_for_coder")
|
|
39
|
+
if ready is False:
|
|
40
|
+
ok = False
|
|
41
|
+
notes.append("observer should mark this evidence-backed seam as ready_for_coder")
|
|
42
|
+
|
|
43
|
+
evidence_text = " ".join(
|
|
44
|
+
str(v)
|
|
45
|
+
for v in [
|
|
46
|
+
observation_artifact.get("evidence_summary"),
|
|
47
|
+
observation_artifact.get("failing_seam"),
|
|
48
|
+
observation_artifact.get("log_excerpt"),
|
|
49
|
+
]
|
|
50
|
+
if v
|
|
51
|
+
).lower()
|
|
52
|
+
if evidence_text and not any(tok in evidence_text for tok in ("500", "typeerror", "invite", "accept")):
|
|
53
|
+
notes.append("observation_artifact does not clearly preserve the supplied evidence")
|
|
54
|
+
|
|
55
|
+
return ok, notes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
SCENARIO_NAME = "observer_repro_creation"
|
|
2
|
+
SCENARIO_DESCRIPTION = (
|
|
3
|
+
"Observer turns a bounded task into a narrow reproducible failing repro instead of "
|
|
4
|
+
"jumping to implementation."
|
|
5
|
+
)
|
|
6
|
+
INPUT_PAYLOAD = {
|
|
7
|
+
"role": "observer",
|
|
8
|
+
"repo_root": "/Users/devflow/repos/devflow_engine",
|
|
9
|
+
"task_artifact": {
|
|
10
|
+
"task_type": "targeted_improvement",
|
|
11
|
+
"surface": "CSV import wizard",
|
|
12
|
+
"current_behavior": "Rows with trailing spaces in email fields are rejected as invalid.",
|
|
13
|
+
"desired_behavior": "Trailing whitespace is trimmed before validation during import.",
|
|
14
|
+
"success_criteria": ["trimmed emails import successfully", "invalid emails still fail validation"],
|
|
15
|
+
},
|
|
16
|
+
"evidence": {
|
|
17
|
+
"sample_row": {"email": " alice@example.com "},
|
|
18
|
+
"observed_result": "validation error: invalid email",
|
|
19
|
+
"entry_point": "POST /api/imports/preview",
|
|
20
|
+
},
|
|
21
|
+
}
|
|
22
|
+
EXPECTED_BEHAVIOR = {
|
|
23
|
+
"produces_observation_artifact": True,
|
|
24
|
+
"creates_narrow_repro": True,
|
|
25
|
+
"repro_is_deterministic": True,
|
|
26
|
+
"expected_green_condition_present": True,
|
|
27
|
+
"ready_for_coder": True,
|
|
28
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from devin.nodes.iterate.scenarios.observer_repro_creation import EXPECTED_BEHAVIOR
|
|
2
|
+
|
|
3
|
+
EVAL_CRITERIA = {
|
|
4
|
+
"produces_observation_artifact": True,
|
|
5
|
+
"creates_narrow_repro": True,
|
|
6
|
+
"repro_is_deterministic": True,
|
|
7
|
+
"expected_green_condition_present": True,
|
|
8
|
+
"ready_for_coder": True,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def evaluate(actual_output: dict) -> tuple[bool, list[str]]:
|
|
14
|
+
ok = True
|
|
15
|
+
notes = []
|
|
16
|
+
|
|
17
|
+
observation_artifact = actual_output.get("observation_artifact") or {}
|
|
18
|
+
if not observation_artifact:
|
|
19
|
+
ok = False
|
|
20
|
+
notes.append("missing observation_artifact")
|
|
21
|
+
return ok, notes
|
|
22
|
+
|
|
23
|
+
repro = observation_artifact.get("repro_steps") or observation_artifact.get("repro") or []
|
|
24
|
+
if not repro:
|
|
25
|
+
ok = False
|
|
26
|
+
notes.append("missing bounded repro steps")
|
|
27
|
+
|
|
28
|
+
if isinstance(repro, list) and len(repro) > 6:
|
|
29
|
+
notes.append("repro exists but may be broader than necessary")
|
|
30
|
+
|
|
31
|
+
green = str(observation_artifact.get("expected_green_condition") or observation_artifact.get("green_condition") or "")
|
|
32
|
+
if not green:
|
|
33
|
+
ok = False
|
|
34
|
+
notes.append("missing expected_green_condition")
|
|
35
|
+
|
|
36
|
+
ready = observation_artifact.get("ready_for_coder")
|
|
37
|
+
if ready is False:
|
|
38
|
+
ok = False
|
|
39
|
+
notes.append("observer should mark deterministic repro as ready_for_coder")
|
|
40
|
+
|
|
41
|
+
repro_text = " ".join(str(step) for step in repro).lower() if isinstance(repro, list) else str(repro).lower()
|
|
42
|
+
if repro_text and not any(tok in repro_text for tok in ("csv", "email", "preview", "trailing", "whitespace")):
|
|
43
|
+
notes.append("repro does not appear aligned to the supplied seam")
|
|
44
|
+
|
|
45
|
+
return ok, notes
|