devflow-engine 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devflow_engine/__init__.py +3 -0
- devflow_engine/agentic_prompts.py +100 -0
- devflow_engine/agentic_runtime.py +398 -0
- devflow_engine/api_key_flow_harness.py +539 -0
- devflow_engine/api_keys.py +357 -0
- devflow_engine/bootstrap/__init__.py +2 -0
- devflow_engine/bootstrap/provision_from_template.py +84 -0
- devflow_engine/cli/__init__.py +0 -0
- devflow_engine/cli/app.py +7270 -0
- devflow_engine/core/__init__.py +0 -0
- devflow_engine/core/config.py +86 -0
- devflow_engine/core/logging.py +29 -0
- devflow_engine/core/paths.py +45 -0
- devflow_engine/core/toml_kv.py +33 -0
- devflow_engine/devflow_event_worker.py +1292 -0
- devflow_engine/devflow_state.py +201 -0
- devflow_engine/devin2/__init__.py +9 -0
- devflow_engine/devin2/agent_definition.py +120 -0
- devflow_engine/devin2/pi_runner.py +204 -0
- devflow_engine/devin_orchestration.py +69 -0
- devflow_engine/docs/prompts/anti-patterns.md +42 -0
- devflow_engine/docs/prompts/devin-agent-prompt.md +55 -0
- devflow_engine/docs/prompts/devin2-agent-prompt.md +81 -0
- devflow_engine/docs/prompts/examples/devin-vapi-clone-reference-exchange.json +85 -0
- devflow_engine/doctor/__init__.py +2 -0
- devflow_engine/doctor/triage.py +140 -0
- devflow_engine/error/__init__.py +0 -0
- devflow_engine/error/remediation.py +21 -0
- devflow_engine/errors/error_solver_dag.py +522 -0
- devflow_engine/errors/runtime_observability.py +67 -0
- devflow_engine/idea/__init__.py +4 -0
- devflow_engine/idea/actors.py +481 -0
- devflow_engine/idea/agentic.py +465 -0
- devflow_engine/idea/analyze.py +93 -0
- devflow_engine/idea/devin_chat_dag.py +1 -0
- devflow_engine/idea/diff.py +99 -0
- devflow_engine/idea/drafts.py +446 -0
- devflow_engine/idea/idea_creation_dag.py +643 -0
- devflow_engine/idea/ideation_enrichment.py +355 -0
- devflow_engine/idea/ideation_enrichment_worker.py +19 -0
- devflow_engine/idea/paths.py +28 -0
- devflow_engine/idea/promote.py +53 -0
- devflow_engine/idea/redaction.py +27 -0
- devflow_engine/idea/repo_tools.py +1277 -0
- devflow_engine/idea/response_mode.py +30 -0
- devflow_engine/idea/story_pipeline.py +1585 -0
- devflow_engine/idea/sufficiency.py +376 -0
- devflow_engine/idea/traditional_stories.py +1257 -0
- devflow_engine/implementation/__init__.py +0 -0
- devflow_engine/implementation/alembic_preflight.py +700 -0
- devflow_engine/implementation/dag.py +8450 -0
- devflow_engine/implementation/green_gate.py +93 -0
- devflow_engine/implementation/prompts.py +108 -0
- devflow_engine/implementation/test_runtime.py +623 -0
- devflow_engine/integration/__init__.py +19 -0
- devflow_engine/integration/agentic.py +66 -0
- devflow_engine/integration/dag.py +3539 -0
- devflow_engine/integration/prompts.py +114 -0
- devflow_engine/integration/supabase_schema.sql +31 -0
- devflow_engine/integration/supabase_sync.py +177 -0
- devflow_engine/llm/__init__.py +1 -0
- devflow_engine/llm/cli_one_shot.py +84 -0
- devflow_engine/llm/cli_stream.py +371 -0
- devflow_engine/llm/execution_context.py +26 -0
- devflow_engine/llm/invoke.py +1322 -0
- devflow_engine/llm/provider_api.py +304 -0
- devflow_engine/llm/repo_knowledge.py +588 -0
- devflow_engine/llm_primitives.py +315 -0
- devflow_engine/orchestration.py +62 -0
- devflow_engine/planning/__init__.py +0 -0
- devflow_engine/planning/analyze_repo.py +92 -0
- devflow_engine/planning/render_drafts.py +133 -0
- devflow_engine/playground/__init__.py +0 -0
- devflow_engine/playground/hooks.py +26 -0
- devflow_engine/playwright_workflow/__init__.py +5 -0
- devflow_engine/playwright_workflow/dag.py +1317 -0
- devflow_engine/process/__init__.py +5 -0
- devflow_engine/process/dag.py +59 -0
- devflow_engine/project_registration/__init__.py +3 -0
- devflow_engine/project_registration/dag.py +1581 -0
- devflow_engine/project_registry.py +109 -0
- devflow_engine/prompts/devin/generic/prompt.md +6 -0
- devflow_engine/prompts/devin/ideation/prompt.md +263 -0
- devflow_engine/prompts/devin/ideation/scenarios.md +5 -0
- devflow_engine/prompts/devin/ideation_loop/prompt.md +6 -0
- devflow_engine/prompts/devin/insight/prompt.md +11 -0
- devflow_engine/prompts/devin/insight/scenarios.md +5 -0
- devflow_engine/prompts/devin/intake/prompt.md +15 -0
- devflow_engine/prompts/devin/iterate/prompt.md +12 -0
- devflow_engine/prompts/devin/shared/eval_doctrine.md +9 -0
- devflow_engine/prompts/devin/shared/principles.md +246 -0
- devflow_engine/prompts/devin_eval/assessment/prompt.md +18 -0
- devflow_engine/prompts/idea/api_ideation_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/api_insight_agent/prompt.md +8 -0
- devflow_engine/prompts/idea/response_doctrine/prompt.md +18 -0
- devflow_engine/prompts/implementation/dependency_assessment/prompt.md +12 -0
- devflow_engine/prompts/implementation/green/green/prompt.md +11 -0
- devflow_engine/prompts/implementation/green/node_config/prompt.md +3 -0
- devflow_engine/prompts/implementation/green_review/outcome_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/green_review/prior_run_review/prompt.md +5 -0
- devflow_engine/prompts/implementation/red/prompt.md +27 -0
- devflow_engine/prompts/implementation/redreview/prompt.md +23 -0
- devflow_engine/prompts/implementation/redreview_repair/prompt.md +16 -0
- devflow_engine/prompts/implementation/setupdoc/prompt.md +10 -0
- devflow_engine/prompts/implementation/story_planning/prompt.md +13 -0
- devflow_engine/prompts/implementation/test_design/prompt.md +27 -0
- devflow_engine/prompts/integration/README.md +185 -0
- devflow_engine/prompts/integration/green/example.md +67 -0
- devflow_engine/prompts/integration/green/green/prompt.md +10 -0
- devflow_engine/prompts/integration/green/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/green/prompt.md +15 -0
- devflow_engine/prompts/integration/green/past_prompts/20260417T212300/node_config/prompt.md +42 -0
- devflow_engine/prompts/integration/green_enrich/example.md +79 -0
- devflow_engine/prompts/integration/green_enrich/green_enrich/prompt.md +9 -0
- devflow_engine/prompts/integration/green_enrich/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/green_enrich/prompt.md +14 -0
- devflow_engine/prompts/integration/green_enrich/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red/code_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red/example.md +152 -0
- devflow_engine/prompts/integration/red/node_config/prompt.md +86 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red/prompt.md +16 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T212300/red_repair/prompt.md +15 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/node_config/prompt.md +84 -0
- devflow_engine/prompts/integration/red/past_prompts/20260417T215032/red_repair/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red/prompt.md +11 -0
- devflow_engine/prompts/integration/red/red_repair/prompt.md +12 -0
- devflow_engine/prompts/integration/red_review/example.md +71 -0
- devflow_engine/prompts/integration/red_review/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/node_config/prompt.md +41 -0
- devflow_engine/prompts/integration/red_review/past_prompts/20260417T212300/red_review/prompt.md +15 -0
- devflow_engine/prompts/integration/red_review/red_review/prompt.md +9 -0
- devflow_engine/prompts/integration/resolve/example.md +111 -0
- devflow_engine/prompts/integration/resolve/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/node_config/prompt.md +64 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_implicated_users/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/past_prompts/20260417T212300/resolve_side_effects/prompt.md +15 -0
- devflow_engine/prompts/integration/resolve/resolve_implicated_users/prompt.md +10 -0
- devflow_engine/prompts/integration/resolve/resolve_side_effects/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/build_idea_acceptance_coverage/prompt.md +12 -0
- devflow_engine/prompts/integration/validate/code_repair/prompt.md +13 -0
- devflow_engine/prompts/integration/validate/example.md +143 -0
- devflow_engine/prompts/integration/validate/node_config/prompt.md +87 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/code_repair/prompt.md +19 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_enrich_gate/prompt.md +17 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T212300/validate_repair/prompt.md +16 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/code_repair/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/node_config/prompt.md +67 -0
- devflow_engine/prompts/integration/validate/past_prompts/20260417T215032/validate_repair/prompt.md +9 -0
- devflow_engine/prompts/integration/validate/validate_enrich_gate/prompt.md +10 -0
- devflow_engine/prompts/integration/validate/validate_repair/prompt.md +20 -0
- devflow_engine/prompts/integration/write_workflows/example.md +100 -0
- devflow_engine/prompts/integration/write_workflows/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/node_config/prompt.md +44 -0
- devflow_engine/prompts/integration/write_workflows/past_prompts/20260417T212300/write_workflows/prompt.md +17 -0
- devflow_engine/prompts/integration/write_workflows/write_workflows/prompt.md +11 -0
- devflow_engine/prompts/iterate/README.md +7 -0
- devflow_engine/prompts/iterate/coder/prompt.md +11 -0
- devflow_engine/prompts/iterate/framer/prompt.md +11 -0
- devflow_engine/prompts/iterate/iterator/prompt.md +13 -0
- devflow_engine/prompts/iterate/observer/prompt.md +11 -0
- devflow_engine/prompts/recovery/diagnosis/prompt.md +7 -0
- devflow_engine/prompts/recovery/execution/prompt.md +8 -0
- devflow_engine/prompts/recovery/execution_verification/prompt.md +7 -0
- devflow_engine/prompts/recovery/failure_investigation/prompt.md +10 -0
- devflow_engine/prompts/recovery/preflight_health_repo_repair/prompt.md +8 -0
- devflow_engine/prompts/recovery/remediation_execution/prompt.md +11 -0
- devflow_engine/prompts/recovery/root_cause_investigation/prompt.md +12 -0
- devflow_engine/prompts/scope_idea/doctrine/prompt.md +7 -0
- devflow_engine/prompts/source_doc_eval/document/prompt.md +6 -0
- devflow_engine/prompts/source_doc_eval/targeted_mutation/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/domain_entities/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/product_brief/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_coherence/prompt.md +7 -0
- devflow_engine/prompts/source_doc_mutation/project_doc_render/prompt.md +9 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_coherence/prompt.md +5 -0
- devflow_engine/prompts/source_doc_mutation/source_doc_enrichment_coherence/prompt.md +6 -0
- devflow_engine/prompts/source_doc_mutation/user_workflows/prompt.md +6 -0
- devflow_engine/prompts/source_scope/doctrine/prompt.md +10 -0
- devflow_engine/prompts/ui_grounding/doctrine/prompt.md +7 -0
- devflow_engine/recovery/__init__.py +3 -0
- devflow_engine/recovery/dag.py +2609 -0
- devflow_engine/recovery/models.py +220 -0
- devflow_engine/refactor.py +93 -0
- devflow_engine/registry/__init__.py +1 -0
- devflow_engine/registry/cards.py +238 -0
- devflow_engine/registry/domain_normalize.py +60 -0
- devflow_engine/registry/effects.py +65 -0
- devflow_engine/registry/enforce_report.py +150 -0
- devflow_engine/registry/module_cards_classify.py +164 -0
- devflow_engine/registry/module_cards_draft.py +184 -0
- devflow_engine/registry/module_cards_gate.py +59 -0
- devflow_engine/registry/packages.py +347 -0
- devflow_engine/registry/pathways.py +323 -0
- devflow_engine/review/__init__.py +11 -0
- devflow_engine/review/dag.py +588 -0
- devflow_engine/review/review_story.py +67 -0
- devflow_engine/scope_idea/__init__.py +3 -0
- devflow_engine/scope_idea/agentic.py +39 -0
- devflow_engine/scope_idea/dag.py +1069 -0
- devflow_engine/scope_idea/models.py +175 -0
- devflow_engine/skills/builtins/devflow/queue_failure_investigation/SKILL.md +112 -0
- devflow_engine/skills/builtins/devflow/queue_idea_to_story/SKILL.md +120 -0
- devflow_engine/skills/builtins/devflow/queue_integration/SKILL.md +105 -0
- devflow_engine/skills/builtins/devflow/queue_recovery/SKILL.md +108 -0
- devflow_engine/skills/builtins/devflow/queue_runtime_core/SKILL.md +155 -0
- devflow_engine/skills/builtins/devflow/queue_story_implementation/SKILL.md +122 -0
- devflow_engine/skills/builtins/devin/idea_to_story_handoff/SKILL.md +120 -0
- devflow_engine/skills/builtins/devin/ideation/SKILL.md +168 -0
- devflow_engine/skills/builtins/devin/ideation/state-and-phrasing-reference.md +18 -0
- devflow_engine/skills/builtins/devin/insight/SKILL.md +22 -0
- devflow_engine/skills/registry.example.yaml +42 -0
- devflow_engine/source_doc_assumptions.py +291 -0
- devflow_engine/source_doc_mutation_dag.py +1606 -0
- devflow_engine/source_doc_mutation_eval.py +417 -0
- devflow_engine/source_doc_mutation_worker.py +25 -0
- devflow_engine/source_docs_schema.py +207 -0
- devflow_engine/source_docs_updater.py +309 -0
- devflow_engine/source_scope/__init__.py +15 -0
- devflow_engine/source_scope/agentic.py +45 -0
- devflow_engine/source_scope/dag.py +1626 -0
- devflow_engine/source_scope/models.py +177 -0
- devflow_engine/stores/__init__.py +0 -0
- devflow_engine/stores/execution_store.py +3534 -0
- devflow_engine/story/__init__.py +0 -0
- devflow_engine/story/contracts.py +160 -0
- devflow_engine/story/discovery.py +47 -0
- devflow_engine/story/evidence.py +118 -0
- devflow_engine/story/hashing.py +27 -0
- devflow_engine/story/implemented_queue_purge.py +148 -0
- devflow_engine/story/indexer.py +105 -0
- devflow_engine/story/io.py +20 -0
- devflow_engine/story/markdown_contracts.py +298 -0
- devflow_engine/story/reconciliation.py +408 -0
- devflow_engine/story/validate_stories.py +149 -0
- devflow_engine/story/validate_tests_story.py +512 -0
- devflow_engine/story/validation.py +133 -0
- devflow_engine/ui_grounding/__init__.py +11 -0
- devflow_engine/ui_grounding/agentic.py +31 -0
- devflow_engine/ui_grounding/dag.py +874 -0
- devflow_engine/ui_grounding/models.py +224 -0
- devflow_engine/ui_grounding/pencil_bridge.py +247 -0
- devflow_engine/vendor/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/__init__.py +11 -0
- devflow_engine/vendor/datalumina_genai/core/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/exceptions.py +9 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/__init__.py +0 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent.py +48 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/agent_streaming_node.py +26 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/base.py +89 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/concurrent.py +30 -0
- devflow_engine/vendor/datalumina_genai/core/nodes/router.py +69 -0
- devflow_engine/vendor/datalumina_genai/core/schema.py +72 -0
- devflow_engine/vendor/datalumina_genai/core/task.py +52 -0
- devflow_engine/vendor/datalumina_genai/core/validate.py +139 -0
- devflow_engine/vendor/datalumina_genai/core/workflow.py +200 -0
- devflow_engine/worker.py +1086 -0
- devflow_engine/worker_guard.py +233 -0
- devflow_engine-1.0.0.dist-info/METADATA +235 -0
- devflow_engine-1.0.0.dist-info/RECORD +393 -0
- devflow_engine-1.0.0.dist-info/WHEEL +4 -0
- devflow_engine-1.0.0.dist-info/entry_points.txt +3 -0
- devin/__init__.py +6 -0
- devin/dag.py +58 -0
- devin/dag_two_arm.py +138 -0
- devin/devin_chat_scenario_catalog.json +588 -0
- devin/devin_eval.py +677 -0
- devin/nodes/__init__.py +0 -0
- devin/nodes/ideation/__init__.py +0 -0
- devin/nodes/ideation/node.py +195 -0
- devin/nodes/ideation/playground.py +267 -0
- devin/nodes/ideation/prompt.md +65 -0
- devin/nodes/ideation/scenarios/continue_refinement.py +13 -0
- devin/nodes/ideation/scenarios/continue_refinement_evals.py +18 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns.py +17 -0
- devin/nodes/ideation/scenarios/idea_fits_existing_patterns_evals.py +16 -0
- devin/nodes/ideation/scenarios/large_idea_split.py +4 -0
- devin/nodes/ideation/scenarios/large_idea_split_evals.py +17 -0
- devin/nodes/ideation/scenarios/source_documentation_added.py +4 -0
- devin/nodes/ideation/scenarios/source_documentation_added_evals.py +16 -0
- devin/nodes/ideation/scenarios/user_says_create_it.py +30 -0
- devin/nodes/ideation/scenarios/user_says_create_it_evals.py +23 -0
- devin/nodes/ideation/scenarios/vague_idea.py +16 -0
- devin/nodes/ideation/scenarios/vague_idea_evals.py +47 -0
- devin/nodes/ideation/tools.json +312 -0
- devin/nodes/insight/__init__.py +0 -0
- devin/nodes/insight/node.py +49 -0
- devin/nodes/insight/playground.py +154 -0
- devin/nodes/insight/prompt.md +61 -0
- devin/nodes/insight/scenarios/architecture_pattern_query.py +15 -0
- devin/nodes/insight/scenarios/architecture_pattern_query_evals.py +25 -0
- devin/nodes/insight/scenarios/codebase_exploration.py +15 -0
- devin/nodes/insight/scenarios/codebase_exploration_evals.py +23 -0
- devin/nodes/insight/scenarios/devin_ideation_routing.py +19 -0
- devin/nodes/insight/scenarios/devin_ideation_routing_evals.py +39 -0
- devin/nodes/insight/scenarios/devin_insight_routing.py +20 -0
- devin/nodes/insight/scenarios/devin_insight_routing_evals.py +40 -0
- devin/nodes/insight/scenarios/operational_debugging.py +15 -0
- devin/nodes/insight/scenarios/operational_debugging_evals.py +23 -0
- devin/nodes/insight/scenarios/operational_question.py +9 -0
- devin/nodes/insight/scenarios/operational_question_evals.py +8 -0
- devin/nodes/insight/scenarios/queue_status.py +15 -0
- devin/nodes/insight/scenarios/queue_status_evals.py +23 -0
- devin/nodes/insight/scenarios/source_doc_explanation.py +14 -0
- devin/nodes/insight/scenarios/source_doc_explanation_evals.py +21 -0
- devin/nodes/insight/scenarios/worker_state_check.py +15 -0
- devin/nodes/insight/scenarios/worker_state_check_evals.py +22 -0
- devin/nodes/insight/tools.json +126 -0
- devin/nodes/intake/__init__.py +0 -0
- devin/nodes/intake/node.py +27 -0
- devin/nodes/intake/playground.py +47 -0
- devin/nodes/intake/prompt.md +12 -0
- devin/nodes/intake/scenarios/ideation_routing.py +4 -0
- devin/nodes/intake/scenarios/ideation_routing_evals.py +5 -0
- devin/nodes/intake/scenarios/insight_routing.py +4 -0
- devin/nodes/intake/scenarios/insight_routing_evals.py +5 -0
- devin/nodes/iterate/README.md +44 -0
- devin/nodes/iterate/__init__.py +1 -0
- devin/nodes/iterate/_archived_design_stages/01-objectives-requirements.md +112 -0
- devin/nodes/iterate/_archived_design_stages/02-evals.md +131 -0
- devin/nodes/iterate/_archived_design_stages/03-tools-and-boundaries.md +110 -0
- devin/nodes/iterate/_archived_design_stages/04-harness-and-playground.md +32 -0
- devin/nodes/iterate/_archived_design_stages/05-prompt-deferred.md +11 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/coder_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/03-tools-and-boundaries.md +13 -0
- devin/nodes/iterate/_archived_design_stages/framer_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/01-objectives-requirements.md +25 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/02-evals.md +9 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/iterator_agent_design/04-harness-and-playground.md +12 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/01-objectives-requirements.md +20 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/02-evals.md +8 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/03-tools-and-boundaries.md +14 -0
- devin/nodes/iterate/_archived_design_stages/observer_agent_design/04-harness-and-playground.md +13 -0
- devin/nodes/iterate/agent-roles.md +89 -0
- devin/nodes/iterate/agents/README.md +10 -0
- devin/nodes/iterate/artifacts.md +504 -0
- devin/nodes/iterate/contract.md +100 -0
- devin/nodes/iterate/eval-plan.md +74 -0
- devin/nodes/iterate/node.py +100 -0
- devin/nodes/iterate/pipeline/README.md +13 -0
- devin/nodes/iterate/playground-contract.md +76 -0
- devin/nodes/iterate/prompt.md +11 -0
- devin/nodes/iterate/scenarios/README.md +38 -0
- devin/nodes/iterate/scenarios/artifact-and-loop-scenarios.md +101 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment.py +32 -0
- devin/nodes/iterate/scenarios/coder_artifact_alignment_evals.py +45 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix.py +27 -0
- devin/nodes/iterate/scenarios/coder_bounded_fix_evals.py +45 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing.py +21 -0
- devin/nodes/iterate/scenarios/devin_iterate_routing_evals.py +36 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary.py +25 -0
- devin/nodes/iterate/scenarios/framer_scope_boundary_evals.py +57 -0
- devin/nodes/iterate/scenarios/framer_task_framing.py +25 -0
- devin/nodes/iterate/scenarios/framer_task_framing_evals.py +58 -0
- devin/nodes/iterate/scenarios/iterate_error_fix.py +21 -0
- devin/nodes/iterate/scenarios/iterate_error_fix_evals.py +39 -0
- devin/nodes/iterate/scenarios/iterate_quick_change.py +21 -0
- devin/nodes/iterate/scenarios/iterate_quick_change_evals.py +35 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_idea_promotion_evals.py +53 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute.py +23 -0
- devin/nodes/iterate/scenarios/iterate_to_insight_reroute_evals.py +53 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam.py +28 -0
- devin/nodes/iterate/scenarios/observer_evidence_seam_evals.py +55 -0
- devin/nodes/iterate/scenarios/observer_repro_creation.py +28 -0
- devin/nodes/iterate/scenarios/observer_repro_creation_evals.py +45 -0
- devin/nodes/iterate/scenarios/routing-matrix.md +45 -0
- devin/nodes/shared/__init__.py +0 -0
- devin/nodes/shared/filemaker_expert.md +80 -0
- devin/nodes/shared/filemaker_expert.py +354 -0
- devin/nodes/shared/filemaker_expert_eval/runner.py +176 -0
- devin/nodes/shared/filemaker_expert_eval/scenarios.json +65 -0
- devin/nodes/shared/goldilocks_advisor_eval/runner.py +214 -0
- devin/nodes/shared/goldilocks_advisor_eval/scenarios.json +58 -0
- devin/nodes/shared/helpers.py +156 -0
- devin/nodes/shared/idea_compliance_advisor_eval/runner.py +252 -0
- devin/nodes/shared/idea_compliance_advisor_eval/scenarios.json +75 -0
- devin/nodes/shared/models.py +44 -0
- devin/nodes/shared/post.py +40 -0
- devin/nodes/shared/router.py +107 -0
- devin/nodes/shared/tools.py +191 -0
- devin/shared/devin-chat-rubric.md +237 -0
- devin/shared/devin-chat-scenario-suite.md +90 -0
- devin/shared/eval_doctrine.md +9 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Devin tools available to IdeationAgent and InsightAgent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import time
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from devflow_engine.stores.execution_store import ExecutionStore
|
|
14
|
+
from devflow_engine.vendor.datalumina_genai.core.nodes.base import Node as DataluminaNode
|
|
15
|
+
|
|
16
|
+
from .helpers import DAG_ID, store_run
|
|
17
|
+
|
|
18
|
+
# -------------------------------------------------------------------
|
|
19
|
+
# Agent tool result types
|
|
20
|
+
# -------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ToolResult:
|
|
24
|
+
ok: bool
|
|
25
|
+
tool_name: str
|
|
26
|
+
output: dict[str, Any]
|
|
27
|
+
error: str | None = None
|
|
28
|
+
|
|
29
|
+
# -------------------------------------------------------------------
|
|
30
|
+
# Devflow primitives
|
|
31
|
+
# -------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
def devflow_init_idea(*, idea_id: str, title: str, repo_root: Path) -> ToolResult:
|
|
34
|
+
"""Initialize a new idea artifact in .devflow/ideas/<idea_id>/"""
|
|
35
|
+
try:
|
|
36
|
+
result = subprocess.run(
|
|
37
|
+
['devflow', 'idea', 'init', '--idea', idea_id, '--title', title],
|
|
38
|
+
cwd=str(repo_root),
|
|
39
|
+
capture_output=True,
|
|
40
|
+
text=True,
|
|
41
|
+
timeout=30,
|
|
42
|
+
)
|
|
43
|
+
if result.returncode == 0:
|
|
44
|
+
return ToolResult(ok=True, tool_name='Devflow_Init_Idea', output={'idea_id': idea_id, 'stdout': result.stdout})
|
|
45
|
+
return ToolResult(ok=False, tool_name='Devflow_Init_Idea', output={}, error=result.stderr or result.stdout)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
return ToolResult(ok=False, tool_name='Devflow_Init_Idea', output={}, error=str(e))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def devflow_amend_idea(*, idea_id: str, refined_text: str, repo_root: Path) -> ToolResult:
|
|
51
|
+
"""Amend an existing idea artifact with refined text."""
|
|
52
|
+
try:
|
|
53
|
+
# Run sufficiency to get updated shape
|
|
54
|
+
result = subprocess.run(
|
|
55
|
+
['devflow', 'idea', 'sufficiency', '--text', refined_text],
|
|
56
|
+
cwd=str(repo_root),
|
|
57
|
+
capture_output=True,
|
|
58
|
+
text=True,
|
|
59
|
+
timeout=30,
|
|
60
|
+
)
|
|
61
|
+
sufficiency = {}
|
|
62
|
+
if result.returncode == 0:
|
|
63
|
+
try:
|
|
64
|
+
sufficiency = json.loads(result.stdout)
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
# Write back to the idea artifact
|
|
68
|
+
idea_json_path = repo_root / '.devflow' / 'ideas' / idea_id / 'idea.json'
|
|
69
|
+
if idea_json_path.exists():
|
|
70
|
+
current = json.loads(idea_json_path.read_text(encoding='utf-8'))
|
|
71
|
+
current['refined_text'] = refined_text
|
|
72
|
+
current['sufficiency'] = sufficiency
|
|
73
|
+
idea_json_path.write_text(json.dumps(current, indent=2, sort_keys=True) + '\n', encoding='utf-8')
|
|
74
|
+
return ToolResult(ok=True, tool_name='Devflow_Amend_Idea', output={'idea_id': idea_id, 'sufficiency': sufficiency})
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return ToolResult(ok=False, tool_name='Devflow_Amend_Idea', output={}, error=str(e))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def devflow_commit_idea(*, idea_id: str, draft_set: str = 'current', repo_root: Path) -> ToolResult:
|
|
80
|
+
"""Promote/commit an idea to ready-for-downstream state."""
|
|
81
|
+
try:
|
|
82
|
+
result = subprocess.run(
|
|
83
|
+
['devflow', 'idea', 'promote', '--idea', idea_id, '--draft-set', draft_set, '--dest', str(repo_root)],
|
|
84
|
+
cwd=str(repo_root),
|
|
85
|
+
capture_output=True,
|
|
86
|
+
text=True,
|
|
87
|
+
timeout=30,
|
|
88
|
+
)
|
|
89
|
+
if result.returncode == 0:
|
|
90
|
+
return ToolResult(ok=True, tool_name='Devflow_Commit_Idea', output={'idea_id': idea_id, 'stdout': result.stdout})
|
|
91
|
+
return ToolResult(ok=False, tool_name='Devflow_Commit_Idea', output={}, error=result.stderr or result.stdout)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
return ToolResult(ok=False, tool_name='Devflow_Commit_Idea', output={}, error=str(e))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# -------------------------------------------------------------------
|
|
97
|
+
# DevinInsight as subagent
|
|
98
|
+
# -------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def devin_insight(*, current_user_message: str, repo_root: Path, context: dict[str, Any]) -> ToolResult:
|
|
101
|
+
"""Call Devin InsightAgent as a subagent to explore codebase / provide grounded context."""
|
|
102
|
+
try:
|
|
103
|
+
from devin.nodes.insight.node import InsightAgentNode
|
|
104
|
+
from devin.nodes.shared.models import DevinChatDagEvent
|
|
105
|
+
from devflow_engine.vendor.datalumina_genai.core.task import TaskContext
|
|
106
|
+
|
|
107
|
+
event = DevinChatDagEvent(
|
|
108
|
+
repo_root=str(repo_root),
|
|
109
|
+
idea_id=context.get('idea_id', 'unknown'),
|
|
110
|
+
raw_text=current_user_message,
|
|
111
|
+
pipeline_key=f'insight_subagent_{int(time.time())}',
|
|
112
|
+
)
|
|
113
|
+
node = InsightAgentNode(task_context=None)
|
|
114
|
+
# Run synchronously via asyncio
|
|
115
|
+
import asyncio
|
|
116
|
+
ctx = asyncio.run(node.process(TaskContext(event=event, metadata=context.copy())))
|
|
117
|
+
return ToolResult(
|
|
118
|
+
ok=True,
|
|
119
|
+
tool_name='DevinInsight',
|
|
120
|
+
output={'insight_response': ctx.metadata.get('response_guidance', {}), 'idea_id': context.get('idea_id')},
|
|
121
|
+
)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
return ToolResult(ok=False, tool_name='DevinInsight', output={}, error=str(e))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# -------------------------------------------------------------------
|
|
127
|
+
# Emit_Response — real-time UI feedback via agent_devin_messages
|
|
128
|
+
# -------------------------------------------------------------------
|
|
129
|
+
|
|
130
|
+
def emit_response(*, message: str, emit_type: str, metadata: dict[str, Any] | None = None, session_id: str) -> ToolResult:
|
|
131
|
+
"""Emit real-time feedback to agent_devin_messages for frontend rendering.
|
|
132
|
+
|
|
133
|
+
emit_type values:
|
|
134
|
+
- start_working: "Running...", "Reviewing...", "Splunking...", etc.
|
|
135
|
+
- stop_working: finalizes the working feedback
|
|
136
|
+
- progress: incremental progress update
|
|
137
|
+
- info: informational message
|
|
138
|
+
"""
|
|
139
|
+
row = {
|
|
140
|
+
'session_id': session_id,
|
|
141
|
+
'from_agent': 'devin',
|
|
142
|
+
'to_agent': 'user',
|
|
143
|
+
'message': message,
|
|
144
|
+
'metadata': {
|
|
145
|
+
'emit_type': emit_type,
|
|
146
|
+
'tool': 'Emit_Response',
|
|
147
|
+
**(metadata or {}),
|
|
148
|
+
},
|
|
149
|
+
'project_id': None,
|
|
150
|
+
'message_type': 'emit',
|
|
151
|
+
}
|
|
152
|
+
created = _post_agent_message(row)
|
|
153
|
+
return ToolResult(ok=True, tool_name='Emit_Response', output={'emit_type': emit_type, 'message': message, 'posted': created.get('id')})
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def emit_start_working(*, activity: str | None = None, session_id: str) -> ToolResult:
|
|
157
|
+
"""Emit Start_Working_Feedback to agent_devin_messages."""
|
|
158
|
+
verbs = ['running', 'reviewing', 'shaping', 'crafting', 'exploring', 'analyzing', 'mapping', 'building', 'checking', 'preparing']
|
|
159
|
+
activity = activity or verbs[int(time.time()) % len(verbs)]
|
|
160
|
+
return emit_response(message=f"{activity.title()}...", emit_type='start_working', session_id=session_id)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def emit_stop_working(*, activity: str | None = None, session_id: str) -> ToolResult:
|
|
164
|
+
"""Emit Stop_Working_Feedback to agent_devin_messages."""
|
|
165
|
+
verbs = ['running', 'reviewing', 'shaping', 'crafting', 'exploring', 'analyzing', 'mapping', 'building', 'checking', 'preparing']
|
|
166
|
+
activity = activity or verbs[int(time.time()) % len(verbs)]
|
|
167
|
+
return emit_response(message=f"Done {activity.title()}.", emit_type='stop_working', session_id=session_id)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def emit_conclude_node(*, node_output: dict[str, Any], session_id: str) -> ToolResult:
|
|
171
|
+
"""Emit Conclude_Node signal with node output for final node processing."""
|
|
172
|
+
return emit_response(
|
|
173
|
+
message="Node complete.",
|
|
174
|
+
emit_type='conclude_node',
|
|
175
|
+
metadata={'node_output': node_output, 'tool': 'Conclude_Node'},
|
|
176
|
+
session_id=session_id,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# -------------------------------------------------------------------
|
|
181
|
+
# Internal helpers
|
|
182
|
+
# -------------------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
def _post_agent_message(row: dict[str, Any]) -> dict[str, Any]:
|
|
185
|
+
if os.environ.get('PYTEST_CURRENT_TEST'):
|
|
186
|
+
return {'id': 'test-agent-agent-message', **row}
|
|
187
|
+
try:
|
|
188
|
+
from devflow_engine.devin_orchestration import maybe_post_devin_message
|
|
189
|
+
return maybe_post_devin_message(row=row) or {'id': 'local-only'}
|
|
190
|
+
except Exception:
|
|
191
|
+
return {'id': 'local-only', **row, 'status': 'local_only'}
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# Devin chat eval rubric
|
|
2
|
+
|
|
3
|
+
Use this rubric to evaluate Devin chat outputs for future DAG-output testing.
|
|
4
|
+
|
|
5
|
+
This rubric is intentionally compact. It is meant to be operational, not aspirational wallpaper.
|
|
6
|
+
|
|
7
|
+
Primary companion doc:
|
|
8
|
+
- [Devin chat principles](../devin-chat-principles.md)
|
|
9
|
+
|
|
10
|
+
## Scoring shape
|
|
11
|
+
|
|
12
|
+
Recommended per-dimension scoring:
|
|
13
|
+
- `1.0` = clear pass
|
|
14
|
+
- `0.5` = mixed / borderline
|
|
15
|
+
- `0.0` = fail
|
|
16
|
+
|
|
17
|
+
A strong response should pass every hard-gate dimension and score well on conversational quality.
|
|
18
|
+
|
|
19
|
+
## Dimensions
|
|
20
|
+
|
|
21
|
+
### 1. Approach ownership
|
|
22
|
+
|
|
23
|
+
Question:
|
|
24
|
+
- Did Devin own the implementation approach while leaving outcome ownership with the user?
|
|
25
|
+
|
|
26
|
+
Pass signals:
|
|
27
|
+
- proposes a concrete framing or direction
|
|
28
|
+
- chooses sensible defaults inside stated constraints
|
|
29
|
+
- does not push technical decision-making onto the user prematurely
|
|
30
|
+
|
|
31
|
+
Fail signals:
|
|
32
|
+
- asks the user to choose architecture, stack, or decomposition Devin should own
|
|
33
|
+
- behaves like a form intake clerk
|
|
34
|
+
|
|
35
|
+
### 2. Outcome-focused clarification
|
|
36
|
+
|
|
37
|
+
Question:
|
|
38
|
+
- If Devin asked a question, was it about business need, UX, constraints, users, or approval boundaries rather than low-level implementation choice?
|
|
39
|
+
|
|
40
|
+
Pass signals:
|
|
41
|
+
- asks one sharp question that changes the solution materially
|
|
42
|
+
- question is outcome- or constraint-oriented
|
|
43
|
+
|
|
44
|
+
Fail signals:
|
|
45
|
+
- asks multiple low-value questions
|
|
46
|
+
- asks the user to pick stack, schema, transport, framework, or similar Devin-owned details
|
|
47
|
+
- asks a question when a reasonable assumption would have been enough
|
|
48
|
+
|
|
49
|
+
### 3. Momentum
|
|
50
|
+
|
|
51
|
+
Question:
|
|
52
|
+
- Did the response make the work feel underway?
|
|
53
|
+
|
|
54
|
+
Pass signals:
|
|
55
|
+
- direct answer first
|
|
56
|
+
- concrete next-step framing
|
|
57
|
+
- reasonable assumptions used to keep motion
|
|
58
|
+
|
|
59
|
+
Fail signals:
|
|
60
|
+
- stalls in generic planning language
|
|
61
|
+
- turns the turn into a checklist interview
|
|
62
|
+
- produces commentary instead of progress
|
|
63
|
+
|
|
64
|
+
### 4. No fake progress
|
|
65
|
+
|
|
66
|
+
Question:
|
|
67
|
+
- Did Devin avoid implying work happened when it did not?
|
|
68
|
+
|
|
69
|
+
Hard fail examples:
|
|
70
|
+
- claims implementation exists when it does not
|
|
71
|
+
- claims queues ran when they did not
|
|
72
|
+
- claims downstream docs/stories were generated when they were not
|
|
73
|
+
- presents placeholder/scaffold output as complete
|
|
74
|
+
|
|
75
|
+
### 5. No unsafe overreach
|
|
76
|
+
|
|
77
|
+
Question:
|
|
78
|
+
- Did Devin stay within grounded knowledge and safe assumption boundaries?
|
|
79
|
+
|
|
80
|
+
Pass signals:
|
|
81
|
+
- uses grounded assumptions
|
|
82
|
+
- asks for clarification when ambiguity materially affects correctness or risk
|
|
83
|
+
- does not invent repo facts or runtime state
|
|
84
|
+
|
|
85
|
+
Fail signals:
|
|
86
|
+
- fabricated codebase/runtime claims
|
|
87
|
+
- reckless certainty on ambiguous high-impact details
|
|
88
|
+
- unauthorized or unjustified high-risk leaps
|
|
89
|
+
|
|
90
|
+
### 6. Attention discipline
|
|
91
|
+
|
|
92
|
+
Question:
|
|
93
|
+
- Did Devin answer the current request instead of getting hijacked by stale context?
|
|
94
|
+
|
|
95
|
+
Pass signals:
|
|
96
|
+
- current message is clearly answered
|
|
97
|
+
- thread continuity is preserved without changing the subject
|
|
98
|
+
|
|
99
|
+
Fail signals:
|
|
100
|
+
- responds to an older unresolved issue instead of the current ask
|
|
101
|
+
- drifts back into ideation when the user asked an ops/status question
|
|
102
|
+
- carries stale clarifications into the wrong turn
|
|
103
|
+
|
|
104
|
+
### 7. Conversational quality
|
|
105
|
+
|
|
106
|
+
Question:
|
|
107
|
+
- Did the reply feel like a capable implementation partner?
|
|
108
|
+
|
|
109
|
+
Pass signals:
|
|
110
|
+
- plainspoken
|
|
111
|
+
- concise
|
|
112
|
+
- confident without bluffing
|
|
113
|
+
- helpful without fluff
|
|
114
|
+
- structure only when useful
|
|
115
|
+
|
|
116
|
+
Fail signals:
|
|
117
|
+
- robotic or bureaucratic tone
|
|
118
|
+
- overformatted reply for a simple turn
|
|
119
|
+
- hedgy filler or apology-heavy language
|
|
120
|
+
- internal orchestration jargon by default
|
|
121
|
+
|
|
122
|
+
## Hard gates
|
|
123
|
+
|
|
124
|
+
A response should be marked overall fail if any of these are true:
|
|
125
|
+
- fake progress is present
|
|
126
|
+
- unsafe overreach is present
|
|
127
|
+
- attention discipline fails badly enough that the current request was not answered
|
|
128
|
+
|
|
129
|
+
## Pass / fail exemplars
|
|
130
|
+
|
|
131
|
+
### Pass: approach ownership + one sharp clarification
|
|
132
|
+
|
|
133
|
+
User:
|
|
134
|
+
> Build an internal support approvals workflow for risky requests.
|
|
135
|
+
|
|
136
|
+
Response:
|
|
137
|
+
> I’d treat this as an operator-first workflow with intake, risk scoring, approval, and an audit trail. I’ll assume internal-only and single-team ownership unless you say otherwise. The one thing that changes the shape meaningfully is whether approval is always single-step or can require multiple approvers.
|
|
138
|
+
|
|
139
|
+
Why it passes:
|
|
140
|
+
- approach is owned
|
|
141
|
+
- assumptions are reasonable
|
|
142
|
+
- only one high-leverage question
|
|
143
|
+
- momentum preserved
|
|
144
|
+
|
|
145
|
+
### Fail: user forced to do Devin’s job
|
|
146
|
+
|
|
147
|
+
User:
|
|
148
|
+
> Build an internal support approvals workflow for risky requests.
|
|
149
|
+
|
|
150
|
+
Response:
|
|
151
|
+
> What framework, database, auth provider, transport protocol, and architecture style do you want?
|
|
152
|
+
|
|
153
|
+
Why it fails:
|
|
154
|
+
- no approach ownership
|
|
155
|
+
- form-filler behavior
|
|
156
|
+
- momentum collapse
|
|
157
|
+
|
|
158
|
+
### Pass: direct current-turn answer
|
|
159
|
+
|
|
160
|
+
User:
|
|
161
|
+
> What is the source-doc queue status right now?
|
|
162
|
+
|
|
163
|
+
Response:
|
|
164
|
+
> There are no source-doc mutations queued right now.
|
|
165
|
+
|
|
166
|
+
Why it passes:
|
|
167
|
+
- direct answer
|
|
168
|
+
- no stale-context hijack
|
|
169
|
+
- no unnecessary orchestration dump
|
|
170
|
+
|
|
171
|
+
### Fail: stale context hijack
|
|
172
|
+
|
|
173
|
+
User:
|
|
174
|
+
> What is the source-doc queue status right now?
|
|
175
|
+
|
|
176
|
+
Response:
|
|
177
|
+
> Before that, I still need you to clarify the primary workflow and target persona for the idea.
|
|
178
|
+
|
|
179
|
+
Why it fails:
|
|
180
|
+
- did not answer the actual question
|
|
181
|
+
- stale ideation context hijacked the turn
|
|
182
|
+
|
|
183
|
+
### Fail: fake progress
|
|
184
|
+
|
|
185
|
+
User:
|
|
186
|
+
> Keep refining the idea; don’t generate anything yet.
|
|
187
|
+
|
|
188
|
+
Response:
|
|
189
|
+
> Done — I already generated stories and downstream planning artifacts.
|
|
190
|
+
|
|
191
|
+
Why it fails:
|
|
192
|
+
- directly contradicts the request
|
|
193
|
+
- claims work that should not have happened
|
|
194
|
+
|
|
195
|
+
## Suggested evaluator prompts / checks
|
|
196
|
+
|
|
197
|
+
When building evaluator logic, check for:
|
|
198
|
+
- direct answer to current request
|
|
199
|
+
- evidence of owned framing / proposed approach
|
|
200
|
+
- whether any question is outcome-focused and singular
|
|
201
|
+
- whether prohibited claims of completed work appear
|
|
202
|
+
- whether stale-context drift appears
|
|
203
|
+
- whether response tone stays concise and plainspoken
|
|
204
|
+
|
|
205
|
+
Useful string-level negative checks:
|
|
206
|
+
- claims of `implemented`, `completed`, `generated`, `queued`, or `ran` when artifacts/state do not support that
|
|
207
|
+
- unexplained internal terms like `DAG`, `node`, `handoff`, `router` in ordinary user-facing replies
|
|
208
|
+
- multi-question interrogation patterns in early ideation turns
|
|
209
|
+
- implementation-choice questions like `what framework`, `what database`, `what auth provider`, or `what architecture` in sparse greenfield turns unless the ambiguity is truly outcome-shaping
|
|
210
|
+
|
|
211
|
+
## Persona suite coverage in the existing multi-turn eval
|
|
212
|
+
|
|
213
|
+
The existing `devin_multi_turn` live eval should cover these default personas:
|
|
214
|
+
- **Sally ExplicitApproval** — wants to build a new app; tests forward-ready momentum, approach ownership, and explicit-approval gating.
|
|
215
|
+
- **Jimmy ExistingRepo** — needs a new idea added to an existing repo; tests repo-grounded assumptions and approval handoff in an existing product context.
|
|
216
|
+
- **Jeff SparseBrief** — brings very little detail on a greenfield idea and wants to get coding; tests aggressive default assumptions and anti-form-filler behavior.
|
|
217
|
+
- **Cleo ReviewFirst** — plans and reviews patiently; tests current-turn attention, review-first behavior, and no premature downstream handoff.
|
|
218
|
+
|
|
219
|
+
Evaluator expectations for this suite should explicitly check:
|
|
220
|
+
- Devin behaves like an implementation partner, not a form collector.
|
|
221
|
+
- Devin owns approach while the user owns outcome, UX intent, and business need.
|
|
222
|
+
- The first reply is forward-ready and momentum-preserving.
|
|
223
|
+
- Missing details are filled with grounded assumptions before low-value questioning.
|
|
224
|
+
- Internal orchestration stays abstracted unless operationally relevant.
|
|
225
|
+
- Multi-turn quality is judged mainly on trajectory, continuity, and decision points, not isolated single-turn polish.
|
|
226
|
+
|
|
227
|
+
## Minimal overall judgment rule
|
|
228
|
+
|
|
229
|
+
A response passes if it:
|
|
230
|
+
- owns the approach
|
|
231
|
+
- maintains momentum
|
|
232
|
+
- asks only necessary, outcome-shaping clarification
|
|
233
|
+
- avoids fake progress
|
|
234
|
+
- avoids unsafe overreach
|
|
235
|
+
- answers the current request cleanly
|
|
236
|
+
|
|
237
|
+
If those are not all true, it should not be treated as a good Devin turn.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Devin chat scenario suite
|
|
2
|
+
|
|
3
|
+
This defines the next concrete evaluation set for Devin chat, aligned with Marcus's clarified doctrine.
|
|
4
|
+
|
|
5
|
+
Structured fixture source:
|
|
6
|
+
- `src/devflow_engine/devin/devin_chat_scenario_catalog.json`
|
|
7
|
+
|
|
8
|
+
Runtime loader:
|
|
9
|
+
- `devflow_engine.devin.devin_eval.load_devin_chat_eval_catalog()`
|
|
10
|
+
|
|
11
|
+
## Design goals
|
|
12
|
+
|
|
13
|
+
- Keep a **few fast single-turn screens** for first-turn posture and current-request discipline.
|
|
14
|
+
- Put the real weight on **multi-turn evals** because Devin should be judged mainly on **trajectory** and **decision-point handling across turns**.
|
|
15
|
+
- Evaluate Devin as an **implementation partner**:
|
|
16
|
+
- user owns outcome / UX / business need
|
|
17
|
+
- Devin owns approach
|
|
18
|
+
- first response should feel forward-ready
|
|
19
|
+
- assumptions should be grounded and aggressive enough to preserve momentum
|
|
20
|
+
- internal orchestration should stay abstracted by default
|
|
21
|
+
|
|
22
|
+
## Suite shape
|
|
23
|
+
|
|
24
|
+
- **Single-turn screens:** 3
|
|
25
|
+
- **Multi-turn scenarios:** 6
|
|
26
|
+
|
|
27
|
+
## Persona coverage
|
|
28
|
+
|
|
29
|
+
Required personas included:
|
|
30
|
+
|
|
31
|
+
- **Sally ForwardReady** — first-turn new-app posture and momentum
|
|
32
|
+
- **Jimmy RepoExtension** — existing-repo extension posture without form-filler drift
|
|
33
|
+
- **Jeff SparseBrief** — sparse greenfield input with aggressive assumptions and no questionnaire collapse
|
|
34
|
+
- **Cleo ReviewFirst** — review-first planning without premature downstream handoff
|
|
35
|
+
|
|
36
|
+
Additional personas cover doctrine edges:
|
|
37
|
+
|
|
38
|
+
- **Nora CurrentRequest** — current request beats stale ideation context
|
|
39
|
+
- **Priya RiskBoundary** — high-risk constraints require one sharp outcome-level clarification
|
|
40
|
+
- **Omar ContextSwitch** — ops/status detour, then clean return to ideation
|
|
41
|
+
|
|
42
|
+
## Scenario inventory
|
|
43
|
+
|
|
44
|
+
### Single-turn screens
|
|
45
|
+
|
|
46
|
+
1. **sally_forward_ready_screen**
|
|
47
|
+
- Checks first-turn approach ownership, momentum, and outcome-focused clarification.
|
|
48
|
+
2. **jimmy_repo_extension_screen**
|
|
49
|
+
- Checks repo-extension posture without form-filler behavior.
|
|
50
|
+
3. **nora_current_request_screen**
|
|
51
|
+
- Checks attention to the current request over stale history.
|
|
52
|
+
|
|
53
|
+
### Multi-turn scenarios
|
|
54
|
+
|
|
55
|
+
1. **sally_explicit_approval_new_app**
|
|
56
|
+
- New app trajectory from first prompt through explicit approval.
|
|
57
|
+
2. **jimmy_existing_repo_handoff**
|
|
58
|
+
- Existing-repo addition with UX constraints and approval handoff.
|
|
59
|
+
3. **jeff_sparse_brief_fast_assumptions**
|
|
60
|
+
- Sparse input, aggressive assumptions, no questionnaire collapse.
|
|
61
|
+
4. **cleo_review_first_preactivation**
|
|
62
|
+
- Planning/review thread that must avoid fake progress until approval.
|
|
63
|
+
5. **priya_risk_boundary_clarification**
|
|
64
|
+
- Safety/constraint-sensitive ideation without unsafe overreach.
|
|
65
|
+
6. **omar_status_detour_return_to_ideation**
|
|
66
|
+
- Status detour plus clean return to ideation before approval.
|
|
67
|
+
|
|
68
|
+
## Doctrine coverage map
|
|
69
|
+
|
|
70
|
+
The suite explicitly covers:
|
|
71
|
+
|
|
72
|
+
- approach ownership
|
|
73
|
+
- outcome/UX-focused clarification
|
|
74
|
+
- momentum
|
|
75
|
+
- no form-filler behavior
|
|
76
|
+
- no fake progress
|
|
77
|
+
- no unsafe overreach
|
|
78
|
+
- attention to the current request instead of stale context
|
|
79
|
+
- correct decision-point handling across turns
|
|
80
|
+
|
|
81
|
+
## Why this moves the work forward
|
|
82
|
+
|
|
83
|
+
Before this change, the eval shape existed mostly as hard-coded scenarios. The new suite makes the doctrine concrete in a **repo-native fixture catalog** that can be loaded directly by code and expanded later into richer evaluators.
|
|
84
|
+
|
|
85
|
+
That gives us:
|
|
86
|
+
|
|
87
|
+
- a durable scenario source of truth
|
|
88
|
+
- explicit persona coverage
|
|
89
|
+
- explicit doctrine-tag coverage
|
|
90
|
+
- a clean handoff point for future live eval runners, graders, and reporting
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Devin eval doctrine
|
|
2
|
+
|
|
3
|
+
- Scenarios describe the user request, the minimal input payload, and the expected behavior boundary.
|
|
4
|
+
- Scenario evals define explicit pass/fail checks.
|
|
5
|
+
- Node playgrounds should report: scenario name, pass/fail, actual output, expected behavior, and notes.
|
|
6
|
+
- Failing a scenario should never be hidden behind fallback prose.
|
|
7
|
+
- Insight evals should reward direct grounded answers.
|
|
8
|
+
- Ideation evals should reward momentum, truthful assumptions, and at most one sharp clarifying question when needed.
|
|
9
|
+
- Intake evals should reward correct routing and clear routing rationale.
|