npm - feed-the-machine - Versions diffs - 1.5.0 → 1.6.0 - Mend

feed-the-machine 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (224) hide show

package/LICENSE +21 -21
package/README.md +170 -170
package/bin/generate-manifest.mjs +463 -463
package/bin/install.mjs +491 -491
package/docs/HOOKS.md +243 -243
package/docs/INBOX.md +233 -233
package/ftm/SKILL.md +122 -122
package/ftm-audit/SKILL.md +623 -541
package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
package/ftm-audit/scripts/run-knip.sh +23 -23
package/ftm-audit.yml +2 -2
package/ftm-brainstorm/SKILL.md +498 -498
package/ftm-brainstorm/evals/evals.json +100 -100
package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
package/ftm-brainstorm/references/agent-prompts.md +224 -224
package/ftm-brainstorm/references/plan-template.md +121 -121
package/ftm-brainstorm.yml +2 -2
package/ftm-browse/SKILL.md +454 -454
package/ftm-browse/daemon/browser-manager.ts +206 -206
package/ftm-browse/daemon/bun.lock +30 -30
package/ftm-browse/daemon/cli.ts +347 -347
package/ftm-browse/daemon/commands.ts +410 -410
package/ftm-browse/daemon/main.ts +357 -357
package/ftm-browse/daemon/package.json +17 -17
package/ftm-browse/daemon/server.ts +189 -189
package/ftm-browse/daemon/snapshot.ts +519 -519
package/ftm-browse/daemon/tsconfig.json +22 -22
package/ftm-browse.yml +4 -4
package/ftm-capture/SKILL.md +370 -370
package/ftm-capture.yml +4 -4
package/ftm-codex-gate/SKILL.md +361 -361
package/ftm-codex-gate.yml +2 -2
package/ftm-config/SKILL.md +345 -345
package/ftm-config.default.yml +82 -80
package/ftm-config.yml +2 -2
package/ftm-council/SKILL.md +416 -416
package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
package/ftm-council.yml +2 -2
package/ftm-dashboard/SKILL.md +163 -163
package/ftm-dashboard.yml +4 -4
package/ftm-debug/SKILL.md +1037 -1037
package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
package/ftm-debug.yml +2 -2
package/ftm-diagram/SKILL.md +277 -277
package/ftm-diagram.yml +2 -2
package/ftm-executor/SKILL.md +777 -767
package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -44
package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
package/ftm-executor/runtime/package.json +8 -8
package/ftm-executor.yml +2 -2
package/ftm-git/SKILL.md +441 -441
package/ftm-git/evals/evals.json +26 -26
package/ftm-git/evals/promptfoo.yaml +75 -75
package/ftm-git/hooks/post-commit-experience.sh +92 -92
package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
package/ftm-git/references/protocols/REMEDIATION.md +139 -139
package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
package/ftm-git.yml +2 -2
package/ftm-inbox/backend/adapters/_retry.py +64 -64
package/ftm-inbox/backend/adapters/base.py +230 -230
package/ftm-inbox/backend/adapters/freshservice.py +104 -104
package/ftm-inbox/backend/adapters/gmail.py +125 -125
package/ftm-inbox/backend/adapters/jira.py +136 -136
package/ftm-inbox/backend/adapters/registry.py +192 -192
package/ftm-inbox/backend/adapters/slack.py +110 -110
package/ftm-inbox/backend/db/connection.py +54 -54
package/ftm-inbox/backend/db/schema.py +78 -78
package/ftm-inbox/backend/executor/__init__.py +7 -7
package/ftm-inbox/backend/executor/engine.py +149 -149
package/ftm-inbox/backend/executor/step_runner.py +98 -98
package/ftm-inbox/backend/main.py +103 -103
package/ftm-inbox/backend/models/__init__.py +1 -1
package/ftm-inbox/backend/models/unified_task.py +36 -36
package/ftm-inbox/backend/planner/__init__.py +6 -6
package/ftm-inbox/backend/planner/generator.py +127 -127
package/ftm-inbox/backend/planner/schema.py +34 -34
package/ftm-inbox/backend/requirements.txt +5 -5
package/ftm-inbox/backend/routes/execute.py +186 -186
package/ftm-inbox/backend/routes/health.py +52 -52
package/ftm-inbox/backend/routes/inbox.py +68 -68
package/ftm-inbox/backend/routes/plan.py +271 -271
package/ftm-inbox/bin/launchagent.mjs +91 -91
package/ftm-inbox/bin/setup.mjs +188 -188
package/ftm-inbox/bin/start.sh +10 -10
package/ftm-inbox/bin/status.sh +17 -17
package/ftm-inbox/bin/stop.sh +8 -8
package/ftm-inbox/config.example.yml +55 -55
package/ftm-inbox/package-lock.json +2898 -2898
package/ftm-inbox/package.json +26 -26
package/ftm-inbox/postcss.config.js +6 -6
package/ftm-inbox/src/app.css +199 -199
package/ftm-inbox/src/app.html +18 -18
package/ftm-inbox/src/lib/api.ts +166 -166
package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
package/ftm-inbox/src/lib/theme.ts +47 -47
package/ftm-inbox/src/routes/+layout.svelte +76 -76
package/ftm-inbox/src/routes/+page.svelte +401 -401
package/ftm-inbox/svelte.config.js +12 -12
package/ftm-inbox/tailwind.config.ts +63 -63
package/ftm-inbox/tsconfig.json +13 -13
package/ftm-inbox/vite.config.ts +6 -6
package/ftm-intent/SKILL.md +241 -241
package/ftm-intent.yml +2 -2
package/ftm-manifest.json +3794 -3794
package/ftm-map/SKILL.md +291 -291
package/ftm-map/scripts/db.py +712 -712
package/ftm-map/scripts/index.py +415 -415
package/ftm-map/scripts/parser.py +224 -224
package/ftm-map/scripts/queries/go-tags.scm +20 -20
package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
package/ftm-map/scripts/queries/python-tags.scm +31 -31
package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
package/ftm-map/scripts/queries/rust-tags.scm +37 -37
package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
package/ftm-map/scripts/query.py +301 -301
package/ftm-map/scripts/ranker.py +377 -377
package/ftm-map/scripts/requirements.txt +5 -5
package/ftm-map/scripts/setup-hooks.sh +27 -27
package/ftm-map/scripts/setup.sh +56 -56
package/ftm-map/scripts/test_db.py +364 -364
package/ftm-map/scripts/test_parser.py +174 -174
package/ftm-map/scripts/test_query.py +183 -183
package/ftm-map/scripts/test_ranker.py +199 -199
package/ftm-map/scripts/views.py +591 -591
package/ftm-map.yml +2 -2
package/ftm-mind/SKILL.md +1943 -1943
package/ftm-mind/evals/promptfoo.yaml +142 -142
package/ftm-mind/references/blackboard-schema.md +328 -328
package/ftm-mind/references/complexity-guide.md +110 -110
package/ftm-mind/references/event-registry.md +319 -319
package/ftm-mind/references/mcp-inventory.md +296 -296
package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
package/ftm-mind/references/reflexion-protocol.md +249 -249
package/ftm-mind/references/routing/SCENARIOS.md +22 -22
package/ftm-mind/references/routing-scenarios.md +35 -35
package/ftm-mind.yml +2 -2
package/ftm-pause/SKILL.md +395 -395
package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
package/ftm-pause/references/protocols/VALIDATION.md +80 -80
package/ftm-pause.yml +2 -2
package/ftm-researcher/SKILL.md +275 -275
package/ftm-researcher/evals/agent-diversity.yaml +17 -17
package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
package/ftm-researcher/references/adaptive-search.md +116 -116
package/ftm-researcher/references/agent-prompts.md +193 -193
package/ftm-researcher/references/council-integration.md +193 -193
package/ftm-researcher/references/output-format.md +203 -203
package/ftm-researcher/references/synthesis-pipeline.md +165 -165
package/ftm-researcher/scripts/score_credibility.py +234 -234
package/ftm-researcher/scripts/validate_research.py +92 -92
package/ftm-researcher.yml +2 -2
package/ftm-resume/SKILL.md +518 -518
package/ftm-resume/references/protocols/VALIDATION.md +172 -172
package/ftm-resume.yml +2 -2
package/ftm-retro/SKILL.md +380 -380
package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
package/ftm-retro.yml +2 -2
package/ftm-routine/SKILL.md +170 -170
package/ftm-routine.yml +4 -4
package/ftm-state/blackboard/capabilities.json +5 -5
package/ftm-state/blackboard/capabilities.schema.json +27 -27
package/ftm-state/blackboard/context.json +23 -23
package/ftm-state/blackboard/experiences/index.json +9 -9
package/ftm-state/blackboard/patterns.json +6 -6
package/ftm-state/schemas/context.schema.json +130 -130
package/ftm-state/schemas/experience-index.schema.json +77 -77
package/ftm-state/schemas/experience.schema.json +78 -78
package/ftm-state/schemas/patterns.schema.json +44 -44
package/ftm-upgrade/SKILL.md +194 -194
package/ftm-upgrade/scripts/check-version.sh +76 -76
package/ftm-upgrade/scripts/upgrade.sh +143 -143
package/ftm-upgrade.yml +2 -2
package/ftm-verify.yml +2 -2
package/ftm.yml +2 -2
package/hooks/ftm-blackboard-enforcer.sh +93 -93
package/hooks/ftm-discovery-reminder.sh +90 -90
package/hooks/ftm-drafts-gate.sh +61 -61
package/hooks/ftm-event-logger.mjs +107 -107
package/hooks/ftm-map-autodetect.sh +79 -79
package/hooks/ftm-pending-sync-check.sh +22 -22
package/hooks/ftm-plan-gate.sh +92 -92
package/hooks/ftm-post-commit-trigger.sh +57 -57
package/hooks/settings-template.json +81 -81
package/install.sh +363 -363
package/package.json +84 -84
package/uninstall.sh +25 -25

package/ftm-brainstorm/evals/evals.json CHANGED Viewed

@@ -1,100 +1,100 @@
-{
-  "skill_name": "ftm-brainstorm",
-  "evals": [
-    {
-      "id": 0,
-      "name": "fresh-idea-intake",
-      "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
-      "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
-      "files": [],
-      "assertions": [
-        {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
-        {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
-        {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
-        {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
-      ]
-    },
-    {
-      "id": 1,
-      "name": "fresh-idea-turn2-research",
-      "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
-      "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
-      "files": [],
-      "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) — not fewer"},
-        {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
-        {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
-        {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
-        {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
-        {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
-        {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
-      ]
-    },
-    {
-      "id": 2,
-      "name": "turn3-deeper-research",
-      "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
-      "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
-      "files": [],
-      "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
-        {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
-        {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
-        {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
-        {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
-        {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
-        {"name": "hard_stop", "description": "Does NOT continue past the questions"}
-      ]
-    },
-    {
-      "id": 3,
-      "name": "brain-dump-intake",
-      "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
-      "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
-      "files": ["brain-dump-input.md"],
-      "assertions": [
-        {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
-        {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
-        {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
-        {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
-        {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
-        {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
-      ]
-    },
-    {
-      "id": 4,
-      "name": "brain-dump-turn2-research",
-      "prompt": "Yeah that looks right, go ahead and research it",
-      "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
-      "files": ["brain-dump-input.md"],
-      "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
-        {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
-        {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
-        {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
-        {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
-        {"name": "hard_stop", "description": "Does NOT proceed past questions"}
-      ]
-    },
-    {
-      "id": 5,
-      "name": "phase3-gate",
-      "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
-      "expected_output": "Vision summary for approval, NOT the full plan yet",
-      "files": [],
-      "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
-      "assertions": [
-        {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
-        {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
-        {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
-        {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
-      ]
-    }
-  ]
-}
+{
+  "skill_name": "ftm-brainstorm",
+  "evals": [
+    {
+      "id": 0,
+      "name": "fresh-idea-intake",
+      "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
+      "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
+      "files": [],
+      "assertions": [
+        {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
+        {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
+        {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
+        {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
+      ]
+    },
+    {
+      "id": 1,
+      "name": "fresh-idea-turn2-research",
+      "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
+      "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
+      "files": [],
+      "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
+      "assertions": [
+        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) — not fewer"},
+        {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
+        {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
+        {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
+        {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
+        {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
+        {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
+      ]
+    },
+    {
+      "id": 2,
+      "name": "turn3-deeper-research",
+      "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
+      "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
+      "files": [],
+      "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
+      "assertions": [
+        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
+        {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
+        {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
+        {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
+        {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
+        {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
+        {"name": "hard_stop", "description": "Does NOT continue past the questions"}
+      ]
+    },
+    {
+      "id": 3,
+      "name": "brain-dump-intake",
+      "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
+      "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
+      "files": ["brain-dump-input.md"],
+      "assertions": [
+        {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
+        {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
+        {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
+        {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
+        {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
+        {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
+      ]
+    },
+    {
+      "id": 4,
+      "name": "brain-dump-turn2-research",
+      "prompt": "Yeah that looks right, go ahead and research it",
+      "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
+      "files": ["brain-dump-input.md"],
+      "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
+      "assertions": [
+        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
+        {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
+        {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
+        {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
+        {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
+        {"name": "hard_stop", "description": "Does NOT proceed past questions"}
+      ]
+    },
+    {
+      "id": 5,
+      "name": "phase3-gate",
+      "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
+      "expected_output": "Vision summary for approval, NOT the full plan yet",
+      "files": [],
+      "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
+      "assertions": [
+        {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
+        {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
+        {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
+        {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
+      ]
+    }
+  ]
+}

package/ftm-brainstorm/evals/promptfoo.yaml CHANGED Viewed

@@ -1,109 +1,109 @@
-description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
-prompts:
-  - "{{input}}"
-providers:
-  - id: "exec:claude --print"
-    label: "claude-code"
-defaultTest:
-  options:
-    transformVars: "vars"
-tests:
-  # Eval 0: fresh-idea-intake
-  - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
-    vars:
-      input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
-    assert:
-      - type: regex
-        value: "\\?"
-        description: "Response must contain at least one question"
-      - type: not-contains
-        value: "sprint"
-        description: "Does not dispatch research sprint on first turn"
-      - type: not-contains
-        value: "dispatch"
-        description: "Does not dispatch agents on first turn"
-  # Eval 1: fresh-idea-turn2-research
-  - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
-    vars:
-      input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
-    assert:
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "RECOMMENDED|recommended|#1"
-        description: "At least one suggestion is labeled as recommended"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 2: turn3-deeper-research
-  - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
-    vars:
-      input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
-    assert:
-      - type: regex
-        value: "matching|algorithm|cold.start"
-        description: "Response addresses the specific concerns raised"
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 3: brain-dump-intake
-  - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
-    vars:
-      input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
-    assert:
-      - type: regex
-        value: "Decided|decided|Decision|decision"
-        description: "Contains a 'Decided' section"
-      - type: regex
-        value: "open question|Open question|gap|Gap"
-        description: "Contains an open questions or gaps section"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a confirmation question"
-      - type: not-contains
-        value: "dispatch"
-        description: "Does not dispatch research agents on intake turn"
-  # Eval 4: brain-dump-turn2-research
-  - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
-    vars:
-      input: "Yeah that looks right, go ahead and research it"
-    assert:
-      - type: regex
-        value: "Novelty|novelty|novel|Novel"
-        description: "Contains a Novelty Map"
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 5: phase3-gate
-  - description: "phase 3 gate — vision summary and approval before generating full plan"
-    vars:
-      input: "Ok I think I know what I want. Let's turn this into a plan."
-    assert:
-      - type: regex
-        value: "summary|Summary|we've landed|landed on|here's what"
-        description: "Presents a vision summary before generating the full plan"
-      - type: regex
-        value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
-        description: "Asks for confirmation before proceeding to full plan"
-      - type: not-contains
-        value: "Wave 1"
-        description: "Does not dump the full plan structure prematurely"
-      - type: not-contains
-        value: "Wave 2"
-        description: "Does not dump the full plan structure prematurely"
+description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
+prompts:
+  - "{{input}}"
+providers:
+  - id: "exec:claude --print"
+    label: "claude-code"
+defaultTest:
+  options:
+    transformVars: "vars"
+tests:
+  # Eval 0: fresh-idea-intake
+  - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
+    vars:
+      input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
+    assert:
+      - type: regex
+        value: "\\?"
+        description: "Response must contain at least one question"
+      - type: not-contains
+        value: "sprint"
+        description: "Does not dispatch research sprint on first turn"
+      - type: not-contains
+        value: "dispatch"
+        description: "Does not dispatch agents on first turn"
+  # Eval 1: fresh-idea-turn2-research
+  - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
+    vars:
+      input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
+    assert:
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "RECOMMENDED|recommended|#1"
+        description: "At least one suggestion is labeled as recommended"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 2: turn3-deeper-research
+  - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
+    vars:
+      input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
+    assert:
+      - type: regex
+        value: "matching|algorithm|cold.start"
+        description: "Response addresses the specific concerns raised"
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 3: brain-dump-intake
+  - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
+    vars:
+      input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
+    assert:
+      - type: regex
+        value: "Decided|decided|Decision|decision"
+        description: "Contains a 'Decided' section"
+      - type: regex
+        value: "open question|Open question|gap|Gap"
+        description: "Contains an open questions or gaps section"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a confirmation question"
+      - type: not-contains
+        value: "dispatch"
+        description: "Does not dispatch research agents on intake turn"
+  # Eval 4: brain-dump-turn2-research
+  - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
+    vars:
+      input: "Yeah that looks right, go ahead and research it"
+    assert:
+      - type: regex
+        value: "Novelty|novelty|novel|Novel"
+        description: "Contains a Novelty Map"
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 5: phase3-gate
+  - description: "phase 3 gate — vision summary and approval before generating full plan"
+    vars:
+      input: "Ok I think I know what I want. Let's turn this into a plan."
+    assert:
+      - type: regex
+        value: "summary|Summary|we've landed|landed on|here's what"
+        description: "Presents a vision summary before generating the full plan"
+      - type: regex
+        value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
+        description: "Asks for confirmation before proceeding to full plan"
+      - type: not-contains
+        value: "Wave 1"
+        description: "Does not dump the full plan structure prematurely"
+      - type: not-contains
+        value: "Wave 2"
+        description: "Does not dump the full plan structure prematurely"