npm - feed-the-machine - Versions diffs - 1.6.0 → 1.7.0 - Mend

feed-the-machine 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (269) hide show

package/LICENSE +21 -21
package/README.md +170 -170
package/bin/brain.py +1340 -0
package/bin/convert_claude_skills_to_codex.py +490 -0
package/bin/generate-manifest.mjs +463 -463
package/bin/harden_codex_skills.py +141 -0
package/bin/install.mjs +491 -491
package/bin/migrate-eng-buddy-data.py +875 -0
package/bin/playbook_engine/__init__.py +1 -0
package/bin/playbook_engine/conftest.py +8 -0
package/bin/playbook_engine/extractor.py +33 -0
package/bin/playbook_engine/manager.py +102 -0
package/bin/playbook_engine/models.py +84 -0
package/bin/playbook_engine/registry.py +35 -0
package/bin/playbook_engine/test_extractor.py +72 -0
package/bin/playbook_engine/test_integration.py +129 -0
package/bin/playbook_engine/test_manager.py +85 -0
package/bin/playbook_engine/test_models.py +166 -0
package/bin/playbook_engine/test_registry.py +67 -0
package/bin/playbook_engine/test_tracer.py +86 -0
package/bin/playbook_engine/tracer.py +93 -0
package/bin/tasks_db.py +456 -0
package/docs/HOOKS.md +243 -243
package/docs/INBOX.md +233 -233
package/ftm/SKILL.md +125 -122
package/ftm-audit/SKILL.md +623 -623
package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
package/ftm-audit/scripts/run-knip.sh +23 -23
package/ftm-audit.yml +2 -2
package/ftm-brainstorm/SKILL.md +1003 -498
package/ftm-brainstorm/evals/evals.json +180 -100
package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
package/ftm-brainstorm/references/agent-prompts.md +552 -224
package/ftm-brainstorm/references/plan-template.md +209 -121
package/ftm-brainstorm.yml +2 -2
package/ftm-browse/SKILL.md +454 -454
package/ftm-browse/daemon/browser-manager.ts +206 -206
package/ftm-browse/daemon/bun.lock +30 -30
package/ftm-browse/daemon/cli.ts +347 -347
package/ftm-browse/daemon/commands.ts +410 -410
package/ftm-browse/daemon/main.ts +357 -357
package/ftm-browse/daemon/package.json +17 -17
package/ftm-browse/daemon/server.ts +189 -189
package/ftm-browse/daemon/snapshot.ts +519 -519
package/ftm-browse/daemon/tsconfig.json +22 -22
package/ftm-browse.yml +4 -4
package/ftm-capture/SKILL.md +370 -370
package/ftm-capture.yml +4 -4
package/ftm-codex-gate/SKILL.md +361 -361
package/ftm-codex-gate.yml +2 -2
package/ftm-config/SKILL.md +422 -345
package/ftm-config.default.yml +125 -82
package/ftm-config.yml +44 -2
package/ftm-council/SKILL.md +416 -416
package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
package/ftm-council.yml +2 -2
package/ftm-dashboard/SKILL.md +163 -163
package/ftm-dashboard.yml +4 -4
package/ftm-debug/SKILL.md +1037 -1037
package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
package/ftm-debug.yml +2 -2
package/ftm-diagram/SKILL.md +277 -277
package/ftm-diagram.yml +2 -2
package/ftm-executor/SKILL.md +777 -777
package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -59
package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
package/ftm-executor/runtime/package.json +8 -8
package/ftm-executor.yml +2 -2
package/ftm-git/SKILL.md +441 -441
package/ftm-git/evals/evals.json +26 -26
package/ftm-git/evals/promptfoo.yaml +75 -75
package/ftm-git/hooks/post-commit-experience.sh +92 -92
package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
package/ftm-git/references/protocols/REMEDIATION.md +139 -139
package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
package/ftm-git.yml +2 -2
package/ftm-inbox/backend/__pycache__/main.cpython-314.pyc +0 -0
package/ftm-inbox/backend/adapters/_retry.py +64 -64
package/ftm-inbox/backend/adapters/base.py +230 -230
package/ftm-inbox/backend/adapters/freshservice.py +104 -104
package/ftm-inbox/backend/adapters/gmail.py +125 -125
package/ftm-inbox/backend/adapters/jira.py +136 -136
package/ftm-inbox/backend/adapters/registry.py +192 -192
package/ftm-inbox/backend/adapters/slack.py +110 -110
package/ftm-inbox/backend/db/connection.py +54 -54
package/ftm-inbox/backend/db/schema.py +78 -78
package/ftm-inbox/backend/executor/__init__.py +7 -7
package/ftm-inbox/backend/executor/engine.py +149 -149
package/ftm-inbox/backend/executor/step_runner.py +98 -98
package/ftm-inbox/backend/main.py +103 -103
package/ftm-inbox/backend/models/__init__.py +1 -1
package/ftm-inbox/backend/models/unified_task.py +36 -36
package/ftm-inbox/backend/planner/__init__.py +6 -6
package/ftm-inbox/backend/planner/__pycache__/__init__.cpython-314.pyc +0 -0
package/ftm-inbox/backend/planner/__pycache__/generator.cpython-314.pyc +0 -0
package/ftm-inbox/backend/planner/__pycache__/schema.cpython-314.pyc +0 -0
package/ftm-inbox/backend/planner/generator.py +127 -127
package/ftm-inbox/backend/planner/schema.py +34 -34
package/ftm-inbox/backend/requirements.txt +5 -5
package/ftm-inbox/backend/routes/__pycache__/plan.cpython-314.pyc +0 -0
package/ftm-inbox/backend/routes/execute.py +186 -186
package/ftm-inbox/backend/routes/health.py +52 -52
package/ftm-inbox/backend/routes/inbox.py +68 -68
package/ftm-inbox/backend/routes/plan.py +271 -271
package/ftm-inbox/bin/launchagent.mjs +91 -91
package/ftm-inbox/bin/setup.mjs +188 -188
package/ftm-inbox/bin/start.sh +10 -10
package/ftm-inbox/bin/status.sh +17 -17
package/ftm-inbox/bin/stop.sh +8 -8
package/ftm-inbox/config.example.yml +55 -55
package/ftm-inbox/package-lock.json +2898 -2898
package/ftm-inbox/package.json +26 -26
package/ftm-inbox/postcss.config.js +6 -6
package/ftm-inbox/src/app.css +199 -199
package/ftm-inbox/src/app.html +18 -18
package/ftm-inbox/src/lib/api.ts +166 -166
package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
package/ftm-inbox/src/lib/theme.ts +47 -47
package/ftm-inbox/src/routes/+layout.svelte +76 -76
package/ftm-inbox/src/routes/+page.svelte +401 -401
package/ftm-inbox/svelte.config.js +12 -12
package/ftm-inbox/tailwind.config.ts +63 -63
package/ftm-inbox/tsconfig.json +13 -13
package/ftm-inbox/vite.config.ts +6 -6
package/ftm-intent/SKILL.md +241 -241
package/ftm-intent.yml +2 -2
package/ftm-manifest.json +3794 -3794
package/ftm-map/SKILL.md +291 -291
package/ftm-map/scripts/db.py +712 -712
package/ftm-map/scripts/index.py +415 -415
package/ftm-map/scripts/parser.py +224 -224
package/ftm-map/scripts/queries/go-tags.scm +20 -20
package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
package/ftm-map/scripts/queries/python-tags.scm +31 -31
package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
package/ftm-map/scripts/queries/rust-tags.scm +37 -37
package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
package/ftm-map/scripts/query.py +301 -301
package/ftm-map/scripts/ranker.py +377 -377
package/ftm-map/scripts/requirements.txt +5 -5
package/ftm-map/scripts/setup-hooks.sh +27 -27
package/ftm-map/scripts/setup.sh +56 -56
package/ftm-map/scripts/test_db.py +364 -364
package/ftm-map/scripts/test_parser.py +174 -174
package/ftm-map/scripts/test_query.py +183 -183
package/ftm-map/scripts/test_ranker.py +199 -199
package/ftm-map/scripts/views.py +591 -591
package/ftm-map.yml +2 -2
package/ftm-mind/SKILL.md +201 -1943
package/ftm-mind/evals/promptfoo.yaml +142 -142
package/ftm-mind/references/blackboard-protocol.md +110 -0
package/ftm-mind/references/blackboard-schema.md +328 -328
package/ftm-mind/references/complexity-guide.md +110 -110
package/ftm-mind/references/complexity-sizing.md +138 -0
package/ftm-mind/references/decide-act-protocol.md +172 -0
package/ftm-mind/references/direct-execution.md +51 -0
package/ftm-mind/references/environment-discovery.md +77 -0
package/ftm-mind/references/event-registry.md +319 -319
package/ftm-mind/references/mcp-inventory.md +300 -296
package/ftm-mind/references/ops-routing.md +47 -0
package/ftm-mind/references/orient-protocol.md +234 -0
package/ftm-mind/references/personality.md +40 -0
package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
package/ftm-mind/references/reflexion-protocol.md +249 -249
package/ftm-mind/references/routing/SCENARIOS.md +22 -22
package/ftm-mind/references/routing-scenarios.md +35 -35
package/ftm-mind.yml +2 -2
package/ftm-ops.yml +4 -0
package/ftm-pause/SKILL.md +395 -395
package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
package/ftm-pause/references/protocols/VALIDATION.md +80 -80
package/ftm-pause.yml +2 -2
package/ftm-researcher/SKILL.md +275 -275
package/ftm-researcher/evals/agent-diversity.yaml +17 -17
package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
package/ftm-researcher/references/adaptive-search.md +116 -116
package/ftm-researcher/references/agent-prompts.md +193 -193
package/ftm-researcher/references/council-integration.md +193 -193
package/ftm-researcher/references/output-format.md +203 -203
package/ftm-researcher/references/synthesis-pipeline.md +165 -165
package/ftm-researcher/scripts/score_credibility.py +234 -234
package/ftm-researcher/scripts/validate_research.py +92 -92
package/ftm-researcher.yml +2 -2
package/ftm-resume/SKILL.md +518 -518
package/ftm-resume/references/protocols/VALIDATION.md +172 -172
package/ftm-resume.yml +2 -2
package/ftm-retro/SKILL.md +380 -380
package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
package/ftm-retro.yml +2 -2
package/ftm-routine/SKILL.md +170 -170
package/ftm-routine.yml +4 -4
package/ftm-state/blackboard/capabilities.json +5 -5
package/ftm-state/blackboard/capabilities.schema.json +27 -27
package/ftm-state/blackboard/context.json +37 -23
package/ftm-state/blackboard/experiences/doom-statusline-fix.json +26 -0
package/ftm-state/blackboard/experiences/hackathon-pages-site.json +26 -0
package/ftm-state/blackboard/experiences/hindsight-sso-kickoff.json +42 -0
package/ftm-state/blackboard/experiences/index.json +58 -9
package/ftm-state/blackboard/experiences/learning-ragnarok-api-access.json +23 -0
package/ftm-state/blackboard/experiences/nordlayer-members-auto-assign.json +26 -0
package/ftm-state/blackboard/experiences/saml2aws-stale-session-fix.json +41 -0
package/ftm-state/blackboard/patterns.json +6 -6
package/ftm-state/schemas/context.schema.json +130 -130
package/ftm-state/schemas/experience-index.schema.json +77 -77
package/ftm-state/schemas/experience.schema.json +78 -78
package/ftm-state/schemas/patterns.schema.json +44 -44
package/ftm-upgrade/SKILL.md +194 -194
package/ftm-upgrade/scripts/check-version.sh +76 -76
package/ftm-upgrade/scripts/upgrade.sh +143 -143
package/ftm-upgrade.yml +2 -2
package/ftm-verify.yml +2 -2
package/ftm.yml +2 -2
package/hooks/ftm-auto-log.sh +137 -0
package/hooks/ftm-blackboard-enforcer.sh +93 -93
package/hooks/ftm-discovery-reminder.sh +90 -90
package/hooks/ftm-drafts-gate.sh +61 -61
package/hooks/ftm-event-logger.mjs +107 -107
package/hooks/ftm-install-hooks.sh +240 -0
package/hooks/ftm-learning-capture.sh +117 -0
package/hooks/ftm-map-autodetect.sh +79 -79
package/hooks/ftm-pending-sync-check.sh +22 -22
package/hooks/ftm-plan-gate.sh +92 -92
package/hooks/ftm-post-commit-trigger.sh +57 -57
package/hooks/ftm-post-compaction.sh +138 -0
package/hooks/ftm-pre-compaction.sh +147 -0
package/hooks/ftm-session-end.sh +52 -0
package/hooks/ftm-session-snapshot.sh +213 -0
package/hooks/settings-template.json +81 -81
package/install.sh +363 -363
package/package.json +84 -84
package/uninstall.sh +25 -25

package/ftm-brainstorm/evals/evals.json CHANGED Viewed

@@ -1,100 +1,180 @@
-{
-  "skill_name": "ftm-brainstorm",
-  "evals": [
-    {
-      "id": 0,
-      "name": "fresh-idea-intake",
-      "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
-      "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
-      "files": [],
-      "assertions": [
-        {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
-        {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
-        {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
-        {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
-      ]
-    },
-    {
-      "id": 1,
-      "name": "fresh-idea-turn2-research",
-      "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
-      "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
-      "files": [],
-      "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) — not fewer"},
-        {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
-        {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
-        {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
-        {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
-        {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
-        {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
-      ]
-    },
-    {
-      "id": 2,
-      "name": "turn3-deeper-research",
-      "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
-      "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
-      "files": [],
-      "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
-        {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
-        {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
-        {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
-        {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
-        {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
-        {"name": "hard_stop", "description": "Does NOT continue past the questions"}
-      ]
-    },
-    {
-      "id": 3,
-      "name": "brain-dump-intake",
-      "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
-      "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
-      "files": ["brain-dump-input.md"],
-      "assertions": [
-        {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
-        {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
-        {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
-        {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
-        {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
-        {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
-      ]
-    },
-    {
-      "id": 4,
-      "name": "brain-dump-turn2-research",
-      "prompt": "Yeah that looks right, go ahead and research it",
-      "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
-      "files": ["brain-dump-input.md"],
-      "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
-      "assertions": [
-        {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
-        {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
-        {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
-        {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
-        {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
-        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
-        {"name": "hard_stop", "description": "Does NOT proceed past questions"}
-      ]
-    },
-    {
-      "id": 5,
-      "name": "phase3-gate",
-      "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
-      "expected_output": "Vision summary for approval, NOT the full plan yet",
-      "files": [],
-      "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
-      "assertions": [
-        {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
-        {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
-        {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
-        {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
-      ]
-    }
-  ]
-}
+{
+  "skill_name": "ftm-brainstorm",
+  "evals": [
+    {
+      "id": 0,
+      "name": "fresh-idea-intake",
+      "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
+      "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
+      "files": [],
+      "assertions": [
+        {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
+        {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
+        {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
+        {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"},
+        {"name": "follows_energy", "description": "Asks about the most interesting/emphasized part of the idea, not a generic checklist question"},
+        {"name": "no_skill_questions", "description": "Does NOT ask about the user's technical experience or skill level"}
+      ]
+    },
+    {
+      "id": 1,
+      "name": "fresh-idea-turn2-research",
+      "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
+      "expected_output": "First research sprint dispatched (7 agents + synthesizer), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
+      "files": [],
+      "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
+      "assertions": [
+        {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents (web, github, competitive, stack, architecture, pitfall, UX) plus synthesizer — not fewer in standard mode"},
+        {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
+        {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
+        {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
+        {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions — as a statement, NOT a question"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions via AskUserQuestion to drive next research sprint"},
+        {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
+        {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"},
+        {"name": "synthesizer_output", "description": "Shows consensus/contested/unique findings structure from synthesizer"}
+      ]
+    },
+    {
+      "id": 2,
+      "name": "turn3-deeper-research",
+      "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
+      "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
+      "files": [],
+      "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
+      "assertions": [
+        {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents again — every turn gets a research sprint"},
+        {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
+        {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
+        {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
+        {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
+        {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
+        {"name": "hard_stop", "description": "Does NOT continue past the questions"},
+        {"name": "prior_decision_logged", "description": "Records 'React Native + Firebase chosen' in prior decisions log — does NOT re-ask about tech stack"}
+      ]
+    },
+    {
+      "id": 3,
+      "name": "brain-dump-intake",
+      "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
+      "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
+      "files": ["brain-dump-input.md"],
+      "assertions": [
+        {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
+        {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
+        {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
+        {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
+        {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
+        {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
+      ]
+    },
+    {
+      "id": 4,
+      "name": "brain-dump-turn2-research",
+      "prompt": "Yeah that looks right, go ahead and research it",
+      "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
+      "files": ["brain-dump-input.md"],
+      "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
+      "assertions": [
+        {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents searching for specific brain dump claims"},
+        {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
+        {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
+        {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
+        {"name": "challenge_present", "description": "At least one challenge/pushback raised as a statement"},
+        {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions via AskUserQuestion"},
+        {"name": "hard_stop", "description": "Does NOT proceed past questions"}
+      ]
+    },
+    {
+      "id": 5,
+      "name": "phase3-gate",
+      "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
+      "expected_output": "Spec self-review + pre-mortem runs, vision summary with risk mitigations for approval, NOT the full plan yet",
+      "files": [],
+      "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
+      "assertions": [
+        {"name": "spec_self_review", "description": "Runs spec self-review checking for placeholders, contradictions, scope gaps, ambiguity"},
+        {"name": "pre_mortem_runs", "description": "Runs pre-mortem stress test generating failure scenarios with mitigations"},
+        {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary including key decisions AND top risks"},
+        {"name": "canonical_references", "description": "Includes canonical references section with links to key sources from research"},
+        {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
+        {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
+        {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
+      ]
+    },
+    {
+      "id": 6,
+      "name": "assumption-audit-activation",
+      "prompt": "I want to build a real-time collaborative whiteboard where multiple users can draw simultaneously. It needs to sync instantly across devices.",
+      "expected_output": "Intake question, and on turn 2-3 the assumption audit module should activate to challenge 'instantly' and 'real-time' assumptions",
+      "files": [],
+      "multi_turn_context": "Turn 2-3. User has described a collaborative whiteboard with real-time sync. The assumption audit should catch assumptions about 'instantly' (network latency exists), 'real-time' (what tolerance?), and synchronization approach.",
+      "assertions": [
+        {"name": "assumption_audit_activates", "description": "Runs the First Principles Assumption Audit module"},
+        {"name": "crackable_assumptions_found", "description": "Identifies at least one crackable assumption (e.g., 'instantly' is a design choice, not a requirement)"},
+        {"name": "five_levels_analyzed", "description": "Examines assumptions at surface, process, structural, cultural, and fundamental levels"},
+        {"name": "user_can_respond", "description": "Presents assumptions as keep/challenge/ignore choices via AskUserQuestion"}
+      ]
+    },
+    {
+      "id": 7,
+      "name": "discuss-mode-activation",
+      "prompt": "I know exactly what I want. Here's my spec: Build a REST API endpoint POST /api/v1/recordings that accepts multipart form data with an audio file (max 500MB), stores it in S3 with a UUID filename, creates a database record with metadata (duration, format, size, created_at), and returns the record ID with a signed URL for playback. Use Express.js with multer for uploads, pg for Postgres, and aws-sdk for S3.",
+      "expected_output": "Discuss mode activates: parses spec, identifies gray areas using SEE/CALL/RUN/READ/ORGANIZE heuristics, asks targeted questions",
+      "files": [],
+      "assertions": [
+        {"name": "discuss_mode_detected", "description": "Recognizes this is a detailed spec and activates Discuss Mode instead of standard brainstorm"},
+        {"name": "spec_parsed", "description": "Extracts key components: endpoint, storage, database, response format"},
+        {"name": "gray_areas_found", "description": "Identifies gray areas: auth, rate limiting, error handling for S3 failures, max concurrent uploads, file validation"},
+        {"name": "uses_heuristics", "description": "Applies SEE/CALL/RUN/READ/ORGANIZE categorization to find unknowns"},
+        {"name": "targeted_questions", "description": "Asks 3-5 specific questions about gray areas, NOT basic 'what are you building' questions"},
+        {"name": "no_broad_research", "description": "Does NOT launch a broad 7-agent landscape research sprint — goes straight to targeted analysis"}
+      ]
+    },
+    {
+      "id": 8,
+      "name": "scope-guardrail",
+      "prompt": "Oh actually, while we're at it, we should also add a social feed where users can post their study notes and other students can comment on them.",
+      "expected_output": "Acknowledges the idea, adds to Deferred Ideas, redirects to current scope",
+      "files": [],
+      "multi_turn_context": "Turn 4 of study buddy app brainstorm. Core scope is matching + scheduling. User now wants to add a social feed — this is scope creep.",
+      "assertions": [
+        {"name": "acknowledges_idea", "description": "Does NOT dismiss the idea — acknowledges it's worth building"},
+        {"name": "defers_to_future", "description": "Adds the social feed to a Deferred Ideas list"},
+        {"name": "redirects_to_scope", "description": "Steers back to the current core scope (matching + scheduling)"},
+        {"name": "does_not_incorporate", "description": "Does NOT start researching social feeds or adding it to the current plan"}
+      ]
+    },
+    {
+      "id": 9,
+      "name": "anti-rationalization-check",
+      "prompt": "This is pretty straightforward, just generate the plan already. We don't need more research.",
+      "expected_output": "Does NOT skip to plan generation. Checks anti-rationalization table, explains why process matters, continues the brainstorm loop.",
+      "files": [],
+      "multi_turn_context": "Turn 3 of a complex brainstorm. Only 2 research sprints completed. User is impatient but the skill has a hard gate requiring explicit readiness AND research saturation.",
+      "assertions": [
+        {"name": "does_not_skip", "description": "Does NOT immediately generate a plan — honors the hard gate"},
+        {"name": "explains_value", "description": "Briefly explains what remaining research could surface, without being preachy"},
+        {"name": "offers_quick_mode", "description": "Offers Quick Mode as a compromise if the user truly wants speed"},
+        {"name": "continues_loop", "description": "Runs at least one more research sprint to check for saturation"}
+      ]
+    },
+    {
+      "id": 10,
+      "name": "plan-quality-verification",
+      "prompt": "Looks good, save it.",
+      "expected_output": "Runs plan checker agent before saving. Fixes any issues found. Then saves.",
+      "files": [],
+      "multi_turn_context": "Phase 3, final section approved. Plan has been presented incrementally and user approved all 3 sections. Now saving.",
+      "assertions": [
+        {"name": "plan_checker_runs", "description": "Dispatches a plan checker agent to validate the plan before saving"},
+        {"name": "nyquist_validated", "description": "Every task in the plan has an automated verify command"},
+        {"name": "discovery_levels_tagged", "description": "Every task has a discovery level (L0-L3)"},
+        {"name": "canonical_references_present", "description": "Plan includes a canonical references section"},
+        {"name": "deferred_ideas_present", "description": "Plan includes deferred ideas from scope guardrail"},
+        {"name": "handoff_prompt", "description": "Provides /ftm-executor handoff command after saving"}
+      ]
+    }
+  ]
+}

package/ftm-brainstorm/evals/promptfoo.yaml CHANGED Viewed

@@ -1,109 +1,109 @@
-description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
-prompts:
-  - "{{input}}"
-providers:
-  - id: "exec:claude --print"
-    label: "claude-code"
-defaultTest:
-  options:
-    transformVars: "vars"
-tests:
-  # Eval 0: fresh-idea-intake
-  - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
-    vars:
-      input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
-    assert:
-      - type: regex
-        value: "\\?"
-        description: "Response must contain at least one question"
-      - type: not-contains
-        value: "sprint"
-        description: "Does not dispatch research sprint on first turn"
-      - type: not-contains
-        value: "dispatch"
-        description: "Does not dispatch agents on first turn"
-  # Eval 1: fresh-idea-turn2-research
-  - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
-    vars:
-      input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
-    assert:
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "RECOMMENDED|recommended|#1"
-        description: "At least one suggestion is labeled as recommended"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 2: turn3-deeper-research
-  - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
-    vars:
-      input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
-    assert:
-      - type: regex
-        value: "matching|algorithm|cold.start"
-        description: "Response addresses the specific concerns raised"
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 3: brain-dump-intake
-  - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
-    vars:
-      input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
-    assert:
-      - type: regex
-        value: "Decided|decided|Decision|decision"
-        description: "Contains a 'Decided' section"
-      - type: regex
-        value: "open question|Open question|gap|Gap"
-        description: "Contains an open questions or gaps section"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a confirmation question"
-      - type: not-contains
-        value: "dispatch"
-        description: "Does not dispatch research agents on intake turn"
-  # Eval 4: brain-dump-turn2-research
-  - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
-    vars:
-      input: "Yeah that looks right, go ahead and research it"
-    assert:
-      - type: regex
-        value: "Novelty|novelty|novel|Novel"
-        description: "Contains a Novelty Map"
-      - type: regex
-        value: "https?://"
-        description: "Response includes at least one URL citation"
-      - type: regex
-        value: "\\?"
-        description: "Ends with a question"
-  # Eval 5: phase3-gate
-  - description: "phase 3 gate — vision summary and approval before generating full plan"
-    vars:
-      input: "Ok I think I know what I want. Let's turn this into a plan."
-    assert:
-      - type: regex
-        value: "summary|Summary|we've landed|landed on|here's what"
-        description: "Presents a vision summary before generating the full plan"
-      - type: regex
-        value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
-        description: "Asks for confirmation before proceeding to full plan"
-      - type: not-contains
-        value: "Wave 1"
-        description: "Does not dump the full plan structure prematurely"
-      - type: not-contains
-        value: "Wave 2"
-        description: "Does not dump the full plan structure prematurely"
+description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
+prompts:
+  - "{{input}}"
+providers:
+  - id: "exec:claude --print"
+    label: "claude-code"
+defaultTest:
+  options:
+    transformVars: "vars"
+tests:
+  # Eval 0: fresh-idea-intake
+  - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
+    vars:
+      input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
+    assert:
+      - type: regex
+        value: "\\?"
+        description: "Response must contain at least one question"
+      - type: not-contains
+        value: "sprint"
+        description: "Does not dispatch research sprint on first turn"
+      - type: not-contains
+        value: "dispatch"
+        description: "Does not dispatch agents on first turn"
+  # Eval 1: fresh-idea-turn2-research
+  - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
+    vars:
+      input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
+    assert:
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "RECOMMENDED|recommended|#1"
+        description: "At least one suggestion is labeled as recommended"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 2: turn3-deeper-research
+  - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
+    vars:
+      input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
+    assert:
+      - type: regex
+        value: "matching|algorithm|cold.start"
+        description: "Response addresses the specific concerns raised"
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 3: brain-dump-intake
+  - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
+    vars:
+      input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
+    assert:
+      - type: regex
+        value: "Decided|decided|Decision|decision"
+        description: "Contains a 'Decided' section"
+      - type: regex
+        value: "open question|Open question|gap|Gap"
+        description: "Contains an open questions or gaps section"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a confirmation question"
+      - type: not-contains
+        value: "dispatch"
+        description: "Does not dispatch research agents on intake turn"
+  # Eval 4: brain-dump-turn2-research
+  - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
+    vars:
+      input: "Yeah that looks right, go ahead and research it"
+    assert:
+      - type: regex
+        value: "Novelty|novelty|novel|Novel"
+        description: "Contains a Novelty Map"
+      - type: regex
+        value: "https?://"
+        description: "Response includes at least one URL citation"
+      - type: regex
+        value: "\\?"
+        description: "Ends with a question"
+  # Eval 5: phase3-gate
+  - description: "phase 3 gate — vision summary and approval before generating full plan"
+    vars:
+      input: "Ok I think I know what I want. Let's turn this into a plan."
+    assert:
+      - type: regex
+        value: "summary|Summary|we've landed|landed on|here's what"
+        description: "Presents a vision summary before generating the full plan"
+      - type: regex
+        value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
+        description: "Asks for confirmation before proceeding to full plan"
+      - type: not-contains
+        value: "Wave 1"
+        description: "Does not dump the full plan structure prematurely"
+      - type: not-contains
+        value: "Wave 2"
+        description: "Does not dump the full plan structure prematurely"