feed-the-machine 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +170 -170
  3. package/bin/generate-manifest.mjs +463 -463
  4. package/bin/install.mjs +491 -491
  5. package/docs/HOOKS.md +243 -243
  6. package/docs/INBOX.md +233 -233
  7. package/ftm/SKILL.md +122 -122
  8. package/ftm-audit/SKILL.md +623 -541
  9. package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
  10. package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
  11. package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
  12. package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
  13. package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
  14. package/ftm-audit/scripts/run-knip.sh +23 -23
  15. package/ftm-audit.yml +2 -2
  16. package/ftm-brainstorm/SKILL.md +498 -498
  17. package/ftm-brainstorm/evals/evals.json +100 -100
  18. package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
  19. package/ftm-brainstorm/references/agent-prompts.md +224 -224
  20. package/ftm-brainstorm/references/plan-template.md +121 -121
  21. package/ftm-brainstorm.yml +2 -2
  22. package/ftm-browse/SKILL.md +454 -454
  23. package/ftm-browse/daemon/browser-manager.ts +206 -206
  24. package/ftm-browse/daemon/bun.lock +30 -30
  25. package/ftm-browse/daemon/cli.ts +347 -347
  26. package/ftm-browse/daemon/commands.ts +410 -410
  27. package/ftm-browse/daemon/main.ts +357 -357
  28. package/ftm-browse/daemon/package.json +17 -17
  29. package/ftm-browse/daemon/server.ts +189 -189
  30. package/ftm-browse/daemon/snapshot.ts +519 -519
  31. package/ftm-browse/daemon/tsconfig.json +22 -22
  32. package/ftm-browse.yml +4 -4
  33. package/ftm-capture/SKILL.md +370 -370
  34. package/ftm-capture.yml +4 -4
  35. package/ftm-codex-gate/SKILL.md +361 -361
  36. package/ftm-codex-gate.yml +2 -2
  37. package/ftm-config/SKILL.md +345 -345
  38. package/ftm-config.default.yml +82 -80
  39. package/ftm-config.yml +2 -2
  40. package/ftm-council/SKILL.md +416 -416
  41. package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
  42. package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
  43. package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
  44. package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
  45. package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
  46. package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
  47. package/ftm-council.yml +2 -2
  48. package/ftm-dashboard/SKILL.md +163 -163
  49. package/ftm-dashboard.yml +4 -4
  50. package/ftm-debug/SKILL.md +1037 -1037
  51. package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
  52. package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
  53. package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
  54. package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
  55. package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
  56. package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
  57. package/ftm-debug.yml +2 -2
  58. package/ftm-diagram/SKILL.md +277 -277
  59. package/ftm-diagram.yml +2 -2
  60. package/ftm-executor/SKILL.md +777 -767
  61. package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
  62. package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
  63. package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
  64. package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
  65. package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
  66. package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
  67. package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
  68. package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
  69. package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -44
  70. package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
  71. package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
  72. package/ftm-executor/runtime/package.json +8 -8
  73. package/ftm-executor.yml +2 -2
  74. package/ftm-git/SKILL.md +441 -441
  75. package/ftm-git/evals/evals.json +26 -26
  76. package/ftm-git/evals/promptfoo.yaml +75 -75
  77. package/ftm-git/hooks/post-commit-experience.sh +92 -92
  78. package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
  79. package/ftm-git/references/protocols/REMEDIATION.md +139 -139
  80. package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
  81. package/ftm-git.yml +2 -2
  82. package/ftm-inbox/backend/adapters/_retry.py +64 -64
  83. package/ftm-inbox/backend/adapters/base.py +230 -230
  84. package/ftm-inbox/backend/adapters/freshservice.py +104 -104
  85. package/ftm-inbox/backend/adapters/gmail.py +125 -125
  86. package/ftm-inbox/backend/adapters/jira.py +136 -136
  87. package/ftm-inbox/backend/adapters/registry.py +192 -192
  88. package/ftm-inbox/backend/adapters/slack.py +110 -110
  89. package/ftm-inbox/backend/db/connection.py +54 -54
  90. package/ftm-inbox/backend/db/schema.py +78 -78
  91. package/ftm-inbox/backend/executor/__init__.py +7 -7
  92. package/ftm-inbox/backend/executor/engine.py +149 -149
  93. package/ftm-inbox/backend/executor/step_runner.py +98 -98
  94. package/ftm-inbox/backend/main.py +103 -103
  95. package/ftm-inbox/backend/models/__init__.py +1 -1
  96. package/ftm-inbox/backend/models/unified_task.py +36 -36
  97. package/ftm-inbox/backend/planner/__init__.py +6 -6
  98. package/ftm-inbox/backend/planner/generator.py +127 -127
  99. package/ftm-inbox/backend/planner/schema.py +34 -34
  100. package/ftm-inbox/backend/requirements.txt +5 -5
  101. package/ftm-inbox/backend/routes/execute.py +186 -186
  102. package/ftm-inbox/backend/routes/health.py +52 -52
  103. package/ftm-inbox/backend/routes/inbox.py +68 -68
  104. package/ftm-inbox/backend/routes/plan.py +271 -271
  105. package/ftm-inbox/bin/launchagent.mjs +91 -91
  106. package/ftm-inbox/bin/setup.mjs +188 -188
  107. package/ftm-inbox/bin/start.sh +10 -10
  108. package/ftm-inbox/bin/status.sh +17 -17
  109. package/ftm-inbox/bin/stop.sh +8 -8
  110. package/ftm-inbox/config.example.yml +55 -55
  111. package/ftm-inbox/package-lock.json +2898 -2898
  112. package/ftm-inbox/package.json +26 -26
  113. package/ftm-inbox/postcss.config.js +6 -6
  114. package/ftm-inbox/src/app.css +199 -199
  115. package/ftm-inbox/src/app.html +18 -18
  116. package/ftm-inbox/src/lib/api.ts +166 -166
  117. package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
  118. package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
  119. package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
  120. package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
  121. package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
  122. package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
  123. package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
  124. package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
  125. package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
  126. package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
  127. package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
  128. package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
  129. package/ftm-inbox/src/lib/theme.ts +47 -47
  130. package/ftm-inbox/src/routes/+layout.svelte +76 -76
  131. package/ftm-inbox/src/routes/+page.svelte +401 -401
  132. package/ftm-inbox/svelte.config.js +12 -12
  133. package/ftm-inbox/tailwind.config.ts +63 -63
  134. package/ftm-inbox/tsconfig.json +13 -13
  135. package/ftm-inbox/vite.config.ts +6 -6
  136. package/ftm-intent/SKILL.md +241 -241
  137. package/ftm-intent.yml +2 -2
  138. package/ftm-manifest.json +3794 -3794
  139. package/ftm-map/SKILL.md +291 -291
  140. package/ftm-map/scripts/db.py +712 -712
  141. package/ftm-map/scripts/index.py +415 -415
  142. package/ftm-map/scripts/parser.py +224 -224
  143. package/ftm-map/scripts/queries/go-tags.scm +20 -20
  144. package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
  145. package/ftm-map/scripts/queries/python-tags.scm +31 -31
  146. package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
  147. package/ftm-map/scripts/queries/rust-tags.scm +37 -37
  148. package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
  149. package/ftm-map/scripts/query.py +301 -301
  150. package/ftm-map/scripts/ranker.py +377 -377
  151. package/ftm-map/scripts/requirements.txt +5 -5
  152. package/ftm-map/scripts/setup-hooks.sh +27 -27
  153. package/ftm-map/scripts/setup.sh +56 -56
  154. package/ftm-map/scripts/test_db.py +364 -364
  155. package/ftm-map/scripts/test_parser.py +174 -174
  156. package/ftm-map/scripts/test_query.py +183 -183
  157. package/ftm-map/scripts/test_ranker.py +199 -199
  158. package/ftm-map/scripts/views.py +591 -591
  159. package/ftm-map.yml +2 -2
  160. package/ftm-mind/SKILL.md +1943 -1943
  161. package/ftm-mind/evals/promptfoo.yaml +142 -142
  162. package/ftm-mind/references/blackboard-schema.md +328 -328
  163. package/ftm-mind/references/complexity-guide.md +110 -110
  164. package/ftm-mind/references/event-registry.md +319 -319
  165. package/ftm-mind/references/mcp-inventory.md +296 -296
  166. package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
  167. package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
  168. package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
  169. package/ftm-mind/references/reflexion-protocol.md +249 -249
  170. package/ftm-mind/references/routing/SCENARIOS.md +22 -22
  171. package/ftm-mind/references/routing-scenarios.md +35 -35
  172. package/ftm-mind.yml +2 -2
  173. package/ftm-pause/SKILL.md +395 -395
  174. package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
  175. package/ftm-pause/references/protocols/VALIDATION.md +80 -80
  176. package/ftm-pause.yml +2 -2
  177. package/ftm-researcher/SKILL.md +275 -275
  178. package/ftm-researcher/evals/agent-diversity.yaml +17 -17
  179. package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
  180. package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
  181. package/ftm-researcher/references/adaptive-search.md +116 -116
  182. package/ftm-researcher/references/agent-prompts.md +193 -193
  183. package/ftm-researcher/references/council-integration.md +193 -193
  184. package/ftm-researcher/references/output-format.md +203 -203
  185. package/ftm-researcher/references/synthesis-pipeline.md +165 -165
  186. package/ftm-researcher/scripts/score_credibility.py +234 -234
  187. package/ftm-researcher/scripts/validate_research.py +92 -92
  188. package/ftm-researcher.yml +2 -2
  189. package/ftm-resume/SKILL.md +518 -518
  190. package/ftm-resume/references/protocols/VALIDATION.md +172 -172
  191. package/ftm-resume.yml +2 -2
  192. package/ftm-retro/SKILL.md +380 -380
  193. package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
  194. package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
  195. package/ftm-retro.yml +2 -2
  196. package/ftm-routine/SKILL.md +170 -170
  197. package/ftm-routine.yml +4 -4
  198. package/ftm-state/blackboard/capabilities.json +5 -5
  199. package/ftm-state/blackboard/capabilities.schema.json +27 -27
  200. package/ftm-state/blackboard/context.json +23 -23
  201. package/ftm-state/blackboard/experiences/index.json +9 -9
  202. package/ftm-state/blackboard/patterns.json +6 -6
  203. package/ftm-state/schemas/context.schema.json +130 -130
  204. package/ftm-state/schemas/experience-index.schema.json +77 -77
  205. package/ftm-state/schemas/experience.schema.json +78 -78
  206. package/ftm-state/schemas/patterns.schema.json +44 -44
  207. package/ftm-upgrade/SKILL.md +194 -194
  208. package/ftm-upgrade/scripts/check-version.sh +76 -76
  209. package/ftm-upgrade/scripts/upgrade.sh +143 -143
  210. package/ftm-upgrade.yml +2 -2
  211. package/ftm-verify.yml +2 -2
  212. package/ftm.yml +2 -2
  213. package/hooks/ftm-blackboard-enforcer.sh +93 -93
  214. package/hooks/ftm-discovery-reminder.sh +90 -90
  215. package/hooks/ftm-drafts-gate.sh +61 -61
  216. package/hooks/ftm-event-logger.mjs +107 -107
  217. package/hooks/ftm-map-autodetect.sh +79 -79
  218. package/hooks/ftm-pending-sync-check.sh +22 -22
  219. package/hooks/ftm-plan-gate.sh +92 -92
  220. package/hooks/ftm-post-commit-trigger.sh +57 -57
  221. package/hooks/settings-template.json +81 -81
  222. package/install.sh +363 -363
  223. package/package.json +84 -84
  224. package/uninstall.sh +25 -25
@@ -1,100 +1,100 @@
1
- {
2
- "skill_name": "ftm-brainstorm",
3
- "evals": [
4
- {
5
- "id": 0,
6
- "name": "fresh-idea-intake",
7
- "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
8
- "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
9
- "files": [],
10
- "assertions": [
11
- {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
12
- {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
13
- {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
14
- {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
15
- ]
16
- },
17
- {
18
- "id": 1,
19
- "name": "fresh-idea-turn2-research",
20
- "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
21
- "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
22
- "files": [],
23
- "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
24
- "assertions": [
25
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) — not fewer"},
26
- {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
27
- {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
28
- {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
29
- {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
30
- {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
31
- {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
32
- {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
33
- ]
34
- },
35
- {
36
- "id": 2,
37
- "name": "turn3-deeper-research",
38
- "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
39
- "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
40
- "files": [],
41
- "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
42
- "assertions": [
43
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
44
- {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
45
- {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
46
- {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
47
- {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
48
- {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
49
- {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
50
- {"name": "hard_stop", "description": "Does NOT continue past the questions"}
51
- ]
52
- },
53
- {
54
- "id": 3,
55
- "name": "brain-dump-intake",
56
- "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
57
- "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
58
- "files": ["brain-dump-input.md"],
59
- "assertions": [
60
- {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
61
- {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
62
- {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
63
- {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
64
- {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
65
- {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
66
- ]
67
- },
68
- {
69
- "id": 4,
70
- "name": "brain-dump-turn2-research",
71
- "prompt": "Yeah that looks right, go ahead and research it",
72
- "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
73
- "files": ["brain-dump-input.md"],
74
- "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
75
- "assertions": [
76
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
77
- {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
78
- {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
79
- {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
80
- {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
81
- {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
82
- {"name": "hard_stop", "description": "Does NOT proceed past questions"}
83
- ]
84
- },
85
- {
86
- "id": 5,
87
- "name": "phase3-gate",
88
- "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
89
- "expected_output": "Vision summary for approval, NOT the full plan yet",
90
- "files": [],
91
- "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
92
- "assertions": [
93
- {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
94
- {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
95
- {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
96
- {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
97
- ]
98
- }
99
- ]
100
- }
1
+ {
2
+ "skill_name": "ftm-brainstorm",
3
+ "evals": [
4
+ {
5
+ "id": 0,
6
+ "name": "fresh-idea-intake",
7
+ "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
8
+ "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
9
+ "files": [],
10
+ "assertions": [
11
+ {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
12
+ {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
13
+ {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
14
+ {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
15
+ ]
16
+ },
17
+ {
18
+ "id": 1,
19
+ "name": "fresh-idea-turn2-research",
20
+ "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
21
+ "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
22
+ "files": [],
23
+ "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
24
+ "assertions": [
25
+ {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) — not fewer"},
26
+ {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
27
+ {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
28
+ {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
29
+ {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
30
+ {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
31
+ {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
32
+ {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
33
+ ]
34
+ },
35
+ {
36
+ "id": 2,
37
+ "name": "turn3-deeper-research",
38
+ "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
39
+ "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
40
+ "files": [],
41
+ "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
42
+ "assertions": [
43
+ {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
44
+ {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
45
+ {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
46
+ {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
47
+ {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
48
+ {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
49
+ {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
50
+ {"name": "hard_stop", "description": "Does NOT continue past the questions"}
51
+ ]
52
+ },
53
+ {
54
+ "id": 3,
55
+ "name": "brain-dump-intake",
56
+ "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
57
+ "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
58
+ "files": ["brain-dump-input.md"],
59
+ "assertions": [
60
+ {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
61
+ {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
62
+ {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
63
+ {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
64
+ {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
65
+ {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
66
+ ]
67
+ },
68
+ {
69
+ "id": 4,
70
+ "name": "brain-dump-turn2-research",
71
+ "prompt": "Yeah that looks right, go ahead and research it",
72
+ "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
73
+ "files": ["brain-dump-input.md"],
74
+ "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
75
+ "assertions": [
76
+ {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
77
+ {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
78
+ {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
79
+ {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
80
+ {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
81
+ {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
82
+ {"name": "hard_stop", "description": "Does NOT proceed past questions"}
83
+ ]
84
+ },
85
+ {
86
+ "id": 5,
87
+ "name": "phase3-gate",
88
+ "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
89
+ "expected_output": "Vision summary for approval, NOT the full plan yet",
90
+ "files": [],
91
+ "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
92
+ "assertions": [
93
+ {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
94
+ {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
95
+ {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
96
+ {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
97
+ ]
98
+ }
99
+ ]
100
+ }
@@ -1,109 +1,109 @@
1
- description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
2
-
3
- prompts:
4
- - "{{input}}"
5
-
6
- providers:
7
- - id: "exec:claude --print"
8
- label: "claude-code"
9
-
10
- defaultTest:
11
- options:
12
- transformVars: "vars"
13
-
14
- tests:
15
- # Eval 0: fresh-idea-intake
16
- - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
17
- vars:
18
- input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
19
- assert:
20
- - type: regex
21
- value: "\\?"
22
- description: "Response must contain at least one question"
23
- - type: not-contains
24
- value: "sprint"
25
- description: "Does not dispatch research sprint on first turn"
26
- - type: not-contains
27
- value: "dispatch"
28
- description: "Does not dispatch agents on first turn"
29
-
30
- # Eval 1: fresh-idea-turn2-research
31
- - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
32
- vars:
33
- input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
34
- assert:
35
- - type: regex
36
- value: "https?://"
37
- description: "Response includes at least one URL citation"
38
- - type: regex
39
- value: "RECOMMENDED|recommended|#1"
40
- description: "At least one suggestion is labeled as recommended"
41
- - type: regex
42
- value: "\\?"
43
- description: "Ends with a question"
44
-
45
- # Eval 2: turn3-deeper-research
46
- - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
47
- vars:
48
- input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
49
- assert:
50
- - type: regex
51
- value: "matching|algorithm|cold.start"
52
- description: "Response addresses the specific concerns raised"
53
- - type: regex
54
- value: "https?://"
55
- description: "Response includes at least one URL citation"
56
- - type: regex
57
- value: "\\?"
58
- description: "Ends with a question"
59
-
60
- # Eval 3: brain-dump-intake
61
- - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
62
- vars:
63
- input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
64
- assert:
65
- - type: regex
66
- value: "Decided|decided|Decision|decision"
67
- description: "Contains a 'Decided' section"
68
- - type: regex
69
- value: "open question|Open question|gap|Gap"
70
- description: "Contains an open questions or gaps section"
71
- - type: regex
72
- value: "\\?"
73
- description: "Ends with a confirmation question"
74
- - type: not-contains
75
- value: "dispatch"
76
- description: "Does not dispatch research agents on intake turn"
77
-
78
- # Eval 4: brain-dump-turn2-research
79
- - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
80
- vars:
81
- input: "Yeah that looks right, go ahead and research it"
82
- assert:
83
- - type: regex
84
- value: "Novelty|novelty|novel|Novel"
85
- description: "Contains a Novelty Map"
86
- - type: regex
87
- value: "https?://"
88
- description: "Response includes at least one URL citation"
89
- - type: regex
90
- value: "\\?"
91
- description: "Ends with a question"
92
-
93
- # Eval 5: phase3-gate
94
- - description: "phase 3 gate — vision summary and approval before generating full plan"
95
- vars:
96
- input: "Ok I think I know what I want. Let's turn this into a plan."
97
- assert:
98
- - type: regex
99
- value: "summary|Summary|we've landed|landed on|here's what"
100
- description: "Presents a vision summary before generating the full plan"
101
- - type: regex
102
- value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
103
- description: "Asks for confirmation before proceeding to full plan"
104
- - type: not-contains
105
- value: "Wave 1"
106
- description: "Does not dump the full plan structure prematurely"
107
- - type: not-contains
108
- value: "Wave 2"
109
- description: "Does not dump the full plan structure prematurely"
1
+ description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
2
+
3
+ prompts:
4
+ - "{{input}}"
5
+
6
+ providers:
7
+ - id: "exec:claude --print"
8
+ label: "claude-code"
9
+
10
+ defaultTest:
11
+ options:
12
+ transformVars: "vars"
13
+
14
+ tests:
15
+ # Eval 0: fresh-idea-intake
16
+ - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
17
+ vars:
18
+ input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
19
+ assert:
20
+ - type: regex
21
+ value: "\\?"
22
+ description: "Response must contain at least one question"
23
+ - type: not-contains
24
+ value: "sprint"
25
+ description: "Does not dispatch research sprint on first turn"
26
+ - type: not-contains
27
+ value: "dispatch"
28
+ description: "Does not dispatch agents on first turn"
29
+
30
+ # Eval 1: fresh-idea-turn2-research
31
+ - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
32
+ vars:
33
+ input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
34
+ assert:
35
+ - type: regex
36
+ value: "https?://"
37
+ description: "Response includes at least one URL citation"
38
+ - type: regex
39
+ value: "RECOMMENDED|recommended|#1"
40
+ description: "At least one suggestion is labeled as recommended"
41
+ - type: regex
42
+ value: "\\?"
43
+ description: "Ends with a question"
44
+
45
+ # Eval 2: turn3-deeper-research
46
+ - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
47
+ vars:
48
+ input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
49
+ assert:
50
+ - type: regex
51
+ value: "matching|algorithm|cold.start"
52
+ description: "Response addresses the specific concerns raised"
53
+ - type: regex
54
+ value: "https?://"
55
+ description: "Response includes at least one URL citation"
56
+ - type: regex
57
+ value: "\\?"
58
+ description: "Ends with a question"
59
+
60
+ # Eval 3: brain-dump-intake
61
+ - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
62
+ vars:
63
+ input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
64
+ assert:
65
+ - type: regex
66
+ value: "Decided|decided|Decision|decision"
67
+ description: "Contains a 'Decided' section"
68
+ - type: regex
69
+ value: "open question|Open question|gap|Gap"
70
+ description: "Contains an open questions or gaps section"
71
+ - type: regex
72
+ value: "\\?"
73
+ description: "Ends with a confirmation question"
74
+ - type: not-contains
75
+ value: "dispatch"
76
+ description: "Does not dispatch research agents on intake turn"
77
+
78
+ # Eval 4: brain-dump-turn2-research
79
+ - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
80
+ vars:
81
+ input: "Yeah that looks right, go ahead and research it"
82
+ assert:
83
+ - type: regex
84
+ value: "Novelty|novelty|novel|Novel"
85
+ description: "Contains a Novelty Map"
86
+ - type: regex
87
+ value: "https?://"
88
+ description: "Response includes at least one URL citation"
89
+ - type: regex
90
+ value: "\\?"
91
+ description: "Ends with a question"
92
+
93
+ # Eval 5: phase3-gate
94
+ - description: "phase 3 gate — vision summary and approval before generating full plan"
95
+ vars:
96
+ input: "Ok I think I know what I want. Let's turn this into a plan."
97
+ assert:
98
+ - type: regex
99
+ value: "summary|Summary|we've landed|landed on|here's what"
100
+ description: "Presents a vision summary before generating the full plan"
101
+ - type: regex
102
+ value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
103
+ description: "Asks for confirmation before proceeding to full plan"
104
+ - type: not-contains
105
+ value: "Wave 1"
106
+ description: "Does not dump the full plan structure prematurely"
107
+ - type: not-contains
108
+ value: "Wave 2"
109
+ description: "Does not dump the full plan structure prematurely"