feed-the-machine 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +170 -170
  3. package/bin/brain.py +1340 -0
  4. package/bin/convert_claude_skills_to_codex.py +490 -0
  5. package/bin/generate-manifest.mjs +463 -463
  6. package/bin/harden_codex_skills.py +141 -0
  7. package/bin/install.mjs +491 -491
  8. package/bin/migrate-eng-buddy-data.py +875 -0
  9. package/bin/playbook_engine/__init__.py +1 -0
  10. package/bin/playbook_engine/conftest.py +8 -0
  11. package/bin/playbook_engine/extractor.py +33 -0
  12. package/bin/playbook_engine/manager.py +102 -0
  13. package/bin/playbook_engine/models.py +84 -0
  14. package/bin/playbook_engine/registry.py +35 -0
  15. package/bin/playbook_engine/test_extractor.py +72 -0
  16. package/bin/playbook_engine/test_integration.py +129 -0
  17. package/bin/playbook_engine/test_manager.py +85 -0
  18. package/bin/playbook_engine/test_models.py +166 -0
  19. package/bin/playbook_engine/test_registry.py +67 -0
  20. package/bin/playbook_engine/test_tracer.py +86 -0
  21. package/bin/playbook_engine/tracer.py +93 -0
  22. package/bin/tasks_db.py +456 -0
  23. package/docs/HOOKS.md +243 -243
  24. package/docs/INBOX.md +233 -233
  25. package/ftm/SKILL.md +125 -122
  26. package/ftm-audit/SKILL.md +623 -623
  27. package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
  28. package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
  29. package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
  30. package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
  31. package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
  32. package/ftm-audit/scripts/run-knip.sh +23 -23
  33. package/ftm-audit.yml +2 -2
  34. package/ftm-brainstorm/SKILL.md +1003 -498
  35. package/ftm-brainstorm/evals/evals.json +180 -100
  36. package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
  37. package/ftm-brainstorm/references/agent-prompts.md +552 -224
  38. package/ftm-brainstorm/references/plan-template.md +209 -121
  39. package/ftm-brainstorm.yml +2 -2
  40. package/ftm-browse/SKILL.md +454 -454
  41. package/ftm-browse/daemon/browser-manager.ts +206 -206
  42. package/ftm-browse/daemon/bun.lock +30 -30
  43. package/ftm-browse/daemon/cli.ts +347 -347
  44. package/ftm-browse/daemon/commands.ts +410 -410
  45. package/ftm-browse/daemon/main.ts +357 -357
  46. package/ftm-browse/daemon/package.json +17 -17
  47. package/ftm-browse/daemon/server.ts +189 -189
  48. package/ftm-browse/daemon/snapshot.ts +519 -519
  49. package/ftm-browse/daemon/tsconfig.json +22 -22
  50. package/ftm-browse.yml +4 -4
  51. package/ftm-capture/SKILL.md +370 -370
  52. package/ftm-capture.yml +4 -4
  53. package/ftm-codex-gate/SKILL.md +361 -361
  54. package/ftm-codex-gate.yml +2 -2
  55. package/ftm-config/SKILL.md +422 -345
  56. package/ftm-config.default.yml +125 -82
  57. package/ftm-config.yml +44 -2
  58. package/ftm-council/SKILL.md +416 -416
  59. package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
  60. package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
  61. package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
  62. package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
  63. package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
  64. package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
  65. package/ftm-council.yml +2 -2
  66. package/ftm-dashboard/SKILL.md +163 -163
  67. package/ftm-dashboard.yml +4 -4
  68. package/ftm-debug/SKILL.md +1037 -1037
  69. package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
  70. package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
  71. package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
  72. package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
  73. package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
  74. package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
  75. package/ftm-debug.yml +2 -2
  76. package/ftm-diagram/SKILL.md +277 -277
  77. package/ftm-diagram.yml +2 -2
  78. package/ftm-executor/SKILL.md +777 -777
  79. package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
  80. package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
  81. package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
  82. package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
  83. package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
  84. package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
  85. package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
  86. package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
  87. package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -59
  88. package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
  89. package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
  90. package/ftm-executor/runtime/package.json +8 -8
  91. package/ftm-executor.yml +2 -2
  92. package/ftm-git/SKILL.md +441 -441
  93. package/ftm-git/evals/evals.json +26 -26
  94. package/ftm-git/evals/promptfoo.yaml +75 -75
  95. package/ftm-git/hooks/post-commit-experience.sh +92 -92
  96. package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
  97. package/ftm-git/references/protocols/REMEDIATION.md +139 -139
  98. package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
  99. package/ftm-git.yml +2 -2
  100. package/ftm-inbox/backend/__pycache__/main.cpython-314.pyc +0 -0
  101. package/ftm-inbox/backend/adapters/_retry.py +64 -64
  102. package/ftm-inbox/backend/adapters/base.py +230 -230
  103. package/ftm-inbox/backend/adapters/freshservice.py +104 -104
  104. package/ftm-inbox/backend/adapters/gmail.py +125 -125
  105. package/ftm-inbox/backend/adapters/jira.py +136 -136
  106. package/ftm-inbox/backend/adapters/registry.py +192 -192
  107. package/ftm-inbox/backend/adapters/slack.py +110 -110
  108. package/ftm-inbox/backend/db/connection.py +54 -54
  109. package/ftm-inbox/backend/db/schema.py +78 -78
  110. package/ftm-inbox/backend/executor/__init__.py +7 -7
  111. package/ftm-inbox/backend/executor/engine.py +149 -149
  112. package/ftm-inbox/backend/executor/step_runner.py +98 -98
  113. package/ftm-inbox/backend/main.py +103 -103
  114. package/ftm-inbox/backend/models/__init__.py +1 -1
  115. package/ftm-inbox/backend/models/unified_task.py +36 -36
  116. package/ftm-inbox/backend/planner/__init__.py +6 -6
  117. package/ftm-inbox/backend/planner/__pycache__/__init__.cpython-314.pyc +0 -0
  118. package/ftm-inbox/backend/planner/__pycache__/generator.cpython-314.pyc +0 -0
  119. package/ftm-inbox/backend/planner/__pycache__/schema.cpython-314.pyc +0 -0
  120. package/ftm-inbox/backend/planner/generator.py +127 -127
  121. package/ftm-inbox/backend/planner/schema.py +34 -34
  122. package/ftm-inbox/backend/requirements.txt +5 -5
  123. package/ftm-inbox/backend/routes/__pycache__/plan.cpython-314.pyc +0 -0
  124. package/ftm-inbox/backend/routes/execute.py +186 -186
  125. package/ftm-inbox/backend/routes/health.py +52 -52
  126. package/ftm-inbox/backend/routes/inbox.py +68 -68
  127. package/ftm-inbox/backend/routes/plan.py +271 -271
  128. package/ftm-inbox/bin/launchagent.mjs +91 -91
  129. package/ftm-inbox/bin/setup.mjs +188 -188
  130. package/ftm-inbox/bin/start.sh +10 -10
  131. package/ftm-inbox/bin/status.sh +17 -17
  132. package/ftm-inbox/bin/stop.sh +8 -8
  133. package/ftm-inbox/config.example.yml +55 -55
  134. package/ftm-inbox/package-lock.json +2898 -2898
  135. package/ftm-inbox/package.json +26 -26
  136. package/ftm-inbox/postcss.config.js +6 -6
  137. package/ftm-inbox/src/app.css +199 -199
  138. package/ftm-inbox/src/app.html +18 -18
  139. package/ftm-inbox/src/lib/api.ts +166 -166
  140. package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
  141. package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
  142. package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
  143. package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
  144. package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
  145. package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
  146. package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
  147. package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
  148. package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
  149. package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
  150. package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
  151. package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
  152. package/ftm-inbox/src/lib/theme.ts +47 -47
  153. package/ftm-inbox/src/routes/+layout.svelte +76 -76
  154. package/ftm-inbox/src/routes/+page.svelte +401 -401
  155. package/ftm-inbox/svelte.config.js +12 -12
  156. package/ftm-inbox/tailwind.config.ts +63 -63
  157. package/ftm-inbox/tsconfig.json +13 -13
  158. package/ftm-inbox/vite.config.ts +6 -6
  159. package/ftm-intent/SKILL.md +241 -241
  160. package/ftm-intent.yml +2 -2
  161. package/ftm-manifest.json +3794 -3794
  162. package/ftm-map/SKILL.md +291 -291
  163. package/ftm-map/scripts/db.py +712 -712
  164. package/ftm-map/scripts/index.py +415 -415
  165. package/ftm-map/scripts/parser.py +224 -224
  166. package/ftm-map/scripts/queries/go-tags.scm +20 -20
  167. package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
  168. package/ftm-map/scripts/queries/python-tags.scm +31 -31
  169. package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
  170. package/ftm-map/scripts/queries/rust-tags.scm +37 -37
  171. package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
  172. package/ftm-map/scripts/query.py +301 -301
  173. package/ftm-map/scripts/ranker.py +377 -377
  174. package/ftm-map/scripts/requirements.txt +5 -5
  175. package/ftm-map/scripts/setup-hooks.sh +27 -27
  176. package/ftm-map/scripts/setup.sh +56 -56
  177. package/ftm-map/scripts/test_db.py +364 -364
  178. package/ftm-map/scripts/test_parser.py +174 -174
  179. package/ftm-map/scripts/test_query.py +183 -183
  180. package/ftm-map/scripts/test_ranker.py +199 -199
  181. package/ftm-map/scripts/views.py +591 -591
  182. package/ftm-map.yml +2 -2
  183. package/ftm-mind/SKILL.md +201 -1943
  184. package/ftm-mind/evals/promptfoo.yaml +142 -142
  185. package/ftm-mind/references/blackboard-protocol.md +110 -0
  186. package/ftm-mind/references/blackboard-schema.md +328 -328
  187. package/ftm-mind/references/complexity-guide.md +110 -110
  188. package/ftm-mind/references/complexity-sizing.md +138 -0
  189. package/ftm-mind/references/decide-act-protocol.md +172 -0
  190. package/ftm-mind/references/direct-execution.md +51 -0
  191. package/ftm-mind/references/environment-discovery.md +77 -0
  192. package/ftm-mind/references/event-registry.md +319 -319
  193. package/ftm-mind/references/mcp-inventory.md +300 -296
  194. package/ftm-mind/references/ops-routing.md +47 -0
  195. package/ftm-mind/references/orient-protocol.md +234 -0
  196. package/ftm-mind/references/personality.md +40 -0
  197. package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
  198. package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
  199. package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
  200. package/ftm-mind/references/reflexion-protocol.md +249 -249
  201. package/ftm-mind/references/routing/SCENARIOS.md +22 -22
  202. package/ftm-mind/references/routing-scenarios.md +35 -35
  203. package/ftm-mind.yml +2 -2
  204. package/ftm-ops.yml +4 -0
  205. package/ftm-pause/SKILL.md +395 -395
  206. package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
  207. package/ftm-pause/references/protocols/VALIDATION.md +80 -80
  208. package/ftm-pause.yml +2 -2
  209. package/ftm-researcher/SKILL.md +275 -275
  210. package/ftm-researcher/evals/agent-diversity.yaml +17 -17
  211. package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
  212. package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
  213. package/ftm-researcher/references/adaptive-search.md +116 -116
  214. package/ftm-researcher/references/agent-prompts.md +193 -193
  215. package/ftm-researcher/references/council-integration.md +193 -193
  216. package/ftm-researcher/references/output-format.md +203 -203
  217. package/ftm-researcher/references/synthesis-pipeline.md +165 -165
  218. package/ftm-researcher/scripts/score_credibility.py +234 -234
  219. package/ftm-researcher/scripts/validate_research.py +92 -92
  220. package/ftm-researcher.yml +2 -2
  221. package/ftm-resume/SKILL.md +518 -518
  222. package/ftm-resume/references/protocols/VALIDATION.md +172 -172
  223. package/ftm-resume.yml +2 -2
  224. package/ftm-retro/SKILL.md +380 -380
  225. package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
  226. package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
  227. package/ftm-retro.yml +2 -2
  228. package/ftm-routine/SKILL.md +170 -170
  229. package/ftm-routine.yml +4 -4
  230. package/ftm-state/blackboard/capabilities.json +5 -5
  231. package/ftm-state/blackboard/capabilities.schema.json +27 -27
  232. package/ftm-state/blackboard/context.json +37 -23
  233. package/ftm-state/blackboard/experiences/doom-statusline-fix.json +26 -0
  234. package/ftm-state/blackboard/experiences/hackathon-pages-site.json +26 -0
  235. package/ftm-state/blackboard/experiences/hindsight-sso-kickoff.json +42 -0
  236. package/ftm-state/blackboard/experiences/index.json +58 -9
  237. package/ftm-state/blackboard/experiences/learning-ragnarok-api-access.json +23 -0
  238. package/ftm-state/blackboard/experiences/nordlayer-members-auto-assign.json +26 -0
  239. package/ftm-state/blackboard/experiences/saml2aws-stale-session-fix.json +41 -0
  240. package/ftm-state/blackboard/patterns.json +6 -6
  241. package/ftm-state/schemas/context.schema.json +130 -130
  242. package/ftm-state/schemas/experience-index.schema.json +77 -77
  243. package/ftm-state/schemas/experience.schema.json +78 -78
  244. package/ftm-state/schemas/patterns.schema.json +44 -44
  245. package/ftm-upgrade/SKILL.md +194 -194
  246. package/ftm-upgrade/scripts/check-version.sh +76 -76
  247. package/ftm-upgrade/scripts/upgrade.sh +143 -143
  248. package/ftm-upgrade.yml +2 -2
  249. package/ftm-verify.yml +2 -2
  250. package/ftm.yml +2 -2
  251. package/hooks/ftm-auto-log.sh +137 -0
  252. package/hooks/ftm-blackboard-enforcer.sh +93 -93
  253. package/hooks/ftm-discovery-reminder.sh +90 -90
  254. package/hooks/ftm-drafts-gate.sh +61 -61
  255. package/hooks/ftm-event-logger.mjs +107 -107
  256. package/hooks/ftm-install-hooks.sh +240 -0
  257. package/hooks/ftm-learning-capture.sh +117 -0
  258. package/hooks/ftm-map-autodetect.sh +79 -79
  259. package/hooks/ftm-pending-sync-check.sh +22 -22
  260. package/hooks/ftm-plan-gate.sh +92 -92
  261. package/hooks/ftm-post-commit-trigger.sh +57 -57
  262. package/hooks/ftm-post-compaction.sh +138 -0
  263. package/hooks/ftm-pre-compaction.sh +147 -0
  264. package/hooks/ftm-session-end.sh +52 -0
  265. package/hooks/ftm-session-snapshot.sh +213 -0
  266. package/hooks/settings-template.json +81 -81
  267. package/install.sh +363 -363
  268. package/package.json +84 -84
  269. package/uninstall.sh +25 -25
@@ -1,100 +1,180 @@
1
- {
2
- "skill_name": "ftm-brainstorm",
3
- "evals": [
4
- {
5
- "id": 0,
6
- "name": "fresh-idea-intake",
7
- "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
8
- "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
9
- "files": [],
10
- "assertions": [
11
- {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
12
- {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
13
- {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
14
- {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"}
15
- ]
16
- },
17
- {
18
- "id": 1,
19
- "name": "fresh-idea-turn2-research",
20
- "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
21
- "expected_output": "First research sprint dispatched (3 agents), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
22
- "files": [],
23
- "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
24
- "assertions": [
25
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents (web, github, competitive) not fewer"},
26
- {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
27
- {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
28
- {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
29
- {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions"},
30
- {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions to drive next research sprint"},
31
- {"name": "hard_stop", "description": "Does NOT continue past the questionswaits for user response"},
32
- {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"}
33
- ]
34
- },
35
- {
36
- "id": 2,
37
- "name": "turn3-deeper-research",
38
- "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
39
- "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
40
- "files": [],
41
- "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
42
- "assertions": [
43
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents again — every turn gets a research sprint"},
44
- {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
45
- {"name": "builds_on_prior", "description": "References prior turn's findings — does not re-present the same 5 approaches"},
46
- {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns fresh research, not recycled"},
47
- {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
48
- {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
49
- {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
50
- {"name": "hard_stop", "description": "Does NOT continue past the questions"}
51
- ]
52
- },
53
- {
54
- "id": 3,
55
- "name": "brain-dump-intake",
56
- "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
57
- "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
58
- "files": ["brain-dump-input.md"],
59
- "assertions": [
60
- {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
61
- {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
62
- {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
63
- {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
64
- {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
65
- {"name": "hard_stop", "description": "Stops after asking for confirmation does not proceed"}
66
- ]
67
- },
68
- {
69
- "id": 4,
70
- "name": "brain-dump-turn2-research",
71
- "prompt": "Yeah that looks right, go ahead and research it",
72
- "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
73
- "files": ["brain-dump-input.md"],
74
- "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
75
- "assertions": [
76
- {"name": "three_agents_dispatched", "description": "Dispatches 3 parallel research agents searching for specific brain dump claims"},
77
- {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
78
- {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
79
- {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
80
- {"name": "challenge_present", "description": "At least one challenge/pushback raised"},
81
- {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions"},
82
- {"name": "hard_stop", "description": "Does NOT proceed past questions"}
83
- ]
84
- },
85
- {
86
- "id": 5,
87
- "name": "phase3-gate",
88
- "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
89
- "expected_output": "Vision summary for approval, NOT the full plan yet",
90
- "files": [],
91
- "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
92
- "assertions": [
93
- {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary before generating the full plan"},
94
- {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
95
- {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
96
- {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
97
- ]
98
- }
99
- ]
100
- }
1
+ {
2
+ "skill_name": "ftm-brainstorm",
3
+ "evals": [
4
+ {
5
+ "id": 0,
6
+ "name": "fresh-idea-intake",
7
+ "prompt": "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying.",
8
+ "expected_output": "Phase 0 repo scan launched in background, 1-2 intake questions, hard STOP",
9
+ "files": [],
10
+ "assertions": [
11
+ {"name": "one_or_two_questions", "description": "Asks 1-2 questions max, not a wall of questions"},
12
+ {"name": "no_research_sprint_turn1", "description": "Does NOT dispatch research agents on the very first turn — intake only"},
13
+ {"name": "hard_stop", "description": "Ends with a question and waits — does NOT proceed to synthesize or generate suggestions"},
14
+ {"name": "repo_scan_launched", "description": "Mentions or silently launches a repo/codebase scan agent in background"},
15
+ {"name": "follows_energy", "description": "Asks about the most interesting/emphasized part of the idea, not a generic checklist question"},
16
+ {"name": "no_skill_questions", "description": "Does NOT ask about the user's technical experience or skill level"}
17
+ ]
18
+ },
19
+ {
20
+ "id": 1,
21
+ "name": "fresh-idea-turn2-research",
22
+ "prompt": "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability.",
23
+ "expected_output": "First research sprint dispatched (7 agents + synthesizer), synthesized suggestions with citations, challenge, 1-2 questions, STOP",
24
+ "files": [],
25
+ "multi_turn_context": "Turn 2. Turn 1: user said 'building study buddy app like Tinder for studying', skill asked intake questions, user now responds with details.",
26
+ "assertions": [
27
+ {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents (web, github, competitive, stack, architecture, pitfall, UX) plus synthesizer not fewer in standard mode"},
28
+ {"name": "real_citations", "description": "At least 3 unique URLs to real repos/articles/products in the synthesis"},
29
+ {"name": "suggestions_with_evidence", "description": "Presents numbered suggestions (3-5) with real-world evidence backing each one"},
30
+ {"name": "recommendation_labeled", "description": "Suggestion #1 is labeled RECOMMENDED with rationale"},
31
+ {"name": "challenge_present", "description": "Includes at least one challenge/pushback after suggestions as a statement, NOT a question"},
32
+ {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions via AskUserQuestion to drive next research sprint"},
33
+ {"name": "hard_stop", "description": "Does NOT continue past the questions — waits for user response"},
34
+ {"name": "depth_is_broad", "description": "Research queries are landscape-level (major approaches, who's done this) not implementation-specific"},
35
+ {"name": "synthesizer_output", "description": "Shows consensus/contested/unique findings structure from synthesizer"}
36
+ ]
37
+ },
38
+ {
39
+ "id": 2,
40
+ "name": "turn3-deeper-research",
41
+ "prompt": "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins.",
42
+ "expected_output": "Second research sprint (deeper, focused on RN+Firebase+matching), new suggestions building on prior, new challenges, new questions",
43
+ "files": [],
44
+ "multi_turn_context": "Turn 3. Prior turns: user described study buddy app, first research sprint found 5 approaches, user now picks one and raises two specific concerns.",
45
+ "assertions": [
46
+ {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents again every turn gets a research sprint"},
47
+ {"name": "research_is_deeper", "description": "Search queries target matching algorithms and cold-start specifically, NOT broad 'study buddy app' landscape again"},
48
+ {"name": "builds_on_prior", "description": "References prior turn's findings does not re-present the same 5 approaches"},
49
+ {"name": "new_citations", "description": "At least 2 URLs not seen in prior turns — fresh research, not recycled"},
50
+ {"name": "addresses_user_concerns", "description": "Suggestions specifically address matching algorithm complexity AND cold-start problem"},
51
+ {"name": "challenge_present", "description": "Challenges the user on at least one assumption or pushes back on scope"},
52
+ {"name": "ends_with_question", "description": "Ends with 1-2 questions that unlock the NEXT research vector"},
53
+ {"name": "hard_stop", "description": "Does NOT continue past the questions"},
54
+ {"name": "prior_decision_logged", "description": "Records 'React Native + Firebase chosen' in prior decisions log — does NOT re-ask about tech stack"}
55
+ ]
56
+ },
57
+ {
58
+ "id": 3,
59
+ "name": "brain-dump-intake",
60
+ "prompt": "help me build all the suggestions in this chat: [brain dump about eng-buddy]",
61
+ "expected_output": "Path B structured extraction with repo context, confirmation gate, no research yet",
62
+ "files": ["brain-dump-input.md"],
63
+ "assertions": [
64
+ {"name": "decided_section", "description": "Contains a 'Decided' or 'Decisions already made' section"},
65
+ {"name": "open_questions_section", "description": "Contains an 'Open questions' or 'Gaps' section"},
66
+ {"name": "no_basic_questions", "description": "Does NOT ask basic 'what are you building?' questions already answered by the paste"},
67
+ {"name": "confirmation_gate", "description": "Ends with a confirmation question before proceeding to research"},
68
+ {"name": "no_research_sprint", "description": "Does NOT dispatch research agents or present suggestions on this turn"},
69
+ {"name": "hard_stop", "description": "Stops after asking for confirmation — does not proceed"}
70
+ ]
71
+ },
72
+ {
73
+ "id": 4,
74
+ "name": "brain-dump-turn2-research",
75
+ "prompt": "Yeah that looks right, go ahead and research it",
76
+ "expected_output": "First research sprint in brain dump mode: novelty map, suggestions with citations, challenges",
77
+ "files": ["brain-dump-input.md"],
78
+ "multi_turn_context": "Turn 2 of brain dump. Turn 1: user pasted eng-buddy brain dump, skill extracted structured summary, user now confirms.",
79
+ "assertions": [
80
+ {"name": "seven_agents_dispatched", "description": "Dispatches 7 parallel research agents searching for specific brain dump claims"},
81
+ {"name": "novelty_map_present", "description": "Contains a Novelty Map table with solved/partially solved/novel verdicts"},
82
+ {"name": "real_citations", "description": "At least 5 unique URLs to real repos/articles/products"},
83
+ {"name": "brain_dump_claims_researched", "description": "Agents searched for specific architectural claims from the dump, not just broad topic searches"},
84
+ {"name": "challenge_present", "description": "At least one challenge/pushback raised as a statement"},
85
+ {"name": "ends_with_question", "description": "Ends with 1-2 targeted questions via AskUserQuestion"},
86
+ {"name": "hard_stop", "description": "Does NOT proceed past questions"}
87
+ ]
88
+ },
89
+ {
90
+ "id": 5,
91
+ "name": "phase3-gate",
92
+ "prompt": "Ok I think I know what I want. Let's turn this into a plan.",
93
+ "expected_output": "Spec self-review + pre-mortem runs, vision summary with risk mitigations for approval, NOT the full plan yet",
94
+ "files": [],
95
+ "multi_turn_context": "Turn 6+ of brainstorming. Previous turns explored study-buddy app, settled on React Native + Firebase, matching algorithm, cold-start solution. User now wants to move to planning.",
96
+ "assertions": [
97
+ {"name": "spec_self_review", "description": "Runs spec self-review checking for placeholders, contradictions, scope gaps, ambiguity"},
98
+ {"name": "pre_mortem_runs", "description": "Runs pre-mortem stress test generating failure scenarios with mitigations"},
99
+ {"name": "vision_summary", "description": "Presents a brief 'here's what we've landed on' summary including key decisions AND top risks"},
100
+ {"name": "canonical_references", "description": "Includes canonical references section with links to key sources from research"},
101
+ {"name": "approval_gate", "description": "Asks for explicit confirmation before proceeding to full plan generation"},
102
+ {"name": "does_not_dump_full_plan", "description": "Does NOT generate the entire task breakdown, agent assignments, and wave structure in this message"},
103
+ {"name": "references_plan_template", "description": "Reads or references references/plan-template.md for plan generation"}
104
+ ]
105
+ },
106
+ {
107
+ "id": 6,
108
+ "name": "assumption-audit-activation",
109
+ "prompt": "I want to build a real-time collaborative whiteboard where multiple users can draw simultaneously. It needs to sync instantly across devices.",
110
+ "expected_output": "Intake question, and on turn 2-3 the assumption audit module should activate to challenge 'instantly' and 'real-time' assumptions",
111
+ "files": [],
112
+ "multi_turn_context": "Turn 2-3. User has described a collaborative whiteboard with real-time sync. The assumption audit should catch assumptions about 'instantly' (network latency exists), 'real-time' (what tolerance?), and synchronization approach.",
113
+ "assertions": [
114
+ {"name": "assumption_audit_activates", "description": "Runs the First Principles Assumption Audit module"},
115
+ {"name": "crackable_assumptions_found", "description": "Identifies at least one crackable assumption (e.g., 'instantly' is a design choice, not a requirement)"},
116
+ {"name": "five_levels_analyzed", "description": "Examines assumptions at surface, process, structural, cultural, and fundamental levels"},
117
+ {"name": "user_can_respond", "description": "Presents assumptions as keep/challenge/ignore choices via AskUserQuestion"}
118
+ ]
119
+ },
120
+ {
121
+ "id": 7,
122
+ "name": "discuss-mode-activation",
123
+ "prompt": "I know exactly what I want. Here's my spec: Build a REST API endpoint POST /api/v1/recordings that accepts multipart form data with an audio file (max 500MB), stores it in S3 with a UUID filename, creates a database record with metadata (duration, format, size, created_at), and returns the record ID with a signed URL for playback. Use Express.js with multer for uploads, pg for Postgres, and aws-sdk for S3.",
124
+ "expected_output": "Discuss mode activates: parses spec, identifies gray areas using SEE/CALL/RUN/READ/ORGANIZE heuristics, asks targeted questions",
125
+ "files": [],
126
+ "assertions": [
127
+ {"name": "discuss_mode_detected", "description": "Recognizes this is a detailed spec and activates Discuss Mode instead of standard brainstorm"},
128
+ {"name": "spec_parsed", "description": "Extracts key components: endpoint, storage, database, response format"},
129
+ {"name": "gray_areas_found", "description": "Identifies gray areas: auth, rate limiting, error handling for S3 failures, max concurrent uploads, file validation"},
130
+ {"name": "uses_heuristics", "description": "Applies SEE/CALL/RUN/READ/ORGANIZE categorization to find unknowns"},
131
+ {"name": "targeted_questions", "description": "Asks 3-5 specific questions about gray areas, NOT basic 'what are you building' questions"},
132
+ {"name": "no_broad_research", "description": "Does NOT launch a broad 7-agent landscape research sprint — goes straight to targeted analysis"}
133
+ ]
134
+ },
135
+ {
136
+ "id": 8,
137
+ "name": "scope-guardrail",
138
+ "prompt": "Oh actually, while we're at it, we should also add a social feed where users can post their study notes and other students can comment on them.",
139
+ "expected_output": "Acknowledges the idea, adds to Deferred Ideas, redirects to current scope",
140
+ "files": [],
141
+ "multi_turn_context": "Turn 4 of study buddy app brainstorm. Core scope is matching + scheduling. User now wants to add a social feed — this is scope creep.",
142
+ "assertions": [
143
+ {"name": "acknowledges_idea", "description": "Does NOT dismiss the idea — acknowledges it's worth building"},
144
+ {"name": "defers_to_future", "description": "Adds the social feed to a Deferred Ideas list"},
145
+ {"name": "redirects_to_scope", "description": "Steers back to the current core scope (matching + scheduling)"},
146
+ {"name": "does_not_incorporate", "description": "Does NOT start researching social feeds or adding it to the current plan"}
147
+ ]
148
+ },
149
+ {
150
+ "id": 9,
151
+ "name": "anti-rationalization-check",
152
+ "prompt": "This is pretty straightforward, just generate the plan already. We don't need more research.",
153
+ "expected_output": "Does NOT skip to plan generation. Checks anti-rationalization table, explains why process matters, continues the brainstorm loop.",
154
+ "files": [],
155
+ "multi_turn_context": "Turn 3 of a complex brainstorm. Only 2 research sprints completed. User is impatient but the skill has a hard gate requiring explicit readiness AND research saturation.",
156
+ "assertions": [
157
+ {"name": "does_not_skip", "description": "Does NOT immediately generate a plan — honors the hard gate"},
158
+ {"name": "explains_value", "description": "Briefly explains what remaining research could surface, without being preachy"},
159
+ {"name": "offers_quick_mode", "description": "Offers Quick Mode as a compromise if the user truly wants speed"},
160
+ {"name": "continues_loop", "description": "Runs at least one more research sprint to check for saturation"}
161
+ ]
162
+ },
163
+ {
164
+ "id": 10,
165
+ "name": "plan-quality-verification",
166
+ "prompt": "Looks good, save it.",
167
+ "expected_output": "Runs plan checker agent before saving. Fixes any issues found. Then saves.",
168
+ "files": [],
169
+ "multi_turn_context": "Phase 3, final section approved. Plan has been presented incrementally and user approved all 3 sections. Now saving.",
170
+ "assertions": [
171
+ {"name": "plan_checker_runs", "description": "Dispatches a plan checker agent to validate the plan before saving"},
172
+ {"name": "nyquist_validated", "description": "Every task in the plan has an automated verify command"},
173
+ {"name": "discovery_levels_tagged", "description": "Every task has a discovery level (L0-L3)"},
174
+ {"name": "canonical_references_present", "description": "Plan includes a canonical references section"},
175
+ {"name": "deferred_ideas_present", "description": "Plan includes deferred ideas from scope guardrail"},
176
+ {"name": "handoff_prompt", "description": "Provides /ftm-executor handoff command after saving"}
177
+ ]
178
+ }
179
+ ]
180
+ }
@@ -1,109 +1,109 @@
1
- description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
2
-
3
- prompts:
4
- - "{{input}}"
5
-
6
- providers:
7
- - id: "exec:claude --print"
8
- label: "claude-code"
9
-
10
- defaultTest:
11
- options:
12
- transformVars: "vars"
13
-
14
- tests:
15
- # Eval 0: fresh-idea-intake
16
- - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
17
- vars:
18
- input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
19
- assert:
20
- - type: regex
21
- value: "\\?"
22
- description: "Response must contain at least one question"
23
- - type: not-contains
24
- value: "sprint"
25
- description: "Does not dispatch research sprint on first turn"
26
- - type: not-contains
27
- value: "dispatch"
28
- description: "Does not dispatch agents on first turn"
29
-
30
- # Eval 1: fresh-idea-turn2-research
31
- - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
32
- vars:
33
- input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
34
- assert:
35
- - type: regex
36
- value: "https?://"
37
- description: "Response includes at least one URL citation"
38
- - type: regex
39
- value: "RECOMMENDED|recommended|#1"
40
- description: "At least one suggestion is labeled as recommended"
41
- - type: regex
42
- value: "\\?"
43
- description: "Ends with a question"
44
-
45
- # Eval 2: turn3-deeper-research
46
- - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
47
- vars:
48
- input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
49
- assert:
50
- - type: regex
51
- value: "matching|algorithm|cold.start"
52
- description: "Response addresses the specific concerns raised"
53
- - type: regex
54
- value: "https?://"
55
- description: "Response includes at least one URL citation"
56
- - type: regex
57
- value: "\\?"
58
- description: "Ends with a question"
59
-
60
- # Eval 3: brain-dump-intake
61
- - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
62
- vars:
63
- input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
64
- assert:
65
- - type: regex
66
- value: "Decided|decided|Decision|decision"
67
- description: "Contains a 'Decided' section"
68
- - type: regex
69
- value: "open question|Open question|gap|Gap"
70
- description: "Contains an open questions or gaps section"
71
- - type: regex
72
- value: "\\?"
73
- description: "Ends with a confirmation question"
74
- - type: not-contains
75
- value: "dispatch"
76
- description: "Does not dispatch research agents on intake turn"
77
-
78
- # Eval 4: brain-dump-turn2-research
79
- - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
80
- vars:
81
- input: "Yeah that looks right, go ahead and research it"
82
- assert:
83
- - type: regex
84
- value: "Novelty|novelty|novel|Novel"
85
- description: "Contains a Novelty Map"
86
- - type: regex
87
- value: "https?://"
88
- description: "Response includes at least one URL citation"
89
- - type: regex
90
- value: "\\?"
91
- description: "Ends with a question"
92
-
93
- # Eval 5: phase3-gate
94
- - description: "phase 3 gate — vision summary and approval before generating full plan"
95
- vars:
96
- input: "Ok I think I know what I want. Let's turn this into a plan."
97
- assert:
98
- - type: regex
99
- value: "summary|Summary|we've landed|landed on|here's what"
100
- description: "Presents a vision summary before generating the full plan"
101
- - type: regex
102
- value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
103
- description: "Asks for confirmation before proceeding to full plan"
104
- - type: not-contains
105
- value: "Wave 1"
106
- description: "Does not dump the full plan structure prematurely"
107
- - type: not-contains
108
- value: "Wave 2"
109
- description: "Does not dump the full plan structure prematurely"
1
+ description: "ftm-brainstorm behavior across multi-turn brainstorming sessions"
2
+
3
+ prompts:
4
+ - "{{input}}"
5
+
6
+ providers:
7
+ - id: "exec:claude --print"
8
+ label: "claude-code"
9
+
10
+ defaultTest:
11
+ options:
12
+ transformVars: "vars"
13
+
14
+ tests:
15
+ # Eval 0: fresh-idea-intake
16
+ - description: "fresh idea intake — asks 1-2 questions, no research sprint, hard stop"
17
+ vars:
18
+ input: "I'm thinking about building an app that helps people find study buddies at their university. Like Tinder but for studying."
19
+ assert:
20
+ - type: regex
21
+ value: "\\?"
22
+ description: "Response must contain at least one question"
23
+ - type: not-contains
24
+ value: "sprint"
25
+ description: "Does not dispatch research sprint on first turn"
26
+ - type: not-contains
27
+ value: "dispatch"
28
+ description: "Does not dispatch agents on first turn"
29
+
30
+ # Eval 1: fresh-idea-turn2-research
31
+ - description: "turn 2 — dispatches research agents, citations, labeled recommendation, challenge, ends with question"
32
+ vars:
33
+ input: "It's for college students who want to find people in the same classes to study with. Matching based on courses, study style, and schedule availability."
34
+ assert:
35
+ - type: regex
36
+ value: "https?://"
37
+ description: "Response includes at least one URL citation"
38
+ - type: regex
39
+ value: "RECOMMENDED|recommended|#1"
40
+ description: "At least one suggestion is labeled as recommended"
41
+ - type: regex
42
+ value: "\\?"
43
+ description: "Ends with a question"
44
+
45
+ # Eval 2: turn3-deeper-research
46
+ - description: "turn 3 — deeper research on specific concerns, builds on prior, fresh citations, challenge"
47
+ vars:
48
+ input: "I like option 2 — the React Native approach with Firebase. But I'm worried about the matching algorithm complexity. Also we need to handle the cold-start problem when a new university joins."
49
+ assert:
50
+ - type: regex
51
+ value: "matching|algorithm|cold.start"
52
+ description: "Response addresses the specific concerns raised"
53
+ - type: regex
54
+ value: "https?://"
55
+ description: "Response includes at least one URL citation"
56
+ - type: regex
57
+ value: "\\?"
58
+ description: "Ends with a question"
59
+
60
+ # Eval 3: brain-dump-intake
61
+ - description: "brain dump intake — extracts structure, confirmation gate, no research yet"
62
+ vars:
63
+ input: "help me build all the suggestions in this chat: [brain dump about eng-buddy]"
64
+ assert:
65
+ - type: regex
66
+ value: "Decided|decided|Decision|decision"
67
+ description: "Contains a 'Decided' section"
68
+ - type: regex
69
+ value: "open question|Open question|gap|Gap"
70
+ description: "Contains an open questions or gaps section"
71
+ - type: regex
72
+ value: "\\?"
73
+ description: "Ends with a confirmation question"
74
+ - type: not-contains
75
+ value: "dispatch"
76
+ description: "Does not dispatch research agents on intake turn"
77
+
78
+ # Eval 4: brain-dump-turn2-research
79
+ - description: "brain dump turn 2 — novelty map, research citations, challenge, question"
80
+ vars:
81
+ input: "Yeah that looks right, go ahead and research it"
82
+ assert:
83
+ - type: regex
84
+ value: "Novelty|novelty|novel|Novel"
85
+ description: "Contains a Novelty Map"
86
+ - type: regex
87
+ value: "https?://"
88
+ description: "Response includes at least one URL citation"
89
+ - type: regex
90
+ value: "\\?"
91
+ description: "Ends with a question"
92
+
93
+ # Eval 5: phase3-gate
94
+ - description: "phase 3 gate — vision summary and approval before generating full plan"
95
+ vars:
96
+ input: "Ok I think I know what I want. Let's turn this into a plan."
97
+ assert:
98
+ - type: regex
99
+ value: "summary|Summary|we've landed|landed on|here's what"
100
+ description: "Presents a vision summary before generating the full plan"
101
+ - type: regex
102
+ value: "confirm|Confirm|proceed|Proceed|ready|Ready|go ahead|approve"
103
+ description: "Asks for confirmation before proceeding to full plan"
104
+ - type: not-contains
105
+ value: "Wave 1"
106
+ description: "Does not dump the full plan structure prematurely"
107
+ - type: not-contains
108
+ value: "Wave 2"
109
+ description: "Does not dump the full plan structure prematurely"