gsd-antigravity-kit 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. package/.agent/skills/gsd/SKILL.md +26 -4
  2. package/.agent/skills/gsd/VERSION +1 -1
  3. package/.agent/skills/gsd/assets/templates/AI-SPEC.md +246 -0
  4. package/.agent/skills/gsd/assets/templates/DEBUG.md +7 -2
  5. package/.agent/skills/gsd/assets/templates/config.json +56 -48
  6. package/.agent/skills/gsd/assets/templates/research.md +40 -0
  7. package/.agent/skills/gsd/assets/templates/spec.md +307 -0
  8. package/.agent/skills/gsd/assets/templates/state.md +8 -0
  9. package/.agent/skills/gsd/bin/gsd-tools.cjs +212 -11
  10. package/.agent/skills/gsd/bin/help-manifest.json +8 -2
  11. package/.agent/skills/gsd/bin/hooks/gsd-check-update-worker.js +108 -0
  12. package/.agent/skills/gsd/bin/hooks/gsd-check-update.js +14 -89
  13. package/.agent/skills/gsd/bin/hooks/gsd-context-monitor.js +34 -5
  14. package/.agent/skills/gsd/bin/hooks/gsd-phase-boundary.sh +1 -0
  15. package/.agent/skills/gsd/bin/hooks/gsd-prompt-guard.js +1 -1
  16. package/.agent/skills/gsd/bin/hooks/gsd-read-guard.js +6 -1
  17. package/.agent/skills/gsd/bin/hooks/gsd-session-state.sh +1 -0
  18. package/.agent/skills/gsd/bin/hooks/gsd-statusline.js +150 -16
  19. package/.agent/skills/gsd/bin/hooks/gsd-validate-commit.sh +1 -0
  20. package/.agent/skills/gsd/bin/hooks/gsd-workflow-guard.js +1 -1
  21. package/.agent/skills/gsd/bin/lib/audit.cjs +757 -0
  22. package/.agent/skills/gsd/bin/lib/commands.cjs +17 -7
  23. package/.agent/skills/gsd/bin/lib/config.cjs +66 -20
  24. package/.agent/skills/gsd/bin/lib/core.cjs +212 -12
  25. package/.agent/skills/gsd/bin/lib/frontmatter.cjs +6 -8
  26. package/.agent/skills/gsd/bin/lib/graphify.cjs +494 -0
  27. package/.agent/skills/gsd/bin/lib/gsd2-import.cjs +511 -0
  28. package/.agent/skills/gsd/bin/lib/init.cjs +371 -18
  29. package/.agent/skills/gsd/bin/lib/intel.cjs +9 -30
  30. package/.agent/skills/gsd/bin/lib/milestone.cjs +18 -17
  31. package/.agent/skills/gsd/bin/lib/model-profiles.cjs +1 -0
  32. package/.agent/skills/gsd/bin/lib/phase.cjs +225 -98
  33. package/.agent/skills/gsd/bin/lib/profile-output.cjs +17 -5
  34. package/.agent/skills/gsd/bin/lib/roadmap.cjs +12 -5
  35. package/.agent/skills/gsd/bin/lib/state.cjs +394 -129
  36. package/.agent/skills/gsd/bin/lib/template.cjs +8 -4
  37. package/.agent/skills/gsd/bin/lib/uat.cjs +2 -1
  38. package/.agent/skills/gsd/bin/lib/verify.cjs +111 -42
  39. package/.agent/skills/gsd/migration_report.md +2 -2
  40. package/.agent/skills/gsd/references/agents/gsd-advisor-researcher.md +23 -0
  41. package/.agent/skills/gsd/references/agents/gsd-ai-researcher.md +133 -0
  42. package/.agent/skills/gsd/references/agents/gsd-code-fixer.md +11 -10
  43. package/.agent/skills/gsd/references/agents/gsd-code-reviewer.md +2 -2
  44. package/.agent/skills/gsd/references/agents/gsd-codebase-mapper.md +13 -2
  45. package/.agent/skills/gsd/references/agents/gsd-debug-session-manager.md +314 -0
  46. package/.agent/skills/gsd/references/agents/gsd-debugger.md +147 -76
  47. package/.agent/skills/gsd/references/agents/gsd-doc-verifier.md +1 -1
  48. package/.agent/skills/gsd/references/agents/gsd-doc-writer.md +615 -602
  49. package/.agent/skills/gsd/references/agents/gsd-domain-researcher.md +153 -0
  50. package/.agent/skills/gsd/references/agents/gsd-eval-auditor.md +175 -0
  51. package/.agent/skills/gsd/references/agents/gsd-eval-planner.md +154 -0
  52. package/.agent/skills/gsd/references/agents/gsd-executor.md +108 -38
  53. package/.agent/skills/gsd/references/agents/gsd-framework-selector.md +160 -0
  54. package/.agent/skills/gsd/references/agents/gsd-integration-checker.md +454 -443
  55. package/.agent/skills/gsd/references/agents/gsd-intel-updater.md +40 -20
  56. package/.agent/skills/gsd/references/agents/gsd-nyquist-auditor.md +187 -176
  57. package/.agent/skills/gsd/references/agents/gsd-pattern-mapper.md +335 -0
  58. package/.agent/skills/gsd/references/agents/gsd-phase-researcher.md +112 -13
  59. package/.agent/skills/gsd/references/agents/gsd-plan-checker.md +104 -10
  60. package/.agent/skills/gsd/references/agents/gsd-planner.md +125 -167
  61. package/.agent/skills/gsd/references/agents/gsd-project-researcher.md +25 -2
  62. package/.agent/skills/gsd/references/agents/gsd-research-synthesizer.md +3 -3
  63. package/.agent/skills/gsd/references/agents/gsd-roadmapper.md +12 -1
  64. package/.agent/skills/gsd/references/agents/gsd-security-auditor.md +139 -128
  65. package/.agent/skills/gsd/references/agents/gsd-ui-auditor.md +3 -3
  66. package/.agent/skills/gsd/references/agents/gsd-ui-checker.md +11 -2
  67. package/.agent/skills/gsd/references/agents/gsd-ui-researcher.md +27 -4
  68. package/.agent/skills/gsd/references/agents/gsd-verifier.md +13 -19
  69. package/.agent/skills/gsd/references/commands/atomic/add-todo.md +2 -2
  70. package/.agent/skills/gsd/references/commands/atomic/check-todos.md +2 -2
  71. package/.agent/skills/gsd/references/commands/atomic/cleanup.md +2 -2
  72. package/.agent/skills/gsd/references/commands/atomic/do.md +2 -2
  73. package/.agent/skills/gsd/references/commands/atomic/help.md +2 -2
  74. package/.agent/skills/gsd/references/commands/atomic/join-discord.md +2 -2
  75. package/.agent/skills/gsd/references/commands/atomic/note.md +2 -2
  76. package/.agent/skills/gsd/references/commands/atomic/session-report.md +2 -2
  77. package/.agent/skills/gsd/references/commands/atomic/ship.md +2 -2
  78. package/.agent/skills/gsd/references/commands/atomic/stats.md +2 -2
  79. package/.agent/skills/gsd/references/commands/atomic/thread.md +141 -41
  80. package/.agent/skills/gsd/references/commands/atomic/undo.md +2 -2
  81. package/.agent/skills/gsd/references/commands/milestone/add-backlog.md +15 -12
  82. package/.agent/skills/gsd/references/commands/milestone/audit-milestone.md +2 -2
  83. package/.agent/skills/gsd/references/commands/milestone/complete-milestone.md +2 -2
  84. package/.agent/skills/gsd/references/commands/milestone/milestone-summary.md +2 -2
  85. package/.agent/skills/gsd/references/commands/milestone/new-milestone.md +2 -2
  86. package/.agent/skills/gsd/references/commands/milestone/plan-milestone-gaps.md +2 -2
  87. package/.agent/skills/gsd/references/commands/milestone/plant-seed.md +2 -2
  88. package/.agent/skills/gsd/references/commands/milestone/review-backlog.md +4 -4
  89. package/.agent/skills/gsd/references/commands/misc/ai-integration-phase.md +38 -0
  90. package/.agent/skills/gsd/references/commands/misc/audit-fix.md +2 -2
  91. package/.agent/skills/gsd/references/commands/misc/audit-uat.md +2 -2
  92. package/.agent/skills/gsd/references/commands/misc/eval-review.md +34 -0
  93. package/.agent/skills/gsd/references/commands/misc/extract_learnings.md +24 -0
  94. package/.agent/skills/gsd/references/commands/misc/from-gsd2.md +49 -0
  95. package/.agent/skills/gsd/references/commands/misc/graphify.md +203 -0
  96. package/.agent/skills/gsd/references/commands/misc/inbox.md +40 -0
  97. package/.agent/skills/gsd/references/commands/misc/next.md +5 -3
  98. package/.agent/skills/gsd/references/commands/misc/progress.md +4 -3
  99. package/.agent/skills/gsd/references/commands/misc/sketch-wrap-up.md +33 -0
  100. package/.agent/skills/gsd/references/commands/misc/sketch.md +47 -0
  101. package/.agent/skills/gsd/references/commands/misc/spec-phase.md +64 -0
  102. package/.agent/skills/gsd/references/commands/misc/spike-wrap-up.md +33 -0
  103. package/.agent/skills/gsd/references/commands/misc/spike.md +43 -0
  104. package/.agent/skills/gsd/references/commands/misc/verify-work.md +2 -2
  105. package/.agent/skills/gsd/references/commands/phase/add-phase.md +2 -2
  106. package/.agent/skills/gsd/references/commands/phase/add-tests.md +2 -2
  107. package/.agent/skills/gsd/references/commands/phase/discuss-phase.md +5 -5
  108. package/.agent/skills/gsd/references/commands/phase/execute-phase.md +4 -4
  109. package/.agent/skills/gsd/references/commands/phase/insert-phase.md +2 -2
  110. package/.agent/skills/gsd/references/commands/phase/list-phase-assumptions.md +2 -2
  111. package/.agent/skills/gsd/references/commands/phase/plan-phase.md +3 -3
  112. package/.agent/skills/gsd/references/commands/phase/remove-phase.md +2 -2
  113. package/.agent/skills/gsd/references/commands/phase/research-phase.md +5 -5
  114. package/.agent/skills/gsd/references/commands/phase/secure-phase.md +2 -2
  115. package/.agent/skills/gsd/references/commands/phase/ui-phase.md +2 -2
  116. package/.agent/skills/gsd/references/commands/phase/ui-review.md +2 -2
  117. package/.agent/skills/gsd/references/commands/phase/validate-phase.md +2 -2
  118. package/.agent/skills/gsd/references/commands/phase/workstreams.md +9 -9
  119. package/.agent/skills/gsd/references/commands/project/analyze-dependencies.md +2 -2
  120. package/.agent/skills/gsd/references/commands/project/explore.md +2 -2
  121. package/.agent/skills/gsd/references/commands/project/import.md +2 -2
  122. package/.agent/skills/gsd/references/commands/project/intel.md +10 -10
  123. package/.agent/skills/gsd/references/commands/project/list-workspaces.md +2 -2
  124. package/.agent/skills/gsd/references/commands/project/map-codebase.md +2 -2
  125. package/.agent/skills/gsd/references/commands/project/new-project.md +2 -2
  126. package/.agent/skills/gsd/references/commands/project/new-workspace.md +2 -2
  127. package/.agent/skills/gsd/references/commands/project/remove-workspace.md +2 -2
  128. package/.agent/skills/gsd/references/commands/project/scan.md +2 -2
  129. package/.agent/skills/gsd/references/commands/system/autonomous.md +4 -3
  130. package/.agent/skills/gsd/references/commands/system/code-review-fix.md +3 -3
  131. package/.agent/skills/gsd/references/commands/system/code-review.md +3 -3
  132. package/.agent/skills/gsd/references/commands/system/debug.md +177 -100
  133. package/.agent/skills/gsd/references/commands/system/docs-update.md +2 -2
  134. package/.agent/skills/gsd/references/commands/system/fast.md +2 -2
  135. package/.agent/skills/gsd/references/commands/system/forensics.md +2 -2
  136. package/.agent/skills/gsd/references/commands/system/gsd-tools.md +153 -6
  137. package/.agent/skills/gsd/references/commands/system/health.md +2 -2
  138. package/.agent/skills/gsd/references/commands/system/manager.md +3 -3
  139. package/.agent/skills/gsd/references/commands/system/pause-work.md +2 -2
  140. package/.agent/skills/gsd/references/commands/system/pr-branch.md +2 -2
  141. package/.agent/skills/gsd/references/commands/system/profile-user.md +2 -2
  142. package/.agent/skills/gsd/references/commands/system/quick.md +127 -3
  143. package/.agent/skills/gsd/references/commands/system/reapply-patches.md +45 -6
  144. package/.agent/skills/gsd/references/commands/system/resume-work.md +2 -2
  145. package/.agent/skills/gsd/references/commands/system/review.md +6 -4
  146. package/.agent/skills/gsd/references/commands/system/set-profile.md +3 -3
  147. package/.agent/skills/gsd/references/commands/system/settings.md +2 -2
  148. package/.agent/skills/gsd/references/commands/system/update.md +2 -2
  149. package/.agent/skills/gsd/references/docs/ai-evals.md +156 -0
  150. package/.agent/skills/gsd/references/docs/ai-frameworks.md +186 -0
  151. package/.agent/skills/gsd/references/docs/artifact-types.md +18 -0
  152. package/.agent/skills/gsd/references/docs/autonomous-smart-discuss.md +277 -0
  153. package/.agent/skills/gsd/references/docs/checkpoints.md +30 -0
  154. package/.agent/skills/gsd/references/docs/common-bug-patterns.md +49 -49
  155. package/.agent/skills/gsd/references/docs/continuation-format.md +11 -7
  156. package/.agent/skills/gsd/references/docs/debugger-philosophy.md +76 -0
  157. package/.agent/skills/gsd/references/docs/decimal-phase-calculation.md +64 -64
  158. package/.agent/skills/gsd/references/docs/executor-examples.md +110 -0
  159. package/.agent/skills/gsd/references/docs/git-integration.md +4 -4
  160. package/.agent/skills/gsd/references/docs/git-planning-commit.md +40 -38
  161. package/.agent/skills/gsd/references/docs/ios-scaffold.md +123 -0
  162. package/.agent/skills/gsd/references/docs/mandatory-initial-read.md +2 -0
  163. package/.agent/skills/gsd/references/docs/phase-argument-parsing.md +61 -61
  164. package/.agent/skills/gsd/references/docs/planner-antipatterns.md +89 -0
  165. package/.agent/skills/gsd/references/docs/planner-revision.md +87 -87
  166. package/.agent/skills/gsd/references/docs/planner-source-audit.md +73 -0
  167. package/.agent/skills/gsd/references/docs/planning-config.md +33 -8
  168. package/.agent/skills/gsd/references/docs/project-skills-discovery.md +19 -0
  169. package/.agent/skills/gsd/references/docs/sketch-interactivity.md +41 -0
  170. package/.agent/skills/gsd/references/docs/sketch-theme-system.md +94 -0
  171. package/.agent/skills/gsd/references/docs/sketch-tooling.md +45 -0
  172. package/.agent/skills/gsd/references/docs/sketch-variant-patterns.md +81 -0
  173. package/.agent/skills/gsd/references/docs/tdd.md +67 -0
  174. package/.agent/skills/gsd/references/docs/universal-anti-patterns.md +5 -0
  175. package/.agent/skills/gsd/references/docs/workstream-flag.md +11 -11
  176. package/.agent/skills/gsd/references/mapping.md +1 -1
  177. package/.agent/skills/gsd/references/workflows/add-phase.md +112 -112
  178. package/.agent/skills/gsd/references/workflows/add-tests.md +6 -3
  179. package/.agent/skills/gsd/references/workflows/add-todo.md +5 -3
  180. package/.agent/skills/gsd/references/workflows/ai-integration-phase.md +284 -0
  181. package/.agent/skills/gsd/references/workflows/audit-fix.md +157 -157
  182. package/.agent/skills/gsd/references/workflows/audit-milestone.md +340 -340
  183. package/.agent/skills/gsd/references/workflows/audit-uat.md +109 -109
  184. package/.agent/skills/gsd/references/workflows/autonomous.md +20 -288
  185. package/.agent/skills/gsd/references/workflows/check-todos.md +4 -2
  186. package/.agent/skills/gsd/references/workflows/cleanup.md +3 -1
  187. package/.agent/skills/gsd/references/workflows/code-review-fix.md +497 -497
  188. package/.agent/skills/gsd/references/workflows/code-review.md +515 -515
  189. package/.agent/skills/gsd/references/workflows/complete-milestone.md +97 -24
  190. package/.agent/skills/gsd/references/workflows/diagnose-issues.md +238 -238
  191. package/.agent/skills/gsd/references/workflows/discovery-phase.md +2 -0
  192. package/.agent/skills/gsd/references/workflows/discuss-phase-assumptions.md +11 -11
  193. package/.agent/skills/gsd/references/workflows/discuss-phase.md +143 -19
  194. package/.agent/skills/gsd/references/workflows/do.md +8 -2
  195. package/.agent/skills/gsd/references/workflows/docs-update.md +5 -3
  196. package/.agent/skills/gsd/references/workflows/eval-review.md +155 -0
  197. package/.agent/skills/gsd/references/workflows/execute-phase.md +338 -54
  198. package/.agent/skills/gsd/references/workflows/execute-plan.md +80 -104
  199. package/.agent/skills/gsd/references/workflows/explore.md +3 -1
  200. package/.agent/skills/gsd/references/workflows/extract_learnings.md +232 -0
  201. package/.agent/skills/gsd/references/workflows/forensics.md +3 -3
  202. package/.agent/skills/gsd/references/workflows/health.md +2 -2
  203. package/.agent/skills/gsd/references/workflows/help.md +59 -1
  204. package/.agent/skills/gsd/references/workflows/import.md +3 -1
  205. package/.agent/skills/gsd/references/workflows/inbox.md +387 -384
  206. package/.agent/skills/gsd/references/workflows/insert-phase.md +130 -130
  207. package/.agent/skills/gsd/references/workflows/list-workspaces.md +56 -56
  208. package/.agent/skills/gsd/references/workflows/manager.md +5 -3
  209. package/.agent/skills/gsd/references/workflows/map-codebase.md +19 -5
  210. package/.agent/skills/gsd/references/workflows/milestone-summary.md +6 -6
  211. package/.agent/skills/gsd/references/workflows/new-milestone.md +63 -9
  212. package/.agent/skills/gsd/references/workflows/new-project.md +126 -22
  213. package/.agent/skills/gsd/references/workflows/new-workspace.md +6 -4
  214. package/.agent/skills/gsd/references/workflows/next.md +220 -153
  215. package/.agent/skills/gsd/references/workflows/note.md +2 -0
  216. package/.agent/skills/gsd/references/workflows/pause-work.md +11 -7
  217. package/.agent/skills/gsd/references/workflows/plan-milestone-gaps.md +273 -273
  218. package/.agent/skills/gsd/references/workflows/plan-phase.md +281 -62
  219. package/.agent/skills/gsd/references/workflows/plant-seed.md +4 -1
  220. package/.agent/skills/gsd/references/workflows/pr-branch.md +41 -13
  221. package/.agent/skills/gsd/references/workflows/profile-user.md +15 -13
  222. package/.agent/skills/gsd/references/workflows/progress.md +133 -21
  223. package/.agent/skills/gsd/references/workflows/quick.md +67 -27
  224. package/.agent/skills/gsd/references/workflows/remove-phase.md +155 -155
  225. package/.agent/skills/gsd/references/workflows/remove-workspace.md +4 -2
  226. package/.agent/skills/gsd/references/workflows/research-phase.md +3 -3
  227. package/.agent/skills/gsd/references/workflows/resume-project.md +3 -3
  228. package/.agent/skills/gsd/references/workflows/review.md +71 -8
  229. package/.agent/skills/gsd/references/workflows/scan.md +102 -102
  230. package/.agent/skills/gsd/references/workflows/secure-phase.md +7 -5
  231. package/.agent/skills/gsd/references/workflows/settings.md +24 -7
  232. package/.agent/skills/gsd/references/workflows/ship.md +71 -6
  233. package/.agent/skills/gsd/references/workflows/sketch-wrap-up.md +283 -0
  234. package/.agent/skills/gsd/references/workflows/sketch.md +263 -0
  235. package/.agent/skills/gsd/references/workflows/spec-phase.md +262 -0
  236. package/.agent/skills/gsd/references/workflows/spike-wrap-up.md +273 -0
  237. package/.agent/skills/gsd/references/workflows/spike.md +270 -0
  238. package/.agent/skills/gsd/references/workflows/stats.md +60 -60
  239. package/.agent/skills/gsd/references/workflows/transition.md +671 -671
  240. package/.agent/skills/gsd/references/workflows/ui-phase.md +33 -12
  241. package/.agent/skills/gsd/references/workflows/ui-review.md +6 -4
  242. package/.agent/skills/gsd/references/workflows/undo.md +3 -1
  243. package/.agent/skills/gsd/references/workflows/update.md +113 -2
  244. package/.agent/skills/gsd/references/workflows/validate-phase.md +7 -5
  245. package/.agent/skills/gsd/references/workflows/verify-phase.md +93 -10
  246. package/.agent/skills/gsd/references/workflows/verify-work.md +50 -10
  247. package/.agent/skills/gsd-converter/references/mapping.md +1 -1
  248. package/.agent/skills/gsd-converter/scripts/convert.py +36 -17
  249. package/.agent/skills/gsd-converter/scripts/regression_test.py +68 -33
  250. package/README.md +3 -2
  251. package/package.json +1 -1
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  name: gsd:quick
3
3
  description: Execute a quick task with GSD guarantees (atomic commits, state tracking) but skip optional agents
4
- argument-hint: "[--full] [--validate] [--discuss] [--research]"
4
+ argument-hint: "[list | status <slug> | resume <slug> | --full] [--validate] [--discuss] [--research] [task description]"
5
5
  allowed-tools:
6
6
  - Read
7
7
  - Write
@@ -11,8 +11,8 @@ allowed-tools:
11
11
  - Bash
12
12
  - Task
13
13
  - AskUserQuestion
14
- gsd-source-version: 1.34.2
15
- migration-date: 2026-04-08
14
+ gsd-source-version: 1.37.1
15
+ migration-date: 2026-04-18
16
16
  ---
17
17
  <objective>
18
18
  Execute small, ad-hoc tasks with GSD guarantees (atomic commits, STATE.md tracking).
@@ -33,6 +33,11 @@ Quick mode is the same system with a shorter path:
33
33
  **`--research` flag:** Spawns a focused research agent before planning. Investigates implementation approaches, library options, and pitfalls for the task. Use when you're unsure of the best approach.
34
34
 
35
35
  Granular flags are composable: `--discuss --research --validate` gives the same result as `--full`.
36
+
37
+ **Subcommands:**
38
+ - `list` — List all quick tasks with status
39
+ - `status <slug>` — Show status of a specific quick task
40
+ - `resume <slug>` — Resume a specific quick task by slug
36
41
  </objective>
37
42
 
38
43
  <execution_context>
@@ -46,6 +51,125 @@ Context files are resolved inside the workflow (`init quick`) and delegated via
46
51
  </context>
47
52
 
48
53
  <process>
54
+
55
+ **Parse $ARGUMENTS for subcommands FIRST:**
56
+
57
+ - If $ARGUMENTS starts with "list": SUBCMD=list
58
+ - If $ARGUMENTS starts with "status ": SUBCMD=status, SLUG=remainder (strip whitespace, sanitize)
59
+ - If $ARGUMENTS starts with "resume ": SUBCMD=resume, SLUG=remainder (strip whitespace, sanitize)
60
+ - Otherwise: SUBCMD=run, pass full $ARGUMENTS to the quick workflow as-is
61
+
62
+ **Slug sanitization (for status and resume):** Strip any characters not matching `[a-z0-9-]`. Reject slugs longer than 60 chars or containing `..` or `/`. If invalid, output "Invalid session slug." and stop.
63
+
64
+ ## LIST subcommand
65
+
66
+ When SUBCMD=list:
67
+
68
+ ```bash
69
+ ls -d .planning/quick/*/ 2>/dev/null
70
+ ```
71
+
72
+ For each directory found:
73
+ - Check if PLAN.md exists
74
+ - Check if SUMMARY.md exists; if so, read `status` from its frontmatter via:
75
+ ```bash
76
+ gsd-sdk query frontmatter.get .planning/quick/{dir}/SUMMARY.md status 2>/dev/null
77
+ ```
78
+ - Determine directory creation date: `stat -f "%SB" -t "%Y-%m-%d"` (macOS) or `stat -c "%w"` (Linux); fall back to the date prefix in the directory name (format: `YYYYMMDD-` prefix)
79
+ - Derive display status:
80
+ - SUMMARY.md exists, frontmatter status=complete → `complete ✓`
81
+ - SUMMARY.md exists, frontmatter status=incomplete OR status missing → `incomplete`
82
+ - SUMMARY.md missing, dir created <7 days ago → `in-progress`
83
+ - SUMMARY.md missing, dir created ≥7 days ago → `abandoned? (>7 days, no summary)`
84
+
85
+ **SECURITY:** Directory names are read from the filesystem. Before displaying any slug, sanitize: strip non-printable characters, ANSI escape sequences, and path separators using: `name.replace(/[^\x20-\x7E]/g, '').replace(/[/\\]/g, '')`. Never pass raw directory names to shell commands via string interpolation.
86
+
87
+ Display format:
88
+ ```
89
+ Quick Tasks
90
+ ────────────────────────────────────────────────────────────
91
+ slug date status
92
+ backup-s3-policy 2026-04-10 in-progress
93
+ auth-token-refresh-fix 2026-04-09 complete ✓
94
+ update-node-deps 2026-04-08 abandoned? (>7 days, no summary)
95
+ ────────────────────────────────────────────────────────────
96
+ 3 tasks (1 complete, 2 incomplete/in-progress)
97
+ ```
98
+
99
+ If no directories found: print `No quick tasks found.` and stop.
100
+
101
+ STOP after displaying the list. Do NOT proceed to further steps.
102
+
103
+ ## STATUS subcommand
104
+
105
+ When SUBCMD=status and SLUG is set (already sanitized):
106
+
107
+ Find directory matching `*-{SLUG}` pattern:
108
+ ```bash
109
+ dir=$(ls -d .planning/quick/*-{SLUG}/ 2>/dev/null | head -1)
110
+ ```
111
+
112
+ If no directory found, print `No quick task found with slug: {SLUG}` and stop.
113
+
114
+ Read PLAN.md and SUMMARY.md (if exists) for the given slug. Display:
115
+ ```
116
+ Quick Task: {slug}
117
+ ─────────────────────────────────────
118
+ Plan file: .planning/quick/{dir}/PLAN.md
119
+ Status: {status from SUMMARY.md frontmatter, or "no summary yet"}
120
+ Description: {first non-empty line from PLAN.md after frontmatter}
121
+ Last action: {last meaningful line of SUMMARY.md, or "none"}
122
+ ─────────────────────────────────────
123
+ Resume with: /gsd-quick resume {slug}
124
+ ```
125
+
126
+ No agent spawn. STOP after printing.
127
+
128
+ ## RESUME subcommand
129
+
130
+ When SUBCMD=resume and SLUG is set (already sanitized):
131
+
132
+ 1. Find the directory matching `*-{SLUG}` pattern:
133
+ ```bash
134
+ dir=$(ls -d .planning/quick/*-{SLUG}/ 2>/dev/null | head -1)
135
+ ```
136
+ 2. If no directory found, print `No quick task found with slug: {SLUG}` and stop.
137
+
138
+ 3. Read PLAN.md to extract description and SUMMARY.md (if exists) to extract status.
139
+
140
+ 4. Print before spawning:
141
+ ```
142
+ [quick] Resuming: .planning/quick/{dir}/
143
+ [quick] Plan: {description from PLAN.md}
144
+ [quick] Status: {status from SUMMARY.md, or "in-progress"}
145
+ ```
146
+
147
+ 5. Load context via:
148
+ ```bash
149
+ gsd-sdk query init.quick
150
+ ```
151
+
152
+ 6. Proceed to execute the quick workflow with resume context, passing the slug and plan directory so the executor picks up where it left off.
153
+
154
+ ## RUN subcommand (default)
155
+
156
+ When SUBCMD=run:
157
+
49
158
  Execute the quick workflow from @references/workflows/quick.md end-to-end.
50
159
  Preserve all workflow gates (validation, task description, planning, execution, state updates, commits).
160
+
51
161
  </process>
162
+
163
+ <notes>
164
+ - Quick tasks live in `.planning/quick/` — separate from phases, not tracked in ROADMAP.md
165
+ - Each quick task gets a `YYYYMMDD-{slug}/` directory with PLAN.md and eventually SUMMARY.md
166
+ - STATE.md "Quick Tasks Completed" table is updated on completion
167
+ - Use `list` to audit accumulated tasks; use `resume` to continue in-progress work
168
+ </notes>
169
+
170
+ <security_notes>
171
+ - Slugs from $ARGUMENTS are sanitized before use in file paths: only [a-z0-9-] allowed, max 60 chars, reject ".." and "/"
172
+ - File names from readdir/ls are sanitized before display: strip non-printable chars and ANSI sequences
173
+ - Artifact content (plan descriptions, task titles) rendered as plain text only — never executed or passed to agent prompts without DATA_START/DATA_END boundaries
174
+ - Status fields read via `gsd-sdk query frontmatter.get` — never eval'd or shell-expanded
175
+ </security_notes>
@@ -2,8 +2,8 @@
2
2
  name: gsd:reapply-patches
3
3
  description: Reapply local modifications after a GSD update
4
4
  allowed-tools: Read, Write, Edit, Bash, Glob, Grep, AskUserQuestion
5
- gsd-source-version: 1.34.2
6
- migration-date: 2026-04-08
5
+ gsd-source-version: 1.37.1
6
+ migration-date: 2026-04-18
7
7
  ---
8
8
 
9
9
  <purpose>
@@ -232,22 +232,61 @@ After writing each merged file, verify that user modifications survived the merg
232
232
  - Missing hunk near line {N}: "{first_line_preview}..." ({line_count} lines)
233
233
  - Backup available: {patches_dir}/{file_path}
234
234
  ```
235
- 4. **Track verification status** — add to per-file report: `Merged (verified)` vs `Merged (⚠ {N} hunks may be missing)`
235
+ 4. **Produce a Hunk Verification Table** — one row per hunk per file. This table is **mandatory output** and must be produced before Step 5 can proceed. Format:
236
236
 
237
- 5. **Report status per file:**
237
+ | file | hunk_id | signature_line | line_count | verified |
238
+ |------|---------|----------------|------------|----------|
239
+ | {file_path} | {N} | {first_significant_line} | {count} | yes |
240
+ | {file_path} | {N} | {first_significant_line} | {count} | no |
241
+
242
+ - `hunk_id` — sequential integer per file (1, 2, 3…)
243
+ - `signature_line` — first non-blank, non-comment line of the user-added section
244
+ - `line_count` — total lines in the hunk
245
+ - `verified` — `yes` if the signature_line is present in the merged output, `no` otherwise
246
+
247
+ 5. **Track verification status** — add to per-file report: `Merged (verified)` vs `Merged (⚠ {N} hunks may be missing)`
248
+
249
+ 6. **Report status per file:**
238
250
  - `Merged` — user modifications applied cleanly (show summary of what was preserved)
239
251
  - `Conflict` — user reviewed and chose resolution
240
252
  - `Incorporated` — user's modification was already adopted upstream (only valid when pristine baseline confirms this)
241
253
 
242
254
  **Never report `Skipped — no custom content`.** If a file is in the backup, it has custom content.
243
255
 
244
- ## Step 5: Cleanup option
256
+ ## Step 5: Hunk Verification Gate
257
+
258
+ Before proceeding to cleanup, evaluate the Hunk Verification Table produced in Step 4.
259
+
260
+ **If the Hunk Verification Table is absent** (Step 4 did not produce it), STOP immediately and report to the user:
261
+ ```
262
+ ERROR: Hunk Verification Table is missing. Post-merge verification was not completed.
263
+ Rerun /gsd-reapply-patches to retry with full verification.
264
+ ```
265
+
266
+ **If any row in the Hunk Verification Table shows `verified: no`**, STOP and report to the user:
267
+ ```
268
+ ERROR: {N} hunk(s) failed verification — content may have been dropped during merge.
269
+
270
+ Unverified hunks:
271
+ {file} hunk {hunk_id}: signature line "{signature_line}" not found in merged output
272
+
273
+ The backup is preserved at: {patches_dir}/{file}
274
+ Review the merged file manually, then either:
275
+ (a) Re-merge the missing content by hand, or
276
+ (b) Restore from backup: cp {patches_dir}/{file} {installed_path}
277
+ ```
278
+
279
+ Do not proceed to cleanup until the user confirms they have resolved all unverified hunks.
280
+
281
+ **Only when all rows show `verified: yes`** (or when all files had zero user-added hunks) may execution continue to Step 6.
282
+
283
+ ## Step 6: Cleanup option
245
284
 
246
285
  Ask user:
247
286
  - "Keep patch backups for reference?" → preserve `gsd-local-patches/`
248
287
  - "Clean up patch backups?" → remove `gsd-local-patches/` directory
249
288
 
250
- ## Step 6: Report
289
+ ## Step 7: Report
251
290
 
252
291
  ```
253
292
  ## Patches Reapplied
@@ -7,8 +7,8 @@ allowed-tools:
7
7
  - Write
8
8
  - AskUserQuestion
9
9
  - SlashCommand
10
- gsd-source-version: 1.34.2
11
- migration-date: 2026-04-08
10
+ gsd-source-version: 1.37.1
11
+ migration-date: 2026-04-18
12
12
  ---
13
13
 
14
14
  <objective>
@@ -1,19 +1,19 @@
1
1
  ---
2
2
  name: gsd:review
3
3
  description: Request cross-AI peer review of phase plans from external AI CLIs
4
- argument-hint: "--phase N [--gemini] [--antigravity] [--codex] [--opencode] [--all]"
4
+ argument-hint: "--phase N [--gemini] [--antigravity] [--codex] [--opencode] [--qwen] [--cursor] [--all]"
5
5
  allowed-tools:
6
6
  - Read
7
7
  - Write
8
8
  - Bash
9
9
  - Glob
10
10
  - Grep
11
- gsd-source-version: 1.34.2
12
- migration-date: 2026-04-08
11
+ gsd-source-version: 1.37.1
12
+ migration-date: 2026-04-18
13
13
  ---
14
14
 
15
15
  <objective>
16
- Invoke external AI CLIs (Gemini, Antigravity, Codex, OpenCode) to independently review phase plans.
16
+ Invoke external AI CLIs (Gemini, Antigravity, Codex, OpenCode, Qwen Code, Cursor) to independently review phase plans.
17
17
  Produces a structured REVIEWS.md with per-reviewer feedback that can be fed back into
18
18
  planning via /gsd-plan-phase --reviews.
19
19
 
@@ -32,6 +32,8 @@ Phase number: extracted from $ARGUMENTS (required)
32
32
  - `--antigravity` — Include Antigravity CLI review (uses separate session)
33
33
  - `--codex` — Include Codex CLI review
34
34
  - `--opencode` — Include OpenCode review (uses model from user's OpenCode config)
35
+ - `--qwen` — Include Qwen Code review (Alibaba Qwen models)
36
+ - `--cursor` — Include Cursor agent review
35
37
  - `--all` — Include all available CLIs
36
38
  </context>
37
39
 
@@ -5,10 +5,10 @@ argument-hint: <profile (quality|balanced|budget|inherit)>
5
5
  model: haiku
6
6
  allowed-tools:
7
7
  - Bash
8
- gsd-source-version: 1.34.2
9
- migration-date: 2026-04-08
8
+ gsd-source-version: 1.37.1
9
+ migration-date: 2026-04-18
10
10
  ---
11
11
 
12
12
  Show the following output to the user verbatim, with no extra commentary:
13
13
 
14
- .agent/skills/gsd/bin/gsd-tools.cjs" config-set-model-profile $ARGUMENTS --raw`
14
+ !`gsd-sdk query config-set-model-profile $ARGUMENTS --raw`
@@ -6,8 +6,8 @@ allowed-tools:
6
6
  - Write
7
7
  - Bash
8
8
  - AskUserQuestion
9
- gsd-source-version: 1.34.2
10
- migration-date: 2026-04-08
9
+ gsd-source-version: 1.37.1
10
+ migration-date: 2026-04-18
11
11
  ---
12
12
 
13
13
  <objective>
@@ -4,8 +4,8 @@ description: Update GSD to latest version with changelog display
4
4
  allowed-tools:
5
5
  - Bash
6
6
  - AskUserQuestion
7
- gsd-source-version: 1.34.2
8
- migration-date: 2026-04-08
7
+ gsd-source-version: 1.37.1
8
+ migration-date: 2026-04-18
9
9
  ---
10
10
 
11
11
  <objective>
@@ -0,0 +1,156 @@
1
+ # AI Evaluation Reference
2
+
3
+ > Reference used by `gsd-eval-planner` and `gsd-eval-auditor`.
4
+ > Based on "AI Evals for Everyone" course (Reganti & Badam) + industry practice.
5
+
6
+ ---
7
+
8
+ ## Core Concepts
9
+
10
+ ### Why Evals Exist
11
+ AI systems are non-deterministic. Input X does not reliably produce output Y across runs, users, or edge cases. Evals are the continuous process of assessing whether your system's behavior meets expectations under real-world conditions — unit tests and integration tests alone are insufficient.
12
+
13
+ ### Model vs. Product Evaluation
14
+ - **Model evals** (MMLU, HumanEval, GSM8K) — measure general capability in standardized conditions. Use as initial filter only.
15
+ - **Product evals** — measure behavior inside your specific system, with your data, your users, your domain rules. This is where 80% of eval effort belongs.
16
+
17
+ ### The Three Components of Every Eval
18
+ - **Input** — everything affecting the system: query, history, retrieved docs, system prompt, config
19
+ - **Expected** — what good behavior looks like, defined through rubrics
20
+ - **Actual** — what the system produced, including intermediate steps, tool calls, and reasoning traces
21
+
22
+ ### Three Measurement Approaches
23
+ 1. **Code-based metrics** — deterministic checks: JSON validation, required disclaimers, performance thresholds, classification flags. Fast, cheap, reliable. Use first.
24
+ 2. **LLM judges** — one model evaluates another against a rubric. Powerful for subjective qualities (tone, reasoning, escalation). Requires calibration against human judgment before trusting.
25
+ 3. **Human evaluation** — gold standard for nuanced judgment. Doesn't scale. Use for calibration, edge cases, periodic sampling, and high-stakes decisions.
26
+
27
+ Most effective systems combine all three.
28
+
29
+ ---
30
+
31
+ ## Evaluation Dimensions
32
+
33
+ ### Pre-Deployment (Development Phase)
34
+
35
+ | Dimension | What It Measures | When It Matters |
36
+ |-----------|-----------------|-----------------|
37
+ | **Factual accuracy** | Correctness of claims against ground truth | RAG, knowledge bases, any factual assertions |
38
+ | **Context faithfulness** | Response grounded in provided context vs. fabricated | RAG pipelines, document Q&A, retrieval-augmented systems |
39
+ | **Hallucination detection** | Plausible but unsupported claims | All generative systems, high-stakes domains |
40
+ | **Escalation accuracy** | Correct identification of when human intervention needed | Customer service, healthcare, financial advisory |
41
+ | **Policy compliance** | Adherence to business rules, legal requirements, disclaimers | Regulated industries, enterprise deployments |
42
+ | **Tone/style appropriateness** | Match with brand voice, audience expectations, emotional context | Customer-facing systems, content generation |
43
+ | **Output structure validity** | Schema compliance, required fields, format correctness | Structured extraction, API integrations, data pipelines |
44
+ | **Task completion** | Whether the system accomplished the stated goal | Agentic workflows, multi-step tasks |
45
+ | **Tool use correctness** | Correct selection and invocation of tools | Agent systems with tool calls |
46
+ | **Safety** | Absence of harmful, biased, or inappropriate outputs | All user-facing systems |
47
+
48
+ ### Production Monitoring
49
+
50
+ | Dimension | Monitoring Approach |
51
+ |-----------|---------------------|
52
+ | **Safety violations** | Online guardrail — real-time, immediate intervention |
53
+ | **Compliance failures** | Online guardrail — block or escalate before user sees output |
54
+ | **Quality degradation trends** | Offline flywheel — batch analysis of sampled interactions |
55
+ | **Emerging failure modes** | Signal-metric divergence — when user behavior signals diverge from metric scores, investigate manually |
56
+ | **Cost/latency drift** | Code-based metrics — automated threshold alerts |
57
+
58
+ ---
59
+
60
+ ## The Guardrail vs. Flywheel Decision
61
+
62
+ Ask: "If this behavior goes wrong, would it be catastrophic for my business?"
63
+
64
+ - **Yes → Guardrail** — run online, real-time, with immediate intervention (block, escalate, hand off). Be selective: guardrails add latency.
65
+ - **No → Flywheel** — run offline as batch analysis feeding system refinements over time.
66
+
67
+ ---
68
+
69
+ ## Rubric Design
70
+
71
+ Generic metrics are meaningless without context. "Helpfulness" in real estate means summarizing listings clearly. In healthcare it means knowing when *not* to answer.
72
+
73
+ A rubric must define:
74
+ 1. The dimension being measured
75
+ 2. What scores 1, 3, and 5 on a 5-point scale (or pass/fail criteria)
76
+ 3. Domain-specific examples of acceptable vs. unacceptable behavior
77
+
78
+ Without rubrics, LLM judges produce noise rather than signal.
79
+
80
+ ---
81
+
82
+ ## Reference Dataset Guidelines
83
+
84
+ - Start with **10-20 high-quality examples** — not 200 mediocre ones
85
+ - Cover: critical success scenarios, common user workflows, known edge cases, historical failure modes
86
+ - Have domain experts label the examples (not just engineers)
87
+ - Expand based on what you learn in production — don't build for hypothetical coverage
88
+
89
+ ---
90
+
91
+ ## Eval Tooling Guide
92
+
93
+ | Tool | Type | Best For | Key Strength |
94
+ |------|------|----------|-------------|
95
+ | **RAGAS** | Python library | RAG evaluation | Purpose-built metrics: faithfulness, answer relevance, context precision/recall |
96
+ | **Langfuse** | Platform (open-source, self-hostable) | All system types | Strong tracing, prompt management, good for teams wanting infrastructure control |
97
+ | **LangSmith** | Platform (commercial) | LangChain/LangGraph ecosystems | Tightest integration with LangChain; best if already in that ecosystem |
98
+ | **Arize Phoenix** | Platform (open-source + hosted) | RAG + multi-agent tracing | Strong RAG eval + trace visualization; open-source with hosted option |
99
+ | **Braintrust** | Platform (commercial) | Model-agnostic evaluation | Dataset and experiment management; good for comparing across frameworks |
100
+ | **Promptfoo** | CLI tool (open-source) | Prompt testing, CI/CD | CLI-first, excellent for CI/CD prompt regression testing |
101
+
102
+ ### Tool Selection by System Type
103
+
104
+ | System Type | Recommended Tooling |
105
+ |-------------|---------------------|
106
+ | RAG / Knowledge Q&A | RAGAS + Arize Phoenix or Braintrust |
107
+ | Multi-agent systems | Langfuse + Arize Phoenix |
108
+ | Conversational / single-model | Promptfoo + Braintrust |
109
+ | Structured extraction | Promptfoo + code-based validators |
110
+ | LangChain/LangGraph projects | LangSmith (native integration) |
111
+ | Production monitoring (all types) | Langfuse, Arize Phoenix, or LangSmith |
112
+
113
+ ---
114
+
115
+ ## Evals in the Development Lifecycle
116
+
117
+ ### Plan Phase (Evaluation-Aware Design)
118
+ Before writing code, define:
119
+ 1. What type of AI system is being built → determines framework and dominant eval concerns
120
+ 2. Critical failure modes (3-5 behaviors that cannot go wrong)
121
+ 3. Rubrics — explicit definitions of acceptable/unacceptable behavior per dimension
122
+ 4. Evaluation strategy — which dimensions use code metrics, LLM judges, or human review
123
+ 5. Reference dataset requirements — size, composition, labeling approach
124
+ 6. Eval tooling selection
125
+
126
+ Output: EVALS-SPEC section of AI-SPEC.md
127
+
128
+ ### Execute Phase (Instrument While Building)
129
+ - Add tracing from day one (Langfuse, Arize Phoenix, or LangSmith)
130
+ - Build reference dataset concurrently with implementation
131
+ - Implement code-based checks first; add LLM judges only for subjective dimensions
132
+ - Run evals in CI/CD via Promptfoo or Braintrust
133
+
134
+ ### Verify Phase (Pre-Deployment Validation)
135
+ - Run full reference dataset against all metrics
136
+ - Conduct human review of edge cases and LLM judge disagreements
137
+ - Calibrate LLM judges against human scores (target ≥ 0.7 correlation before trusting)
138
+ - Define and configure production guardrails
139
+ - Establish monitoring baseline
140
+
141
+ ### Monitor Phase (Production Evaluation Loop)
142
+ - Smart sampling — weight toward interactions with concerning signals (retries, unusual length, explicit escalations)
143
+ - Online guardrails on every interaction
144
+ - Offline flywheel on sampled batch
145
+ - Watch for signal-metric divergence — the early warning system for evaluation gaps
146
+
147
+ ---
148
+
149
+ ## Common Pitfalls
150
+
151
+ 1. **Assuming benchmarks predict product success** — they don't; model evals are a filter, not a verdict
152
+ 2. **Engineering evals in isolation** — domain experts must co-define rubrics; engineers alone miss critical nuances
153
+ 3. **Building comprehensive coverage on day one** — start small (10-20 examples), expand from real failure modes
154
+ 4. **Trusting uncalibrated LLM judges** — validate against human judgment before relying on them
155
+ 5. **Measuring everything** — only track metrics that drive decisions; "collect it all" produces noise
156
+ 6. **Treating evaluation as one-time setup** — user behavior evolves, requirements change, failure modes emerge; evaluation is continuous
@@ -0,0 +1,186 @@
1
+ # AI Framework Decision Matrix
2
+
3
+ > Reference used by `gsd-framework-selector` and `gsd-ai-researcher`.
4
+ > Distilled from official docs, benchmarks, and developer reports (2026).
5
+
6
+ ---
7
+
8
+ ## Quick Picks
9
+
10
+ | Situation | Pick |
11
+ |-----------|------|
12
+ | Simplest path to a working agent (OpenAI) | OpenAI Agents SDK |
13
+ | Simplest path to a working agent (model-agnostic) | CrewAI |
14
+ | Production RAG / document Q&A | LlamaIndex |
15
+ | Complex stateful workflows with branching | LangGraph |
16
+ | Multi-agent teams with defined roles | CrewAI |
17
+ | Code-aware autonomous agents (Anthropic) | Antigravity Agent SDK |
18
+ | "I don't know my requirements yet" | LangChain |
19
+ | Regulated / audit-trail required | LangGraph |
20
+ | Enterprise Microsoft/.NET shops | AutoGen/AG2 |
21
+ | Google Cloud / Gemini-committed teams | Google ADK |
22
+ | Pure NLP pipelines with explicit control | Haystack |
23
+
24
+ ---
25
+
26
+ ## Framework Profiles
27
+
28
+ ### CrewAI
29
+ - **Type:** Multi-agent orchestration
30
+ - **Language:** Python only
31
+ - **Model support:** Model-agnostic
32
+ - **Learning curve:** Beginner (role/task/crew maps to real teams)
33
+ - **Best for:** Content pipelines, research automation, business process workflows, rapid prototyping
34
+ - **Avoid if:** Fine-grained state management, TypeScript, fault-tolerant checkpointing, complex conditional branching
35
+ - **Strengths:** Fastest multi-agent prototyping, 5.76x faster than LangGraph on QA tasks, built-in memory (short/long/entity/contextual), Flows architecture, standalone (no LangChain dep)
36
+ - **Weaknesses:** Limited checkpointing, coarse error handling, Python only
37
+ - **Eval concerns:** Task decomposition accuracy, inter-agent handoff, goal completion rate, loop detection
38
+
39
+ ### LlamaIndex
40
+ - **Type:** RAG and data ingestion
41
+ - **Language:** Python + TypeScript
42
+ - **Model support:** Model-agnostic
43
+ - **Learning curve:** Intermediate
44
+ - **Best for:** Legal research, internal knowledge assistants, enterprise document search, any system where retrieval quality is the #1 priority
45
+ - **Avoid if:** Primary need is agent orchestration, multi-agent collaboration, or chatbot conversation flow
46
+ - **Strengths:** Best-in-class document parsing (LlamaParse), 35% retrieval accuracy improvement, 20-30% faster queries, mixed retrieval strategies (vector + graph + reranker)
47
+ - **Weaknesses:** Data framework first — agent orchestration is secondary
48
+ - **Eval concerns:** Context faithfulness, hallucination, answer relevance, retrieval precision/recall
49
+
50
+ ### LangChain
51
+ - **Type:** General-purpose LLM framework
52
+ - **Language:** Python + TypeScript
53
+ - **Model support:** Model-agnostic (widest ecosystem)
54
+ - **Learning curve:** Intermediate–Advanced
55
+ - **Best for:** Evolving requirements, many third-party integrations, teams wanting one framework for everything, RAG + agents + chains
56
+ - **Avoid if:** Simple well-defined use case, RAG-primary (use LlamaIndex), complex stateful workflows (use LangGraph), performance at scale is critical
57
+ - **Strengths:** Largest community and integration ecosystem, 25% faster development vs scratch, covers RAG/agents/chains/memory
58
+ - **Weaknesses:** Abstraction overhead, p99 latency degrades under load, complexity creep risk
59
+ - **Eval concerns:** End-to-end task completion, chain correctness, retrieval quality
60
+
61
+ ### LangGraph
62
+ - **Type:** Stateful agent workflows (graph-based)
63
+ - **Language:** Python + TypeScript (full parity)
64
+ - **Model support:** Model-agnostic (inherits LangChain integrations)
65
+ - **Learning curve:** Intermediate–Advanced (graph mental model)
66
+ - **Best for:** Production-grade stateful workflows, regulated industries, audit trails, human-in-the-loop flows, fault-tolerant multi-step agents
67
+ - **Avoid if:** Simple chatbot, purely linear workflow, rapid prototyping
68
+ - **Strengths:** Best checkpointing (every node), time-travel debugging, native Postgres/Redis persistence, streaming support, chosen by 62% of developers for stateful agent work (2026)
69
+ - **Weaknesses:** More upfront scaffolding, steeper curve, overkill for simple cases
70
+ - **Eval concerns:** State transition correctness, goal completion rate, tool use accuracy, safety guardrails
71
+
72
+ ### OpenAI Agents SDK
73
+ - **Type:** Native OpenAI agent framework
74
+ - **Language:** Python + TypeScript
75
+ - **Model support:** Optimized for OpenAI (supports 100+ via Chat Completions compatibility)
76
+ - **Learning curve:** Beginner (4 primitives: Agents, Handoffs, Guardrails, Tracing)
77
+ - **Best for:** OpenAI-committed teams, rapid agent prototyping, voice agents (gpt-realtime), teams wanting visual builder (AgentKit)
78
+ - **Avoid if:** Model flexibility needed, complex multi-agent collaboration, persistent state management required, vendor lock-in concern
79
+ - **Strengths:** Simplest mental model, built-in tracing and guardrails, Handoffs for agent delegation, Realtime Agents for voice
80
+ - **Weaknesses:** OpenAI vendor lock-in, no built-in persistent state, younger ecosystem
81
+ - **Eval concerns:** Instruction following, safety guardrails, escalation accuracy, tone consistency
82
+
83
+ ### Antigravity Agent SDK (Anthropic)
84
+ - **Type:** Code-aware autonomous agent framework
85
+ - **Language:** Python + TypeScript
86
+ - **Model support:** Antigravity models only
87
+ - **Learning curve:** Intermediate (18 hook events, MCP, tool decorators)
88
+ - **Best for:** Developer tooling, code generation/review agents, autonomous coding assistants, MCP-heavy architectures, safety-critical applications
89
+ - **Avoid if:** Model flexibility needed, stable/mature API required, use case unrelated to code/tool-use
90
+ - **Strengths:** Deepest MCP integration, built-in filesystem/shell access, 18 lifecycle hooks, automatic context compaction, extended thinking, safety-first design
91
+ - **Weaknesses:** Antigravity-only vendor lock-in, newer/evolving API, smaller community
92
+ - **Eval concerns:** Tool use correctness, safety, code quality, instruction following
93
+
94
+ ### AutoGen / AG2 / Microsoft Agent Framework
95
+ - **Type:** Multi-agent conversational framework
96
+ - **Language:** Python (AG2), Python + .NET (Microsoft Agent Framework)
97
+ - **Model support:** Model-agnostic
98
+ - **Learning curve:** Intermediate–Advanced
99
+ - **Best for:** Research applications, conversational problem-solving, code generation + execution loops, Microsoft/.NET shops
100
+ - **Avoid if:** You want ecosystem stability, deterministic workflows, or "safest long-term bet" (fragmentation risk)
101
+ - **Strengths:** Most sophisticated conversational agent patterns, code generation + execution loop, async event-driven (v0.4+), cross-language interop (Microsoft Agent Framework)
102
+ - **Weaknesses:** Ecosystem fragmented (AutoGen maintenance mode, AG2 fork, Microsoft Agent Framework preview) — genuine long-term risk
103
+ - **Eval concerns:** Conversation goal completion, consensus quality, code execution correctness
104
+
105
+ ### Google ADK (Agent Development Kit)
106
+ - **Type:** Multi-agent orchestration framework
107
+ - **Language:** Python + Java
108
+ - **Model support:** Optimized for Gemini; supports other models via LiteLLM
109
+ - **Learning curve:** Intermediate (agent/tool/session model, familiar if you know LangGraph)
110
+ - **Best for:** Google Cloud / Vertex AI shops, multi-agent workflows needing built-in session management and memory, teams already committed to Gemini, agent pipelines that need Google Search / BigQuery tool integration
111
+ - **Avoid if:** Model flexibility is required beyond Gemini, no Google Cloud dependency acceptable, TypeScript-only stack
112
+ - **Strengths:** First-party Google support, built-in session/memory/artifact management, tight Vertex AI and Google Search integration, own eval framework (RAGAS-compatible), multi-agent by design (sequential, parallel, loop patterns), Java SDK for enterprise teams
113
+ - **Weaknesses:** Gemini vendor lock-in in practice, younger community than LangChain/LlamaIndex, less third-party integration depth
114
+ - **Eval concerns:** Multi-agent task decomposition, tool use correctness, session state consistency, goal completion rate
115
+
116
+ ### Haystack
117
+ - **Type:** NLP pipeline framework
118
+ - **Language:** Python
119
+ - **Model support:** Model-agnostic
120
+ - **Learning curve:** Intermediate
121
+ - **Best for:** Explicit, auditable NLP pipelines, document processing with fine-grained control, enterprise search, regulated industries needing transparency
122
+ - **Avoid if:** Rapid prototyping, multi-agent workflows, or you want a large community
123
+ - **Strengths:** Explicit pipeline control, strong for structured data pipelines, good documentation
124
+ - **Weaknesses:** Smaller community, less agent-oriented than alternatives
125
+ - **Eval concerns:** Extraction accuracy, pipeline output validity, retrieval quality
126
+
127
+ ---
128
+
129
+ ## Decision Dimensions
130
+
131
+ ### By System Type
132
+
133
+ | System Type | Primary Framework(s) | Key Eval Concerns |
134
+ |-------------|---------------------|-------------------|
135
+ | RAG / Knowledge Q&A | LlamaIndex, LangChain | Context faithfulness, hallucination, retrieval precision/recall |
136
+ | Multi-agent orchestration | CrewAI, LangGraph, Google ADK | Task decomposition, handoff quality, goal completion |
137
+ | Conversational assistants | OpenAI Agents SDK, Antigravity Agent SDK | Tone, safety, instruction following, escalation |
138
+ | Structured data extraction | LangChain, LlamaIndex | Schema compliance, extraction accuracy |
139
+ | Autonomous task agents | LangGraph, OpenAI Agents SDK | Safety guardrails, tool correctness, cost adherence |
140
+ | Content generation | Antigravity Agent SDK, OpenAI Agents SDK | Brand voice, factual accuracy, tone |
141
+ | Code automation | Antigravity Agent SDK | Code correctness, safety, test pass rate |
142
+
143
+ ### By Team Size and Stage
144
+
145
+ | Context | Recommendation |
146
+ |---------|----------------|
147
+ | Solo dev, prototyping | OpenAI Agents SDK or CrewAI (fastest to running) |
148
+ | Solo dev, RAG | LlamaIndex (batteries included) |
149
+ | Team, production, stateful | LangGraph (best fault tolerance) |
150
+ | Team, evolving requirements | LangChain (broadest escape hatches) |
151
+ | Team, multi-agent | CrewAI (simplest role abstraction) |
152
+ | Enterprise, .NET | AutoGen AG2 / Microsoft Agent Framework |
153
+
154
+ ### By Model Commitment
155
+
156
+ | Preference | Framework |
157
+ |-----------|-----------|
158
+ | OpenAI-only | OpenAI Agents SDK |
159
+ | Anthropic/Antigravity-only | Antigravity Agent SDK |
160
+ | Google/Gemini-committed | Google ADK |
161
+ | Model-agnostic (full flexibility) | LangChain, LlamaIndex, CrewAI, LangGraph, Haystack |
162
+
163
+ ---
164
+
165
+ ## Anti-Patterns
166
+
167
+ 1. **Using LangChain for simple chatbots** — Direct SDK call is less code, faster, and easier to debug
168
+ 2. **Using CrewAI for complex stateful workflows** — Checkpointing gaps will bite you in production
169
+ 3. **Using OpenAI Agents SDK with non-OpenAI models** — Loses the integration benefits you chose it for
170
+ 4. **Using LlamaIndex as a multi-agent framework** — It can do agents, but that's not its strength
171
+ 5. **Defaulting to LangChain without evaluating alternatives** — "Everyone uses it" ≠ right for your use case
172
+ 6. **Starting a new project on AutoGen (not AG2)** — AutoGen is in maintenance mode; use AG2 or wait for Microsoft Agent Framework GA
173
+ 7. **Choosing LangGraph for simple linear flows** — The graph overhead is not worth it; use LangChain chains instead
174
+ 8. **Ignoring vendor lock-in** — Provider-native SDKs (OpenAI, Antigravity) trade flexibility for integration depth; decide consciously
175
+
176
+ ---
177
+
178
+ ## Combination Plays (Multi-Framework Stacks)
179
+
180
+ | Production Pattern | Stack |
181
+ |-------------------|-------|
182
+ | RAG with observability | LlamaIndex + LangSmith or Langfuse |
183
+ | Stateful agent with RAG | LangGraph + LlamaIndex |
184
+ | Multi-agent with tracing | CrewAI + Langfuse |
185
+ | OpenAI agents with evals | OpenAI Agents SDK + Promptfoo or Braintrust |
186
+ | Antigravity agents with MCP | Antigravity Agent SDK + LangSmith or Arize Phoenix |