@runchr/gstack-antigravity 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of @runchr/gstack-antigravity might be problematic. Click here for more details.

Files changed (231) hide show
  1. package/.agents/skills/gstack/.agents/skills/gstack/SKILL.md +651 -0
  2. package/.agents/skills/gstack/.agents/skills/gstack-autoplan/SKILL.md +678 -0
  3. package/.agents/skills/gstack/.agents/skills/gstack-benchmark/SKILL.md +482 -0
  4. package/.agents/skills/gstack/.agents/skills/gstack-browse/SKILL.md +511 -0
  5. package/.agents/skills/gstack/.agents/skills/gstack-canary/SKILL.md +486 -0
  6. package/.agents/skills/gstack/.agents/skills/gstack-careful/SKILL.md +50 -0
  7. package/.agents/skills/gstack/.agents/skills/gstack-cso/SKILL.md +607 -0
  8. package/.agents/skills/gstack/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
  9. package/.agents/skills/gstack/.agents/skills/gstack-design-review/SKILL.md +988 -0
  10. package/.agents/skills/gstack/.agents/skills/gstack-document-release/SKILL.md +604 -0
  11. package/.agents/skills/gstack/.agents/skills/gstack-freeze/SKILL.md +67 -0
  12. package/.agents/skills/gstack/.agents/skills/gstack-guard/SKILL.md +62 -0
  13. package/.agents/skills/gstack/.agents/skills/gstack-investigate/SKILL.md +415 -0
  14. package/.agents/skills/gstack/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
  15. package/.agents/skills/gstack/.agents/skills/gstack-office-hours/SKILL.md +986 -0
  16. package/.agents/skills/gstack/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
  17. package/.agents/skills/gstack/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
  18. package/.agents/skills/gstack/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
  19. package/.agents/skills/gstack/.agents/skills/gstack-qa/SKILL.md +1006 -0
  20. package/.agents/skills/gstack/.agents/skills/gstack-qa-only/SKILL.md +626 -0
  21. package/.agents/skills/gstack/.agents/skills/gstack-retro/SKILL.md +1065 -0
  22. package/.agents/skills/gstack/.agents/skills/gstack-review/SKILL.md +704 -0
  23. package/.agents/skills/gstack/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
  24. package/.agents/skills/gstack/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
  25. package/.agents/skills/gstack/.agents/skills/gstack-ship/SKILL.md +1312 -0
  26. package/.agents/skills/gstack/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
  27. package/.agents/skills/gstack/.agents/skills/gstack-upgrade/SKILL.md +220 -0
  28. package/.agents/skills/gstack/.env.example +5 -0
  29. package/.agents/skills/gstack/.github/workflows/skill-docs.yml +17 -0
  30. package/.agents/skills/gstack/AGENTS.md +49 -0
  31. package/.agents/skills/gstack/ARCHITECTURE.md +359 -0
  32. package/.agents/skills/gstack/BROWSER.md +271 -0
  33. package/.agents/skills/gstack/CHANGELOG.md +800 -0
  34. package/.agents/skills/gstack/CLAUDE.md +284 -0
  35. package/.agents/skills/gstack/CONTRIBUTING.md +370 -0
  36. package/.agents/skills/gstack/ETHOS.md +129 -0
  37. package/.agents/skills/gstack/LICENSE +21 -0
  38. package/.agents/skills/gstack/README.md +228 -0
  39. package/.agents/skills/gstack/SKILL.md +657 -0
  40. package/.agents/skills/gstack/SKILL.md.tmpl +281 -0
  41. package/.agents/skills/gstack/TODOS.md +564 -0
  42. package/.agents/skills/gstack/VERSION +1 -0
  43. package/.agents/skills/gstack/autoplan/SKILL.md +689 -0
  44. package/.agents/skills/gstack/autoplan/SKILL.md.tmpl +416 -0
  45. package/.agents/skills/gstack/benchmark/SKILL.md +489 -0
  46. package/.agents/skills/gstack/benchmark/SKILL.md.tmpl +233 -0
  47. package/.agents/skills/gstack/bin/dev-setup +68 -0
  48. package/.agents/skills/gstack/bin/dev-teardown +56 -0
  49. package/.agents/skills/gstack/bin/gstack-analytics +191 -0
  50. package/.agents/skills/gstack/bin/gstack-community-dashboard +113 -0
  51. package/.agents/skills/gstack/bin/gstack-config +38 -0
  52. package/.agents/skills/gstack/bin/gstack-diff-scope +71 -0
  53. package/.agents/skills/gstack/bin/gstack-global-discover.ts +591 -0
  54. package/.agents/skills/gstack/bin/gstack-repo-mode +93 -0
  55. package/.agents/skills/gstack/bin/gstack-review-log +9 -0
  56. package/.agents/skills/gstack/bin/gstack-review-read +12 -0
  57. package/.agents/skills/gstack/bin/gstack-slug +15 -0
  58. package/.agents/skills/gstack/bin/gstack-telemetry-log +158 -0
  59. package/.agents/skills/gstack/bin/gstack-telemetry-sync +127 -0
  60. package/.agents/skills/gstack/bin/gstack-update-check +196 -0
  61. package/.agents/skills/gstack/browse/SKILL.md +517 -0
  62. package/.agents/skills/gstack/browse/SKILL.md.tmpl +141 -0
  63. package/.agents/skills/gstack/browse/bin/find-browse +21 -0
  64. package/.agents/skills/gstack/browse/bin/remote-slug +14 -0
  65. package/.agents/skills/gstack/browse/scripts/build-node-server.sh +48 -0
  66. package/.agents/skills/gstack/browse/src/browser-manager.ts +634 -0
  67. package/.agents/skills/gstack/browse/src/buffers.ts +137 -0
  68. package/.agents/skills/gstack/browse/src/bun-polyfill.cjs +109 -0
  69. package/.agents/skills/gstack/browse/src/cli.ts +420 -0
  70. package/.agents/skills/gstack/browse/src/commands.ts +111 -0
  71. package/.agents/skills/gstack/browse/src/config.ts +150 -0
  72. package/.agents/skills/gstack/browse/src/cookie-import-browser.ts +417 -0
  73. package/.agents/skills/gstack/browse/src/cookie-picker-routes.ts +207 -0
  74. package/.agents/skills/gstack/browse/src/cookie-picker-ui.ts +541 -0
  75. package/.agents/skills/gstack/browse/src/find-browse.ts +61 -0
  76. package/.agents/skills/gstack/browse/src/meta-commands.ts +269 -0
  77. package/.agents/skills/gstack/browse/src/platform.ts +17 -0
  78. package/.agents/skills/gstack/browse/src/read-commands.ts +335 -0
  79. package/.agents/skills/gstack/browse/src/server.ts +369 -0
  80. package/.agents/skills/gstack/browse/src/snapshot.ts +398 -0
  81. package/.agents/skills/gstack/browse/src/url-validation.ts +91 -0
  82. package/.agents/skills/gstack/browse/src/write-commands.ts +352 -0
  83. package/.agents/skills/gstack/browse/test/bun-polyfill.test.ts +72 -0
  84. package/.agents/skills/gstack/browse/test/commands.test.ts +1836 -0
  85. package/.agents/skills/gstack/browse/test/config.test.ts +250 -0
  86. package/.agents/skills/gstack/browse/test/cookie-import-browser.test.ts +397 -0
  87. package/.agents/skills/gstack/browse/test/cookie-picker-routes.test.ts +205 -0
  88. package/.agents/skills/gstack/browse/test/find-browse.test.ts +50 -0
  89. package/.agents/skills/gstack/browse/test/fixtures/basic.html +33 -0
  90. package/.agents/skills/gstack/browse/test/fixtures/cursor-interactive.html +22 -0
  91. package/.agents/skills/gstack/browse/test/fixtures/dialog.html +15 -0
  92. package/.agents/skills/gstack/browse/test/fixtures/empty.html +2 -0
  93. package/.agents/skills/gstack/browse/test/fixtures/forms.html +55 -0
  94. package/.agents/skills/gstack/browse/test/fixtures/qa-eval-checkout.html +108 -0
  95. package/.agents/skills/gstack/browse/test/fixtures/qa-eval-spa.html +98 -0
  96. package/.agents/skills/gstack/browse/test/fixtures/qa-eval.html +51 -0
  97. package/.agents/skills/gstack/browse/test/fixtures/responsive.html +49 -0
  98. package/.agents/skills/gstack/browse/test/fixtures/snapshot.html +55 -0
  99. package/.agents/skills/gstack/browse/test/fixtures/spa.html +24 -0
  100. package/.agents/skills/gstack/browse/test/fixtures/states.html +17 -0
  101. package/.agents/skills/gstack/browse/test/fixtures/upload.html +25 -0
  102. package/.agents/skills/gstack/browse/test/gstack-config.test.ts +125 -0
  103. package/.agents/skills/gstack/browse/test/gstack-update-check.test.ts +467 -0
  104. package/.agents/skills/gstack/browse/test/handoff.test.ts +235 -0
  105. package/.agents/skills/gstack/browse/test/path-validation.test.ts +63 -0
  106. package/.agents/skills/gstack/browse/test/platform.test.ts +37 -0
  107. package/.agents/skills/gstack/browse/test/snapshot.test.ts +467 -0
  108. package/.agents/skills/gstack/browse/test/test-server.ts +57 -0
  109. package/.agents/skills/gstack/browse/test/url-validation.test.ts +72 -0
  110. package/.agents/skills/gstack/canary/SKILL.md +493 -0
  111. package/.agents/skills/gstack/canary/SKILL.md.tmpl +220 -0
  112. package/.agents/skills/gstack/careful/SKILL.md +59 -0
  113. package/.agents/skills/gstack/careful/SKILL.md.tmpl +57 -0
  114. package/.agents/skills/gstack/careful/bin/check-careful.sh +112 -0
  115. package/.agents/skills/gstack/codex/SKILL.md +677 -0
  116. package/.agents/skills/gstack/codex/SKILL.md.tmpl +356 -0
  117. package/.agents/skills/gstack/conductor.json +6 -0
  118. package/.agents/skills/gstack/cso/SKILL.md +615 -0
  119. package/.agents/skills/gstack/cso/SKILL.md.tmpl +376 -0
  120. package/.agents/skills/gstack/design-consultation/SKILL.md +625 -0
  121. package/.agents/skills/gstack/design-consultation/SKILL.md.tmpl +369 -0
  122. package/.agents/skills/gstack/design-review/SKILL.md +998 -0
  123. package/.agents/skills/gstack/design-review/SKILL.md.tmpl +262 -0
  124. package/.agents/skills/gstack/docs/images/github-2013.png +0 -0
  125. package/.agents/skills/gstack/docs/images/github-2026.png +0 -0
  126. package/.agents/skills/gstack/docs/skills.md +877 -0
  127. package/.agents/skills/gstack/document-release/SKILL.md +613 -0
  128. package/.agents/skills/gstack/document-release/SKILL.md.tmpl +357 -0
  129. package/.agents/skills/gstack/freeze/SKILL.md +82 -0
  130. package/.agents/skills/gstack/freeze/SKILL.md.tmpl +80 -0
  131. package/.agents/skills/gstack/freeze/bin/check-freeze.sh +68 -0
  132. package/.agents/skills/gstack/gstack-upgrade/SKILL.md +226 -0
  133. package/.agents/skills/gstack/gstack-upgrade/SKILL.md.tmpl +224 -0
  134. package/.agents/skills/gstack/guard/SKILL.md +82 -0
  135. package/.agents/skills/gstack/guard/SKILL.md.tmpl +80 -0
  136. package/.agents/skills/gstack/investigate/SKILL.md +435 -0
  137. package/.agents/skills/gstack/investigate/SKILL.md.tmpl +196 -0
  138. package/.agents/skills/gstack/land-and-deploy/SKILL.md +880 -0
  139. package/.agents/skills/gstack/land-and-deploy/SKILL.md.tmpl +575 -0
  140. package/.agents/skills/gstack/office-hours/SKILL.md +996 -0
  141. package/.agents/skills/gstack/office-hours/SKILL.md.tmpl +624 -0
  142. package/.agents/skills/gstack/package.json +55 -0
  143. package/.agents/skills/gstack/plan-ceo-review/SKILL.md +1277 -0
  144. package/.agents/skills/gstack/plan-ceo-review/SKILL.md.tmpl +838 -0
  145. package/.agents/skills/gstack/plan-design-review/SKILL.md +676 -0
  146. package/.agents/skills/gstack/plan-design-review/SKILL.md.tmpl +314 -0
  147. package/.agents/skills/gstack/plan-eng-review/SKILL.md +836 -0
  148. package/.agents/skills/gstack/plan-eng-review/SKILL.md.tmpl +279 -0
  149. package/.agents/skills/gstack/qa/SKILL.md +1016 -0
  150. package/.agents/skills/gstack/qa/SKILL.md.tmpl +316 -0
  151. package/.agents/skills/gstack/qa/references/issue-taxonomy.md +85 -0
  152. package/.agents/skills/gstack/qa/templates/qa-report-template.md +126 -0
  153. package/.agents/skills/gstack/qa-only/SKILL.md +633 -0
  154. package/.agents/skills/gstack/qa-only/SKILL.md.tmpl +101 -0
  155. package/.agents/skills/gstack/retro/SKILL.md +1072 -0
  156. package/.agents/skills/gstack/retro/SKILL.md.tmpl +833 -0
  157. package/.agents/skills/gstack/review/SKILL.md +849 -0
  158. package/.agents/skills/gstack/review/SKILL.md.tmpl +259 -0
  159. package/.agents/skills/gstack/review/TODOS-format.md +62 -0
  160. package/.agents/skills/gstack/review/checklist.md +190 -0
  161. package/.agents/skills/gstack/review/design-checklist.md +132 -0
  162. package/.agents/skills/gstack/review/greptile-triage.md +220 -0
  163. package/.agents/skills/gstack/scripts/analytics.ts +190 -0
  164. package/.agents/skills/gstack/scripts/dev-skill.ts +82 -0
  165. package/.agents/skills/gstack/scripts/eval-compare.ts +96 -0
  166. package/.agents/skills/gstack/scripts/eval-list.ts +116 -0
  167. package/.agents/skills/gstack/scripts/eval-select.ts +86 -0
  168. package/.agents/skills/gstack/scripts/eval-summary.ts +187 -0
  169. package/.agents/skills/gstack/scripts/eval-watch.ts +172 -0
  170. package/.agents/skills/gstack/scripts/gen-skill-docs.ts +2414 -0
  171. package/.agents/skills/gstack/scripts/skill-check.ts +167 -0
  172. package/.agents/skills/gstack/setup +269 -0
  173. package/.agents/skills/gstack/setup-browser-cookies/SKILL.md +330 -0
  174. package/.agents/skills/gstack/setup-browser-cookies/SKILL.md.tmpl +74 -0
  175. package/.agents/skills/gstack/setup-deploy/SKILL.md +459 -0
  176. package/.agents/skills/gstack/setup-deploy/SKILL.md.tmpl +220 -0
  177. package/.agents/skills/gstack/ship/SKILL.md +1457 -0
  178. package/.agents/skills/gstack/ship/SKILL.md.tmpl +528 -0
  179. package/.agents/skills/gstack/supabase/config.sh +10 -0
  180. package/.agents/skills/gstack/supabase/functions/community-pulse/index.ts +59 -0
  181. package/.agents/skills/gstack/supabase/functions/telemetry-ingest/index.ts +135 -0
  182. package/.agents/skills/gstack/supabase/functions/update-check/index.ts +37 -0
  183. package/.agents/skills/gstack/supabase/migrations/001_telemetry.sql +89 -0
  184. package/.agents/skills/gstack/test/analytics.test.ts +277 -0
  185. package/.agents/skills/gstack/test/codex-e2e.test.ts +197 -0
  186. package/.agents/skills/gstack/test/fixtures/coverage-audit-fixture.ts +76 -0
  187. package/.agents/skills/gstack/test/fixtures/eval-baselines.json +7 -0
  188. package/.agents/skills/gstack/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
  189. package/.agents/skills/gstack/test/fixtures/qa-eval-ground-truth.json +43 -0
  190. package/.agents/skills/gstack/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
  191. package/.agents/skills/gstack/test/fixtures/review-eval-design-slop.css +86 -0
  192. package/.agents/skills/gstack/test/fixtures/review-eval-design-slop.html +41 -0
  193. package/.agents/skills/gstack/test/fixtures/review-eval-enum-diff.rb +30 -0
  194. package/.agents/skills/gstack/test/fixtures/review-eval-enum.rb +27 -0
  195. package/.agents/skills/gstack/test/fixtures/review-eval-vuln.rb +14 -0
  196. package/.agents/skills/gstack/test/gemini-e2e.test.ts +173 -0
  197. package/.agents/skills/gstack/test/gen-skill-docs.test.ts +1049 -0
  198. package/.agents/skills/gstack/test/global-discover.test.ts +187 -0
  199. package/.agents/skills/gstack/test/helpers/codex-session-runner.ts +282 -0
  200. package/.agents/skills/gstack/test/helpers/e2e-helpers.ts +239 -0
  201. package/.agents/skills/gstack/test/helpers/eval-store.test.ts +548 -0
  202. package/.agents/skills/gstack/test/helpers/eval-store.ts +689 -0
  203. package/.agents/skills/gstack/test/helpers/gemini-session-runner.test.ts +104 -0
  204. package/.agents/skills/gstack/test/helpers/gemini-session-runner.ts +201 -0
  205. package/.agents/skills/gstack/test/helpers/llm-judge.ts +130 -0
  206. package/.agents/skills/gstack/test/helpers/observability.test.ts +283 -0
  207. package/.agents/skills/gstack/test/helpers/session-runner.test.ts +96 -0
  208. package/.agents/skills/gstack/test/helpers/session-runner.ts +357 -0
  209. package/.agents/skills/gstack/test/helpers/skill-parser.ts +206 -0
  210. package/.agents/skills/gstack/test/helpers/touchfiles.ts +260 -0
  211. package/.agents/skills/gstack/test/hook-scripts.test.ts +373 -0
  212. package/.agents/skills/gstack/test/skill-e2e-browse.test.ts +293 -0
  213. package/.agents/skills/gstack/test/skill-e2e-deploy.test.ts +279 -0
  214. package/.agents/skills/gstack/test/skill-e2e-design.test.ts +614 -0
  215. package/.agents/skills/gstack/test/skill-e2e-plan.test.ts +538 -0
  216. package/.agents/skills/gstack/test/skill-e2e-qa-bugs.test.ts +194 -0
  217. package/.agents/skills/gstack/test/skill-e2e-qa-workflow.test.ts +412 -0
  218. package/.agents/skills/gstack/test/skill-e2e-review.test.ts +535 -0
  219. package/.agents/skills/gstack/test/skill-e2e-workflow.test.ts +586 -0
  220. package/.agents/skills/gstack/test/skill-e2e.test.ts +3325 -0
  221. package/.agents/skills/gstack/test/skill-llm-eval.test.ts +787 -0
  222. package/.agents/skills/gstack/test/skill-parser.test.ts +179 -0
  223. package/.agents/skills/gstack/test/skill-routing-e2e.test.ts +605 -0
  224. package/.agents/skills/gstack/test/skill-validation.test.ts +1520 -0
  225. package/.agents/skills/gstack/test/telemetry.test.ts +278 -0
  226. package/.agents/skills/gstack/test/touchfiles.test.ts +262 -0
  227. package/.agents/skills/gstack/unfreeze/SKILL.md +40 -0
  228. package/.agents/skills/gstack/unfreeze/SKILL.md.tmpl +38 -0
  229. package/README.md +12 -7
  230. package/README_KO.md +12 -6
  231. package/package.json +3 -2
@@ -0,0 +1,357 @@
1
+ /**
2
+ * Claude CLI subprocess runner for skill E2E testing.
3
+ *
4
+ * Spawns `claude -p` as a completely independent process (not via Agent SDK),
5
+ * so it works inside Claude Code sessions. Pipes prompt via stdin, streams
6
+ * NDJSON output for real-time progress, scans for browse errors.
7
+ */
8
+
9
+ import * as fs from 'fs';
10
+ import * as path from 'path';
11
+ import * as os from 'os';
12
+
13
+ const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
14
+ const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
15
+
16
+ /** Sanitize test name for use as filename: strip leading slashes, replace / with - */
17
+ export function sanitizeTestName(name: string): string {
18
+ return name.replace(/^\/+/, '').replace(/\//g, '-');
19
+ }
20
+
21
+ /** Atomic write: write to .tmp then rename. Non-fatal on error. */
22
+ function atomicWriteSync(filePath: string, data: string): void {
23
+ const tmp = filePath + '.tmp';
24
+ fs.writeFileSync(tmp, data);
25
+ fs.renameSync(tmp, filePath);
26
+ }
27
+
28
+ export interface CostEstimate {
29
+ inputChars: number;
30
+ outputChars: number;
31
+ estimatedTokens: number;
32
+ estimatedCost: number; // USD
33
+ turnsUsed: number;
34
+ }
35
+
36
+ export interface SkillTestResult {
37
+ toolCalls: Array<{ tool: string; input: any; output: string }>;
38
+ browseErrors: string[];
39
+ exitReason: string;
40
+ duration: number;
41
+ output: string;
42
+ costEstimate: CostEstimate;
43
+ transcript: any[];
44
+ /** Which model was used for this test (added for Sonnet/Opus split diagnostics) */
45
+ model: string;
46
+ /** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */
47
+ firstResponseMs: number;
48
+ /** Peak latency between consecutive tool calls, in ms */
49
+ maxInterTurnMs: number;
50
+ }
51
+
52
+ const BROWSE_ERROR_PATTERNS = [
53
+ /Unknown command: \w+/,
54
+ /Unknown snapshot flag: .+/,
55
+ /ERROR: browse binary not found/,
56
+ /Server failed to start/,
57
+ /no such file or directory.*browse/i,
58
+ ];
59
+
60
+ // --- Testable NDJSON parser ---
61
+
62
+ export interface ParsedNDJSON {
63
+ transcript: any[];
64
+ resultLine: any | null;
65
+ turnCount: number;
66
+ toolCallCount: number;
67
+ toolCalls: Array<{ tool: string; input: any; output: string }>;
68
+ }
69
+
70
+ /**
71
+ * Parse an array of NDJSON lines into structured transcript data.
72
+ * Pure function — no I/O, no side effects. Used by both the streaming
73
+ * reader and unit tests.
74
+ */
75
+ export function parseNDJSON(lines: string[]): ParsedNDJSON {
76
+ const transcript: any[] = [];
77
+ let resultLine: any = null;
78
+ let turnCount = 0;
79
+ let toolCallCount = 0;
80
+ const toolCalls: ParsedNDJSON['toolCalls'] = [];
81
+
82
+ for (const line of lines) {
83
+ if (!line.trim()) continue;
84
+ try {
85
+ const event = JSON.parse(line);
86
+ transcript.push(event);
87
+
88
+ // Track turns and tool calls from assistant events
89
+ if (event.type === 'assistant') {
90
+ turnCount++;
91
+ const content = event.message?.content || [];
92
+ for (const item of content) {
93
+ if (item.type === 'tool_use') {
94
+ toolCallCount++;
95
+ toolCalls.push({
96
+ tool: item.name || 'unknown',
97
+ input: item.input || {},
98
+ output: '',
99
+ });
100
+ }
101
+ }
102
+ }
103
+
104
+ if (event.type === 'result') resultLine = event;
105
+ } catch { /* skip malformed lines */ }
106
+ }
107
+
108
+ return { transcript, resultLine, turnCount, toolCallCount, toolCalls };
109
+ }
110
+
111
+ function truncate(s: string, max: number): string {
112
+ return s.length > max ? s.slice(0, max) + '…' : s;
113
+ }
114
+
115
+ // --- Main runner ---
116
+
117
+ export async function runSkillTest(options: {
118
+ prompt: string;
119
+ workingDirectory: string;
120
+ maxTurns?: number;
121
+ allowedTools?: string[];
122
+ timeout?: number;
123
+ testName?: string;
124
+ runId?: string;
125
+ /** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
126
+ model?: string;
127
+ }): Promise<SkillTestResult> {
128
+ const {
129
+ prompt,
130
+ workingDirectory,
131
+ maxTurns = 15,
132
+ allowedTools = ['Bash', 'Read', 'Write'],
133
+ timeout = 120_000,
134
+ testName,
135
+ runId,
136
+ } = options;
137
+ const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';
138
+
139
+ const startTime = Date.now();
140
+ const startedAt = new Date().toISOString();
141
+
142
+ // Set up per-run log directory if runId is provided
143
+ let runDir: string | null = null;
144
+ const safeName = testName ? sanitizeTestName(testName) : null;
145
+ if (runId) {
146
+ try {
147
+ runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
148
+ fs.mkdirSync(runDir, { recursive: true });
149
+ } catch { /* non-fatal */ }
150
+ }
151
+
152
+ // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
153
+ // avoid shell escaping issues. --verbose is required for stream-json mode.
154
+ const args = [
155
+ '-p',
156
+ '--model', model,
157
+ '--output-format', 'stream-json',
158
+ '--verbose',
159
+ '--dangerously-skip-permissions',
160
+ '--max-turns', String(maxTurns),
161
+ '--allowed-tools', ...allowedTools,
162
+ ];
163
+
164
+ // Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
165
+ // where afterAll cleanup deletes the dir before cat reads the file (especially
166
+ // with --concurrent --retry). Using os.tmpdir() + unique suffix keeps it stable.
167
+ const promptFile = path.join(os.tmpdir(), `.prompt-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2)}`);
168
+ fs.writeFileSync(promptFile, prompt);
169
+
170
+ const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
171
+ cwd: workingDirectory,
172
+ stdout: 'pipe',
173
+ stderr: 'pipe',
174
+ });
175
+
176
+ // Race against timeout
177
+ let stderr = '';
178
+ let exitReason = 'unknown';
179
+ let timedOut = false;
180
+
181
+ const timeoutId = setTimeout(() => {
182
+ timedOut = true;
183
+ proc.kill();
184
+ }, timeout);
185
+
186
+ // Stream NDJSON from stdout for real-time progress
187
+ const collectedLines: string[] = [];
188
+ let liveTurnCount = 0;
189
+ let liveToolCount = 0;
190
+ let firstResponseMs = 0;
191
+ let lastToolTime = 0;
192
+ let maxInterTurnMs = 0;
193
+ const stderrPromise = new Response(proc.stderr).text();
194
+
195
+ const reader = proc.stdout.getReader();
196
+ const decoder = new TextDecoder();
197
+ let buf = '';
198
+
199
+ try {
200
+ while (true) {
201
+ const { done, value } = await reader.read();
202
+ if (done) break;
203
+ buf += decoder.decode(value, { stream: true });
204
+ const lines = buf.split('\n');
205
+ buf = lines.pop() || '';
206
+ for (const line of lines) {
207
+ if (!line.trim()) continue;
208
+ collectedLines.push(line);
209
+
210
+ // Real-time progress to stderr + persistent logs
211
+ try {
212
+ const event = JSON.parse(line);
213
+ if (event.type === 'assistant') {
214
+ liveTurnCount++;
215
+ const content = event.message?.content || [];
216
+ for (const item of content) {
217
+ if (item.type === 'tool_use') {
218
+ liveToolCount++;
219
+ const now = Date.now();
220
+ const elapsed = Math.round((now - startTime) / 1000);
221
+ // Track timing telemetry
222
+ if (firstResponseMs === 0) firstResponseMs = now - startTime;
223
+ if (lastToolTime > 0) {
224
+ const interTurn = now - lastToolTime;
225
+ if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
226
+ }
227
+ lastToolTime = now;
228
+ const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
229
+ process.stderr.write(progressLine);
230
+
231
+ // Persist progress.log
232
+ if (runDir) {
233
+ try { fs.appendFileSync(path.join(runDir, 'progress.log'), progressLine); } catch { /* non-fatal */ }
234
+ }
235
+
236
+ // Write heartbeat (atomic)
237
+ if (runId && testName) {
238
+ try {
239
+ const toolDesc = `${item.name}(${truncate(JSON.stringify(item.input || {}), 60)})`;
240
+ atomicWriteSync(HEARTBEAT_PATH, JSON.stringify({
241
+ runId,
242
+ pid: proc.pid,
243
+ startedAt,
244
+ currentTest: testName,
245
+ status: 'running',
246
+ turn: liveTurnCount,
247
+ toolCount: liveToolCount,
248
+ lastTool: toolDesc,
249
+ lastToolAt: new Date().toISOString(),
250
+ elapsedSec: elapsed,
251
+ }, null, 2) + '\n');
252
+ } catch { /* non-fatal */ }
253
+ }
254
+ }
255
+ }
256
+ }
257
+ } catch { /* skip — parseNDJSON will handle it later */ }
258
+
259
+ // Append raw NDJSON line to per-test transcript file
260
+ if (runDir && safeName) {
261
+ try { fs.appendFileSync(path.join(runDir, `${safeName}.ndjson`), line + '\n'); } catch { /* non-fatal */ }
262
+ }
263
+ }
264
+ }
265
+ } catch { /* stream read error — fall through to exit code handling */ }
266
+
267
+ // Flush remaining buffer
268
+ if (buf.trim()) {
269
+ collectedLines.push(buf);
270
+ }
271
+
272
+ stderr = await stderrPromise;
273
+ const exitCode = await proc.exited;
274
+ clearTimeout(timeoutId);
275
+
276
+ try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ }
277
+
278
+ if (timedOut) {
279
+ exitReason = 'timeout';
280
+ } else if (exitCode === 0) {
281
+ exitReason = 'success';
282
+ } else {
283
+ exitReason = `exit_code_${exitCode}`;
284
+ }
285
+
286
+ const duration = Date.now() - startTime;
287
+
288
+ // Parse all collected NDJSON lines
289
+ const parsed = parseNDJSON(collectedLines);
290
+ const { transcript, resultLine, toolCalls } = parsed;
291
+ const browseErrors: string[] = [];
292
+
293
+ // Scan transcript + stderr for browse errors
294
+ const allText = transcript.map(e => JSON.stringify(e)).join('\n') + '\n' + stderr;
295
+ for (const pattern of BROWSE_ERROR_PATTERNS) {
296
+ const match = allText.match(pattern);
297
+ if (match) {
298
+ browseErrors.push(match[0].slice(0, 200));
299
+ }
300
+ }
301
+
302
+ // Use resultLine for structured result data
303
+ if (resultLine) {
304
+ if (resultLine.is_error) {
305
+ // claude -p can return subtype=success with is_error=true (e.g. API connection failure)
306
+ exitReason = 'error_api';
307
+ } else if (resultLine.subtype === 'success') {
308
+ exitReason = 'success';
309
+ } else if (resultLine.subtype) {
310
+ exitReason = resultLine.subtype;
311
+ }
312
+ }
313
+
314
+ // Save failure transcript to persistent run directory (or fallback to workingDirectory)
315
+ if (browseErrors.length > 0 || exitReason !== 'success') {
316
+ try {
317
+ const failureDir = runDir || path.join(workingDirectory, '.gstack', 'test-transcripts');
318
+ fs.mkdirSync(failureDir, { recursive: true });
319
+ const failureName = safeName
320
+ ? `${safeName}-failure.json`
321
+ : `e2e-${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
322
+ fs.writeFileSync(
323
+ path.join(failureDir, failureName),
324
+ JSON.stringify({
325
+ prompt: prompt.slice(0, 500),
326
+ testName: testName || 'unknown',
327
+ exitReason,
328
+ browseErrors,
329
+ duration,
330
+ turnAtTimeout: timedOut ? liveTurnCount : undefined,
331
+ lastToolCall: liveToolCount > 0 ? `tool #${liveToolCount}` : undefined,
332
+ stderr: stderr.slice(0, 2000),
333
+ result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
334
+ }, null, 2),
335
+ );
336
+ } catch { /* non-fatal */ }
337
+ }
338
+
339
+ // Cost from result line (exact) or estimate from chars
340
+ const turnsUsed = resultLine?.num_turns || 0;
341
+ const estimatedCost = resultLine?.total_cost_usd || 0;
342
+ const inputChars = prompt.length;
343
+ const outputChars = (resultLine?.result || '').length;
344
+ const estimatedTokens = (resultLine?.usage?.input_tokens || 0)
345
+ + (resultLine?.usage?.output_tokens || 0)
346
+ + (resultLine?.usage?.cache_read_input_tokens || 0);
347
+
348
+ const costEstimate: CostEstimate = {
349
+ inputChars,
350
+ outputChars,
351
+ estimatedTokens,
352
+ estimatedCost: Math.round((estimatedCost) * 100) / 100,
353
+ turnsUsed,
354
+ };
355
+
356
+ return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, model, firstResponseMs, maxInterTurnMs };
357
+ }
@@ -0,0 +1,206 @@
1
+ /**
2
+ * SKILL.md parser and validator.
3
+ *
4
+ * Extracts $B commands from code blocks, validates them against
5
+ * the command registry and snapshot flags.
6
+ *
7
+ * Used by:
8
+ * - test/skill-validation.test.ts (Tier 1 static tests)
9
+ * - scripts/skill-check.ts (health summary)
10
+ * - scripts/dev-skill.ts (watch mode)
11
+ */
12
+
13
+ import { ALL_COMMANDS } from '../../browse/src/commands';
14
+ import { parseSnapshotArgs } from '../../browse/src/snapshot';
15
+ import * as fs from 'fs';
16
+ import * as path from 'path';
17
+
18
+ export interface BrowseCommand {
19
+ command: string;
20
+ args: string[];
21
+ line: number;
22
+ raw: string;
23
+ }
24
+
25
+ export interface ValidationResult {
26
+ valid: BrowseCommand[];
27
+ invalid: BrowseCommand[];
28
+ snapshotFlagErrors: Array<{ command: BrowseCommand; error: string }>;
29
+ warnings: string[];
30
+ }
31
+
32
+ /**
33
+ * Extract all $B invocations from bash code blocks in a SKILL.md file.
34
+ */
35
+ export function extractBrowseCommands(skillPath: string): BrowseCommand[] {
36
+ const content = fs.readFileSync(skillPath, 'utf-8');
37
+ const lines = content.split('\n');
38
+ const commands: BrowseCommand[] = [];
39
+
40
+ let inBashBlock = false;
41
+
42
+ for (let i = 0; i < lines.length; i++) {
43
+ const line = lines[i];
44
+
45
+ // Detect code block boundaries
46
+ if (line.trimStart().startsWith('```')) {
47
+ if (inBashBlock) {
48
+ inBashBlock = false;
49
+ } else if (line.trimStart().startsWith('```bash')) {
50
+ inBashBlock = true;
51
+ }
52
+ // Non-bash code blocks (```json, ```, ```js, etc.) are skipped
53
+ continue;
54
+ }
55
+
56
+ if (!inBashBlock) continue;
57
+
58
+ // Match lines with $B command invocations
59
+ // Handle multiple $B commands on one line (e.g., "$B click @e3 $B fill @e4 "value"")
60
+ const matches = line.matchAll(/\$B\s+(\S+)(?:\s+([^\$]*))?/g);
61
+ for (const match of matches) {
62
+ const command = match[1];
63
+ let argsStr = (match[2] || '').trim();
64
+
65
+ // Strip inline comments (# ...) — but not inside quotes
66
+ // Simple approach: remove everything from first unquoted # onward
67
+ let inQuote = false;
68
+ for (let j = 0; j < argsStr.length; j++) {
69
+ if (argsStr[j] === '"') inQuote = !inQuote;
70
+ if (argsStr[j] === '#' && !inQuote) {
71
+ argsStr = argsStr.slice(0, j).trim();
72
+ break;
73
+ }
74
+ }
75
+
76
+ // Parse args — handle quoted strings
77
+ const args: string[] = [];
78
+ if (argsStr) {
79
+ const argMatches = argsStr.matchAll(/"([^"]*)"|(\S+)/g);
80
+ for (const am of argMatches) {
81
+ args.push(am[1] ?? am[2]);
82
+ }
83
+ }
84
+
85
+ commands.push({
86
+ command,
87
+ args,
88
+ line: i + 1, // 1-based
89
+ raw: match[0].trim(),
90
+ });
91
+ }
92
+ }
93
+
94
+ return commands;
95
+ }
96
+
97
+ /**
98
+ * Extract and validate all $B commands in a SKILL.md file.
99
+ */
100
+ export function validateSkill(skillPath: string): ValidationResult {
101
+ const commands = extractBrowseCommands(skillPath);
102
+ const result: ValidationResult = {
103
+ valid: [],
104
+ invalid: [],
105
+ snapshotFlagErrors: [],
106
+ warnings: [],
107
+ };
108
+
109
+ if (commands.length === 0) {
110
+ result.warnings.push('no $B commands found');
111
+ return result;
112
+ }
113
+
114
+ for (const cmd of commands) {
115
+ if (!ALL_COMMANDS.has(cmd.command)) {
116
+ result.invalid.push(cmd);
117
+ continue;
118
+ }
119
+
120
+ // Validate snapshot flags
121
+ if (cmd.command === 'snapshot' && cmd.args.length > 0) {
122
+ try {
123
+ parseSnapshotArgs(cmd.args);
124
+ } catch (err: any) {
125
+ result.snapshotFlagErrors.push({ command: cmd, error: err.message });
126
+ continue;
127
+ }
128
+ }
129
+
130
+ result.valid.push(cmd);
131
+ }
132
+
133
+ return result;
134
+ }
135
+
136
+ /**
137
+ * Extract all REMOTE_SLUG=$(...) assignment patterns from .md files in given subdirectories.
138
+ * Returns a Map from filename → array of full assignment lines found.
139
+ */
140
+ export function extractRemoteSlugPatterns(rootDir: string, subdirs: string[]): Map<string, string[]> {
141
+ const results = new Map<string, string[]>();
142
+ const pattern = /^REMOTE_SLUG=\$\(.*\)$/;
143
+
144
+ for (const subdir of subdirs) {
145
+ const dir = path.join(rootDir, subdir);
146
+ if (!fs.existsSync(dir)) continue;
147
+
148
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
149
+ for (const file of files) {
150
+ const filePath = path.join(dir, file);
151
+ const content = fs.readFileSync(filePath, 'utf-8');
152
+ const matches: string[] = [];
153
+
154
+ for (const line of content.split('\n')) {
155
+ const trimmed = line.trim();
156
+ if (pattern.test(trimmed)) {
157
+ matches.push(trimmed);
158
+ }
159
+ }
160
+
161
+ if (matches.length > 0) {
162
+ results.set(`${subdir}/${file}`, matches);
163
+ }
164
+ }
165
+ }
166
+
167
+ return results;
168
+ }
169
+
170
+ /**
171
+ * Parse a markdown weight table anchored to a "### Weights" heading.
172
+ * Expects rows like: | Category | 15% |
173
+ * Returns Map<category, number> where number is the percentage (e.g., 15).
174
+ */
175
+ export function extractWeightsFromTable(content: string): Map<string, number> {
176
+ const weights = new Map<string, number>();
177
+
178
+ // Find the ### Weights section
179
+ const weightsIdx = content.indexOf('### Weights');
180
+ if (weightsIdx === -1) return weights;
181
+
182
+ // Find the table within that section (stop at next heading or end)
183
+ const section = content.slice(weightsIdx);
184
+ const lines = section.split('\n');
185
+
186
+ for (let i = 1; i < lines.length; i++) {
187
+ const line = lines[i].trim();
188
+
189
+ // Stop at next heading
190
+ if (line.startsWith('#') && !line.startsWith('###')) break;
191
+ if (line.startsWith('### ') && i > 0) break;
192
+
193
+ // Parse table rows: | Category | N% |
194
+ const match = line.match(/^\|\s*(\w[\w\s]*\w|\w+)\s*\|\s*(\d+)%\s*\|$/);
195
+ if (match) {
196
+ const category = match[1].trim();
197
+ const pct = parseInt(match[2], 10);
198
+ // Skip header row
199
+ if (category !== 'Category' && !isNaN(pct)) {
200
+ weights.set(category, pct);
201
+ }
202
+ }
203
+ }
204
+
205
+ return weights;
206
+ }