@runchr/gstack-antigravity 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @runchr/gstack-antigravity might be problematic. Click here for more details.
- package/.agents/skills/gstack/.agents/skills/gstack/SKILL.md +651 -0
- package/.agents/skills/gstack/.agents/skills/gstack-autoplan/SKILL.md +678 -0
- package/.agents/skills/gstack/.agents/skills/gstack-benchmark/SKILL.md +482 -0
- package/.agents/skills/gstack/.agents/skills/gstack-browse/SKILL.md +511 -0
- package/.agents/skills/gstack/.agents/skills/gstack-canary/SKILL.md +486 -0
- package/.agents/skills/gstack/.agents/skills/gstack-careful/SKILL.md +50 -0
- package/.agents/skills/gstack/.agents/skills/gstack-cso/SKILL.md +607 -0
- package/.agents/skills/gstack/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
- package/.agents/skills/gstack/.agents/skills/gstack-design-review/SKILL.md +988 -0
- package/.agents/skills/gstack/.agents/skills/gstack-document-release/SKILL.md +604 -0
- package/.agents/skills/gstack/.agents/skills/gstack-freeze/SKILL.md +67 -0
- package/.agents/skills/gstack/.agents/skills/gstack-guard/SKILL.md +62 -0
- package/.agents/skills/gstack/.agents/skills/gstack-investigate/SKILL.md +415 -0
- package/.agents/skills/gstack/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
- package/.agents/skills/gstack/.agents/skills/gstack-office-hours/SKILL.md +986 -0
- package/.agents/skills/gstack/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
- package/.agents/skills/gstack/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
- package/.agents/skills/gstack/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
- package/.agents/skills/gstack/.agents/skills/gstack-qa/SKILL.md +1006 -0
- package/.agents/skills/gstack/.agents/skills/gstack-qa-only/SKILL.md +626 -0
- package/.agents/skills/gstack/.agents/skills/gstack-retro/SKILL.md +1065 -0
- package/.agents/skills/gstack/.agents/skills/gstack-review/SKILL.md +704 -0
- package/.agents/skills/gstack/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
- package/.agents/skills/gstack/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
- package/.agents/skills/gstack/.agents/skills/gstack-ship/SKILL.md +1312 -0
- package/.agents/skills/gstack/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
- package/.agents/skills/gstack/.agents/skills/gstack-upgrade/SKILL.md +220 -0
- package/.agents/skills/gstack/.env.example +5 -0
- package/.agents/skills/gstack/.github/workflows/skill-docs.yml +17 -0
- package/.agents/skills/gstack/AGENTS.md +49 -0
- package/.agents/skills/gstack/ARCHITECTURE.md +359 -0
- package/.agents/skills/gstack/BROWSER.md +271 -0
- package/.agents/skills/gstack/CHANGELOG.md +800 -0
- package/.agents/skills/gstack/CLAUDE.md +284 -0
- package/.agents/skills/gstack/CONTRIBUTING.md +370 -0
- package/.agents/skills/gstack/ETHOS.md +129 -0
- package/.agents/skills/gstack/LICENSE +21 -0
- package/.agents/skills/gstack/README.md +228 -0
- package/.agents/skills/gstack/SKILL.md +657 -0
- package/.agents/skills/gstack/SKILL.md.tmpl +281 -0
- package/.agents/skills/gstack/TODOS.md +564 -0
- package/.agents/skills/gstack/VERSION +1 -0
- package/.agents/skills/gstack/autoplan/SKILL.md +689 -0
- package/.agents/skills/gstack/autoplan/SKILL.md.tmpl +416 -0
- package/.agents/skills/gstack/benchmark/SKILL.md +489 -0
- package/.agents/skills/gstack/benchmark/SKILL.md.tmpl +233 -0
- package/.agents/skills/gstack/bin/dev-setup +68 -0
- package/.agents/skills/gstack/bin/dev-teardown +56 -0
- package/.agents/skills/gstack/bin/gstack-analytics +191 -0
- package/.agents/skills/gstack/bin/gstack-community-dashboard +113 -0
- package/.agents/skills/gstack/bin/gstack-config +38 -0
- package/.agents/skills/gstack/bin/gstack-diff-scope +71 -0
- package/.agents/skills/gstack/bin/gstack-global-discover.ts +591 -0
- package/.agents/skills/gstack/bin/gstack-repo-mode +93 -0
- package/.agents/skills/gstack/bin/gstack-review-log +9 -0
- package/.agents/skills/gstack/bin/gstack-review-read +12 -0
- package/.agents/skills/gstack/bin/gstack-slug +15 -0
- package/.agents/skills/gstack/bin/gstack-telemetry-log +158 -0
- package/.agents/skills/gstack/bin/gstack-telemetry-sync +127 -0
- package/.agents/skills/gstack/bin/gstack-update-check +196 -0
- package/.agents/skills/gstack/browse/SKILL.md +517 -0
- package/.agents/skills/gstack/browse/SKILL.md.tmpl +141 -0
- package/.agents/skills/gstack/browse/bin/find-browse +21 -0
- package/.agents/skills/gstack/browse/bin/remote-slug +14 -0
- package/.agents/skills/gstack/browse/scripts/build-node-server.sh +48 -0
- package/.agents/skills/gstack/browse/src/browser-manager.ts +634 -0
- package/.agents/skills/gstack/browse/src/buffers.ts +137 -0
- package/.agents/skills/gstack/browse/src/bun-polyfill.cjs +109 -0
- package/.agents/skills/gstack/browse/src/cli.ts +420 -0
- package/.agents/skills/gstack/browse/src/commands.ts +111 -0
- package/.agents/skills/gstack/browse/src/config.ts +150 -0
- package/.agents/skills/gstack/browse/src/cookie-import-browser.ts +417 -0
- package/.agents/skills/gstack/browse/src/cookie-picker-routes.ts +207 -0
- package/.agents/skills/gstack/browse/src/cookie-picker-ui.ts +541 -0
- package/.agents/skills/gstack/browse/src/find-browse.ts +61 -0
- package/.agents/skills/gstack/browse/src/meta-commands.ts +269 -0
- package/.agents/skills/gstack/browse/src/platform.ts +17 -0
- package/.agents/skills/gstack/browse/src/read-commands.ts +335 -0
- package/.agents/skills/gstack/browse/src/server.ts +369 -0
- package/.agents/skills/gstack/browse/src/snapshot.ts +398 -0
- package/.agents/skills/gstack/browse/src/url-validation.ts +91 -0
- package/.agents/skills/gstack/browse/src/write-commands.ts +352 -0
- package/.agents/skills/gstack/browse/test/bun-polyfill.test.ts +72 -0
- package/.agents/skills/gstack/browse/test/commands.test.ts +1836 -0
- package/.agents/skills/gstack/browse/test/config.test.ts +250 -0
- package/.agents/skills/gstack/browse/test/cookie-import-browser.test.ts +397 -0
- package/.agents/skills/gstack/browse/test/cookie-picker-routes.test.ts +205 -0
- package/.agents/skills/gstack/browse/test/find-browse.test.ts +50 -0
- package/.agents/skills/gstack/browse/test/fixtures/basic.html +33 -0
- package/.agents/skills/gstack/browse/test/fixtures/cursor-interactive.html +22 -0
- package/.agents/skills/gstack/browse/test/fixtures/dialog.html +15 -0
- package/.agents/skills/gstack/browse/test/fixtures/empty.html +2 -0
- package/.agents/skills/gstack/browse/test/fixtures/forms.html +55 -0
- package/.agents/skills/gstack/browse/test/fixtures/qa-eval-checkout.html +108 -0
- package/.agents/skills/gstack/browse/test/fixtures/qa-eval-spa.html +98 -0
- package/.agents/skills/gstack/browse/test/fixtures/qa-eval.html +51 -0
- package/.agents/skills/gstack/browse/test/fixtures/responsive.html +49 -0
- package/.agents/skills/gstack/browse/test/fixtures/snapshot.html +55 -0
- package/.agents/skills/gstack/browse/test/fixtures/spa.html +24 -0
- package/.agents/skills/gstack/browse/test/fixtures/states.html +17 -0
- package/.agents/skills/gstack/browse/test/fixtures/upload.html +25 -0
- package/.agents/skills/gstack/browse/test/gstack-config.test.ts +125 -0
- package/.agents/skills/gstack/browse/test/gstack-update-check.test.ts +467 -0
- package/.agents/skills/gstack/browse/test/handoff.test.ts +235 -0
- package/.agents/skills/gstack/browse/test/path-validation.test.ts +63 -0
- package/.agents/skills/gstack/browse/test/platform.test.ts +37 -0
- package/.agents/skills/gstack/browse/test/snapshot.test.ts +467 -0
- package/.agents/skills/gstack/browse/test/test-server.ts +57 -0
- package/.agents/skills/gstack/browse/test/url-validation.test.ts +72 -0
- package/.agents/skills/gstack/canary/SKILL.md +493 -0
- package/.agents/skills/gstack/canary/SKILL.md.tmpl +220 -0
- package/.agents/skills/gstack/careful/SKILL.md +59 -0
- package/.agents/skills/gstack/careful/SKILL.md.tmpl +57 -0
- package/.agents/skills/gstack/careful/bin/check-careful.sh +112 -0
- package/.agents/skills/gstack/codex/SKILL.md +677 -0
- package/.agents/skills/gstack/codex/SKILL.md.tmpl +356 -0
- package/.agents/skills/gstack/conductor.json +6 -0
- package/.agents/skills/gstack/cso/SKILL.md +615 -0
- package/.agents/skills/gstack/cso/SKILL.md.tmpl +376 -0
- package/.agents/skills/gstack/design-consultation/SKILL.md +625 -0
- package/.agents/skills/gstack/design-consultation/SKILL.md.tmpl +369 -0
- package/.agents/skills/gstack/design-review/SKILL.md +998 -0
- package/.agents/skills/gstack/design-review/SKILL.md.tmpl +262 -0
- package/.agents/skills/gstack/docs/images/github-2013.png +0 -0
- package/.agents/skills/gstack/docs/images/github-2026.png +0 -0
- package/.agents/skills/gstack/docs/skills.md +877 -0
- package/.agents/skills/gstack/document-release/SKILL.md +613 -0
- package/.agents/skills/gstack/document-release/SKILL.md.tmpl +357 -0
- package/.agents/skills/gstack/freeze/SKILL.md +82 -0
- package/.agents/skills/gstack/freeze/SKILL.md.tmpl +80 -0
- package/.agents/skills/gstack/freeze/bin/check-freeze.sh +68 -0
- package/.agents/skills/gstack/gstack-upgrade/SKILL.md +226 -0
- package/.agents/skills/gstack/gstack-upgrade/SKILL.md.tmpl +224 -0
- package/.agents/skills/gstack/guard/SKILL.md +82 -0
- package/.agents/skills/gstack/guard/SKILL.md.tmpl +80 -0
- package/.agents/skills/gstack/investigate/SKILL.md +435 -0
- package/.agents/skills/gstack/investigate/SKILL.md.tmpl +196 -0
- package/.agents/skills/gstack/land-and-deploy/SKILL.md +880 -0
- package/.agents/skills/gstack/land-and-deploy/SKILL.md.tmpl +575 -0
- package/.agents/skills/gstack/office-hours/SKILL.md +996 -0
- package/.agents/skills/gstack/office-hours/SKILL.md.tmpl +624 -0
- package/.agents/skills/gstack/package.json +55 -0
- package/.agents/skills/gstack/plan-ceo-review/SKILL.md +1277 -0
- package/.agents/skills/gstack/plan-ceo-review/SKILL.md.tmpl +838 -0
- package/.agents/skills/gstack/plan-design-review/SKILL.md +676 -0
- package/.agents/skills/gstack/plan-design-review/SKILL.md.tmpl +314 -0
- package/.agents/skills/gstack/plan-eng-review/SKILL.md +836 -0
- package/.agents/skills/gstack/plan-eng-review/SKILL.md.tmpl +279 -0
- package/.agents/skills/gstack/qa/SKILL.md +1016 -0
- package/.agents/skills/gstack/qa/SKILL.md.tmpl +316 -0
- package/.agents/skills/gstack/qa/references/issue-taxonomy.md +85 -0
- package/.agents/skills/gstack/qa/templates/qa-report-template.md +126 -0
- package/.agents/skills/gstack/qa-only/SKILL.md +633 -0
- package/.agents/skills/gstack/qa-only/SKILL.md.tmpl +101 -0
- package/.agents/skills/gstack/retro/SKILL.md +1072 -0
- package/.agents/skills/gstack/retro/SKILL.md.tmpl +833 -0
- package/.agents/skills/gstack/review/SKILL.md +849 -0
- package/.agents/skills/gstack/review/SKILL.md.tmpl +259 -0
- package/.agents/skills/gstack/review/TODOS-format.md +62 -0
- package/.agents/skills/gstack/review/checklist.md +190 -0
- package/.agents/skills/gstack/review/design-checklist.md +132 -0
- package/.agents/skills/gstack/review/greptile-triage.md +220 -0
- package/.agents/skills/gstack/scripts/analytics.ts +190 -0
- package/.agents/skills/gstack/scripts/dev-skill.ts +82 -0
- package/.agents/skills/gstack/scripts/eval-compare.ts +96 -0
- package/.agents/skills/gstack/scripts/eval-list.ts +116 -0
- package/.agents/skills/gstack/scripts/eval-select.ts +86 -0
- package/.agents/skills/gstack/scripts/eval-summary.ts +187 -0
- package/.agents/skills/gstack/scripts/eval-watch.ts +172 -0
- package/.agents/skills/gstack/scripts/gen-skill-docs.ts +2414 -0
- package/.agents/skills/gstack/scripts/skill-check.ts +167 -0
- package/.agents/skills/gstack/setup +269 -0
- package/.agents/skills/gstack/setup-browser-cookies/SKILL.md +330 -0
- package/.agents/skills/gstack/setup-browser-cookies/SKILL.md.tmpl +74 -0
- package/.agents/skills/gstack/setup-deploy/SKILL.md +459 -0
- package/.agents/skills/gstack/setup-deploy/SKILL.md.tmpl +220 -0
- package/.agents/skills/gstack/ship/SKILL.md +1457 -0
- package/.agents/skills/gstack/ship/SKILL.md.tmpl +528 -0
- package/.agents/skills/gstack/supabase/config.sh +10 -0
- package/.agents/skills/gstack/supabase/functions/community-pulse/index.ts +59 -0
- package/.agents/skills/gstack/supabase/functions/telemetry-ingest/index.ts +135 -0
- package/.agents/skills/gstack/supabase/functions/update-check/index.ts +37 -0
- package/.agents/skills/gstack/supabase/migrations/001_telemetry.sql +89 -0
- package/.agents/skills/gstack/test/analytics.test.ts +277 -0
- package/.agents/skills/gstack/test/codex-e2e.test.ts +197 -0
- package/.agents/skills/gstack/test/fixtures/coverage-audit-fixture.ts +76 -0
- package/.agents/skills/gstack/test/fixtures/eval-baselines.json +7 -0
- package/.agents/skills/gstack/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
- package/.agents/skills/gstack/test/fixtures/qa-eval-ground-truth.json +43 -0
- package/.agents/skills/gstack/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
- package/.agents/skills/gstack/test/fixtures/review-eval-design-slop.css +86 -0
- package/.agents/skills/gstack/test/fixtures/review-eval-design-slop.html +41 -0
- package/.agents/skills/gstack/test/fixtures/review-eval-enum-diff.rb +30 -0
- package/.agents/skills/gstack/test/fixtures/review-eval-enum.rb +27 -0
- package/.agents/skills/gstack/test/fixtures/review-eval-vuln.rb +14 -0
- package/.agents/skills/gstack/test/gemini-e2e.test.ts +173 -0
- package/.agents/skills/gstack/test/gen-skill-docs.test.ts +1049 -0
- package/.agents/skills/gstack/test/global-discover.test.ts +187 -0
- package/.agents/skills/gstack/test/helpers/codex-session-runner.ts +282 -0
- package/.agents/skills/gstack/test/helpers/e2e-helpers.ts +239 -0
- package/.agents/skills/gstack/test/helpers/eval-store.test.ts +548 -0
- package/.agents/skills/gstack/test/helpers/eval-store.ts +689 -0
- package/.agents/skills/gstack/test/helpers/gemini-session-runner.test.ts +104 -0
- package/.agents/skills/gstack/test/helpers/gemini-session-runner.ts +201 -0
- package/.agents/skills/gstack/test/helpers/llm-judge.ts +130 -0
- package/.agents/skills/gstack/test/helpers/observability.test.ts +283 -0
- package/.agents/skills/gstack/test/helpers/session-runner.test.ts +96 -0
- package/.agents/skills/gstack/test/helpers/session-runner.ts +357 -0
- package/.agents/skills/gstack/test/helpers/skill-parser.ts +206 -0
- package/.agents/skills/gstack/test/helpers/touchfiles.ts +260 -0
- package/.agents/skills/gstack/test/hook-scripts.test.ts +373 -0
- package/.agents/skills/gstack/test/skill-e2e-browse.test.ts +293 -0
- package/.agents/skills/gstack/test/skill-e2e-deploy.test.ts +279 -0
- package/.agents/skills/gstack/test/skill-e2e-design.test.ts +614 -0
- package/.agents/skills/gstack/test/skill-e2e-plan.test.ts +538 -0
- package/.agents/skills/gstack/test/skill-e2e-qa-bugs.test.ts +194 -0
- package/.agents/skills/gstack/test/skill-e2e-qa-workflow.test.ts +412 -0
- package/.agents/skills/gstack/test/skill-e2e-review.test.ts +535 -0
- package/.agents/skills/gstack/test/skill-e2e-workflow.test.ts +586 -0
- package/.agents/skills/gstack/test/skill-e2e.test.ts +3325 -0
- package/.agents/skills/gstack/test/skill-llm-eval.test.ts +787 -0
- package/.agents/skills/gstack/test/skill-parser.test.ts +179 -0
- package/.agents/skills/gstack/test/skill-routing-e2e.test.ts +605 -0
- package/.agents/skills/gstack/test/skill-validation.test.ts +1520 -0
- package/.agents/skills/gstack/test/telemetry.test.ts +278 -0
- package/.agents/skills/gstack/test/touchfiles.test.ts +262 -0
- package/.agents/skills/gstack/unfreeze/SKILL.md +40 -0
- package/.agents/skills/gstack/unfreeze/SKILL.md.tmpl +38 -0
- package/README.md +12 -7
- package/README_KO.md +12 -6
- package/package.json +3 -2
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
2
|
+
import { runSkillTest } from './helpers/session-runner';
|
|
3
|
+
import {
|
|
4
|
+
ROOT, browseBin, runId, evalsEnabled,
|
|
5
|
+
describeIfSelected, testConcurrentIfSelected,
|
|
6
|
+
copyDirSync, setupBrowseShims, logCost, recordE2E,
|
|
7
|
+
createEvalCollector, finalizeEvalCollector,
|
|
8
|
+
} from './helpers/e2e-helpers';
|
|
9
|
+
import { startTestServer } from '../browse/test/test-server';
|
|
10
|
+
import { spawnSync } from 'child_process';
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import * as os from 'os';
|
|
14
|
+
|
|
15
|
+
const evalCollector = createEvalCollector('e2e-browse');
|
|
16
|
+
|
|
17
|
+
let testServer: ReturnType<typeof startTestServer>;
|
|
18
|
+
let tmpDir: string;
|
|
19
|
+
|
|
20
|
+
describeIfSelected('Skill E2E tests', [
|
|
21
|
+
'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery',
|
|
22
|
+
'skillmd-no-local-binary', 'skillmd-outside-git', 'session-awareness',
|
|
23
|
+
], () => {
|
|
24
|
+
beforeAll(() => {
|
|
25
|
+
testServer = startTestServer();
|
|
26
|
+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
|
27
|
+
setupBrowseShims(tmpDir);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
afterAll(() => {
|
|
31
|
+
testServer?.server?.stop();
|
|
32
|
+
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
testConcurrentIfSelected('browse-basic', async () => {
|
|
36
|
+
const result = await runSkillTest({
|
|
37
|
+
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
|
|
38
|
+
1. $B goto ${testServer.url}
|
|
39
|
+
2. $B snapshot -i
|
|
40
|
+
3. $B text
|
|
41
|
+
4. $B screenshot /tmp/skill-e2e-test.png
|
|
42
|
+
Report the results of each command.`,
|
|
43
|
+
workingDirectory: tmpDir,
|
|
44
|
+
maxTurns: 10,
|
|
45
|
+
timeout: 60_000,
|
|
46
|
+
testName: 'browse-basic',
|
|
47
|
+
runId,
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
logCost('browse basic', result);
|
|
51
|
+
recordE2E(evalCollector, 'browse basic commands', 'Skill E2E tests', result);
|
|
52
|
+
expect(result.browseErrors).toHaveLength(0);
|
|
53
|
+
expect(result.exitReason).toBe('success');
|
|
54
|
+
}, 90_000);
|
|
55
|
+
|
|
56
|
+
testConcurrentIfSelected('browse-snapshot', async () => {
|
|
57
|
+
const result = await runSkillTest({
|
|
58
|
+
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
|
|
59
|
+
1. $B goto ${testServer.url}
|
|
60
|
+
2. $B snapshot -i
|
|
61
|
+
3. $B snapshot -c
|
|
62
|
+
4. $B snapshot -D
|
|
63
|
+
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
|
|
64
|
+
Report what each command returned.`,
|
|
65
|
+
workingDirectory: tmpDir,
|
|
66
|
+
maxTurns: 10,
|
|
67
|
+
timeout: 60_000,
|
|
68
|
+
testName: 'browse-snapshot',
|
|
69
|
+
runId,
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
logCost('browse snapshot', result);
|
|
73
|
+
recordE2E(evalCollector, 'browse snapshot flags', 'Skill E2E tests', result);
|
|
74
|
+
// browseErrors can include false positives from hallucinated paths (e.g. "baltimore" vs "bangalore")
|
|
75
|
+
if (result.browseErrors.length > 0) {
|
|
76
|
+
console.warn('Browse errors (non-fatal):', result.browseErrors);
|
|
77
|
+
}
|
|
78
|
+
expect(result.exitReason).toBe('success');
|
|
79
|
+
}, 90_000);
|
|
80
|
+
|
|
81
|
+
testConcurrentIfSelected('skillmd-setup-discovery', async () => {
|
|
82
|
+
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
83
|
+
const setupStart = skillMd.indexOf('## SETUP');
|
|
84
|
+
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
|
85
|
+
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
|
86
|
+
|
|
87
|
+
// Guard: verify we extracted a valid setup block
|
|
88
|
+
expect(setupBlock).toContain('browse/dist/browse');
|
|
89
|
+
|
|
90
|
+
const result = await runSkillTest({
|
|
91
|
+
prompt: `Follow these instructions to find the browse binary and run a basic command.
|
|
92
|
+
|
|
93
|
+
${setupBlock}
|
|
94
|
+
|
|
95
|
+
After finding the binary, run: $B goto ${testServer.url}
|
|
96
|
+
Then run: $B text
|
|
97
|
+
Report whether it worked.`,
|
|
98
|
+
workingDirectory: tmpDir,
|
|
99
|
+
maxTurns: 10,
|
|
100
|
+
timeout: 60_000,
|
|
101
|
+
testName: 'skillmd-setup-discovery',
|
|
102
|
+
runId,
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
recordE2E(evalCollector, 'SKILL.md setup block discovery', 'Skill E2E tests', result);
|
|
106
|
+
expect(result.browseErrors).toHaveLength(0);
|
|
107
|
+
expect(result.exitReason).toBe('success');
|
|
108
|
+
}, 90_000);
|
|
109
|
+
|
|
110
|
+
testConcurrentIfSelected('skillmd-no-local-binary', async () => {
|
|
111
|
+
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
|
|
112
|
+
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
|
113
|
+
|
|
114
|
+
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
115
|
+
const setupStart = skillMd.indexOf('## SETUP');
|
|
116
|
+
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
|
117
|
+
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
|
118
|
+
|
|
119
|
+
const result = await runSkillTest({
|
|
120
|
+
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
|
121
|
+
|
|
122
|
+
${setupBlock}
|
|
123
|
+
|
|
124
|
+
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
|
|
125
|
+
workingDirectory: emptyDir,
|
|
126
|
+
maxTurns: 5,
|
|
127
|
+
timeout: 30_000,
|
|
128
|
+
testName: 'skillmd-no-local-binary',
|
|
129
|
+
runId,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// Setup block should either find the global binary (READY) or show NEEDS_SETUP.
|
|
133
|
+
// On dev machines with gstack installed globally, the fallback path
|
|
134
|
+
// ~/.claude/skills/gstack/browse/dist/browse exists, so we get READY.
|
|
135
|
+
// The important thing is it doesn't crash or give a confusing error.
|
|
136
|
+
const allText = result.output || '';
|
|
137
|
+
recordE2E(evalCollector, 'SKILL.md setup block (no local binary)', 'Skill E2E tests', result);
|
|
138
|
+
expect(allText).toMatch(/READY|NEEDS_SETUP/);
|
|
139
|
+
expect(result.exitReason).toBe('success');
|
|
140
|
+
|
|
141
|
+
// Clean up
|
|
142
|
+
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
|
143
|
+
}, 60_000);
|
|
144
|
+
|
|
145
|
+
testConcurrentIfSelected('skillmd-outside-git', async () => {
|
|
146
|
+
// Create a tmpdir outside any git repo
|
|
147
|
+
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
|
148
|
+
|
|
149
|
+
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
150
|
+
const setupStart = skillMd.indexOf('## SETUP');
|
|
151
|
+
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
|
152
|
+
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
|
153
|
+
|
|
154
|
+
const result = await runSkillTest({
|
|
155
|
+
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
|
156
|
+
|
|
157
|
+
${setupBlock}
|
|
158
|
+
|
|
159
|
+
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
|
160
|
+
workingDirectory: nonGitDir,
|
|
161
|
+
maxTurns: 5,
|
|
162
|
+
timeout: 30_000,
|
|
163
|
+
testName: 'skillmd-outside-git',
|
|
164
|
+
runId,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
|
|
168
|
+
const allText = result.output || '';
|
|
169
|
+
recordE2E(evalCollector, 'SKILL.md outside git repo', 'Skill E2E tests', result);
|
|
170
|
+
expect(allText).toMatch(/READY|NEEDS_SETUP/);
|
|
171
|
+
|
|
172
|
+
// Clean up
|
|
173
|
+
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
|
174
|
+
}, 60_000);
|
|
175
|
+
|
|
176
|
+
testConcurrentIfSelected('contributor-mode', async () => {
|
|
177
|
+
const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
|
|
178
|
+
const logsDir = path.join(contribDir, 'contributor-logs');
|
|
179
|
+
fs.mkdirSync(logsDir, { recursive: true });
|
|
180
|
+
|
|
181
|
+
const result = await runSkillTest({
|
|
182
|
+
prompt: `You are in contributor mode (gstack_contributor=true). You just ran this browse command and it failed:
|
|
183
|
+
|
|
184
|
+
$ /nonexistent/browse goto https://example.com
|
|
185
|
+
/nonexistent/browse: No such file or directory
|
|
186
|
+
|
|
187
|
+
Per the contributor mode instructions, file a field report to ${logsDir}/browse-missing-binary.md using the Write tool. Include all required sections: title, what you tried, what happened, rating, repro steps, raw output, what would make it a 10, and the date/version footer.`,
|
|
188
|
+
workingDirectory: contribDir,
|
|
189
|
+
maxTurns: 5,
|
|
190
|
+
timeout: 30_000,
|
|
191
|
+
testName: 'contributor-mode',
|
|
192
|
+
runId,
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
logCost('contributor mode', result);
|
|
196
|
+
// Override passed: this test intentionally triggers a browse error (nonexistent binary)
|
|
197
|
+
// so browseErrors will be non-empty — that's expected, not a failure
|
|
198
|
+
recordE2E(evalCollector, 'contributor mode report', 'Skill E2E tests', result, {
|
|
199
|
+
passed: result.exitReason === 'success',
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// Verify a contributor log was created with expected format
|
|
203
|
+
const logFiles = fs.readdirSync(logsDir).filter(f => f.endsWith('.md'));
|
|
204
|
+
expect(logFiles.length).toBeGreaterThan(0);
|
|
205
|
+
|
|
206
|
+
// Verify report has key structural sections (agent may phrase differently)
|
|
207
|
+
const logContent = fs.readFileSync(path.join(logsDir, logFiles[0]), 'utf-8');
|
|
208
|
+
// Must have a title (# heading)
|
|
209
|
+
expect(logContent).toMatch(/^#\s/m);
|
|
210
|
+
// Must mention the failed command or browse
|
|
211
|
+
expect(logContent).toMatch(/browse|nonexistent|not found|no such file/i);
|
|
212
|
+
// Must have some kind of rating
|
|
213
|
+
expect(logContent).toMatch(/rating|\/10/i);
|
|
214
|
+
// Must have steps or reproduction info
|
|
215
|
+
expect(logContent).toMatch(/step|repro|reproduce/i);
|
|
216
|
+
|
|
217
|
+
// Clean up
|
|
218
|
+
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
|
219
|
+
}, 90_000);
|
|
220
|
+
|
|
221
|
+
testConcurrentIfSelected('session-awareness', async () => {
|
|
222
|
+
const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
|
|
223
|
+
|
|
224
|
+
// Set up a git repo so there's project/branch context to reference
|
|
225
|
+
const run = (cmd: string, args: string[]) =>
|
|
226
|
+
spawnSync(cmd, args, { cwd: sessionDir, stdio: 'pipe', timeout: 5000 });
|
|
227
|
+
run('git', ['init', '-b', 'main']);
|
|
228
|
+
run('git', ['config', 'user.email', 'test@test.com']);
|
|
229
|
+
run('git', ['config', 'user.name', 'Test']);
|
|
230
|
+
fs.writeFileSync(path.join(sessionDir, 'app.rb'), '# my app\n');
|
|
231
|
+
run('git', ['add', '.']);
|
|
232
|
+
run('git', ['commit', '-m', 'init']);
|
|
233
|
+
run('git', ['checkout', '-b', 'feature/add-payments']);
|
|
234
|
+
// Add a remote so the agent can derive a project name
|
|
235
|
+
run('git', ['remote', 'add', 'origin', 'https://github.com/acme/billing-app.git']);
|
|
236
|
+
|
|
237
|
+
// Extract AskUserQuestion format instructions from generated SKILL.md
|
|
238
|
+
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
239
|
+
const aqStart = skillMd.indexOf('## AskUserQuestion Format');
|
|
240
|
+
const aqEnd = skillMd.indexOf('\n## ', aqStart + 1);
|
|
241
|
+
const aqBlock = skillMd.slice(aqStart, aqEnd > 0 ? aqEnd : undefined);
|
|
242
|
+
|
|
243
|
+
const outputPath = path.join(sessionDir, 'question-output.md');
|
|
244
|
+
|
|
245
|
+
const result = await runSkillTest({
|
|
246
|
+
prompt: `You are running a gstack skill. The session preamble detected _SESSIONS=4 (the user has 4 gstack windows open).
|
|
247
|
+
|
|
248
|
+
${aqBlock}
|
|
249
|
+
|
|
250
|
+
You are on branch feature/add-payments in the billing-app project. You were reviewing a plan to add Stripe integration.
|
|
251
|
+
|
|
252
|
+
You've hit a decision point: the plan doesn't specify whether to use Stripe Checkout (hosted) or Stripe Elements (embedded). You need to ask the user which approach to use.
|
|
253
|
+
|
|
254
|
+
Since this is non-interactive, DO NOT actually call AskUserQuestion. Instead, write the EXACT text you would display to the user (the full AskUserQuestion content) to the file: ${outputPath}
|
|
255
|
+
|
|
256
|
+
Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple windows and may not remember what this conversation is about. Re-ground them.`,
|
|
257
|
+
workingDirectory: sessionDir,
|
|
258
|
+
maxTurns: 8,
|
|
259
|
+
timeout: 60_000,
|
|
260
|
+
testName: 'session-awareness',
|
|
261
|
+
runId,
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
logCost('session awareness', result);
|
|
265
|
+
recordE2E(evalCollector, 'session awareness ELI16', 'Skill E2E tests', result);
|
|
266
|
+
|
|
267
|
+
// Verify the output contains ELI16 re-grounding context
|
|
268
|
+
if (fs.existsSync(outputPath)) {
|
|
269
|
+
const output = fs.readFileSync(outputPath, 'utf-8');
|
|
270
|
+
const lower = output.toLowerCase();
|
|
271
|
+
// Must mention project name
|
|
272
|
+
expect(lower.includes('billing') || lower.includes('acme')).toBe(true);
|
|
273
|
+
// Must mention branch
|
|
274
|
+
expect(lower.includes('payment') || lower.includes('feature')).toBe(true);
|
|
275
|
+
// Must mention what we're working on
|
|
276
|
+
expect(lower.includes('stripe') || lower.includes('checkout') || lower.includes('payment')).toBe(true);
|
|
277
|
+
// Must have a RECOMMENDATION
|
|
278
|
+
expect(output).toContain('RECOMMENDATION');
|
|
279
|
+
} else {
|
|
280
|
+
// Check agent output as fallback
|
|
281
|
+
const output = result.output || '';
|
|
282
|
+
expect(output).toContain('RECOMMENDATION');
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Clean up
|
|
286
|
+
try { fs.rmSync(sessionDir, { recursive: true, force: true }); } catch {}
|
|
287
|
+
}, 90_000);
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
// Module-level afterAll — finalize eval collector after all tests complete
|
|
291
|
+
afterAll(async () => {
|
|
292
|
+
await finalizeEvalCollector(evalCollector);
|
|
293
|
+
});
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
2
|
+
import { runSkillTest } from './helpers/session-runner';
|
|
3
|
+
import {
|
|
4
|
+
ROOT, browseBin, runId, evalsEnabled,
|
|
5
|
+
describeIfSelected, testConcurrentIfSelected,
|
|
6
|
+
copyDirSync, setupBrowseShims, logCost, recordE2E,
|
|
7
|
+
createEvalCollector, finalizeEvalCollector,
|
|
8
|
+
} from './helpers/e2e-helpers';
|
|
9
|
+
import { spawnSync } from 'child_process';
|
|
10
|
+
import * as fs from 'fs';
|
|
11
|
+
import * as path from 'path';
|
|
12
|
+
import * as os from 'os';
|
|
13
|
+
|
|
14
|
+
const evalCollector = createEvalCollector('e2e-deploy');
|
|
15
|
+
|
|
16
|
+
// --- Land-and-Deploy E2E ---
|
|
17
|
+
|
|
18
|
+
describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], () => {
|
|
19
|
+
let landDir: string;
|
|
20
|
+
|
|
21
|
+
beforeAll(() => {
|
|
22
|
+
landDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-deploy-'));
|
|
23
|
+
const run = (cmd: string, args: string[]) =>
|
|
24
|
+
spawnSync(cmd, args, { cwd: landDir, stdio: 'pipe', timeout: 5000 });
|
|
25
|
+
|
|
26
|
+
run('git', ['init', '-b', 'main']);
|
|
27
|
+
run('git', ['config', 'user.email', 'test@test.com']);
|
|
28
|
+
run('git', ['config', 'user.name', 'Test']);
|
|
29
|
+
|
|
30
|
+
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
|
31
|
+
fs.writeFileSync(path.join(landDir, 'fly.toml'), 'app = "test-app"\n\n[http_service]\n internal_port = 3000\n');
|
|
32
|
+
run('git', ['add', '.']);
|
|
33
|
+
run('git', ['commit', '-m', 'initial']);
|
|
34
|
+
|
|
35
|
+
run('git', ['checkout', '-b', 'feat/add-deploy']);
|
|
36
|
+
fs.writeFileSync(path.join(landDir, 'app.ts'), 'export function hello() { return "deployed"; }\n');
|
|
37
|
+
run('git', ['add', '.']);
|
|
38
|
+
run('git', ['commit', '-m', 'feat: update hello']);
|
|
39
|
+
|
|
40
|
+
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(landDir, 'land-and-deploy'));
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
afterAll(() => {
|
|
44
|
+
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
|
|
48
|
+
const result = await runSkillTest({
|
|
49
|
+
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
|
50
|
+
|
|
51
|
+
You are on branch feat/add-deploy with changes against main. This repo has a fly.toml
|
|
52
|
+
with app = "test-app", indicating a Fly.io deployment.
|
|
53
|
+
|
|
54
|
+
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
|
|
55
|
+
Instead, simulate the workflow:
|
|
56
|
+
1. Detect the deploy platform from fly.toml (should find Fly.io, app = test-app)
|
|
57
|
+
2. Infer the production URL (https://test-app.fly.dev)
|
|
58
|
+
3. Note the merge method would be squash
|
|
59
|
+
4. Write the deploy configuration to CLAUDE.md
|
|
60
|
+
5. Write a deploy report skeleton to .gstack/deploy-reports/report.md showing the
|
|
61
|
+
expected report structure (PR number: simulated, timing: simulated, verdict: simulated)
|
|
62
|
+
|
|
63
|
+
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
|
|
64
|
+
workingDirectory: landDir,
|
|
65
|
+
maxTurns: 20,
|
|
66
|
+
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
|
67
|
+
timeout: 120_000,
|
|
68
|
+
testName: 'land-and-deploy-workflow',
|
|
69
|
+
runId,
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
logCost('/land-and-deploy', result);
|
|
73
|
+
recordE2E(evalCollector, '/land-and-deploy workflow', 'Land-and-Deploy skill E2E', result);
|
|
74
|
+
expect(result.exitReason).toBe('success');
|
|
75
|
+
|
|
76
|
+
const claudeMd = path.join(landDir, 'CLAUDE.md');
|
|
77
|
+
if (fs.existsSync(claudeMd)) {
|
|
78
|
+
const content = fs.readFileSync(claudeMd, 'utf-8');
|
|
79
|
+
const hasFly = content.toLowerCase().includes('fly') || content.toLowerCase().includes('test-app');
|
|
80
|
+
expect(hasFly).toBe(true);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const reportDir = path.join(landDir, '.gstack', 'deploy-reports');
|
|
84
|
+
expect(fs.existsSync(reportDir)).toBe(true);
|
|
85
|
+
}, 180_000);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// --- Canary skill E2E ---
|
|
89
|
+
|
|
90
|
+
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
|
91
|
+
let canaryDir: string;
|
|
92
|
+
|
|
93
|
+
beforeAll(() => {
|
|
94
|
+
canaryDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-canary-'));
|
|
95
|
+
const run = (cmd: string, args: string[]) =>
|
|
96
|
+
spawnSync(cmd, args, { cwd: canaryDir, stdio: 'pipe', timeout: 5000 });
|
|
97
|
+
|
|
98
|
+
run('git', ['init', '-b', 'main']);
|
|
99
|
+
run('git', ['config', 'user.email', 'test@test.com']);
|
|
100
|
+
run('git', ['config', 'user.name', 'Test']);
|
|
101
|
+
|
|
102
|
+
fs.writeFileSync(path.join(canaryDir, 'index.html'), '<h1>Hello</h1>\n');
|
|
103
|
+
run('git', ['add', '.']);
|
|
104
|
+
run('git', ['commit', '-m', 'initial']);
|
|
105
|
+
|
|
106
|
+
copyDirSync(path.join(ROOT, 'canary'), path.join(canaryDir, 'canary'));
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
afterAll(() => {
|
|
110
|
+
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
test('/canary skill produces monitoring report structure', async () => {
|
|
114
|
+
const result = await runSkillTest({
|
|
115
|
+
prompt: `Read canary/SKILL.md for the /canary skill instructions.
|
|
116
|
+
|
|
117
|
+
You are simulating a canary check. There is NO browse daemon available and NO production URL.
|
|
118
|
+
|
|
119
|
+
Instead, demonstrate you understand the workflow:
|
|
120
|
+
1. Create the .gstack/canary-reports/ directory structure
|
|
121
|
+
2. Write a simulated baseline.json to .gstack/canary-reports/baseline.json with the
|
|
122
|
+
schema described in Phase 2 of the skill (url, timestamp, branch, pages with
|
|
123
|
+
screenshot path, console_errors count, and load_time_ms)
|
|
124
|
+
3. Write a simulated canary report to .gstack/canary-reports/canary-report.md following
|
|
125
|
+
the Phase 6 Health Report format (CANARY REPORT header, duration, pages, status,
|
|
126
|
+
per-page results table, verdict)
|
|
127
|
+
|
|
128
|
+
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
|
|
129
|
+
Just create the directory structure and report files showing the correct schema.`,
|
|
130
|
+
workingDirectory: canaryDir,
|
|
131
|
+
maxTurns: 15,
|
|
132
|
+
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
|
|
133
|
+
timeout: 120_000,
|
|
134
|
+
testName: 'canary-workflow',
|
|
135
|
+
runId,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
logCost('/canary', result);
|
|
139
|
+
recordE2E(evalCollector, '/canary workflow', 'Canary skill E2E', result);
|
|
140
|
+
expect(result.exitReason).toBe('success');
|
|
141
|
+
|
|
142
|
+
expect(fs.existsSync(path.join(canaryDir, '.gstack', 'canary-reports'))).toBe(true);
|
|
143
|
+
const reportDir = path.join(canaryDir, '.gstack', 'canary-reports');
|
|
144
|
+
const files = fs.readdirSync(reportDir, { recursive: true }) as string[];
|
|
145
|
+
expect(files.length).toBeGreaterThan(0);
|
|
146
|
+
}, 180_000);
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// --- Benchmark skill E2E ---
|
|
150
|
+
|
|
151
|
+
describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
|
|
152
|
+
let benchDir: string;
|
|
153
|
+
|
|
154
|
+
beforeAll(() => {
|
|
155
|
+
benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benchmark-'));
|
|
156
|
+
const run = (cmd: string, args: string[]) =>
|
|
157
|
+
spawnSync(cmd, args, { cwd: benchDir, stdio: 'pipe', timeout: 5000 });
|
|
158
|
+
|
|
159
|
+
run('git', ['init', '-b', 'main']);
|
|
160
|
+
run('git', ['config', 'user.email', 'test@test.com']);
|
|
161
|
+
run('git', ['config', 'user.name', 'Test']);
|
|
162
|
+
|
|
163
|
+
fs.writeFileSync(path.join(benchDir, 'index.html'), '<h1>Hello</h1>\n');
|
|
164
|
+
run('git', ['add', '.']);
|
|
165
|
+
run('git', ['commit', '-m', 'initial']);
|
|
166
|
+
|
|
167
|
+
copyDirSync(path.join(ROOT, 'benchmark'), path.join(benchDir, 'benchmark'));
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
afterAll(() => {
|
|
171
|
+
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
test('/benchmark skill produces performance report structure', async () => {
|
|
175
|
+
const result = await runSkillTest({
|
|
176
|
+
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
|
|
177
|
+
|
|
178
|
+
You are simulating a benchmark run. There is NO browse daemon available and NO production URL.
|
|
179
|
+
|
|
180
|
+
Instead, demonstrate you understand the workflow:
|
|
181
|
+
1. Create the .gstack/benchmark-reports/ directory structure including baselines/
|
|
182
|
+
2. Write a simulated baseline.json to .gstack/benchmark-reports/baselines/baseline.json
|
|
183
|
+
with the schema from Phase 4 (url, timestamp, branch, pages with ttfb_ms, fcp_ms,
|
|
184
|
+
lcp_ms, dom_interactive_ms, dom_complete_ms, full_load_ms, total_requests,
|
|
185
|
+
total_transfer_bytes, js_bundle_bytes, css_bundle_bytes, largest_resources)
|
|
186
|
+
3. Write a simulated benchmark report to .gstack/benchmark-reports/benchmark-report.md
|
|
187
|
+
following the Phase 5 comparison format (PERFORMANCE REPORT header, page comparison
|
|
188
|
+
table with Baseline/Current/Delta/Status columns, regression thresholds applied)
|
|
189
|
+
4. Include the Phase 7 Performance Budget section in the report
|
|
190
|
+
|
|
191
|
+
Do NOT use AskUserQuestion. Do NOT run browse ($B) commands.
|
|
192
|
+
Just create the files showing the correct schema and report format.`,
|
|
193
|
+
workingDirectory: benchDir,
|
|
194
|
+
maxTurns: 15,
|
|
195
|
+
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob'],
|
|
196
|
+
timeout: 120_000,
|
|
197
|
+
testName: 'benchmark-workflow',
|
|
198
|
+
runId,
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
logCost('/benchmark', result);
|
|
202
|
+
recordE2E(evalCollector, '/benchmark workflow', 'Benchmark skill E2E', result);
|
|
203
|
+
expect(result.exitReason).toBe('success');
|
|
204
|
+
|
|
205
|
+
expect(fs.existsSync(path.join(benchDir, '.gstack', 'benchmark-reports'))).toBe(true);
|
|
206
|
+
const baselineDir = path.join(benchDir, '.gstack', 'benchmark-reports', 'baselines');
|
|
207
|
+
if (fs.existsSync(baselineDir)) {
|
|
208
|
+
const files = fs.readdirSync(baselineDir);
|
|
209
|
+
expect(files.length).toBeGreaterThan(0);
|
|
210
|
+
}
|
|
211
|
+
}, 180_000);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
// --- Setup-Deploy skill E2E ---
|
|
215
|
+
|
|
216
|
+
describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
|
|
217
|
+
let setupDir: string;
|
|
218
|
+
|
|
219
|
+
beforeAll(() => {
|
|
220
|
+
setupDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-setup-deploy-'));
|
|
221
|
+
const run = (cmd: string, args: string[]) =>
|
|
222
|
+
spawnSync(cmd, args, { cwd: setupDir, stdio: 'pipe', timeout: 5000 });
|
|
223
|
+
|
|
224
|
+
run('git', ['init', '-b', 'main']);
|
|
225
|
+
run('git', ['config', 'user.email', 'test@test.com']);
|
|
226
|
+
run('git', ['config', 'user.name', 'Test']);
|
|
227
|
+
|
|
228
|
+
fs.writeFileSync(path.join(setupDir, 'app.ts'), 'export default { port: 3000 };\n');
|
|
229
|
+
fs.writeFileSync(path.join(setupDir, 'fly.toml'), 'app = "my-cool-app"\n\n[http_service]\n internal_port = 3000\n force_https = true\n');
|
|
230
|
+
run('git', ['add', '.']);
|
|
231
|
+
run('git', ['commit', '-m', 'initial']);
|
|
232
|
+
|
|
233
|
+
copyDirSync(path.join(ROOT, 'setup-deploy'), path.join(setupDir, 'setup-deploy'));
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
afterAll(() => {
|
|
237
|
+
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
|
|
241
|
+
const result = await runSkillTest({
|
|
242
|
+
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
|
|
243
|
+
|
|
244
|
+
This repo has a fly.toml with app = "my-cool-app". Run the /setup-deploy workflow:
|
|
245
|
+
1. Detect the platform from fly.toml (should be Fly.io)
|
|
246
|
+
2. Extract the app name: my-cool-app
|
|
247
|
+
3. Infer production URL: https://my-cool-app.fly.dev
|
|
248
|
+
4. Set deploy status command: fly status --app my-cool-app
|
|
249
|
+
5. Write the Deploy Configuration section to CLAUDE.md
|
|
250
|
+
|
|
251
|
+
Do NOT use AskUserQuestion. Do NOT run fly or gh commands.
|
|
252
|
+
Do NOT try to verify the health check URL (there is no network).
|
|
253
|
+
Just detect the platform and write the config.`,
|
|
254
|
+
workingDirectory: setupDir,
|
|
255
|
+
maxTurns: 15,
|
|
256
|
+
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
|
257
|
+
timeout: 120_000,
|
|
258
|
+
testName: 'setup-deploy-workflow',
|
|
259
|
+
runId,
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
logCost('/setup-deploy', result);
|
|
263
|
+
recordE2E(evalCollector, '/setup-deploy workflow', 'Setup-Deploy skill E2E', result);
|
|
264
|
+
expect(result.exitReason).toBe('success');
|
|
265
|
+
|
|
266
|
+
const claudeMd = path.join(setupDir, 'CLAUDE.md');
|
|
267
|
+
expect(fs.existsSync(claudeMd)).toBe(true);
|
|
268
|
+
|
|
269
|
+
const content = fs.readFileSync(claudeMd, 'utf-8');
|
|
270
|
+
expect(content.toLowerCase()).toContain('fly');
|
|
271
|
+
expect(content).toContain('my-cool-app');
|
|
272
|
+
expect(content).toContain('Deploy Configuration');
|
|
273
|
+
}, 180_000);
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
// Module-level afterAll — finalize eval collector after all tests complete
|
|
277
|
+
afterAll(async () => {
|
|
278
|
+
await finalizeEvalCollector(evalCollector);
|
|
279
|
+
});
|