@runchr/gstack-antigravity 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/rules/ETHOS.md +129 -0
- package/.agents/rules/global-gstack.md +117 -0
- package/.agents/rules/persona-gstack-autoplan.md +14 -0
- package/.agents/rules/persona-gstack-benchmark.md +14 -0
- package/.agents/rules/persona-gstack-browse.md +14 -0
- package/.agents/rules/persona-gstack-canary.md +14 -0
- package/.agents/rules/persona-gstack-careful.md +14 -0
- package/.agents/rules/persona-gstack-codex.md +14 -0
- package/.agents/rules/persona-gstack-cso.md +14 -0
- package/.agents/rules/persona-gstack-design-consultation.md +14 -0
- package/.agents/rules/persona-gstack-design-review.md +14 -0
- package/.agents/rules/persona-gstack-document-release.md +14 -0
- package/.agents/rules/persona-gstack-freeze.md +14 -0
- package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
- package/.agents/rules/persona-gstack-guard.md +14 -0
- package/.agents/rules/persona-gstack-investigate.md +14 -0
- package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
- package/.agents/rules/persona-gstack-office-hours.md +14 -0
- package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
- package/.agents/rules/persona-gstack-qa-only.md +14 -0
- package/.agents/rules/persona-gstack-qa.md +14 -0
- package/.agents/rules/persona-gstack-retro.md +14 -0
- package/.agents/rules/persona-gstack-review.md +14 -0
- package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
- package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
- package/.agents/rules/persona-gstack-ship.md +14 -0
- package/.agents/rules/persona-gstack-unfreeze.md +14 -0
- package/.agents/rules/persona-gstack.md +40 -0
- package/.agents/rules/recursive-identities.md +22 -0
- package/.agents/workflows/autoplan.md +30 -0
- package/.agents/workflows/benchmark.md +31 -0
- package/.agents/workflows/browse.md +26 -0
- package/.agents/workflows/canary.md +33 -0
- package/.agents/workflows/careful.md +22 -0
- package/.agents/workflows/codex.md +36 -0
- package/.agents/workflows/cso.md +29 -0
- package/.agents/workflows/design-consultation.md +28 -0
- package/.agents/workflows/design-review.md +28 -0
- package/.agents/workflows/document-release.md +32 -0
- package/.agents/workflows/freeze.md +17 -0
- package/.agents/workflows/gstack-upgrade.md +54 -0
- package/.agents/workflows/gstack.md +56 -0
- package/.agents/workflows/guard.md +18 -0
- package/.agents/workflows/investigate.md +37 -0
- package/.agents/workflows/land-and-deploy.md +35 -0
- package/.agents/workflows/office-hours.md +27 -0
- package/.agents/workflows/plan-ceo-review.md +34 -0
- package/.agents/workflows/plan-design-review.md +31 -0
- package/.agents/workflows/plan-eng-review.md +28 -0
- package/.agents/workflows/qa-only.md +28 -0
- package/.agents/workflows/qa.md +73 -0
- package/.agents/workflows/retro.md +34 -0
- package/.agents/workflows/review.md +30 -0
- package/.agents/workflows/setup-browser-cookies.md +15 -0
- package/.agents/workflows/setup-cookies.md +8 -0
- package/.agents/workflows/setup-deploy.md +21 -0
- package/.agents/workflows/ship.md +93 -0
- package/.agents/workflows/unfreeze.md +12 -0
- package/LICENSE +22 -0
- package/README.md +189 -0
- package/README_KO.md +191 -0
- package/bin/install.js +105 -0
- package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
- package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
- package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
- package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
- package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
- package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
- package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
- package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
- package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
- package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
- package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
- package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
- package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
- package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
- package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
- package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
- package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
- package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
- package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
- package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
- package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
- package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
- package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
- package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
- package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
- package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
- package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
- package/gstack-origin/.env.example +5 -0
- package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
- package/gstack-origin/AGENTS.md +49 -0
- package/gstack-origin/ARCHITECTURE.md +359 -0
- package/gstack-origin/BROWSER.md +271 -0
- package/gstack-origin/CHANGELOG.md +800 -0
- package/gstack-origin/CLAUDE.md +284 -0
- package/gstack-origin/CONTRIBUTING.md +370 -0
- package/gstack-origin/ETHOS.md +129 -0
- package/gstack-origin/LICENSE +21 -0
- package/gstack-origin/README.md +228 -0
- package/gstack-origin/SKILL.md +657 -0
- package/gstack-origin/SKILL.md.tmpl +281 -0
- package/gstack-origin/TODOS.md +564 -0
- package/gstack-origin/VERSION +1 -0
- package/gstack-origin/autoplan/SKILL.md +689 -0
- package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
- package/gstack-origin/benchmark/SKILL.md +489 -0
- package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
- package/gstack-origin/bin/dev-setup +68 -0
- package/gstack-origin/bin/dev-teardown +56 -0
- package/gstack-origin/bin/gstack-analytics +191 -0
- package/gstack-origin/bin/gstack-community-dashboard +113 -0
- package/gstack-origin/bin/gstack-config +38 -0
- package/gstack-origin/bin/gstack-diff-scope +71 -0
- package/gstack-origin/bin/gstack-global-discover.ts +591 -0
- package/gstack-origin/bin/gstack-repo-mode +93 -0
- package/gstack-origin/bin/gstack-review-log +9 -0
- package/gstack-origin/bin/gstack-review-read +12 -0
- package/gstack-origin/bin/gstack-slug +15 -0
- package/gstack-origin/bin/gstack-telemetry-log +158 -0
- package/gstack-origin/bin/gstack-telemetry-sync +127 -0
- package/gstack-origin/bin/gstack-update-check +196 -0
- package/gstack-origin/browse/SKILL.md +517 -0
- package/gstack-origin/browse/SKILL.md.tmpl +141 -0
- package/gstack-origin/browse/bin/find-browse +21 -0
- package/gstack-origin/browse/bin/remote-slug +14 -0
- package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
- package/gstack-origin/browse/src/browser-manager.ts +634 -0
- package/gstack-origin/browse/src/buffers.ts +137 -0
- package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
- package/gstack-origin/browse/src/cli.ts +420 -0
- package/gstack-origin/browse/src/commands.ts +111 -0
- package/gstack-origin/browse/src/config.ts +150 -0
- package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
- package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
- package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
- package/gstack-origin/browse/src/find-browse.ts +61 -0
- package/gstack-origin/browse/src/meta-commands.ts +269 -0
- package/gstack-origin/browse/src/platform.ts +17 -0
- package/gstack-origin/browse/src/read-commands.ts +335 -0
- package/gstack-origin/browse/src/server.ts +369 -0
- package/gstack-origin/browse/src/snapshot.ts +398 -0
- package/gstack-origin/browse/src/url-validation.ts +91 -0
- package/gstack-origin/browse/src/write-commands.ts +352 -0
- package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
- package/gstack-origin/browse/test/commands.test.ts +1836 -0
- package/gstack-origin/browse/test/config.test.ts +250 -0
- package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
- package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
- package/gstack-origin/browse/test/find-browse.test.ts +50 -0
- package/gstack-origin/browse/test/fixtures/basic.html +33 -0
- package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
- package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
- package/gstack-origin/browse/test/fixtures/empty.html +2 -0
- package/gstack-origin/browse/test/fixtures/forms.html +55 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
- package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
- package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
- package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
- package/gstack-origin/browse/test/fixtures/spa.html +24 -0
- package/gstack-origin/browse/test/fixtures/states.html +17 -0
- package/gstack-origin/browse/test/fixtures/upload.html +25 -0
- package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
- package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
- package/gstack-origin/browse/test/handoff.test.ts +235 -0
- package/gstack-origin/browse/test/path-validation.test.ts +63 -0
- package/gstack-origin/browse/test/platform.test.ts +37 -0
- package/gstack-origin/browse/test/snapshot.test.ts +467 -0
- package/gstack-origin/browse/test/test-server.ts +57 -0
- package/gstack-origin/browse/test/url-validation.test.ts +72 -0
- package/gstack-origin/canary/SKILL.md +493 -0
- package/gstack-origin/canary/SKILL.md.tmpl +220 -0
- package/gstack-origin/careful/SKILL.md +59 -0
- package/gstack-origin/careful/SKILL.md.tmpl +57 -0
- package/gstack-origin/careful/bin/check-careful.sh +112 -0
- package/gstack-origin/codex/SKILL.md +677 -0
- package/gstack-origin/codex/SKILL.md.tmpl +356 -0
- package/gstack-origin/conductor.json +6 -0
- package/gstack-origin/cso/SKILL.md +615 -0
- package/gstack-origin/cso/SKILL.md.tmpl +376 -0
- package/gstack-origin/design-consultation/SKILL.md +625 -0
- package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
- package/gstack-origin/design-review/SKILL.md +998 -0
- package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
- package/gstack-origin/docs/images/github-2013.png +0 -0
- package/gstack-origin/docs/images/github-2026.png +0 -0
- package/gstack-origin/docs/skills.md +877 -0
- package/gstack-origin/document-release/SKILL.md +613 -0
- package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
- package/gstack-origin/freeze/SKILL.md +82 -0
- package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
- package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
- package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
- package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
- package/gstack-origin/guard/SKILL.md +82 -0
- package/gstack-origin/guard/SKILL.md.tmpl +80 -0
- package/gstack-origin/investigate/SKILL.md +435 -0
- package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
- package/gstack-origin/land-and-deploy/SKILL.md +880 -0
- package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
- package/gstack-origin/office-hours/SKILL.md +996 -0
- package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
- package/gstack-origin/package.json +55 -0
- package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
- package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
- package/gstack-origin/plan-design-review/SKILL.md +676 -0
- package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
- package/gstack-origin/plan-eng-review/SKILL.md +836 -0
- package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
- package/gstack-origin/qa/SKILL.md +1016 -0
- package/gstack-origin/qa/SKILL.md.tmpl +316 -0
- package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
- package/gstack-origin/qa/templates/qa-report-template.md +126 -0
- package/gstack-origin/qa-only/SKILL.md +633 -0
- package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
- package/gstack-origin/retro/SKILL.md +1072 -0
- package/gstack-origin/retro/SKILL.md.tmpl +833 -0
- package/gstack-origin/review/SKILL.md +849 -0
- package/gstack-origin/review/SKILL.md.tmpl +259 -0
- package/gstack-origin/review/TODOS-format.md +62 -0
- package/gstack-origin/review/checklist.md +190 -0
- package/gstack-origin/review/design-checklist.md +132 -0
- package/gstack-origin/review/greptile-triage.md +220 -0
- package/gstack-origin/scripts/analytics.ts +190 -0
- package/gstack-origin/scripts/dev-skill.ts +82 -0
- package/gstack-origin/scripts/eval-compare.ts +96 -0
- package/gstack-origin/scripts/eval-list.ts +116 -0
- package/gstack-origin/scripts/eval-select.ts +86 -0
- package/gstack-origin/scripts/eval-summary.ts +187 -0
- package/gstack-origin/scripts/eval-watch.ts +172 -0
- package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
- package/gstack-origin/scripts/skill-check.ts +167 -0
- package/gstack-origin/setup +269 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
- package/gstack-origin/setup-deploy/SKILL.md +459 -0
- package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
- package/gstack-origin/ship/SKILL.md +1457 -0
- package/gstack-origin/ship/SKILL.md.tmpl +528 -0
- package/gstack-origin/supabase/config.sh +10 -0
- package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
- package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
- package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
- package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
- package/gstack-origin/test/analytics.test.ts +277 -0
- package/gstack-origin/test/codex-e2e.test.ts +197 -0
- package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
- package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
- package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
- package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
- package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
- package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
- package/gstack-origin/test/gemini-e2e.test.ts +173 -0
- package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
- package/gstack-origin/test/global-discover.test.ts +187 -0
- package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
- package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
- package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
- package/gstack-origin/test/helpers/eval-store.ts +689 -0
- package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
- package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
- package/gstack-origin/test/helpers/llm-judge.ts +130 -0
- package/gstack-origin/test/helpers/observability.test.ts +283 -0
- package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
- package/gstack-origin/test/helpers/session-runner.ts +357 -0
- package/gstack-origin/test/helpers/skill-parser.ts +206 -0
- package/gstack-origin/test/helpers/touchfiles.ts +260 -0
- package/gstack-origin/test/hook-scripts.test.ts +373 -0
- package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
- package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
- package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
- package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
- package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
- package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
- package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
- package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
- package/gstack-origin/test/skill-e2e.test.ts +3325 -0
- package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
- package/gstack-origin/test/skill-parser.test.ts +179 -0
- package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
- package/gstack-origin/test/skill-validation.test.ts +1520 -0
- package/gstack-origin/test/telemetry.test.ts +278 -0
- package/gstack-origin/test/touchfiles.test.ts +262 -0
- package/gstack-origin/unfreeze/SKILL.md +40 -0
- package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
- package/package.json +38 -0
- package/scripts/install-antigravity-skill.ps1 +33 -0
- package/scripts/install-antigravity-skill.sh +41 -0
- package/scripts/sync-gstack-origin.ps1 +37 -0
- package/scripts/sync-gstack-origin.sh +35 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test';
|
|
2
|
+
import { parseGeminiJSONL } from './gemini-session-runner';
|
|
3
|
+
|
|
4
|
+
// Fixture: actual Gemini CLI stream-json output with tool use
|
|
5
|
+
const FIXTURE_LINES = [
|
|
6
|
+
'{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}',
|
|
7
|
+
'{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}',
|
|
8
|
+
'{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}',
|
|
9
|
+
'{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
|
|
10
|
+
'{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}',
|
|
11
|
+
'{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}',
|
|
12
|
+
'{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}',
|
|
13
|
+
];
|
|
14
|
+
|
|
15
|
+
describe('parseGeminiJSONL', () => {
|
|
16
|
+
test('extracts session ID from init event', () => {
|
|
17
|
+
const parsed = parseGeminiJSONL(FIXTURE_LINES);
|
|
18
|
+
expect(parsed.sessionId).toBe('test-session-123');
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test('concatenates assistant message deltas into output', () => {
|
|
22
|
+
const parsed = parseGeminiJSONL(FIXTURE_LINES);
|
|
23
|
+
expect(parsed.output).toBe('I will list the files.Here are the files.');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('ignores user messages', () => {
|
|
27
|
+
const lines = [
|
|
28
|
+
'{"type":"message","role":"user","content":"this should be ignored"}',
|
|
29
|
+
'{"type":"message","role":"assistant","content":"this should be kept","delta":true}',
|
|
30
|
+
];
|
|
31
|
+
const parsed = parseGeminiJSONL(lines);
|
|
32
|
+
expect(parsed.output).toBe('this should be kept');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test('extracts tool names from tool_use events', () => {
|
|
36
|
+
const parsed = parseGeminiJSONL(FIXTURE_LINES);
|
|
37
|
+
expect(parsed.toolCalls).toHaveLength(1);
|
|
38
|
+
expect(parsed.toolCalls[0]).toBe('run_shell_command');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test('extracts total tokens from result stats', () => {
|
|
42
|
+
const parsed = parseGeminiJSONL(FIXTURE_LINES);
|
|
43
|
+
expect(parsed.tokens).toBe(27147);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('skips malformed lines without throwing', () => {
|
|
47
|
+
const lines = [
|
|
48
|
+
'{"type":"init","session_id":"ok"}',
|
|
49
|
+
'this is not json',
|
|
50
|
+
'{"type":"message","role":"assistant","content":"hello","delta":true}',
|
|
51
|
+
'{incomplete json',
|
|
52
|
+
'{"type":"result","status":"success","stats":{"total_tokens":100}}',
|
|
53
|
+
];
|
|
54
|
+
const parsed = parseGeminiJSONL(lines);
|
|
55
|
+
expect(parsed.sessionId).toBe('ok');
|
|
56
|
+
expect(parsed.output).toBe('hello');
|
|
57
|
+
expect(parsed.tokens).toBe(100);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('skips empty and whitespace-only lines', () => {
|
|
61
|
+
const lines = [
|
|
62
|
+
'',
|
|
63
|
+
' ',
|
|
64
|
+
'{"type":"init","session_id":"s1"}',
|
|
65
|
+
'\t',
|
|
66
|
+
'{"type":"result","status":"success","stats":{"total_tokens":50}}',
|
|
67
|
+
];
|
|
68
|
+
const parsed = parseGeminiJSONL(lines);
|
|
69
|
+
expect(parsed.sessionId).toBe('s1');
|
|
70
|
+
expect(parsed.tokens).toBe(50);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
test('handles empty input', () => {
|
|
74
|
+
const parsed = parseGeminiJSONL([]);
|
|
75
|
+
expect(parsed.output).toBe('');
|
|
76
|
+
expect(parsed.toolCalls).toHaveLength(0);
|
|
77
|
+
expect(parsed.tokens).toBe(0);
|
|
78
|
+
expect(parsed.sessionId).toBeNull();
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test('handles missing fields gracefully', () => {
|
|
82
|
+
const lines = [
|
|
83
|
+
'{"type":"init"}', // no session_id
|
|
84
|
+
'{"type":"message","role":"assistant"}', // no content
|
|
85
|
+
'{"type":"tool_use"}', // no tool_name
|
|
86
|
+
'{"type":"result","status":"success"}', // no stats
|
|
87
|
+
];
|
|
88
|
+
const parsed = parseGeminiJSONL(lines);
|
|
89
|
+
expect(parsed.sessionId).toBeNull();
|
|
90
|
+
expect(parsed.output).toBe('');
|
|
91
|
+
expect(parsed.toolCalls).toHaveLength(0);
|
|
92
|
+
expect(parsed.tokens).toBe(0);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
test('handles multiple tool_use events', () => {
|
|
96
|
+
const lines = [
|
|
97
|
+
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
|
|
98
|
+
'{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}',
|
|
99
|
+
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}',
|
|
100
|
+
];
|
|
101
|
+
const parsed = parseGeminiJSONL(lines);
|
|
102
|
+
expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']);
|
|
103
|
+
});
|
|
104
|
+
});
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gemini CLI subprocess runner for skill E2E testing.
|
|
3
|
+
*
|
|
4
|
+
* Spawns `gemini -p` as an independent process, parses its stream-json
|
|
5
|
+
* output, and returns structured results. Follows the same pattern as
|
|
6
|
+
* codex-session-runner.ts but adapted for the Gemini CLI.
|
|
7
|
+
*
|
|
8
|
+
* Key differences from Codex session-runner:
|
|
9
|
+
* - Uses `gemini -p` instead of `codex exec`
|
|
10
|
+
* - Output is NDJSON with event types: init, message, tool_use, tool_result, result
|
|
11
|
+
* - Uses `--output-format stream-json --yolo` instead of `--json -s read-only`
|
|
12
|
+
* - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd
|
|
13
|
+
* - Message events are streamed with `delta: true` — must concatenate
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import * as path from 'path';
|
|
17
|
+
|
|
18
|
+
// --- Interfaces ---
|
|
19
|
+
|
|
20
|
+
export interface GeminiResult {
|
|
21
|
+
output: string; // Full assistant message text (concatenated deltas)
|
|
22
|
+
toolCalls: string[]; // Tool names from tool_use events
|
|
23
|
+
tokens: number; // Total tokens used
|
|
24
|
+
exitCode: number; // Process exit code
|
|
25
|
+
durationMs: number; // Wall clock time
|
|
26
|
+
sessionId: string | null; // Session ID from init event
|
|
27
|
+
rawLines: string[]; // Raw JSONL lines for debugging
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// --- JSONL parser ---
|
|
31
|
+
|
|
32
|
+
export interface ParsedGeminiJSONL {
|
|
33
|
+
output: string;
|
|
34
|
+
toolCalls: string[];
|
|
35
|
+
tokens: number;
|
|
36
|
+
sessionId: string | null;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Parse an array of JSONL lines from `gemini -p --output-format stream-json`.
|
|
41
|
+
* Pure function — no I/O, no side effects.
|
|
42
|
+
*
|
|
43
|
+
* Handles these Gemini event types:
|
|
44
|
+
* - init → extract session_id
|
|
45
|
+
* - message (role=assistant, delta=true) → concatenate content into output
|
|
46
|
+
* - tool_use → extract tool_name
|
|
47
|
+
* - tool_result → logged but not extracted
|
|
48
|
+
* - result → extract token usage from stats
|
|
49
|
+
*/
|
|
50
|
+
export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL {
|
|
51
|
+
const outputParts: string[] = [];
|
|
52
|
+
const toolCalls: string[] = [];
|
|
53
|
+
let tokens = 0;
|
|
54
|
+
let sessionId: string | null = null;
|
|
55
|
+
|
|
56
|
+
for (const line of lines) {
|
|
57
|
+
if (!line.trim()) continue;
|
|
58
|
+
try {
|
|
59
|
+
const obj = JSON.parse(line);
|
|
60
|
+
const t = obj.type || '';
|
|
61
|
+
|
|
62
|
+
if (t === 'init') {
|
|
63
|
+
const sid = obj.session_id || '';
|
|
64
|
+
if (sid) sessionId = sid;
|
|
65
|
+
} else if (t === 'message') {
|
|
66
|
+
if (obj.role === 'assistant' && obj.content) {
|
|
67
|
+
outputParts.push(obj.content);
|
|
68
|
+
}
|
|
69
|
+
} else if (t === 'tool_use') {
|
|
70
|
+
const name = obj.tool_name || '';
|
|
71
|
+
if (name) toolCalls.push(name);
|
|
72
|
+
} else if (t === 'result') {
|
|
73
|
+
const stats = obj.stats || {};
|
|
74
|
+
tokens = (stats.total_tokens || 0);
|
|
75
|
+
}
|
|
76
|
+
} catch { /* skip malformed lines */ }
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
output: outputParts.join(''),
|
|
81
|
+
toolCalls,
|
|
82
|
+
tokens,
|
|
83
|
+
sessionId,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// --- Main runner ---
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Run a prompt via `gemini -p` and return structured results.
|
|
91
|
+
*
|
|
92
|
+
* Spawns gemini with stream-json output, parses JSONL events,
|
|
93
|
+
* and returns a GeminiResult. Skips gracefully if gemini binary is not found.
|
|
94
|
+
*/
|
|
95
|
+
export async function runGeminiSkill(opts: {
|
|
96
|
+
prompt: string; // What to ask Gemini
|
|
97
|
+
timeoutMs?: number; // Default 300000 (5 min)
|
|
98
|
+
cwd?: string; // Working directory (where .agents/skills/ lives)
|
|
99
|
+
}): Promise<GeminiResult> {
|
|
100
|
+
const {
|
|
101
|
+
prompt,
|
|
102
|
+
timeoutMs = 300_000,
|
|
103
|
+
cwd,
|
|
104
|
+
} = opts;
|
|
105
|
+
|
|
106
|
+
const startTime = Date.now();
|
|
107
|
+
|
|
108
|
+
// Check if gemini binary exists
|
|
109
|
+
const whichResult = Bun.spawnSync(['which', 'gemini']);
|
|
110
|
+
if (whichResult.exitCode !== 0) {
|
|
111
|
+
return {
|
|
112
|
+
output: 'SKIP: gemini binary not found',
|
|
113
|
+
toolCalls: [],
|
|
114
|
+
tokens: 0,
|
|
115
|
+
exitCode: -1,
|
|
116
|
+
durationMs: Date.now() - startTime,
|
|
117
|
+
sessionId: null,
|
|
118
|
+
rawLines: [],
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Build gemini command
|
|
123
|
+
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
|
|
124
|
+
|
|
125
|
+
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
|
|
126
|
+
const proc = Bun.spawn(['gemini', ...args], {
|
|
127
|
+
cwd: cwd || process.cwd(),
|
|
128
|
+
stdout: 'pipe',
|
|
129
|
+
stderr: 'pipe',
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// Race against timeout
|
|
133
|
+
let timedOut = false;
|
|
134
|
+
const timeoutId = setTimeout(() => {
|
|
135
|
+
timedOut = true;
|
|
136
|
+
proc.kill();
|
|
137
|
+
}, timeoutMs);
|
|
138
|
+
|
|
139
|
+
// Stream and collect JSONL from stdout
|
|
140
|
+
const collectedLines: string[] = [];
|
|
141
|
+
const stderrPromise = new Response(proc.stderr).text();
|
|
142
|
+
|
|
143
|
+
const reader = proc.stdout.getReader();
|
|
144
|
+
const decoder = new TextDecoder();
|
|
145
|
+
let buf = '';
|
|
146
|
+
|
|
147
|
+
try {
|
|
148
|
+
while (true) {
|
|
149
|
+
const { done, value } = await reader.read();
|
|
150
|
+
if (done) break;
|
|
151
|
+
buf += decoder.decode(value, { stream: true });
|
|
152
|
+
const lines = buf.split('\n');
|
|
153
|
+
buf = lines.pop() || '';
|
|
154
|
+
for (const line of lines) {
|
|
155
|
+
if (!line.trim()) continue;
|
|
156
|
+
collectedLines.push(line);
|
|
157
|
+
|
|
158
|
+
// Real-time progress to stderr
|
|
159
|
+
try {
|
|
160
|
+
const event = JSON.parse(line);
|
|
161
|
+
if (event.type === 'tool_use' && event.tool_name) {
|
|
162
|
+
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
163
|
+
process.stderr.write(` [gemini ${elapsed}s] tool: ${event.tool_name}\n`);
|
|
164
|
+
} else if (event.type === 'message' && event.role === 'assistant' && event.content) {
|
|
165
|
+
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
166
|
+
process.stderr.write(` [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`);
|
|
167
|
+
}
|
|
168
|
+
} catch { /* skip — parseGeminiJSONL will handle it later */ }
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
} catch { /* stream read error — fall through to exit code handling */ }
|
|
172
|
+
|
|
173
|
+
// Flush remaining buffer
|
|
174
|
+
if (buf.trim()) {
|
|
175
|
+
collectedLines.push(buf);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const stderr = await stderrPromise;
|
|
179
|
+
const exitCode = await proc.exited;
|
|
180
|
+
clearTimeout(timeoutId);
|
|
181
|
+
|
|
182
|
+
const durationMs = Date.now() - startTime;
|
|
183
|
+
|
|
184
|
+
// Parse all collected JSONL lines
|
|
185
|
+
const parsed = parseGeminiJSONL(collectedLines);
|
|
186
|
+
|
|
187
|
+
// Log stderr if non-empty (may contain auth errors, etc.)
|
|
188
|
+
if (stderr.trim()) {
|
|
189
|
+
process.stderr.write(` [gemini stderr] ${stderr.trim().slice(0, 200)}\n`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
output: parsed.output,
|
|
194
|
+
toolCalls: parsed.toolCalls,
|
|
195
|
+
tokens: parsed.tokens,
|
|
196
|
+
exitCode: timedOut ? 124 : exitCode,
|
|
197
|
+
durationMs,
|
|
198
|
+
sessionId: parsed.sessionId,
|
|
199
|
+
rawLines: collectedLines,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared LLM-as-judge helpers for eval and E2E tests.
|
|
3
|
+
*
|
|
4
|
+
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
|
5
|
+
* and outcomeJudge (planted-bug detection scorer).
|
|
6
|
+
*
|
|
7
|
+
* Requires: ANTHROPIC_API_KEY env var
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
11
|
+
|
|
12
|
+
export interface JudgeScore {
|
|
13
|
+
clarity: number; // 1-5
|
|
14
|
+
completeness: number; // 1-5
|
|
15
|
+
actionability: number; // 1-5
|
|
16
|
+
reasoning: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface OutcomeJudgeResult {
|
|
20
|
+
detected: string[];
|
|
21
|
+
missed: string[];
|
|
22
|
+
false_positives: number;
|
|
23
|
+
detection_rate: number;
|
|
24
|
+
evidence_quality: number;
|
|
25
|
+
reasoning: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
|
30
|
+
* Retries once on 429 rate limit errors.
|
|
31
|
+
*/
|
|
32
|
+
export async function callJudge<T>(prompt: string): Promise<T> {
|
|
33
|
+
const client = new Anthropic();
|
|
34
|
+
|
|
35
|
+
const makeRequest = () => client.messages.create({
|
|
36
|
+
model: 'claude-sonnet-4-6',
|
|
37
|
+
max_tokens: 1024,
|
|
38
|
+
messages: [{ role: 'user', content: prompt }],
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
let response;
|
|
42
|
+
try {
|
|
43
|
+
response = await makeRequest();
|
|
44
|
+
} catch (err: any) {
|
|
45
|
+
if (err.status === 429) {
|
|
46
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
47
|
+
response = await makeRequest();
|
|
48
|
+
} else {
|
|
49
|
+
throw err;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
54
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
55
|
+
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
56
|
+
return JSON.parse(jsonMatch[0]) as T;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Score documentation quality on clarity/completeness/actionability (1-5).
|
|
61
|
+
*/
|
|
62
|
+
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
|
63
|
+
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
|
64
|
+
|
|
65
|
+
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
|
66
|
+
1. Understand what each command does
|
|
67
|
+
2. Know what arguments to pass
|
|
68
|
+
3. Know valid values for enum-like parameters
|
|
69
|
+
4. Construct correct command invocations without guessing
|
|
70
|
+
|
|
71
|
+
Rate the following ${section} on three dimensions (1-5 scale):
|
|
72
|
+
|
|
73
|
+
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
|
74
|
+
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
|
75
|
+
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
|
76
|
+
|
|
77
|
+
Scoring guide:
|
|
78
|
+
- 5: Excellent — no ambiguity, all info present
|
|
79
|
+
- 4: Good — minor gaps an experienced agent could infer
|
|
80
|
+
- 3: Adequate — some guessing required
|
|
81
|
+
- 2: Poor — significant info missing
|
|
82
|
+
- 1: Unusable — agent would fail without external help
|
|
83
|
+
|
|
84
|
+
Respond with ONLY valid JSON in this exact format:
|
|
85
|
+
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
86
|
+
|
|
87
|
+
Here is the ${section} to evaluate:
|
|
88
|
+
|
|
89
|
+
${content}`);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Evaluate a QA report against planted-bug ground truth.
|
|
94
|
+
* Returns detection metrics for the planted bugs.
|
|
95
|
+
*/
|
|
96
|
+
export async function outcomeJudge(
|
|
97
|
+
groundTruth: any,
|
|
98
|
+
report: string,
|
|
99
|
+
): Promise<OutcomeJudgeResult> {
|
|
100
|
+
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
|
101
|
+
|
|
102
|
+
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
|
103
|
+
${JSON.stringify(groundTruth.bugs, null, 2)}
|
|
104
|
+
|
|
105
|
+
QA REPORT (generated by an AI agent):
|
|
106
|
+
${report}
|
|
107
|
+
|
|
108
|
+
For each planted bug, determine if the report identified it. A bug counts as
|
|
109
|
+
"detected" if the report describes the same defect, even if the wording differs.
|
|
110
|
+
Use the detection_hint keywords as guidance.
|
|
111
|
+
|
|
112
|
+
Also count false positives: issues in the report that don't correspond to any
|
|
113
|
+
planted bug AND aren't legitimate issues with the page.
|
|
114
|
+
|
|
115
|
+
Respond with ONLY valid JSON:
|
|
116
|
+
{
|
|
117
|
+
"detected": ["bug-id-1", "bug-id-2"],
|
|
118
|
+
"missed": ["bug-id-3"],
|
|
119
|
+
"false_positives": 0,
|
|
120
|
+
"detection_rate": 2,
|
|
121
|
+
"evidence_quality": 4,
|
|
122
|
+
"reasoning": "brief explanation"
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
Rules:
|
|
126
|
+
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
|
127
|
+
- detection_rate = length of detected array
|
|
128
|
+
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
|
129
|
+
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
|
130
|
+
}
|