@runchr/gstack-antigravity 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/rules/ETHOS.md +129 -0
- package/.agents/rules/global-gstack.md +117 -0
- package/.agents/rules/persona-gstack-autoplan.md +14 -0
- package/.agents/rules/persona-gstack-benchmark.md +14 -0
- package/.agents/rules/persona-gstack-browse.md +14 -0
- package/.agents/rules/persona-gstack-canary.md +14 -0
- package/.agents/rules/persona-gstack-careful.md +14 -0
- package/.agents/rules/persona-gstack-codex.md +14 -0
- package/.agents/rules/persona-gstack-cso.md +14 -0
- package/.agents/rules/persona-gstack-design-consultation.md +14 -0
- package/.agents/rules/persona-gstack-design-review.md +14 -0
- package/.agents/rules/persona-gstack-document-release.md +14 -0
- package/.agents/rules/persona-gstack-freeze.md +14 -0
- package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
- package/.agents/rules/persona-gstack-guard.md +14 -0
- package/.agents/rules/persona-gstack-investigate.md +14 -0
- package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
- package/.agents/rules/persona-gstack-office-hours.md +14 -0
- package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
- package/.agents/rules/persona-gstack-qa-only.md +14 -0
- package/.agents/rules/persona-gstack-qa.md +14 -0
- package/.agents/rules/persona-gstack-retro.md +14 -0
- package/.agents/rules/persona-gstack-review.md +14 -0
- package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
- package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
- package/.agents/rules/persona-gstack-ship.md +14 -0
- package/.agents/rules/persona-gstack-unfreeze.md +14 -0
- package/.agents/rules/persona-gstack.md +40 -0
- package/.agents/rules/recursive-identities.md +22 -0
- package/.agents/workflows/autoplan.md +30 -0
- package/.agents/workflows/benchmark.md +31 -0
- package/.agents/workflows/browse.md +26 -0
- package/.agents/workflows/canary.md +33 -0
- package/.agents/workflows/careful.md +22 -0
- package/.agents/workflows/codex.md +36 -0
- package/.agents/workflows/cso.md +29 -0
- package/.agents/workflows/design-consultation.md +28 -0
- package/.agents/workflows/design-review.md +28 -0
- package/.agents/workflows/document-release.md +32 -0
- package/.agents/workflows/freeze.md +17 -0
- package/.agents/workflows/gstack-upgrade.md +54 -0
- package/.agents/workflows/gstack.md +56 -0
- package/.agents/workflows/guard.md +18 -0
- package/.agents/workflows/investigate.md +37 -0
- package/.agents/workflows/land-and-deploy.md +35 -0
- package/.agents/workflows/office-hours.md +27 -0
- package/.agents/workflows/plan-ceo-review.md +34 -0
- package/.agents/workflows/plan-design-review.md +31 -0
- package/.agents/workflows/plan-eng-review.md +28 -0
- package/.agents/workflows/qa-only.md +28 -0
- package/.agents/workflows/qa.md +73 -0
- package/.agents/workflows/retro.md +34 -0
- package/.agents/workflows/review.md +30 -0
- package/.agents/workflows/setup-browser-cookies.md +15 -0
- package/.agents/workflows/setup-cookies.md +8 -0
- package/.agents/workflows/setup-deploy.md +21 -0
- package/.agents/workflows/ship.md +93 -0
- package/.agents/workflows/unfreeze.md +12 -0
- package/LICENSE +22 -0
- package/README.md +189 -0
- package/README_KO.md +191 -0
- package/bin/install.js +105 -0
- package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
- package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
- package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
- package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
- package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
- package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
- package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
- package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
- package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
- package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
- package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
- package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
- package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
- package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
- package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
- package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
- package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
- package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
- package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
- package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
- package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
- package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
- package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
- package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
- package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
- package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
- package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
- package/gstack-origin/.env.example +5 -0
- package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
- package/gstack-origin/AGENTS.md +49 -0
- package/gstack-origin/ARCHITECTURE.md +359 -0
- package/gstack-origin/BROWSER.md +271 -0
- package/gstack-origin/CHANGELOG.md +800 -0
- package/gstack-origin/CLAUDE.md +284 -0
- package/gstack-origin/CONTRIBUTING.md +370 -0
- package/gstack-origin/ETHOS.md +129 -0
- package/gstack-origin/LICENSE +21 -0
- package/gstack-origin/README.md +228 -0
- package/gstack-origin/SKILL.md +657 -0
- package/gstack-origin/SKILL.md.tmpl +281 -0
- package/gstack-origin/TODOS.md +564 -0
- package/gstack-origin/VERSION +1 -0
- package/gstack-origin/autoplan/SKILL.md +689 -0
- package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
- package/gstack-origin/benchmark/SKILL.md +489 -0
- package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
- package/gstack-origin/bin/dev-setup +68 -0
- package/gstack-origin/bin/dev-teardown +56 -0
- package/gstack-origin/bin/gstack-analytics +191 -0
- package/gstack-origin/bin/gstack-community-dashboard +113 -0
- package/gstack-origin/bin/gstack-config +38 -0
- package/gstack-origin/bin/gstack-diff-scope +71 -0
- package/gstack-origin/bin/gstack-global-discover.ts +591 -0
- package/gstack-origin/bin/gstack-repo-mode +93 -0
- package/gstack-origin/bin/gstack-review-log +9 -0
- package/gstack-origin/bin/gstack-review-read +12 -0
- package/gstack-origin/bin/gstack-slug +15 -0
- package/gstack-origin/bin/gstack-telemetry-log +158 -0
- package/gstack-origin/bin/gstack-telemetry-sync +127 -0
- package/gstack-origin/bin/gstack-update-check +196 -0
- package/gstack-origin/browse/SKILL.md +517 -0
- package/gstack-origin/browse/SKILL.md.tmpl +141 -0
- package/gstack-origin/browse/bin/find-browse +21 -0
- package/gstack-origin/browse/bin/remote-slug +14 -0
- package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
- package/gstack-origin/browse/src/browser-manager.ts +634 -0
- package/gstack-origin/browse/src/buffers.ts +137 -0
- package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
- package/gstack-origin/browse/src/cli.ts +420 -0
- package/gstack-origin/browse/src/commands.ts +111 -0
- package/gstack-origin/browse/src/config.ts +150 -0
- package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
- package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
- package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
- package/gstack-origin/browse/src/find-browse.ts +61 -0
- package/gstack-origin/browse/src/meta-commands.ts +269 -0
- package/gstack-origin/browse/src/platform.ts +17 -0
- package/gstack-origin/browse/src/read-commands.ts +335 -0
- package/gstack-origin/browse/src/server.ts +369 -0
- package/gstack-origin/browse/src/snapshot.ts +398 -0
- package/gstack-origin/browse/src/url-validation.ts +91 -0
- package/gstack-origin/browse/src/write-commands.ts +352 -0
- package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
- package/gstack-origin/browse/test/commands.test.ts +1836 -0
- package/gstack-origin/browse/test/config.test.ts +250 -0
- package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
- package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
- package/gstack-origin/browse/test/find-browse.test.ts +50 -0
- package/gstack-origin/browse/test/fixtures/basic.html +33 -0
- package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
- package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
- package/gstack-origin/browse/test/fixtures/empty.html +2 -0
- package/gstack-origin/browse/test/fixtures/forms.html +55 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
- package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
- package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
- package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
- package/gstack-origin/browse/test/fixtures/spa.html +24 -0
- package/gstack-origin/browse/test/fixtures/states.html +17 -0
- package/gstack-origin/browse/test/fixtures/upload.html +25 -0
- package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
- package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
- package/gstack-origin/browse/test/handoff.test.ts +235 -0
- package/gstack-origin/browse/test/path-validation.test.ts +63 -0
- package/gstack-origin/browse/test/platform.test.ts +37 -0
- package/gstack-origin/browse/test/snapshot.test.ts +467 -0
- package/gstack-origin/browse/test/test-server.ts +57 -0
- package/gstack-origin/browse/test/url-validation.test.ts +72 -0
- package/gstack-origin/canary/SKILL.md +493 -0
- package/gstack-origin/canary/SKILL.md.tmpl +220 -0
- package/gstack-origin/careful/SKILL.md +59 -0
- package/gstack-origin/careful/SKILL.md.tmpl +57 -0
- package/gstack-origin/careful/bin/check-careful.sh +112 -0
- package/gstack-origin/codex/SKILL.md +677 -0
- package/gstack-origin/codex/SKILL.md.tmpl +356 -0
- package/gstack-origin/conductor.json +6 -0
- package/gstack-origin/cso/SKILL.md +615 -0
- package/gstack-origin/cso/SKILL.md.tmpl +376 -0
- package/gstack-origin/design-consultation/SKILL.md +625 -0
- package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
- package/gstack-origin/design-review/SKILL.md +998 -0
- package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
- package/gstack-origin/docs/images/github-2013.png +0 -0
- package/gstack-origin/docs/images/github-2026.png +0 -0
- package/gstack-origin/docs/skills.md +877 -0
- package/gstack-origin/document-release/SKILL.md +613 -0
- package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
- package/gstack-origin/freeze/SKILL.md +82 -0
- package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
- package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
- package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
- package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
- package/gstack-origin/guard/SKILL.md +82 -0
- package/gstack-origin/guard/SKILL.md.tmpl +80 -0
- package/gstack-origin/investigate/SKILL.md +435 -0
- package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
- package/gstack-origin/land-and-deploy/SKILL.md +880 -0
- package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
- package/gstack-origin/office-hours/SKILL.md +996 -0
- package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
- package/gstack-origin/package.json +55 -0
- package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
- package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
- package/gstack-origin/plan-design-review/SKILL.md +676 -0
- package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
- package/gstack-origin/plan-eng-review/SKILL.md +836 -0
- package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
- package/gstack-origin/qa/SKILL.md +1016 -0
- package/gstack-origin/qa/SKILL.md.tmpl +316 -0
- package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
- package/gstack-origin/qa/templates/qa-report-template.md +126 -0
- package/gstack-origin/qa-only/SKILL.md +633 -0
- package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
- package/gstack-origin/retro/SKILL.md +1072 -0
- package/gstack-origin/retro/SKILL.md.tmpl +833 -0
- package/gstack-origin/review/SKILL.md +849 -0
- package/gstack-origin/review/SKILL.md.tmpl +259 -0
- package/gstack-origin/review/TODOS-format.md +62 -0
- package/gstack-origin/review/checklist.md +190 -0
- package/gstack-origin/review/design-checklist.md +132 -0
- package/gstack-origin/review/greptile-triage.md +220 -0
- package/gstack-origin/scripts/analytics.ts +190 -0
- package/gstack-origin/scripts/dev-skill.ts +82 -0
- package/gstack-origin/scripts/eval-compare.ts +96 -0
- package/gstack-origin/scripts/eval-list.ts +116 -0
- package/gstack-origin/scripts/eval-select.ts +86 -0
- package/gstack-origin/scripts/eval-summary.ts +187 -0
- package/gstack-origin/scripts/eval-watch.ts +172 -0
- package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
- package/gstack-origin/scripts/skill-check.ts +167 -0
- package/gstack-origin/setup +269 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
- package/gstack-origin/setup-deploy/SKILL.md +459 -0
- package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
- package/gstack-origin/ship/SKILL.md +1457 -0
- package/gstack-origin/ship/SKILL.md.tmpl +528 -0
- package/gstack-origin/supabase/config.sh +10 -0
- package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
- package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
- package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
- package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
- package/gstack-origin/test/analytics.test.ts +277 -0
- package/gstack-origin/test/codex-e2e.test.ts +197 -0
- package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
- package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
- package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
- package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
- package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
- package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
- package/gstack-origin/test/gemini-e2e.test.ts +173 -0
- package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
- package/gstack-origin/test/global-discover.test.ts +187 -0
- package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
- package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
- package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
- package/gstack-origin/test/helpers/eval-store.ts +689 -0
- package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
- package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
- package/gstack-origin/test/helpers/llm-judge.ts +130 -0
- package/gstack-origin/test/helpers/observability.test.ts +283 -0
- package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
- package/gstack-origin/test/helpers/session-runner.ts +357 -0
- package/gstack-origin/test/helpers/skill-parser.ts +206 -0
- package/gstack-origin/test/helpers/touchfiles.ts +260 -0
- package/gstack-origin/test/hook-scripts.test.ts +373 -0
- package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
- package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
- package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
- package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
- package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
- package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
- package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
- package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
- package/gstack-origin/test/skill-e2e.test.ts +3325 -0
- package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
- package/gstack-origin/test/skill-parser.test.ts +179 -0
- package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
- package/gstack-origin/test/skill-validation.test.ts +1520 -0
- package/gstack-origin/test/telemetry.test.ts +278 -0
- package/gstack-origin/test/touchfiles.test.ts +262 -0
- package/gstack-origin/unfreeze/SKILL.md +40 -0
- package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
- package/package.json +38 -0
- package/scripts/install-antigravity-skill.ps1 +33 -0
- package/scripts/install-antigravity-skill.sh +41 -0
- package/scripts/sync-gstack-origin.ps1 +37 -0
- package/scripts/sync-gstack-origin.sh +35 -0
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-a-Judge evals for generated SKILL.md quality.
|
|
3
|
+
*
|
|
4
|
+
* Uses the Anthropic API directly (not Agent SDK) to evaluate whether
|
|
5
|
+
* generated command docs are clear, complete, and actionable for an AI agent.
|
|
6
|
+
*
|
|
7
|
+
* Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
|
|
8
|
+
* Run: EVALS=1 bun run test:eval
|
|
9
|
+
*
|
|
10
|
+
* Cost: ~$0.05-0.15 per run (sonnet)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { describe, test, expect, afterAll } from 'bun:test';
|
|
14
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
15
|
+
import * as fs from 'fs';
|
|
16
|
+
import * as path from 'path';
|
|
17
|
+
import { callJudge, judge } from './helpers/llm-judge';
|
|
18
|
+
import type { JudgeScore } from './helpers/llm-judge';
|
|
19
|
+
import { EvalCollector } from './helpers/eval-store';
|
|
20
|
+
import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
|
|
21
|
+
|
|
22
|
+
const ROOT = path.resolve(import.meta.dir, '..');
|
|
23
|
+
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
|
|
24
|
+
const evalsEnabled = !!process.env.EVALS;
|
|
25
|
+
const describeEval = evalsEnabled ? describe : describe.skip;
|
|
26
|
+
|
|
27
|
+
// Eval result collector
|
|
28
|
+
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
|
|
29
|
+
|
|
30
|
+
// --- Diff-based test selection ---
|
|
31
|
+
let selectedTests: string[] | null = null;
|
|
32
|
+
|
|
33
|
+
if (evalsEnabled && !process.env.EVALS_ALL) {
|
|
34
|
+
const baseBranch = process.env.EVALS_BASE
|
|
35
|
+
|| detectBaseBranch(ROOT)
|
|
36
|
+
|| 'main';
|
|
37
|
+
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
|
38
|
+
|
|
39
|
+
if (changedFiles.length > 0) {
|
|
40
|
+
const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
|
|
41
|
+
selectedTests = selection.selected;
|
|
42
|
+
process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`);
|
|
43
|
+
if (selection.skipped.length > 0) {
|
|
44
|
+
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
|
|
45
|
+
}
|
|
46
|
+
process.stderr.write('\n');
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Wrap a describe block to skip if none of its tests are selected. */
|
|
51
|
+
function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
|
52
|
+
const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
|
|
53
|
+
(anySelected ? describeEval : describe.skip)(name, fn);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
|
57
|
+
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
|
58
|
+
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
|
59
|
+
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
describeIfSelected('LLM-as-judge quality evals', [
|
|
63
|
+
'command reference table', 'snapshot flags reference',
|
|
64
|
+
'browse/SKILL.md reference', 'setup block', 'regression vs baseline',
|
|
65
|
+
], () => {
|
|
66
|
+
testIfSelected('command reference table', async () => {
|
|
67
|
+
const t0 = Date.now();
|
|
68
|
+
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
69
|
+
const start = content.indexOf('## Command Reference');
|
|
70
|
+
const end = content.indexOf('## Tips');
|
|
71
|
+
const section = content.slice(start, end);
|
|
72
|
+
|
|
73
|
+
const scores = await judge('command reference table', section);
|
|
74
|
+
console.log('Command reference scores:', JSON.stringify(scores, null, 2));
|
|
75
|
+
|
|
76
|
+
evalCollector?.addTest({
|
|
77
|
+
name: 'command reference table',
|
|
78
|
+
suite: 'LLM-as-judge quality evals',
|
|
79
|
+
tier: 'llm-judge',
|
|
80
|
+
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
|
81
|
+
duration_ms: Date.now() - t0,
|
|
82
|
+
cost_usd: 0.02,
|
|
83
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
84
|
+
judge_reasoning: scores.reasoning,
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
|
88
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
|
89
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
|
90
|
+
}, 30_000);
|
|
91
|
+
|
|
92
|
+
testIfSelected('snapshot flags reference', async () => {
|
|
93
|
+
const t0 = Date.now();
|
|
94
|
+
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
95
|
+
const start = content.indexOf('## Snapshot System');
|
|
96
|
+
const end = content.indexOf('## Command Reference');
|
|
97
|
+
const section = content.slice(start, end);
|
|
98
|
+
|
|
99
|
+
const scores = await judge('snapshot flags reference', section);
|
|
100
|
+
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
|
|
101
|
+
|
|
102
|
+
evalCollector?.addTest({
|
|
103
|
+
name: 'snapshot flags reference',
|
|
104
|
+
suite: 'LLM-as-judge quality evals',
|
|
105
|
+
tier: 'llm-judge',
|
|
106
|
+
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
|
107
|
+
duration_ms: Date.now() - t0,
|
|
108
|
+
cost_usd: 0.02,
|
|
109
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
110
|
+
judge_reasoning: scores.reasoning,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
|
114
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
|
115
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
|
116
|
+
}, 30_000);
|
|
117
|
+
|
|
118
|
+
testIfSelected('browse/SKILL.md reference', async () => {
|
|
119
|
+
const t0 = Date.now();
|
|
120
|
+
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
|
|
121
|
+
const start = content.indexOf('## Snapshot Flags');
|
|
122
|
+
const section = content.slice(start);
|
|
123
|
+
|
|
124
|
+
const scores = await judge('browse skill reference (flags + commands)', section);
|
|
125
|
+
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
|
|
126
|
+
|
|
127
|
+
evalCollector?.addTest({
|
|
128
|
+
name: 'browse/SKILL.md reference',
|
|
129
|
+
suite: 'LLM-as-judge quality evals',
|
|
130
|
+
tier: 'llm-judge',
|
|
131
|
+
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
|
132
|
+
duration_ms: Date.now() - t0,
|
|
133
|
+
cost_usd: 0.02,
|
|
134
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
135
|
+
judge_reasoning: scores.reasoning,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
|
139
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
|
140
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
|
141
|
+
}, 30_000);
|
|
142
|
+
|
|
143
|
+
testIfSelected('setup block', async () => {
|
|
144
|
+
const t0 = Date.now();
|
|
145
|
+
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
146
|
+
const setupStart = content.indexOf('## SETUP');
|
|
147
|
+
const setupEnd = content.indexOf('## IMPORTANT');
|
|
148
|
+
const section = content.slice(setupStart, setupEnd);
|
|
149
|
+
|
|
150
|
+
const scores = await judge('setup/binary discovery instructions', section);
|
|
151
|
+
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
|
|
152
|
+
|
|
153
|
+
evalCollector?.addTest({
|
|
154
|
+
name: 'setup block',
|
|
155
|
+
suite: 'LLM-as-judge quality evals',
|
|
156
|
+
tier: 'llm-judge',
|
|
157
|
+
passed: scores.actionability >= 3 && scores.clarity >= 3,
|
|
158
|
+
duration_ms: Date.now() - t0,
|
|
159
|
+
cost_usd: 0.02,
|
|
160
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
161
|
+
judge_reasoning: scores.reasoning,
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// Setup block is intentionally minimal (binary discovery only).
|
|
165
|
+
// SKILL_DIR is inferred from context, so judge sometimes scores 3.
|
|
166
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(3);
|
|
167
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(3);
|
|
168
|
+
}, 30_000);
|
|
169
|
+
|
|
170
|
+
testIfSelected('regression vs baseline', async () => {
|
|
171
|
+
const t0 = Date.now();
|
|
172
|
+
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
173
|
+
const genStart = generated.indexOf('## Command Reference');
|
|
174
|
+
const genEnd = generated.indexOf('## Tips');
|
|
175
|
+
const genSection = generated.slice(genStart, genEnd);
|
|
176
|
+
|
|
177
|
+
const baseline = `## Command Reference
|
|
178
|
+
|
|
179
|
+
### Navigation
|
|
180
|
+
| Command | Description |
|
|
181
|
+
|---------|-------------|
|
|
182
|
+
| \`goto <url>\` | Navigate to URL |
|
|
183
|
+
| \`back\` / \`forward\` | History navigation |
|
|
184
|
+
| \`reload\` | Reload page |
|
|
185
|
+
| \`url\` | Print current URL |
|
|
186
|
+
|
|
187
|
+
### Interaction
|
|
188
|
+
| Command | Description |
|
|
189
|
+
|---------|-------------|
|
|
190
|
+
| \`click <sel>\` | Click element |
|
|
191
|
+
| \`fill <sel> <val>\` | Fill input |
|
|
192
|
+
| \`select <sel> <val>\` | Select dropdown |
|
|
193
|
+
| \`hover <sel>\` | Hover element |
|
|
194
|
+
| \`type <text>\` | Type into focused element |
|
|
195
|
+
| \`press <key>\` | Press key (Enter, Tab, Escape) |
|
|
196
|
+
| \`scroll [sel]\` | Scroll element into view |
|
|
197
|
+
| \`wait <sel>\` | Wait for element (max 10s) |
|
|
198
|
+
| \`wait --networkidle\` | Wait for network to be idle |
|
|
199
|
+
| \`wait --load\` | Wait for page load event |
|
|
200
|
+
|
|
201
|
+
### Inspection
|
|
202
|
+
| Command | Description |
|
|
203
|
+
|---------|-------------|
|
|
204
|
+
| \`js <expr>\` | Run JavaScript |
|
|
205
|
+
| \`css <sel> <prop>\` | Computed CSS |
|
|
206
|
+
| \`attrs <sel>\` | Element attributes |
|
|
207
|
+
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
|
|
208
|
+
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
|
|
209
|
+
|
|
210
|
+
const client = new Anthropic();
|
|
211
|
+
const response = await client.messages.create({
|
|
212
|
+
model: 'claude-sonnet-4-6',
|
|
213
|
+
max_tokens: 1024,
|
|
214
|
+
messages: [{
|
|
215
|
+
role: 'user',
|
|
216
|
+
content: `You are comparing two versions of CLI documentation for an AI coding agent.
|
|
217
|
+
|
|
218
|
+
VERSION A (baseline — hand-maintained):
|
|
219
|
+
${baseline}
|
|
220
|
+
|
|
221
|
+
VERSION B (auto-generated from source):
|
|
222
|
+
${genSection}
|
|
223
|
+
|
|
224
|
+
Which version is better for an AI agent trying to use these commands? Consider:
|
|
225
|
+
- Completeness (more commands documented? all args shown?)
|
|
226
|
+
- Clarity (descriptions helpful?)
|
|
227
|
+
- Coverage (missing commands in either version?)
|
|
228
|
+
|
|
229
|
+
Respond with ONLY valid JSON:
|
|
230
|
+
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
|
|
231
|
+
|
|
232
|
+
Scores are 1-5 overall quality.`,
|
|
233
|
+
}],
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
237
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
238
|
+
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
239
|
+
const result = JSON.parse(jsonMatch[0]);
|
|
240
|
+
console.log('Regression comparison:', JSON.stringify(result, null, 2));
|
|
241
|
+
|
|
242
|
+
evalCollector?.addTest({
|
|
243
|
+
name: 'regression vs baseline',
|
|
244
|
+
suite: 'LLM-as-judge quality evals',
|
|
245
|
+
tier: 'llm-judge',
|
|
246
|
+
passed: result.b_score >= result.a_score,
|
|
247
|
+
duration_ms: Date.now() - t0,
|
|
248
|
+
cost_usd: 0.02,
|
|
249
|
+
judge_scores: { a_score: result.a_score, b_score: result.b_score },
|
|
250
|
+
judge_reasoning: result.reasoning,
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
|
254
|
+
}, 30_000);
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
// --- Part 7: QA skill quality evals (C6) ---
|
|
258
|
+
|
|
259
|
+
describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric', 'qa/SKILL.md anti-refusal'], () => {
|
|
260
|
+
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
|
261
|
+
|
|
262
|
+
testIfSelected('qa/SKILL.md workflow', async () => {
|
|
263
|
+
const t0 = Date.now();
|
|
264
|
+
const start = qaContent.indexOf('## Workflow');
|
|
265
|
+
const end = qaContent.indexOf('## Health Score Rubric');
|
|
266
|
+
const section = qaContent.slice(start, end);
|
|
267
|
+
|
|
268
|
+
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
|
|
269
|
+
|
|
270
|
+
The agent reads this document to learn how to systematically QA test a web application. The workflow references
|
|
271
|
+
a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
|
|
272
|
+
Instead, evaluate whether the workflow itself is clear, complete, and actionable.
|
|
273
|
+
|
|
274
|
+
Rate on three dimensions (1-5 scale):
|
|
275
|
+
- **clarity** (1-5): Can an agent follow the step-by-step phases without ambiguity?
|
|
276
|
+
- **completeness** (1-5): Are all phases, decision points, and outputs well-defined?
|
|
277
|
+
- **actionability** (1-5): Can an agent execute the workflow and produce the expected deliverables?
|
|
278
|
+
|
|
279
|
+
Respond with ONLY valid JSON:
|
|
280
|
+
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
281
|
+
|
|
282
|
+
Here is the QA workflow to evaluate:
|
|
283
|
+
|
|
284
|
+
${section}`);
|
|
285
|
+
console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
|
|
286
|
+
|
|
287
|
+
evalCollector?.addTest({
|
|
288
|
+
name: 'qa/SKILL.md workflow',
|
|
289
|
+
suite: 'QA skill quality evals',
|
|
290
|
+
tier: 'llm-judge',
|
|
291
|
+
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
|
292
|
+
duration_ms: Date.now() - t0,
|
|
293
|
+
cost_usd: 0.02,
|
|
294
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
295
|
+
judge_reasoning: scores.reasoning,
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
|
299
|
+
// Completeness scores 3 when judge notes the health rubric is in a separate
|
|
300
|
+
// section (the eval only passes the Workflow section, not the full document).
|
|
301
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(3);
|
|
302
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
|
303
|
+
}, 30_000);
|
|
304
|
+
|
|
305
|
+
testIfSelected('qa/SKILL.md health rubric', async () => {
|
|
306
|
+
const t0 = Date.now();
|
|
307
|
+
const start = qaContent.indexOf('## Health Score Rubric');
|
|
308
|
+
const section = qaContent.slice(start);
|
|
309
|
+
|
|
310
|
+
const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
|
|
311
|
+
|
|
312
|
+
The agent uses this rubric after QA testing a website. It needs to:
|
|
313
|
+
1. Understand each scoring category and what counts as a deduction
|
|
314
|
+
2. Apply the weights correctly to compute a final score out of 100
|
|
315
|
+
3. Produce a consistent, reproducible score
|
|
316
|
+
|
|
317
|
+
Rate on three dimensions (1-5 scale):
|
|
318
|
+
- **clarity** (1-5): Are the categories, deduction criteria, and weights unambiguous?
|
|
319
|
+
- **completeness** (1-5): Are all edge cases and scoring boundaries defined?
|
|
320
|
+
- **actionability** (1-5): Can an agent compute a correct score from this rubric alone?
|
|
321
|
+
|
|
322
|
+
Respond with ONLY valid JSON:
|
|
323
|
+
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
324
|
+
|
|
325
|
+
Here is the rubric to evaluate:
|
|
326
|
+
|
|
327
|
+
${section}`);
|
|
328
|
+
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
|
|
329
|
+
|
|
330
|
+
evalCollector?.addTest({
|
|
331
|
+
name: 'qa/SKILL.md health rubric',
|
|
332
|
+
suite: 'QA skill quality evals',
|
|
333
|
+
tier: 'llm-judge',
|
|
334
|
+
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
|
335
|
+
duration_ms: Date.now() - t0,
|
|
336
|
+
cost_usd: 0.02,
|
|
337
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
338
|
+
judge_reasoning: scores.reasoning,
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
|
342
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(3);
|
|
343
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
|
344
|
+
}, 30_000);
|
|
345
|
+
|
|
346
|
+
testIfSelected('qa/SKILL.md anti-refusal', async () => {
|
|
347
|
+
const t0 = Date.now();
|
|
348
|
+
// Extract both the diff-aware mode section and Important Rules section
|
|
349
|
+
const diffAwareStart = qaContent.indexOf('### Diff-aware');
|
|
350
|
+
const diffAwareEnd = qaContent.indexOf('### Full');
|
|
351
|
+
const rulesStart = qaContent.indexOf('## Important Rules');
|
|
352
|
+
const rulesEnd = qaContent.indexOf('## Framework-Specific');
|
|
353
|
+
const diffAwareSection = qaContent.slice(diffAwareStart, diffAwareEnd);
|
|
354
|
+
const rulesSection = qaContent.slice(rulesStart, rulesEnd);
|
|
355
|
+
|
|
356
|
+
const result = await callJudge<{ would_browse: boolean; fallback_behavior: string; confidence: number; reasoning: string }>(`You are evaluating whether a QA testing skill document would cause an AI agent to USE THE BROWSER or REFUSE to use the browser in a specific scenario.
|
|
357
|
+
|
|
358
|
+
SCENARIO:
|
|
359
|
+
A user runs /qa (a browser-based QA testing skill). The branch diff shows ONLY prompt template files and config file changes — no routes, views, controllers, components, or CSS were changed. The changes are "purely backend" with no obvious UI surface.
|
|
360
|
+
|
|
361
|
+
QUESTION: Based on the document excerpts below, would the agent open the browser and test the application, or would it decline/refuse to browse and suggest running evals or unit tests instead?
|
|
362
|
+
|
|
363
|
+
DOCUMENT EXCERPT 1 (Diff-aware mode instructions):
|
|
364
|
+
${diffAwareSection}
|
|
365
|
+
|
|
366
|
+
DOCUMENT EXCERPT 2 (Important Rules):
|
|
367
|
+
${rulesSection}
|
|
368
|
+
|
|
369
|
+
Respond with ONLY valid JSON:
|
|
370
|
+
{
|
|
371
|
+
"would_browse": true or false,
|
|
372
|
+
"fallback_behavior": "description of what the agent would do when no UI pages are identified from the diff",
|
|
373
|
+
"confidence": N (1-5, how confident you are in your answer),
|
|
374
|
+
"reasoning": "brief explanation"
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
Rules:
|
|
378
|
+
- would_browse should be true if the document instructs the agent to always use the browser regardless of diff content
|
|
379
|
+
- would_browse should be false if the document allows the agent to skip browser testing for non-UI changes
|
|
380
|
+
- confidence: 5 = document is unambiguous, 1 = document is unclear or contradictory`);
|
|
381
|
+
|
|
382
|
+
console.log('QA anti-refusal result:', JSON.stringify(result, null, 2));
|
|
383
|
+
|
|
384
|
+
evalCollector?.addTest({
|
|
385
|
+
name: 'qa/SKILL.md anti-refusal',
|
|
386
|
+
suite: 'QA skill quality evals',
|
|
387
|
+
tier: 'llm-judge',
|
|
388
|
+
passed: result.would_browse === true && result.confidence >= 4,
|
|
389
|
+
duration_ms: Date.now() - t0,
|
|
390
|
+
cost_usd: 0.02,
|
|
391
|
+
judge_scores: { would_browse: result.would_browse ? 1 : 0, confidence: result.confidence },
|
|
392
|
+
judge_reasoning: result.reasoning,
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
expect(result.would_browse).toBe(true);
|
|
396
|
+
expect(result.confidence).toBeGreaterThanOrEqual(4);
|
|
397
|
+
}, 30_000);
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
// --- Part 7: Cross-skill consistency judge (C7) ---
|
|
401
|
+
|
|
402
|
+
describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => {
|
|
403
|
+
testIfSelected('cross-skill greptile consistency', async () => {
|
|
404
|
+
const t0 = Date.now();
|
|
405
|
+
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
|
406
|
+
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
|
407
|
+
const triageContent = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
|
408
|
+
const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
|
|
409
|
+
|
|
410
|
+
const extractGrepLines = (content: string, filename: string) => {
|
|
411
|
+
const lines = content.split('\n')
|
|
412
|
+
.filter(l => /greptile|history\.md|REMOTE_SLUG/i.test(l))
|
|
413
|
+
.map(l => l.trim());
|
|
414
|
+
return `--- ${filename} ---\n${lines.join('\n')}`;
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
const collected = [
|
|
418
|
+
extractGrepLines(reviewContent, 'review/SKILL.md'),
|
|
419
|
+
extractGrepLines(shipContent, 'ship/SKILL.md'),
|
|
420
|
+
extractGrepLines(triageContent, 'review/greptile-triage.md'),
|
|
421
|
+
extractGrepLines(retroContent, 'retro/SKILL.md'),
|
|
422
|
+
].join('\n\n');
|
|
423
|
+
|
|
424
|
+
const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
|
|
425
|
+
|
|
426
|
+
INTENDED ARCHITECTURE:
|
|
427
|
+
- greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
|
|
428
|
+
- /review and /ship WRITE to BOTH paths (per-project for suppressions, global for retro aggregation)
|
|
429
|
+
- /review and /ship delegate write mechanics to greptile-triage.md
|
|
430
|
+
- /retro READS from the GLOBAL path only (it aggregates across all projects)
|
|
431
|
+
- REMOTE_SLUG derivation should be consistent across files that use it
|
|
432
|
+
|
|
433
|
+
Below are greptile-related lines extracted from each skill file:
|
|
434
|
+
|
|
435
|
+
${collected}
|
|
436
|
+
|
|
437
|
+
Evaluate consistency. Respond with ONLY valid JSON:
|
|
438
|
+
{
|
|
439
|
+
"consistent": true/false,
|
|
440
|
+
"issues": ["issue 1", "issue 2"],
|
|
441
|
+
"score": N,
|
|
442
|
+
"reasoning": "brief explanation"
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
|
446
|
+
|
|
447
|
+
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
|
|
448
|
+
|
|
449
|
+
evalCollector?.addTest({
|
|
450
|
+
name: 'cross-skill greptile consistency',
|
|
451
|
+
suite: 'Cross-skill consistency evals',
|
|
452
|
+
tier: 'llm-judge',
|
|
453
|
+
passed: result.consistent && result.score >= 4,
|
|
454
|
+
duration_ms: Date.now() - t0,
|
|
455
|
+
cost_usd: 0.02,
|
|
456
|
+
judge_scores: { consistency_score: result.score },
|
|
457
|
+
judge_reasoning: result.reasoning,
|
|
458
|
+
});
|
|
459
|
+
|
|
460
|
+
expect(result.consistent).toBe(true);
|
|
461
|
+
expect(result.score).toBeGreaterThanOrEqual(4);
|
|
462
|
+
}, 30_000);
|
|
463
|
+
});
|
|
464
|
+
|
|
465
|
+
// --- Part 7: Baseline score pinning (C9) ---
|
|
466
|
+
|
|
467
|
+
describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
|
|
468
|
+
const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
|
|
469
|
+
|
|
470
|
+
testIfSelected('baseline score pinning', async () => {
|
|
471
|
+
const t0 = Date.now();
|
|
472
|
+
if (!fs.existsSync(baselinesPath)) {
|
|
473
|
+
console.log('No baseline file found — skipping pinning check');
|
|
474
|
+
return;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const baselines = JSON.parse(fs.readFileSync(baselinesPath, 'utf-8'));
|
|
478
|
+
const regressions: string[] = [];
|
|
479
|
+
|
|
480
|
+
const skillContent = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
|
481
|
+
const cmdStart = skillContent.indexOf('## Command Reference');
|
|
482
|
+
const cmdEnd = skillContent.indexOf('## Tips');
|
|
483
|
+
const cmdSection = skillContent.slice(cmdStart, cmdEnd);
|
|
484
|
+
const cmdScores = await judge('command reference table', cmdSection);
|
|
485
|
+
|
|
486
|
+
for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
|
|
487
|
+
if (cmdScores[dim] < baselines.command_reference[dim]) {
|
|
488
|
+
regressions.push(`command_reference.${dim}: ${cmdScores[dim]} < baseline ${baselines.command_reference[dim]}`);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
if (process.env.UPDATE_BASELINES) {
|
|
493
|
+
baselines.command_reference = {
|
|
494
|
+
clarity: cmdScores.clarity,
|
|
495
|
+
completeness: cmdScores.completeness,
|
|
496
|
+
actionability: cmdScores.actionability,
|
|
497
|
+
};
|
|
498
|
+
fs.writeFileSync(baselinesPath, JSON.stringify(baselines, null, 2) + '\n');
|
|
499
|
+
console.log('Updated eval baselines');
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
const passed = regressions.length === 0;
|
|
503
|
+
evalCollector?.addTest({
|
|
504
|
+
name: 'baseline score pinning',
|
|
505
|
+
suite: 'Baseline score pinning',
|
|
506
|
+
tier: 'llm-judge',
|
|
507
|
+
passed,
|
|
508
|
+
duration_ms: Date.now() - t0,
|
|
509
|
+
cost_usd: 0.02,
|
|
510
|
+
judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
|
|
511
|
+
judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
|
|
512
|
+
});
|
|
513
|
+
|
|
514
|
+
if (!passed) {
|
|
515
|
+
throw new Error(`Score regressions detected:\n${regressions.join('\n')}`);
|
|
516
|
+
}
|
|
517
|
+
}, 60_000);
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) ---
|
|
521
|
+
|
|
522
|
+
/**
|
|
523
|
+
* DRY helper for workflow SKILL.md judge tests.
|
|
524
|
+
* Extracts a section from a SKILL.md file and judges its quality as an agent workflow.
|
|
525
|
+
*/
|
|
526
|
+
async function runWorkflowJudge(opts: {
|
|
527
|
+
testName: string;
|
|
528
|
+
suite: string;
|
|
529
|
+
skillPath: string;
|
|
530
|
+
startMarker: string;
|
|
531
|
+
endMarker: string | null;
|
|
532
|
+
judgeContext: string;
|
|
533
|
+
judgeGoal: string;
|
|
534
|
+
thresholds?: { clarity: number; completeness: number; actionability: number };
|
|
535
|
+
}) {
|
|
536
|
+
const t0 = Date.now();
|
|
537
|
+
const defaults = { clarity: 4, completeness: 3, actionability: 4 };
|
|
538
|
+
const thresholds = { ...defaults, ...opts.thresholds };
|
|
539
|
+
|
|
540
|
+
const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
|
|
541
|
+
const startIdx = content.indexOf(opts.startMarker);
|
|
542
|
+
if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);
|
|
543
|
+
|
|
544
|
+
let section: string;
|
|
545
|
+
if (opts.endMarker) {
|
|
546
|
+
const endIdx = content.indexOf(opts.endMarker, startIdx);
|
|
547
|
+
if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`);
|
|
548
|
+
section = content.slice(startIdx, endIdx);
|
|
549
|
+
} else {
|
|
550
|
+
section = content.slice(startIdx);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.
|
|
554
|
+
|
|
555
|
+
The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
|
|
556
|
+
that are documented separately — do NOT penalize for missing external definitions.
|
|
557
|
+
|
|
558
|
+
Rate on three dimensions (1-5 scale):
|
|
559
|
+
- **clarity** (1-5): Can an agent follow the instructions without ambiguity?
|
|
560
|
+
- **completeness** (1-5): Are all steps, decision points, and outputs well-defined?
|
|
561
|
+
- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables?
|
|
562
|
+
|
|
563
|
+
Respond with ONLY valid JSON:
|
|
564
|
+
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
565
|
+
|
|
566
|
+
Here is the document to evaluate:
|
|
567
|
+
|
|
568
|
+
${section}`);
|
|
569
|
+
|
|
570
|
+
console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2));
|
|
571
|
+
|
|
572
|
+
evalCollector?.addTest({
|
|
573
|
+
name: opts.testName,
|
|
574
|
+
suite: opts.suite,
|
|
575
|
+
tier: 'llm-judge',
|
|
576
|
+
passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability,
|
|
577
|
+
duration_ms: Date.now() - t0,
|
|
578
|
+
cost_usd: 0.02,
|
|
579
|
+
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
|
580
|
+
judge_reasoning: scores.reasoning,
|
|
581
|
+
});
|
|
582
|
+
|
|
583
|
+
expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity);
|
|
584
|
+
expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness);
|
|
585
|
+
expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability);
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// Block 1: Ship & Release skills
|
|
589
|
+
describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => {
|
|
590
|
+
testIfSelected('ship/SKILL.md workflow', async () => {
|
|
591
|
+
await runWorkflowJudge({
|
|
592
|
+
testName: 'ship/SKILL.md workflow',
|
|
593
|
+
suite: 'Ship & Release skill evals',
|
|
594
|
+
skillPath: 'ship/SKILL.md',
|
|
595
|
+
startMarker: '# Ship:',
|
|
596
|
+
endMarker: '## Important Rules',
|
|
597
|
+
judgeContext: 'a ship/release workflow document',
|
|
598
|
+
judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR',
|
|
599
|
+
});
|
|
600
|
+
}, 30_000);
|
|
601
|
+
|
|
602
|
+
testIfSelected('document-release/SKILL.md workflow', async () => {
|
|
603
|
+
await runWorkflowJudge({
|
|
604
|
+
testName: 'document-release/SKILL.md workflow',
|
|
605
|
+
suite: 'Ship & Release skill evals',
|
|
606
|
+
skillPath: 'document-release/SKILL.md',
|
|
607
|
+
startMarker: '# Document Release:',
|
|
608
|
+
endMarker: '## Important Rules',
|
|
609
|
+
judgeContext: 'a post-ship documentation update workflow',
|
|
610
|
+
judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS',
|
|
611
|
+
});
|
|
612
|
+
}, 30_000);
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
// Block 2: Plan Review skills
|
|
616
|
+
describeIfSelected('Plan Review skill evals', [
|
|
617
|
+
'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes',
|
|
618
|
+
], () => {
|
|
619
|
+
testIfSelected('plan-ceo-review/SKILL.md modes', async () => {
|
|
620
|
+
await runWorkflowJudge({
|
|
621
|
+
testName: 'plan-ceo-review/SKILL.md modes',
|
|
622
|
+
suite: 'Plan Review skill evals',
|
|
623
|
+
skillPath: 'plan-ceo-review/SKILL.md',
|
|
624
|
+
startMarker: '## Step 0: Nuclear Scope Challenge',
|
|
625
|
+
endMarker: '## Review Sections',
|
|
626
|
+
judgeContext: 'a CEO/founder plan review framework with 4 scope modes',
|
|
627
|
+
judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively',
|
|
628
|
+
});
|
|
629
|
+
}, 30_000);
|
|
630
|
+
|
|
631
|
+
testIfSelected('plan-eng-review/SKILL.md sections', async () => {
|
|
632
|
+
await runWorkflowJudge({
|
|
633
|
+
testName: 'plan-eng-review/SKILL.md sections',
|
|
634
|
+
suite: 'Plan Review skill evals',
|
|
635
|
+
skillPath: 'plan-eng-review/SKILL.md',
|
|
636
|
+
startMarker: '## BEFORE YOU START:',
|
|
637
|
+
endMarker: '## CRITICAL RULE',
|
|
638
|
+
judgeContext: 'an engineering plan review framework with 4 review sections',
|
|
639
|
+
judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion',
|
|
640
|
+
});
|
|
641
|
+
}, 30_000);
|
|
642
|
+
|
|
643
|
+
testIfSelected('plan-design-review/SKILL.md passes', async () => {
|
|
644
|
+
await runWorkflowJudge({
|
|
645
|
+
testName: 'plan-design-review/SKILL.md passes',
|
|
646
|
+
suite: 'Plan Review skill evals',
|
|
647
|
+
skillPath: 'plan-design-review/SKILL.md',
|
|
648
|
+
startMarker: '## Review Sections',
|
|
649
|
+
endMarker: '## CRITICAL RULE',
|
|
650
|
+
judgeContext: 'a design plan review framework with 7 review passes',
|
|
651
|
+
judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate',
|
|
652
|
+
});
|
|
653
|
+
}, 30_000);
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
// Block 3: Design skills
|
|
657
|
+
describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => {
|
|
658
|
+
testIfSelected('design-review/SKILL.md fix loop', async () => {
|
|
659
|
+
await runWorkflowJudge({
|
|
660
|
+
testName: 'design-review/SKILL.md fix loop',
|
|
661
|
+
suite: 'Design skill evals',
|
|
662
|
+
skillPath: 'design-review/SKILL.md',
|
|
663
|
+
startMarker: '## Phase 7:',
|
|
664
|
+
endMarker: '## Additional Rules',
|
|
665
|
+
judgeContext: 'a design audit triage and fix loop workflow',
|
|
666
|
+
judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots',
|
|
667
|
+
});
|
|
668
|
+
}, 30_000);
|
|
669
|
+
|
|
670
|
+
testIfSelected('design-consultation/SKILL.md research', async () => {
|
|
671
|
+
await runWorkflowJudge({
|
|
672
|
+
testName: 'design-consultation/SKILL.md research',
|
|
673
|
+
suite: 'Design skill evals',
|
|
674
|
+
skillPath: 'design-consultation/SKILL.md',
|
|
675
|
+
startMarker: '## Phase 1:',
|
|
676
|
+
endMarker: '## Phase 4:',
|
|
677
|
+
judgeContext: 'a design consultation research and proposal workflow',
|
|
678
|
+
judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications',
|
|
679
|
+
});
|
|
680
|
+
}, 30_000);
|
|
681
|
+
});
|
|
682
|
+
|
|
683
|
+
// Block 4: Deploy skills
|
|
684
|
+
describeIfSelected('Deploy skill evals', [
|
|
685
|
+
'land-and-deploy/SKILL.md workflow', 'canary/SKILL.md monitoring loop',
|
|
686
|
+
'benchmark/SKILL.md perf collection', 'setup-deploy/SKILL.md platform setup',
|
|
687
|
+
], () => {
|
|
688
|
+
testIfSelected('land-and-deploy/SKILL.md workflow', async () => {
|
|
689
|
+
await runWorkflowJudge({
|
|
690
|
+
testName: 'land-and-deploy/SKILL.md workflow',
|
|
691
|
+
suite: 'Deploy skill evals',
|
|
692
|
+
skillPath: 'land-and-deploy/SKILL.md',
|
|
693
|
+
startMarker: '## Step 1: Pre-flight',
|
|
694
|
+
endMarker: '## Important Rules',
|
|
695
|
+
judgeContext: 'a merge-deploy-verify workflow for landing PRs to production',
|
|
696
|
+
judgeGoal: 'how to merge a PR via GitHub CLI, wait for CI and deploy workflows (with platform-specific strategies for Fly.io/Render/Vercel/Netlify), run canary health checks on production, and offer revert if something breaks — with timing data logged for retrospectives',
|
|
697
|
+
});
|
|
698
|
+
}, 30_000);
|
|
699
|
+
|
|
700
|
+
testIfSelected('canary/SKILL.md monitoring loop', async () => {
|
|
701
|
+
await runWorkflowJudge({
|
|
702
|
+
testName: 'canary/SKILL.md monitoring loop',
|
|
703
|
+
suite: 'Deploy skill evals',
|
|
704
|
+
skillPath: 'canary/SKILL.md',
|
|
705
|
+
startMarker: '### Phase 2: Baseline Capture',
|
|
706
|
+
endMarker: '## Important Rules',
|
|
707
|
+
judgeContext: 'a post-deploy canary monitoring workflow using a headless browser daemon',
|
|
708
|
+
judgeGoal: 'how to capture baseline screenshots and metrics before deploy, run a continuous monitoring loop checking each page every 60 seconds for console errors and performance regressions, fire alerts with evidence (screenshots), and produce a health report with per-page status and verdict',
|
|
709
|
+
});
|
|
710
|
+
}, 30_000);
|
|
711
|
+
|
|
712
|
+
testIfSelected('benchmark/SKILL.md perf collection', async () => {
|
|
713
|
+
await runWorkflowJudge({
|
|
714
|
+
testName: 'benchmark/SKILL.md perf collection',
|
|
715
|
+
suite: 'Deploy skill evals',
|
|
716
|
+
skillPath: 'benchmark/SKILL.md',
|
|
717
|
+
startMarker: '### Phase 3: Performance Data Collection',
|
|
718
|
+
endMarker: '## Important Rules',
|
|
719
|
+
judgeContext: 'a performance regression detection workflow using browser-based Web Vitals measurement',
|
|
720
|
+
judgeGoal: 'how to collect real performance metrics (TTFB, FCP, LCP, bundle sizes, request counts) via performance.getEntries(), compare against baselines with regression thresholds, produce a performance report with delta analysis, and track trends over time',
|
|
721
|
+
});
|
|
722
|
+
}, 30_000);
|
|
723
|
+
|
|
724
|
+
testIfSelected('setup-deploy/SKILL.md platform setup', async () => {
|
|
725
|
+
await runWorkflowJudge({
|
|
726
|
+
testName: 'setup-deploy/SKILL.md platform setup',
|
|
727
|
+
suite: 'Deploy skill evals',
|
|
728
|
+
skillPath: 'setup-deploy/SKILL.md',
|
|
729
|
+
startMarker: '### Step 2: Detect platform',
|
|
730
|
+
endMarker: '## Important Rules',
|
|
731
|
+
judgeContext: 'a deployment configuration setup workflow that detects deploy platforms and writes config to CLAUDE.md',
|
|
732
|
+
judgeGoal: 'how to detect deploy platforms (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), gather platform-specific configuration (URLs, status commands, health checks, custom hooks), and persist everything to CLAUDE.md for future automated use',
|
|
733
|
+
});
|
|
734
|
+
}, 30_000);
|
|
735
|
+
});
|
|
736
|
+
|
|
737
|
+
// Block 5: Other skills
|
|
738
|
+
describeIfSelected('Other skill evals', [
|
|
739
|
+
'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow',
|
|
740
|
+
], () => {
|
|
741
|
+
testIfSelected('retro/SKILL.md instructions', async () => {
|
|
742
|
+
await runWorkflowJudge({
|
|
743
|
+
testName: 'retro/SKILL.md instructions',
|
|
744
|
+
suite: 'Other skill evals',
|
|
745
|
+
skillPath: 'retro/SKILL.md',
|
|
746
|
+
startMarker: '## Instructions',
|
|
747
|
+
endMarker: '## Compare Mode',
|
|
748
|
+
judgeContext: 'an engineering retrospective data gathering and analysis workflow',
|
|
749
|
+
judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking',
|
|
750
|
+
});
|
|
751
|
+
}, 30_000);
|
|
752
|
+
|
|
753
|
+
testIfSelected('qa-only/SKILL.md workflow', async () => {
|
|
754
|
+
await runWorkflowJudge({
|
|
755
|
+
testName: 'qa-only/SKILL.md workflow',
|
|
756
|
+
suite: 'Other skill evals',
|
|
757
|
+
skillPath: 'qa-only/SKILL.md',
|
|
758
|
+
startMarker: '## Workflow',
|
|
759
|
+
endMarker: '## Important Rules',
|
|
760
|
+
judgeContext: 'a report-only QA testing workflow',
|
|
761
|
+
judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything',
|
|
762
|
+
});
|
|
763
|
+
}, 30_000);
|
|
764
|
+
|
|
765
|
+
testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => {
|
|
766
|
+
await runWorkflowJudge({
|
|
767
|
+
testName: 'gstack-upgrade/SKILL.md upgrade flow',
|
|
768
|
+
suite: 'Other skill evals',
|
|
769
|
+
skillPath: 'gstack-upgrade/SKILL.md',
|
|
770
|
+
startMarker: '## Inline upgrade flow',
|
|
771
|
+
endMarker: '## Standalone usage',
|
|
772
|
+
judgeContext: 'a version upgrade detection and execution workflow',
|
|
773
|
+
judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed',
|
|
774
|
+
});
|
|
775
|
+
}, 30_000);
|
|
776
|
+
});
|
|
777
|
+
|
|
778
|
+
// Module-level afterAll — finalize eval collector after all tests complete
|
|
779
|
+
afterAll(async () => {
|
|
780
|
+
if (evalCollector) {
|
|
781
|
+
try {
|
|
782
|
+
await evalCollector.finalize();
|
|
783
|
+
} catch (err) {
|
|
784
|
+
console.error('Failed to save eval results:', err);
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
});
|