@geminix/gxpm 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +148 -0
- package/CANON.md +53 -0
- package/CLAUDE.md +60 -0
- package/CONTEXT.md +49 -0
- package/DEBUG.md +59 -0
- package/ISSUE_CONTEXT.md +25 -0
- package/README.md +143 -0
- package/VERSION +1 -0
- package/agents/cleanup-auditor/cleanup-auditor.md +56 -0
- package/agents/grill-master.md +26 -0
- package/agents/implementer.md +32 -0
- package/agents/review-army/accessibility-reviewer.md +54 -0
- package/agents/review-army/code-quality-reviewer.md +54 -0
- package/agents/review-army/security-reviewer.md +56 -0
- package/agents/review-army/spec-compliance-reviewer.md +51 -0
- package/agents/review-army/test-reviewer.md +55 -0
- package/agents/reviewer.md +59 -0
- package/agents/ship-audit-army/docs-auditor.md +53 -0
- package/agents/ship-audit-army/performance-auditor.md +52 -0
- package/agents/ship-audit-army/security-auditor.md +52 -0
- package/agents/specifier.md +55 -0
- package/agents/triage-officer.md +27 -0
- package/bin/gxpm +17 -0
- package/bin/gxpm-browser +17 -0
- package/bin/gxpm-config +15 -0
- package/bin/gxpm-eval +13 -0
- package/bin/gxpm-global-discover +15 -0
- package/bin/gxpm-init +38 -0
- package/bin/gxpm-investigate +194 -0
- package/bin/gxpm-uninstall +15 -0
- package/bin/gxpm-update-check +165 -0
- package/commands/build.md +40 -0
- package/commands/help.md +53 -0
- package/commands/plan.md +34 -0
- package/commands/refine.md +46 -0
- package/commands/review.md +34 -0
- package/commands/ship.md +37 -0
- package/core/ac-check.ts +20 -0
- package/core/agent-runtime.ts +363 -0
- package/core/artifact-validator.ts +151 -0
- package/core/artifacts.ts +313 -0
- package/core/autopilot.ts +250 -0
- package/core/capabilities.ts +779 -0
- package/core/checkpoint.ts +370 -0
- package/core/cleanup.ts +32 -0
- package/core/command-probe.ts +82 -0
- package/core/config.ts +533 -0
- package/core/contracts/behavior-spec.schema.ts +38 -0
- package/core/contracts/converter.ts +61 -0
- package/core/contracts/host.ts +43 -0
- package/core/converters/converter.ts +93 -0
- package/core/converters/index.ts +8 -0
- package/core/converters/managed-artifact.ts +119 -0
- package/core/converters/parser.ts +159 -0
- package/core/converters/template-renderer.ts +35 -0
- package/core/converters/writer.ts +61 -0
- package/core/dag-executor.ts +426 -0
- package/core/dag-loader.ts +292 -0
- package/core/dag-schemas.ts +150 -0
- package/core/dispatch.ts +125 -0
- package/core/evidence.ts +148 -0
- package/core/gate.ts +269 -0
- package/core/hook-engine.ts +566 -0
- package/core/host-probe.ts +64 -0
- package/core/implement.ts +16 -0
- package/core/isolation-errors.ts +174 -0
- package/core/isolation-resolver.ts +921 -0
- package/core/issue-context.ts +381 -0
- package/core/issue-readiness.ts +457 -0
- package/core/issue-sync.ts +427 -0
- package/core/issues.ts +132 -0
- package/core/land.ts +108 -0
- package/core/orchestrator.ts +54 -0
- package/core/phase-artifact.ts +32 -0
- package/core/phase-gates.ts +130 -0
- package/core/phase-rewind.ts +94 -0
- package/core/plan-lint.ts +61 -0
- package/core/plan.ts +77 -0
- package/core/port-allocation.ts +50 -0
- package/core/pr-check.ts +15 -0
- package/core/preset-system/preset-resolver.ts +221 -0
- package/core/project-init-status.ts +127 -0
- package/core/qa.ts +15 -0
- package/core/resilience.ts +165 -0
- package/core/runs.ts +288 -0
- package/core/safe-path.test.ts +80 -0
- package/core/safe-path.ts +60 -0
- package/core/sdd-gate.test.ts +98 -0
- package/core/sdd-gate.ts +134 -0
- package/core/self-review.ts +62 -0
- package/core/session.ts +70 -0
- package/core/ship.ts +86 -0
- package/core/specify.ts +173 -0
- package/core/state.ts +1002 -0
- package/core/template-engine.ts +152 -0
- package/core/template-resolver.test.ts +70 -0
- package/core/template-resolver.ts +156 -0
- package/core/triage.ts +26 -0
- package/core/verify.ts +15 -0
- package/core/wiki-native.ts +2423 -0
- package/core/wiki.ts +27 -0
- package/core/workflow-event-emitter.ts +163 -0
- package/core/workflows/engine.ts +273 -0
- package/core/workflows/expressions.ts +76 -0
- package/core/workflows/index.ts +38 -0
- package/core/workflows/steps/command.ts +43 -0
- package/core/workflows/steps/gate.ts +47 -0
- package/core/workflows/steps/gxpm.ts +44 -0
- package/core/workflows/steps/linear.ts +31 -0
- package/core/workflows/steps/shell.ts +65 -0
- package/core/workflows/types.ts +62 -0
- package/core/workspace-runtime.ts +227 -0
- package/core/worktree-init-steps.ts +647 -0
- package/core/worktree-init.ts +330 -0
- package/core/worktree-owner.ts +143 -0
- package/docs/GXPM_VERIFY.md +98 -0
- package/docs/INSTALL_FOR_AGENTS.md +113 -0
- package/docs/README.md +57 -0
- package/docs/adr/adr-005-multi-platform-skill-converter.md +72 -0
- package/docs/agents/domain.md +30 -0
- package/docs/agents/issue-tracker.md +30 -0
- package/docs/agents/triage-labels.md +32 -0
- package/docs/architecture/gxpm-architecture-diagram.md +265 -0
- package/docs/architecture/gxpm-current-architecture.md +175 -0
- package/docs/architecture/gxpm-current-flow.md +278 -0
- package/docs/architecture/gxpm-replacement-architecture.md +211 -0
- package/docs/architecture/gxpm-target-architecture.md +449 -0
- package/docs/architecture/gxpm-v0-contract.md +311 -0
- package/docs/architecture/layered-workflow-boundaries.md +193 -0
- package/docs/architecture/preset-system.md +126 -0
- package/docs/architecture/scaffold-northstar.md +23 -0
- package/docs/brainstorms/2026-05-14-bdd-then-tdd-design.md +320 -0
- package/docs/brainstorms/README.md +22 -0
- package/docs/brainstorms/docs-knowledge-system-requirements.md +29 -0
- package/docs/governance/beta-skill-promotion.md +39 -0
- package/docs/governance/development-contract.md +144 -0
- package/docs/governance/gherkin-style.md +90 -0
- package/docs/governance/host-adapter.md +56 -0
- package/docs/governance/skill-authoring.md +87 -0
- package/docs/governance/skill-testing.md +356 -0
- package/docs/governance/template-authoring.md +53 -0
- package/docs/migrations/v0.2.md +51 -0
- package/docs/plans/README.md +23 -0
- package/docs/plans/bdd-then-tdd-plan.md +1767 -0
- package/docs/plans/docs-knowledge-system-plan.md +31 -0
- package/docs/plans/spec-kit-sdd-adoption-plan.md +305 -0
- package/docs/research/agents-md-best-practices.md +207 -0
- package/docs/research/archon-study.md +351 -0
- package/docs/research/claude-hooks-study.md +440 -0
- package/docs/research/codex-hooks-study.md +624 -0
- package/docs/research/everything-claude-code-study.md +252 -0
- package/docs/research/from-skills-to-layered-workflow.md +322 -0
- package/docs/research/gsd-study.md +69 -0
- package/docs/research/kimi-hooks-study.md +274 -0
- package/docs/research/mattpocock-skills-comparison.md +429 -0
- package/docs/research/mattpocock-skills-study.md +275 -0
- package/docs/research/oh-my-codex-study.md +279 -0
- package/docs/research/perplexity-agent-skills-design.md +168 -0
- package/docs/research/pmc-gstack-skill-study.md +122 -0
- package/docs/research/spec-kit-study.md +224 -0
- package/docs/research/superpowers-study.md +209 -0
- package/docs/roadmap/initial-roadmap.md +53 -0
- package/docs/solutions/README.md +45 -0
- package/docs/solutions/artifact-nesting-recovery.md +58 -0
- package/docs/solutions/session-context-restore-practice.md +67 -0
- package/docs/solutions/workflow/version-drift-recovery.md +49 -0
- package/docs/solutions/worktree-gate-recovery.md +62 -0
- package/docs/specs/README.md +28 -0
- package/docs/specs/claude.md +45 -0
- package/docs/specs/codex.md +44 -0
- package/docs/specs/cursor.md +44 -0
- package/hosts/adapters/claude.ts +29 -0
- package/hosts/adapters/codex.ts +27 -0
- package/hosts/adapters/cursor.ts +27 -0
- package/hosts/adapters/kimi.ts +27 -0
- package/hosts/claude.ts +23 -0
- package/hosts/codex.ts +26 -0
- package/hosts/cursor.ts +19 -0
- package/hosts/index.ts +33 -0
- package/hosts/registry.test.ts +52 -0
- package/hosts/registry.ts +57 -0
- package/hosts/schema.ts +58 -0
- package/package.json +52 -0
- package/scripts/browser.ts +185 -0
- package/scripts/cleanup.ts +142 -0
- package/scripts/commands/artifact.ts +115 -0
- package/scripts/commands/autopilot.ts +143 -0
- package/scripts/commands/capability.ts +57 -0
- package/scripts/commands/config.ts +69 -0
- package/scripts/commands/dag.ts +126 -0
- package/scripts/commands/feedback.ts +123 -0
- package/scripts/commands/gate.ts +291 -0
- package/scripts/commands/helpers.ts +126 -0
- package/scripts/commands/hook.ts +66 -0
- package/scripts/commands/init.ts +515 -0
- package/scripts/commands/issue.ts +825 -0
- package/scripts/commands/phase.ts +61 -0
- package/scripts/commands/preset.ts +159 -0
- package/scripts/commands/runtime.ts +199 -0
- package/scripts/commands/specify.ts +71 -0
- package/scripts/commands/upgrade.ts +243 -0
- package/scripts/commands/verify.ts +183 -0
- package/scripts/commands/wiki.ts +242 -0
- package/scripts/commands/workflow.ts +131 -0
- package/scripts/dev-skill.ts +55 -0
- package/scripts/discover-skills.ts +116 -0
- package/scripts/doctor.ts +410 -0
- package/scripts/dogfood-check.ts +125 -0
- package/scripts/eval-functional.ts +218 -0
- package/scripts/eval.ts +246 -0
- package/scripts/gen-skill-docs.ts +201 -0
- package/scripts/global-discover.ts +217 -0
- package/scripts/governance-check.ts +75 -0
- package/scripts/gxpm-check.ts +12 -0
- package/scripts/gxpm.ts +216 -0
- package/scripts/host-config.ts +62 -0
- package/scripts/install-claude-hooks.ts +138 -0
- package/scripts/install-codex-hooks.ts +271 -0
- package/scripts/install-hooks.ts +128 -0
- package/scripts/install-kimi-hooks.ts +92 -0
- package/scripts/install-skill.ts +184 -0
- package/scripts/phase-artifact-commands.ts +100 -0
- package/scripts/post-land-sync.ts +46 -0
- package/scripts/scaffold-check.ts +85 -0
- package/scripts/skill-naming-check.ts +78 -0
- package/scripts/skill-structure-check.ts +157 -0
- package/scripts/skills-lock-check.ts +60 -0
- package/scripts/sync-markdown-artifacts.ts +172 -0
- package/scripts/uninstall.ts +162 -0
- package/scripts/version.ts +47 -0
- package/scripts/wait-pr-ready.ts +407 -0
- package/skills/gxpm/SKILL.md +485 -0
- package/skills/gxpm/SKILL.md.tmpl +422 -0
- package/skills/gxpm/references/CANON.md +53 -0
- package/skills/gxpm/references/key-rules.md +130 -0
- package/skills/gxpm-architecture/SKILL.md +106 -0
- package/skills/gxpm-architecture/references/DEEPENING.md +37 -0
- package/skills/gxpm-architecture/references/INTERFACE-DESIGN.md +44 -0
- package/skills/gxpm-autopilot/SKILL.md +116 -0
- package/skills/gxpm-autopilot/SKILL.md.tmpl +107 -0
- package/skills/gxpm-browser/SKILL.md +105 -0
- package/skills/gxpm-browser/SKILL.md.tmpl +41 -0
- package/skills/gxpm-browser/references/commands.md +43 -0
- package/skills/gxpm-browser/references/evidence-path.md +20 -0
- package/skills/gxpm-build/SKILL.md +78 -0
- package/skills/gxpm-cleanup/SKILL.md +76 -0
- package/skills/gxpm-debug-issue/SKILL.md +39 -0
- package/skills/gxpm-diagnose/SKILL.md +220 -0
- package/skills/gxpm-diagnose/SKILL.md.tmpl +31 -0
- package/skills/gxpm-diagnose/references/feedback-loop.md +34 -0
- package/skills/gxpm-diagnose/references/feedback-loops.md +43 -0
- package/skills/gxpm-diagnose/references/phases.md +60 -0
- package/skills/gxpm-eval/SKILL.md +78 -0
- package/skills/gxpm-explore-codebase/SKILL.md +36 -0
- package/skills/gxpm-explore-codebase/scripts/summarize-communities.ts +51 -0
- package/skills/gxpm-feedback/SKILL.md +122 -0
- package/skills/gxpm-grill/SKILL.md +159 -0
- package/skills/gxpm-grill/SKILL.md.tmpl +77 -0
- package/skills/gxpm-grill/references/documentation-templates.md +56 -0
- package/skills/gxpm-grill/references/process.md +25 -0
- package/skills/gxpm-handoff/SKILL.md +112 -0
- package/skills/gxpm-hygiene/SKILL.md +69 -0
- package/skills/gxpm-implementer/SKILL.md +142 -0
- package/skills/gxpm-implementer/SKILL.md.tmpl +141 -0
- package/skills/gxpm-linear/SKILL.md +282 -0
- package/skills/gxpm-linear/SKILL.md.tmpl +86 -0
- package/skills/gxpm-linear/references/commands.md +75 -0
- package/skills/gxpm-linear/references/workflows.md +120 -0
- package/skills/gxpm-planning/SKILL.md +134 -0
- package/skills/gxpm-prototype/SKILL.md +64 -0
- package/skills/gxpm-refactor-safely/SKILL.md +62 -0
- package/skills/gxpm-review-army/SKILL.md +117 -0
- package/skills/gxpm-review-changes/SKILL.md +36 -0
- package/skills/gxpm-setup/SKILL.md +101 -0
- package/skills/gxpm-specifier/SKILL.md +135 -0
- package/skills/gxpm-tdd/SKILL.md +187 -0
- package/skills/gxpm-tdd/references/interface-design.md +23 -0
- package/skills/gxpm-tdd/references/mocking.md +27 -0
- package/skills/gxpm-tdd/references/red-green-refactor.md +61 -0
- package/skills/gxpm-tdd/references/troubleshooting.md +28 -0
- package/skills/gxpm-tdd/references/workflow.md +50 -0
- package/skills/gxpm-tdd/testing-anti-patterns.tmpl +304 -0
- package/skills/gxpm-triage/SKILL.md +160 -0
- package/skills/gxpm-verify/SKILL.md +107 -0
- package/skills/gxpm-write-skill/SKILL.md +131 -0
- package/skills/gxpm-zoom-out/SKILL.md +69 -0
- package/skills/maintain-hygiene-skills-lock/SKILL.md +54 -0
- package/skills/maintain-hygiene-skills-lock/SKILL.md.tmpl +53 -0
- package/templates/constitution-template.md +63 -0
- package/templates/hooks/gxpm-commit-msg +16 -0
- package/templates/hooks/gxpm-post-checkout +19 -0
- package/templates/hooks/gxpm-post-commit +7 -0
- package/templates/hooks/gxpm-post-merge +29 -0
- package/templates/hooks/gxpm-pre-commit +39 -0
- package/templates/hooks/gxpm-pre-push +33 -0
- package/templates/plan-template.md.tmpl +46 -0
- package/templates/spec-template.md.tmpl +63 -0
- package/templates/specify-stub.tmpl +22 -0
- package/templates/tasks-template.md.tmpl +32 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dogfood compliance check: verifies that every gxpm issue created
|
|
3
|
+
* after SPECIFY_PHASE_CUTOFF has gone through the specify phase before
|
|
4
|
+
* reaching implement (or any phase past it).
|
|
5
|
+
*
|
|
6
|
+
* Default mode is informational (exit 0, print warnings).
|
|
7
|
+
* Pass `--strict` to make non-compliant issues fail the check (exit 1).
|
|
8
|
+
*
|
|
9
|
+
* Run: bun run scripts/dogfood-check.ts [--strict]
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { existsSync, readFileSync, readdirSync } from "node:fs";
|
|
13
|
+
import { join, resolve } from "node:path";
|
|
14
|
+
import { SPECIFY_PHASE_CUTOFF } from "../core/state";
|
|
15
|
+
|
|
16
|
+
const PAST_SPECIFY_PHASES = new Set([
|
|
17
|
+
"implement",
|
|
18
|
+
"local-verify",
|
|
19
|
+
"ac-check",
|
|
20
|
+
"self-review",
|
|
21
|
+
"ship",
|
|
22
|
+
"pr-check",
|
|
23
|
+
"verify",
|
|
24
|
+
"qa",
|
|
25
|
+
"land",
|
|
26
|
+
]);
|
|
27
|
+
|
|
28
|
+
interface IssueViolation {
|
|
29
|
+
issueId: string;
|
|
30
|
+
currentPhase: string;
|
|
31
|
+
reason: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface DogfoodReport {
|
|
35
|
+
scanned: number;
|
|
36
|
+
legacyExempt: number;
|
|
37
|
+
compliant: number;
|
|
38
|
+
violations: IssueViolation[];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function runDogfoodCheck(root: string = process.cwd()): DogfoodReport {
|
|
42
|
+
const issuesDir = join(root, ".gxpm", "issues");
|
|
43
|
+
const report: DogfoodReport = {
|
|
44
|
+
scanned: 0,
|
|
45
|
+
legacyExempt: 0,
|
|
46
|
+
compliant: 0,
|
|
47
|
+
violations: [],
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
if (!existsSync(issuesDir)) {
|
|
51
|
+
return report;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
for (const entry of readdirSync(issuesDir, { withFileTypes: true })) {
|
|
55
|
+
if (!entry.isDirectory()) continue;
|
|
56
|
+
const issueId = entry.name;
|
|
57
|
+
const statePath = join(issuesDir, issueId, "state.json");
|
|
58
|
+
if (!existsSync(statePath)) continue;
|
|
59
|
+
|
|
60
|
+
let state: { currentPhase?: string; phaseHistory?: Array<{ phase: string; enteredAt: string }> };
|
|
61
|
+
try {
|
|
62
|
+
state = JSON.parse(readFileSync(statePath, "utf8"));
|
|
63
|
+
} catch {
|
|
64
|
+
report.violations.push({
|
|
65
|
+
issueId,
|
|
66
|
+
currentPhase: "<unreadable>",
|
|
67
|
+
reason: "state.json is not valid JSON",
|
|
68
|
+
});
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
report.scanned++;
|
|
73
|
+
const phase = state.currentPhase ?? "<missing>";
|
|
74
|
+
|
|
75
|
+
if (!PAST_SPECIFY_PHASES.has(phase)) {
|
|
76
|
+
// Issue not yet at implement; specify is not strictly required yet.
|
|
77
|
+
report.compliant++;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const history = state.phaseHistory ?? [];
|
|
82
|
+
const implementEntry = history.find((h) => h.phase === "implement");
|
|
83
|
+
if (implementEntry && implementEntry.enteredAt < SPECIFY_PHASE_CUTOFF) {
|
|
84
|
+
// Legacy issue — exempt per the same rule the phase-gate enforces.
|
|
85
|
+
report.legacyExempt++;
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const specifyEntry = history.find((h) => h.phase === "specify");
|
|
90
|
+
if (!specifyEntry) {
|
|
91
|
+
report.violations.push({
|
|
92
|
+
issueId,
|
|
93
|
+
currentPhase: phase,
|
|
94
|
+
reason: "post-cutoff issue reached implement without entering specify",
|
|
95
|
+
});
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
report.compliant++;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return report;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function formatReport(report: DogfoodReport): string {
|
|
106
|
+
const lines: string[] = [];
|
|
107
|
+
lines.push(`dogfood-check: scanned ${report.scanned} issue(s)`);
|
|
108
|
+
lines.push(` compliant: ${report.compliant}`);
|
|
109
|
+
lines.push(` legacy exempt: ${report.legacyExempt}`);
|
|
110
|
+
lines.push(` violations: ${report.violations.length}`);
|
|
111
|
+
for (const v of report.violations) {
|
|
112
|
+
lines.push(` - ${v.issueId} (in ${v.currentPhase}): ${v.reason}`);
|
|
113
|
+
}
|
|
114
|
+
return lines.join("\n");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (import.meta.main) {
|
|
118
|
+
const strict = process.argv.includes("--strict");
|
|
119
|
+
const root = resolve(process.cwd());
|
|
120
|
+
const report = runDogfoodCheck(root);
|
|
121
|
+
console.log(formatReport(report));
|
|
122
|
+
if (strict && report.violations.length > 0) {
|
|
123
|
+
process.exit(1);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* gxpm-eval-functional — functional eval runner for discipline skills.
|
|
4
|
+
*
|
|
5
|
+
* Reads pressure scenarios from test/functional/<skill-name>/,
|
|
6
|
+
* runs them WITH and WITHOUT the skill, and reports compliance rates.
|
|
7
|
+
*
|
|
8
|
+
* Skeleton (P1-4): scenario parsing, CLI, report format.
|
|
9
|
+
* Full LLM-based agent evaluation to be wired in Phase-3.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { readFileSync, readdirSync, existsSync, statSync } from "node:fs";
|
|
13
|
+
import { join } from "node:path";
|
|
14
|
+
|
|
15
|
+
const ROOT = join(import.meta.dir, "..");
|
|
16
|
+
|
|
17
|
+
interface PressureScenario {
|
|
18
|
+
name: string;
|
|
19
|
+
pressures: string[];
|
|
20
|
+
options: string[];
|
|
21
|
+
expected: string;
|
|
22
|
+
rationale: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface ScenarioResult {
|
|
26
|
+
scenario: string;
|
|
27
|
+
withSkill: {
|
|
28
|
+
choice: string;
|
|
29
|
+
compliant: boolean;
|
|
30
|
+
citedSections: string[];
|
|
31
|
+
};
|
|
32
|
+
withoutSkill: {
|
|
33
|
+
choice: string;
|
|
34
|
+
compliant: boolean;
|
|
35
|
+
rationalizations: string[];
|
|
36
|
+
};
|
|
37
|
+
metaTest?: {
|
|
38
|
+
question: string;
|
|
39
|
+
response: string;
|
|
40
|
+
improvement: string;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
interface EvalReport {
|
|
45
|
+
skill: string;
|
|
46
|
+
evaluatedAt: string;
|
|
47
|
+
summary: {
|
|
48
|
+
total: number;
|
|
49
|
+
withSkillCompliant: number;
|
|
50
|
+
withoutSkillNonCompliant: number;
|
|
51
|
+
citationAccuracy: number;
|
|
52
|
+
newRationalizations: string[];
|
|
53
|
+
};
|
|
54
|
+
results: ScenarioResult[];
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function discoverScenarios(skillName: string): PressureScenario[] {
|
|
58
|
+
const dir = join(ROOT, "test", "functional", skillName);
|
|
59
|
+
if (!existsSync(dir) || !statSync(dir).isDirectory()) {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const scenarios: PressureScenario[] = [];
|
|
64
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
65
|
+
if (entry.isFile() && entry.name.endsWith(".md")) {
|
|
66
|
+
const content = readFileSync(join(dir, entry.name), "utf8");
|
|
67
|
+
const parsed = parseScenario(content);
|
|
68
|
+
if (parsed) scenarios.push({ ...parsed, name: entry.name.replace(/\.md$/, "") });
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return scenarios;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function parseScenario(content: string): Omit<PressureScenario, "name"> | null {
|
|
75
|
+
const pressuresMatch = content.match(/## Pressures\n([\s\S]*?)(?=\n## |$)/);
|
|
76
|
+
const optionsMatch = content.match(/## Options\n([\s\S]*?)(?=\n## |$)/);
|
|
77
|
+
const expectedMatch = content.match(/## Expected\n([\s\S]*?)(?=\n## |$)/);
|
|
78
|
+
const rationaleMatch = content.match(/## Rationale\n([\s\S]*?)(?=\n## |$)/);
|
|
79
|
+
|
|
80
|
+
if (!pressuresMatch || !optionsMatch || !expectedMatch) return null;
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
pressures: pressuresMatch[1].trim().split("\n").filter((l) => l.startsWith("- ")).map((l) => l.slice(2).trim()),
|
|
84
|
+
options: optionsMatch[1].trim().split("\n").filter((l) => /^[A-C]\)/.test(l)).map((l) => l.trim()),
|
|
85
|
+
expected: expectedMatch[1].trim(),
|
|
86
|
+
rationale: rationaleMatch ? rationaleMatch[1].trim() : "",
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function runScenario(skillName: string, scenario: PressureScenario): ScenarioResult {
|
|
91
|
+
// Skeleton: placeholder for LLM agent execution.
|
|
92
|
+
// Phase-3 will wire actual subagent calls via core/dag-executor.ts.
|
|
93
|
+
return {
|
|
94
|
+
scenario: scenario.name,
|
|
95
|
+
withSkill: {
|
|
96
|
+
choice: scenario.expected,
|
|
97
|
+
compliant: true,
|
|
98
|
+
citedSections: [],
|
|
99
|
+
},
|
|
100
|
+
withoutSkill: {
|
|
101
|
+
choice: "B",
|
|
102
|
+
compliant: false,
|
|
103
|
+
rationalizations: [],
|
|
104
|
+
},
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function buildReport(skillName: string, results: ScenarioResult[]): EvalReport {
|
|
109
|
+
const withSkillCompliant = results.filter((r) => r.withSkill.compliant).length;
|
|
110
|
+
const withoutSkillNonCompliant = results.filter((r) => !r.withoutSkill.compliant).length;
|
|
111
|
+
const totalCitations = results.reduce((s, r) => s + r.withSkill.citedSections.length, 0);
|
|
112
|
+
const citationAccuracy = results.length > 0 ? totalCitations / results.length : 0;
|
|
113
|
+
const newRationalizations = results.flatMap((r) => r.withoutSkill.rationalizations);
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
skill: skillName,
|
|
117
|
+
evaluatedAt: new Date().toISOString(),
|
|
118
|
+
summary: {
|
|
119
|
+
total: results.length,
|
|
120
|
+
withSkillCompliant,
|
|
121
|
+
withoutSkillNonCompliant,
|
|
122
|
+
citationAccuracy,
|
|
123
|
+
newRationalizations,
|
|
124
|
+
},
|
|
125
|
+
results,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function formatReport(report: EvalReport, asJson: boolean): string {
|
|
130
|
+
if (asJson) {
|
|
131
|
+
return JSON.stringify(report, null, 2);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const lines: string[] = [];
|
|
135
|
+
lines.push(`Functional eval: ${report.skill}`);
|
|
136
|
+
lines.push(`Evaluated at: ${report.evaluatedAt}`);
|
|
137
|
+
lines.push("");
|
|
138
|
+
lines.push(`Scenarios: ${report.summary.total}`);
|
|
139
|
+
lines.push(`With skill compliant: ${report.summary.withSkillCompliant}/${report.summary.total}`);
|
|
140
|
+
lines.push(`Without skill non-compliant: ${report.summary.withoutSkillNonCompliant}/${report.summary.total}`);
|
|
141
|
+
lines.push(`Citation accuracy: ${(report.summary.citationAccuracy * 100).toFixed(0)}%`);
|
|
142
|
+
if (report.summary.newRationalizations.length > 0) {
|
|
143
|
+
lines.push("");
|
|
144
|
+
lines.push("New rationalizations found:");
|
|
145
|
+
for (const r of report.summary.newRationalizations) {
|
|
146
|
+
lines.push(` - ${r}`);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return lines.join("\n");
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function discoverDisciplineSkills(): string[] {
|
|
153
|
+
// Skeleton: discover skills with type=discipline or discipline signals.
|
|
154
|
+
// For now, hardcode the known discipline skills.
|
|
155
|
+
return ["gxpm-tdd", "gxpm-triage"];
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function usage() {
|
|
159
|
+
console.log(`gxpm-eval-functional — functional eval runner for discipline skills
|
|
160
|
+
|
|
161
|
+
Usage:
|
|
162
|
+
gxpm-eval-functional run --all [--json]
|
|
163
|
+
gxpm-eval-functional run <skill-name> [--json]
|
|
164
|
+
|
|
165
|
+
Commands:
|
|
166
|
+
run --all Run all discipline skills
|
|
167
|
+
run <skill-name> Run a specific skill
|
|
168
|
+
`);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const args = process.argv.slice(2);
|
|
172
|
+
const command = args[0];
|
|
173
|
+
|
|
174
|
+
if (!command || command === "--help" || command === "-h") {
|
|
175
|
+
usage();
|
|
176
|
+
process.exit(0);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const asJson = args.includes("--json");
|
|
180
|
+
|
|
181
|
+
if (command === "run") {
|
|
182
|
+
const target = args[1];
|
|
183
|
+
if (!target) {
|
|
184
|
+
console.error("Missing target. Use --all or a skill name.");
|
|
185
|
+
usage();
|
|
186
|
+
process.exit(1);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const skills = target === "--all" ? discoverDisciplineSkills() : [target];
|
|
190
|
+
const reports: EvalReport[] = [];
|
|
191
|
+
|
|
192
|
+
for (const skill of skills) {
|
|
193
|
+
const scenarios = discoverScenarios(skill);
|
|
194
|
+
if (scenarios.length === 0) {
|
|
195
|
+
console.error(`No scenarios found for ${skill} (expected test/functional/${skill}/*.md)`);
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const results = scenarios.map((s) => runScenario(skill, s));
|
|
199
|
+
reports.push(buildReport(skill, results));
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if (reports.length === 0) {
|
|
203
|
+
process.exit(1);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (asJson) {
|
|
207
|
+
console.log(JSON.stringify({ evaluated: reports.length, reports }, null, 2));
|
|
208
|
+
} else {
|
|
209
|
+
for (const report of reports) {
|
|
210
|
+
console.log(formatReport(report, false));
|
|
211
|
+
console.log("");
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
} else {
|
|
215
|
+
console.error(`Unknown command: ${command}`);
|
|
216
|
+
usage();
|
|
217
|
+
process.exit(1);
|
|
218
|
+
}
|
package/scripts/eval.ts
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* gxpm-eval — lightweight skill quality eval harness.
|
|
4
|
+
*
|
|
5
|
+
* Static analysis of SKILL.md files: structure, frontmatter, triggers, length.
|
|
6
|
+
* Future iterations can add LLM-based output quality scoring.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { discoverTemplates } from "./discover-skills";
|
|
12
|
+
|
|
13
|
+
interface EvalResult {
|
|
14
|
+
skill: string;
|
|
15
|
+
score: number;
|
|
16
|
+
maxScore: number;
|
|
17
|
+
checks: EvalCheck[];
|
|
18
|
+
type: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
interface EvalCheck {
|
|
22
|
+
name: string;
|
|
23
|
+
pass: boolean;
|
|
24
|
+
message: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const ROOT = join(import.meta.dir, "..");
|
|
28
|
+
|
|
29
|
+
type SkillType = "discipline" | "technique" | "pattern" | "reference" | "unknown";
|
|
30
|
+
|
|
31
|
+
function detectSkillType(content: string): SkillType {
|
|
32
|
+
// 1. Frontmatter type field
|
|
33
|
+
const typeMatch = content.match(/^type:\s*(.+)$/m);
|
|
34
|
+
if (typeMatch) {
|
|
35
|
+
const t = typeMatch[1].trim().toLowerCase();
|
|
36
|
+
if (["discipline", "technique", "pattern", "reference"].includes(t)) {
|
|
37
|
+
return t as SkillType;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// 2. Content heuristic: strong signals of a discipline skill
|
|
42
|
+
const disciplineSignals = [
|
|
43
|
+
/## The Iron Law/i,
|
|
44
|
+
/## Red Flags/i,
|
|
45
|
+
/## Common Rationalizations/i,
|
|
46
|
+
/\*\*No exceptions:\*\*/i,
|
|
47
|
+
/Violating the letter/i,
|
|
48
|
+
];
|
|
49
|
+
if (disciplineSignals.some((re) => re.test(content))) {
|
|
50
|
+
return "discipline";
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return "unknown";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function evaluateSkill(skillPath: string, content: string): EvalResult {
|
|
57
|
+
const checks: EvalCheck[] = [];
|
|
58
|
+
const lines = content.split("\n");
|
|
59
|
+
const skillType = detectSkillType(content);
|
|
60
|
+
|
|
61
|
+
// 1. Frontmatter exists
|
|
62
|
+
const hasFrontmatter = content.startsWith("---");
|
|
63
|
+
checks.push({
|
|
64
|
+
name: "frontmatter",
|
|
65
|
+
pass: hasFrontmatter,
|
|
66
|
+
message: hasFrontmatter ? "Has YAML frontmatter" : "Missing YAML frontmatter",
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// 2. Name field
|
|
70
|
+
const nameMatch = content.match(/^name:\s*(.+)$/m);
|
|
71
|
+
const hasName = !!nameMatch && nameMatch[1].trim().length > 0;
|
|
72
|
+
checks.push({
|
|
73
|
+
name: "name",
|
|
74
|
+
pass: hasName,
|
|
75
|
+
message: hasName ? `Name: ${nameMatch![1].trim()}` : "Missing 'name' in frontmatter",
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// 3. Description field
|
|
79
|
+
const descMatch = content.match(/^description:\s*(.+)$/m);
|
|
80
|
+
const desc = descMatch ? descMatch[1].trim() : "";
|
|
81
|
+
const descOk = desc.length >= 20 && desc.length <= 300;
|
|
82
|
+
checks.push({
|
|
83
|
+
name: "description",
|
|
84
|
+
pass: descOk,
|
|
85
|
+
message: descOk
|
|
86
|
+
? `Description length: ${desc.length}`
|
|
87
|
+
: `Description length ${desc.length} (want 20-300)`,
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
// 4. Has trigger section
|
|
91
|
+
const hasTrigger = /## When to trigger|## Commands|## Trigger/i.test(content);
|
|
92
|
+
checks.push({
|
|
93
|
+
name: "triggers",
|
|
94
|
+
pass: hasTrigger,
|
|
95
|
+
message: hasTrigger ? "Has trigger/commands section" : "Missing trigger/commands section",
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// 5. Reasonable length
|
|
99
|
+
const lineCount = lines.length;
|
|
100
|
+
const lengthOk = lineCount >= 10 && lineCount <= 1000;
|
|
101
|
+
checks.push({
|
|
102
|
+
name: "length",
|
|
103
|
+
pass: lengthOk,
|
|
104
|
+
message: lengthOk ? `${lineCount} lines` : `${lineCount} lines (want 10-1000)`,
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
// 6. Has references or read next
|
|
108
|
+
const hasReferences = /## Read Next|## References|## See Also/i.test(content);
|
|
109
|
+
checks.push({
|
|
110
|
+
name: "references",
|
|
111
|
+
pass: hasReferences,
|
|
112
|
+
message: hasReferences ? "Has references/read-next section" : "Missing references/read-next",
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
// 7–10. Discipline-specific structural checks
|
|
116
|
+
if (skillType === "discipline") {
|
|
117
|
+
const hasRationalization = /\|\s*Excuse\s*\|\s*Reality\s*\|/i.test(content);
|
|
118
|
+
checks.push({
|
|
119
|
+
name: "rationalization-table",
|
|
120
|
+
pass: hasRationalization,
|
|
121
|
+
message: hasRationalization
|
|
122
|
+
? "Has rationalization table (Excuse / Reality)"
|
|
123
|
+
: "Missing rationalization table",
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
const hasRedFlags = /## Red Flags/i.test(content);
|
|
127
|
+
checks.push({
|
|
128
|
+
name: "red-flags",
|
|
129
|
+
pass: hasRedFlags,
|
|
130
|
+
message: hasRedFlags ? "Has Red Flags section" : "Missing Red Flags section",
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
const hasExplicitNegation = /\*\*No exceptions:\*\*/i.test(content);
|
|
134
|
+
checks.push({
|
|
135
|
+
name: "explicit-negation",
|
|
136
|
+
pass: hasExplicitNegation,
|
|
137
|
+
message: hasExplicitNegation
|
|
138
|
+
? "Has explicit negation (No exceptions)"
|
|
139
|
+
: "Missing explicit negation clause",
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
const hasFoundationalPrinciple = /Violating the letter/i.test(content);
|
|
143
|
+
checks.push({
|
|
144
|
+
name: "foundational-principle",
|
|
145
|
+
pass: hasFoundationalPrinciple,
|
|
146
|
+
message: hasFoundationalPrinciple
|
|
147
|
+
? "Has foundational principle"
|
|
148
|
+
: "Missing foundational principle (e.g. 'Violating the letter')",
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const passCount = checks.filter((c) => c.pass).length;
|
|
153
|
+
const maxScore = checks.length * 10;
|
|
154
|
+
const score = passCount * 10;
|
|
155
|
+
|
|
156
|
+
return { skill: skillPath, score, maxScore, checks, type: skillType };
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function listSkills(): string[] {
|
|
160
|
+
const templates = discoverTemplates(ROOT);
|
|
161
|
+
return templates.map((t) => t.name);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function runEval(skillName?: string): EvalResult[] {
|
|
165
|
+
const templates = discoverTemplates(ROOT);
|
|
166
|
+
const toEval = skillName
|
|
167
|
+
? templates.filter((t) => t.name === skillName)
|
|
168
|
+
: templates;
|
|
169
|
+
|
|
170
|
+
return toEval.map((t) => {
|
|
171
|
+
const content = readFileSync(join(ROOT, t.tmpl.endsWith(".tmpl") ? t.output : t.tmpl), "utf8");
|
|
172
|
+
return evaluateSkill(t.name, content);
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function formatReport(results: EvalResult[], asJson: boolean): string {
|
|
177
|
+
if (asJson) {
|
|
178
|
+
return JSON.stringify({ evaluated: results.length, results }, null, 2);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const lines: string[] = [];
|
|
182
|
+
lines.push(`Evaluated ${results.length} skill(s)`);
|
|
183
|
+
lines.push("");
|
|
184
|
+
|
|
185
|
+
for (const r of results) {
|
|
186
|
+
const pct = Math.round((r.score / r.maxScore) * 100);
|
|
187
|
+
const icon = pct >= 80 ? "✓" : pct >= 50 ? "~" : "✗";
|
|
188
|
+
lines.push(`${icon} ${r.skill}: ${r.score}/${r.maxScore} (${pct}%) [type: ${r.type}]`);
|
|
189
|
+
for (const c of r.checks) {
|
|
190
|
+
const cicon = c.pass ? " ✓" : " ✗";
|
|
191
|
+
lines.push(`${cicon} ${c.name}: ${c.message}`);
|
|
192
|
+
}
|
|
193
|
+
lines.push("");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const totalScore = results.reduce((s, r) => s + r.score, 0);
|
|
197
|
+
const totalMax = results.reduce((s, r) => s + r.maxScore, 0);
|
|
198
|
+
const totalPct = totalMax > 0 ? Math.round((totalScore / totalMax) * 100) : 0;
|
|
199
|
+
lines.push(`Overall: ${totalScore}/${totalMax} (${totalPct}%)`);
|
|
200
|
+
|
|
201
|
+
return lines.join("\n");
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function usage() {
|
|
205
|
+
console.log(`gxpm-eval — skill quality eval harness
|
|
206
|
+
|
|
207
|
+
Usage:
|
|
208
|
+
gxpm-eval list
|
|
209
|
+
gxpm-eval run [<skill-name>] [--json]
|
|
210
|
+
|
|
211
|
+
Commands:
|
|
212
|
+
list List all discoverable skills
|
|
213
|
+
run Run eval on all skills or a specific skill
|
|
214
|
+
`);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const args = process.argv.slice(2);
|
|
218
|
+
const command = args[0];
|
|
219
|
+
|
|
220
|
+
if (!command || command === "--help" || command === "-h") {
|
|
221
|
+
usage();
|
|
222
|
+
process.exit(0);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const asJson = args.includes("--json");
|
|
226
|
+
|
|
227
|
+
if (command === "list") {
|
|
228
|
+
const skills = listSkills();
|
|
229
|
+
if (asJson) {
|
|
230
|
+
console.log(JSON.stringify({ skills }, null, 2));
|
|
231
|
+
} else {
|
|
232
|
+
console.log(skills.join("\n"));
|
|
233
|
+
}
|
|
234
|
+
} else if (command === "run") {
|
|
235
|
+
const skillName = args[1]?.startsWith("-") ? undefined : args[1];
|
|
236
|
+
const results = runEval(skillName);
|
|
237
|
+
console.log(formatReport(results, asJson));
|
|
238
|
+
const totalScore = results.reduce((s, r) => s + r.score, 0);
|
|
239
|
+
const totalMax = results.reduce((s, r) => s + r.maxScore, 0);
|
|
240
|
+
const totalPct = totalMax > 0 ? Math.round((totalScore / totalMax) * 100) : 0;
|
|
241
|
+
if (totalPct < 50) process.exit(1);
|
|
242
|
+
} else {
|
|
243
|
+
console.error(`Unknown command: ${command}`);
|
|
244
|
+
usage();
|
|
245
|
+
process.exit(1);
|
|
246
|
+
}
|