xtrm-tools 0.5.10 → 0.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/CHANGELOG.md +4 -1
  2. package/README.md +28 -30
  3. package/cli/dist/index.cjs +1509 -2722
  4. package/cli/dist/index.cjs.map +1 -1
  5. package/cli/package.json +1 -1
  6. package/config/instructions/agents-top.md +87 -23
  7. package/config/instructions/claude-top.md +101 -23
  8. package/config/pi/extensions/beads/index.ts +3 -1
  9. package/config/pi/extensions/session-flow/index.ts +26 -90
  10. package/config/pi/extensions/xtrm-loader/index.ts +39 -2
  11. package/hooks/README.md +0 -14
  12. package/hooks/beads-gate-messages.mjs +8 -22
  13. package/hooks/gitnexus/gitnexus-hook.cjs +1 -1
  14. package/hooks/hooks.json +25 -27
  15. package/hooks/quality-check-env.mjs +79 -0
  16. package/hooks/quality-check.cjs +6 -6
  17. package/hooks/statusline.mjs +115 -0
  18. package/hooks/using-xtrm-reminder.mjs +35 -0
  19. package/package.json +1 -1
  20. package/skills/sync-docs-workspace/iteration-1/benchmark.json +293 -0
  21. package/skills/sync-docs-workspace/iteration-1/benchmark.md +13 -0
  22. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/eval_metadata.json +27 -0
  23. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/outputs/result.md +210 -0
  24. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/grading.json +28 -0
  25. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  26. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/outputs/result.md +101 -0
  27. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/grading.json +28 -0
  28. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  29. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/timing.json +5 -0
  30. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/eval_metadata.json +27 -0
  31. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/outputs/result.md +198 -0
  32. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/grading.json +28 -0
  33. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  34. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/outputs/result.md +94 -0
  35. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/grading.json +28 -0
  36. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/timing.json +1 -0
  37. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/eval_metadata.json +27 -0
  38. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/outputs/result.md +237 -0
  39. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/grading.json +28 -0
  40. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  41. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/outputs/result.md +134 -0
  42. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/grading.json +28 -0
  43. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/timing.json +1 -0
  44. package/skills/sync-docs-workspace/iteration-2/benchmark.json +297 -0
  45. package/skills/sync-docs-workspace/iteration-2/benchmark.md +13 -0
  46. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/eval_metadata.json +27 -0
  47. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/outputs/result.md +137 -0
  48. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/grading.json +92 -0
  49. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  50. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/outputs/result.md +134 -0
  51. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json +86 -0
  52. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/timing.json +1 -0
  53. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/eval_metadata.json +27 -0
  54. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/outputs/result.md +193 -0
  55. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/grading.json +72 -0
  56. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  57. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/outputs/result.md +211 -0
  58. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/grading.json +91 -0
  59. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  60. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/eval_metadata.json +27 -0
  61. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/outputs/result.md +182 -0
  62. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  63. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  64. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/outputs/result.md +222 -0
  65. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/grading.json +88 -0
  66. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  67. package/skills/sync-docs-workspace/iteration-3/benchmark.json +298 -0
  68. package/skills/sync-docs-workspace/iteration-3/benchmark.md +13 -0
  69. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/eval_metadata.json +27 -0
  70. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/outputs/result.md +125 -0
  71. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/grading.json +97 -0
  72. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/timing.json +5 -0
  73. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/outputs/result.md +144 -0
  74. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/grading.json +78 -0
  75. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  76. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/eval_metadata.json +27 -0
  77. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/outputs/result.md +104 -0
  78. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/grading.json +91 -0
  79. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/timing.json +5 -0
  80. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/outputs/result.md +79 -0
  81. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/grading.json +82 -0
  82. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  83. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/eval_metadata.json +27 -0
  84. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase1_context.json +302 -0
  85. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase2_drift.txt +33 -0
  86. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase3_analysis.json +114 -0
  87. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase4_fix.txt +118 -0
  88. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase5_validate.txt +38 -0
  89. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/result.md +158 -0
  90. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  91. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/timing.json +5 -0
  92. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/outputs/result.md +71 -0
  93. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/grading.json +90 -0
  94. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  95. package/skills/using-xtrm/SKILL.md +84 -205
  96. package/config/pi/extensions/bg-process/index.ts +0 -230
  97. package/config/pi/extensions/bg-process/package.json +0 -16
  98. package/config/pi/extensions/minimal-mode/index.ts +0 -201
  99. package/config/pi/extensions/minimal-mode/package.json +0 -16
  100. package/config/pi/extensions/todo/index.ts +0 -299
  101. package/config/pi/extensions/todo/package.json +0 -16
  102. package/hooks/agent_context.py +0 -105
  103. package/hooks/branch-state.mjs +0 -39
  104. package/hooks/guard-rules.mjs +0 -118
  105. package/hooks/main-guard-post-push.mjs +0 -71
  106. package/hooks/main-guard.mjs +0 -119
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env node
2
+ // SessionStart hook — verify quality gate environment is intact.
3
+ // Checks for tsc, eslint, ruff so the agent knows early if enforcement
4
+ // is silently degraded. Exits 0 always (informational only).
5
+
6
+ import { readFileSync, existsSync } from 'node:fs';
7
+ import { execSync } from 'node:child_process';
8
+ import path from 'node:path';
9
+
10
+ let input;
11
+ try {
12
+ input = JSON.parse(readFileSync(0, 'utf8'));
13
+ } catch {
14
+ process.exit(0);
15
+ }
16
+
17
+ const cwd = input.cwd ?? process.env.CLAUDE_PROJECT_DIR ?? process.cwd();
18
+
19
+ // Only relevant in projects that have quality gates wired
20
+ const pluginRoot = process.env.CLAUDE_PLUGIN_ROOT ?? '';
21
+ const hookPresent =
22
+ existsSync(path.join(pluginRoot, 'hooks', 'quality-check.cjs')) ||
23
+ existsSync(path.join(cwd, '.claude', 'hooks', 'quality-check.cjs'));
24
+
25
+ if (!hookPresent) process.exit(0);
26
+
27
+ function which(cmd) {
28
+ try {
29
+ execSync(`which ${cmd}`, { stdio: 'ignore' });
30
+ return true;
31
+ } catch {
32
+ // fall through to local node_modules probe
33
+ }
34
+ // Check node_modules/.bin/ walking up from cwd
35
+ let dir = cwd;
36
+ while (true) {
37
+ if (existsSync(path.join(dir, 'node_modules', '.bin', cmd))) return true;
38
+ const parent = path.dirname(dir);
39
+ if (parent === dir) break;
40
+ dir = parent;
41
+ }
42
+ return false;
43
+ }
44
+
45
+ const warnings = [];
46
+
47
+ // CLAUDE_PROJECT_DIR check
48
+ if (!process.env.CLAUDE_PROJECT_DIR) {
49
+ warnings.push('CLAUDE_PROJECT_DIR is not set — quality gate may target wrong directory');
50
+ }
51
+
52
+ // TypeScript project checks
53
+ const hasTsConfig = existsSync(path.join(cwd, 'tsconfig.json')) ||
54
+ existsSync(path.join(cwd, 'cli', 'tsconfig.json'));
55
+
56
+ if (hasTsConfig) {
57
+ if (!which('tsc')) warnings.push('tsc not found — TypeScript compilation check will be skipped');
58
+ const hasEslintConfig = ['eslint.config.js', 'eslint.config.mjs', '.eslintrc.js', '.eslintrc.json', '.eslintrc.yml']
59
+ .some(f => existsSync(path.join(cwd, f)));
60
+ if (hasEslintConfig && !which('eslint')) warnings.push('eslint not found — ESLint check will be skipped');
61
+ }
62
+
63
+ // Python project checks
64
+ const hasPyFiles = existsSync(path.join(cwd, 'pyproject.toml')) ||
65
+ existsSync(path.join(cwd, 'setup.py')) ||
66
+ existsSync(path.join(cwd, 'requirements.txt'));
67
+
68
+ if (hasPyFiles) {
69
+ if (!which('ruff')) warnings.push('ruff not found — Python lint check will be skipped');
70
+ }
71
+
72
+ if (warnings.length === 0) process.exit(0);
73
+
74
+ const msg = `⚠️ Quality gate environment issue(s) detected:\n${warnings.map(w => ` • ${w}`).join('\n')}\nFix these to ensure quality gates enforce correctly.`;
75
+
76
+ process.stdout.write(JSON.stringify({
77
+ hookSpecificOutput: { additionalSystemPrompt: msg },
78
+ }));
79
+ process.exit(0);
@@ -447,7 +447,7 @@ class QualityChecker {
447
447
  if (/\.(ts|tsx)$/.test(filePath)) {
448
448
  return 'typescript';
449
449
  }
450
- if (/\.(js|jsx)$/.test(filePath)) {
450
+ if (/\.(js|jsx|cjs|mjs)$/.test(filePath)) {
451
451
  return 'javascript';
452
452
  }
453
453
  return 'unknown';
@@ -537,7 +537,7 @@ class QualityChecker {
537
537
  const resolved = path.resolve(dir, importPath);
538
538
 
539
539
  // Try common extensions
540
- const extensions = ['.ts', '.tsx', '.js', '.jsx'];
540
+ const extensions = ['.ts', '.tsx', '.js', '.jsx', '.cjs', '.mjs'];
541
541
  for (const ext of extensions) {
542
542
  const fullPath = resolved + ext;
543
543
  if (require('fs').existsSync(fullPath)) {
@@ -565,8 +565,8 @@ class QualityChecker {
565
565
  return;
566
566
  }
567
567
 
568
- // Skip TypeScript checking for JavaScript files in hook directories
569
- if (this.filePath.endsWith('.js') && this.filePath.includes('.claude/hooks/')) {
568
+ // Skip TypeScript checking for JavaScript/CJS/MJS files in hook directories
569
+ if (/\.(js|cjs|mjs)$/.test(this.filePath) && this.filePath.includes('.claude/hooks/')) {
570
570
  log.debug('Skipping TypeScript check for JavaScript hook file');
571
571
  return;
572
572
  }
@@ -865,7 +865,7 @@ class QualityChecker {
865
865
  const debuggerRule = config._fileConfig.rules?.debugger || {};
866
866
  if (debuggerRule.enabled !== false) {
867
867
  lines.forEach((line, index) => {
868
- if (/\bdebugger\b/.test(line)) {
868
+ if (/^\s*debugger\s*;/.test(line)) {
869
869
  const severity = debuggerRule.severity || 'error';
870
870
  const message =
871
871
  debuggerRule.message || 'Remove debugger statements before committing';
@@ -1111,7 +1111,7 @@ async function fileExists(filePath) {
1111
1111
  * @returns {boolean} True if source file
1112
1112
  */
1113
1113
  function isSourceFile(filePath) {
1114
- return /\.(ts|tsx|js|jsx)$/.test(filePath);
1114
+ return /\.(ts|tsx|js|jsx|cjs|mjs)$/.test(filePath);
1115
1115
  }
1116
1116
 
1117
1117
  /**
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env node
2
+ // statusline.mjs — Claude Code statusLine command for xt claude worktree sessions
3
+ // Two lines:
4
+ // Line 1 (plain): XTRM ⎇ <branch>
5
+ // Line 2 (colored): ◐ <claim title in italics> OR ○ N open
6
+ // State file: .xtrm/statusline-claim (written by beads-claim-sync.mjs)
7
+ // Results cached 5s in /tmp to avoid hammering bd on every render.
8
+
9
+ import { execSync } from 'node:child_process';
10
+ import { readFileSync, writeFileSync, existsSync } from 'node:fs';
11
+ import { join } from 'node:path';
12
+ import { tmpdir } from 'node:os';
13
+ import { createHash } from 'node:crypto';
14
+
15
+ const cwd = process.cwd();
16
+ const cacheKey = createHash('md5').update(cwd).digest('hex').slice(0, 8);
17
+ const CACHE_FILE = join(tmpdir(), `xtrm-sl-${cacheKey}.json`);
18
+ const CACHE_TTL = 5000;
19
+
20
+ function run(cmd) {
21
+ try {
22
+ return execSync(cmd, {
23
+ encoding: 'utf8', cwd,
24
+ stdio: ['pipe', 'pipe', 'pipe'],
25
+ timeout: 2000,
26
+ }).trim();
27
+ } catch { return null; }
28
+ }
29
+
30
+ function getCached() {
31
+ try {
32
+ const c = JSON.parse(readFileSync(CACHE_FILE, 'utf8'));
33
+ if (Date.now() - c.ts < CACHE_TTL) return c.data;
34
+ } catch {}
35
+ return null;
36
+ }
37
+
38
+ function setCache(data) {
39
+ try { writeFileSync(CACHE_FILE, JSON.stringify({ ts: Date.now(), data })); } catch {}
40
+ }
41
+
42
+ // ANSI
43
+ const R = '\x1b[0m';
44
+ const BOLD = '\x1b[1m';
45
+ const BOLD_OFF = '\x1b[22m';
46
+ const ITALIC = '\x1b[3m';
47
+ const ITALIC_OFF = '\x1b[23m';
48
+ const FG_WHITE = '\x1b[38;5;15m';
49
+ const FG_ACCENT = '\x1b[38;5;75m';
50
+ const FG_MUTED = '\x1b[38;5;245m';
51
+ const BG_CLAIMED = '\x1b[48;5;17m';
52
+ const BG_IDLE = '\x1b[48;5;238m';
53
+
54
+ // Data
55
+ let data = getCached();
56
+ if (!data) {
57
+ const branch = run('git branch --show-current');
58
+ let claimTitle = null;
59
+ let openCount = 0;
60
+
61
+ const hasBeads = existsSync(join(cwd, '.beads'));
62
+ if (hasBeads) {
63
+ const claimFile = join(cwd, '.xtrm', 'statusline-claim');
64
+ let claimId = null;
65
+ if (existsSync(claimFile)) {
66
+ claimId = readFileSync(claimFile, 'utf8').trim() || null;
67
+ }
68
+
69
+ if (claimId) {
70
+ try {
71
+ const raw = run(`bd show ${claimId} --json`);
72
+ if (raw) {
73
+ const parsed = JSON.parse(raw);
74
+ claimTitle = parsed?.[0]?.title ?? null;
75
+ }
76
+ } catch {}
77
+ }
78
+
79
+ if (!claimTitle) {
80
+ const listOut = run('bd list');
81
+ const m = listOut?.match(/\((\d+)\s+open/);
82
+ if (m) openCount = parseInt(m[1], 10);
83
+ }
84
+ }
85
+
86
+ data = { branch, claimTitle, openCount };
87
+ setCache(data);
88
+ }
89
+
90
+ // Render
91
+ const { branch, claimTitle, openCount } = data;
92
+ const cols = process.stdout.columns || 80;
93
+
94
+ const brand = `${BOLD}${FG_ACCENT}XTRM${BOLD_OFF}${R}`;
95
+ const branchStr = branch ? `${FG_MUTED}⎇ ${branch}${R}` : '';
96
+ const line1 = [brand, branchStr].filter(Boolean).join(' ');
97
+
98
+ function padded(text, bg) {
99
+ const visible = text.replace(/\x1b\[[0-9;]*m/g, '');
100
+ const pad = Math.max(0, cols - visible.length);
101
+ return `${bg}${FG_WHITE}${text}${' '.repeat(pad)}${R}`;
102
+ }
103
+
104
+ let line2;
105
+ if (claimTitle) {
106
+ const maxLen = cols - 4;
107
+ const title = claimTitle.length > maxLen ? claimTitle.slice(0, maxLen - 1) + '\u2026' : claimTitle;
108
+ line2 = padded(` \u25d0 ${ITALIC}${title}${ITALIC_OFF}`, BG_CLAIMED);
109
+ } else {
110
+ const idle = openCount > 0 ? `\u25cb ${openCount} open` : '\u25cb no open issues';
111
+ line2 = padded(` ${idle}`, BG_IDLE);
112
+ }
113
+
114
+ process.stdout.write(line1 + '\n' + line2 + '\n');
115
+ process.exit(0);
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env node
2
+ // using-xtrm-reminder.mjs — Claude Code SessionStart hook
3
+ // Reads skills/using-xtrm/SKILL.md and injects it as additionalSystemPrompt
4
+ // so the agent starts every session already oriented on the xtrm workflow.
5
+ // Exit 0 in all paths (fail open).
6
+
7
+ import { readFileSync } from 'node:fs';
8
+ import { join } from 'node:path';
9
+
10
+ let input;
11
+ try { input = JSON.parse(readFileSync(0, 'utf8')); } catch { process.exit(0); }
12
+
13
+ const pluginRoot = process.env.CLAUDE_PLUGIN_ROOT;
14
+ if (!pluginRoot) process.exit(0);
15
+
16
+ const skillPath = join(pluginRoot, 'skills', 'using-xtrm', 'SKILL.md');
17
+ let content;
18
+ try {
19
+ content = readFileSync(skillPath, 'utf8');
20
+ } catch {
21
+ process.exit(0);
22
+ }
23
+
24
+ // Strip YAML frontmatter (--- ... ---\n)
25
+ content = content.replace(/^---[\s\S]*?---\n/, '').trim();
26
+
27
+ process.stdout.write(
28
+ JSON.stringify({
29
+ hookSpecificOutput: {
30
+ hookEventName: 'SessionStart',
31
+ additionalSystemPrompt: content,
32
+ },
33
+ }) + '\n',
34
+ );
35
+ process.exit(0);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xtrm-tools",
3
- "version": "0.5.10",
3
+ "version": "0.5.13",
4
4
  "description": "Claude Code tools installer (skills, hooks, MCP servers)",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -0,0 +1,293 @@
1
+ {
2
+ "metadata": {
3
+ "skill_name": "sync-docs",
4
+ "skill_path": "<path/to/skill>",
5
+ "executor_model": "<model-name>",
6
+ "analyzer_model": "<model-name>",
7
+ "timestamp": "2026-03-18T07:43:29Z",
8
+ "evals_run": [
9
+ 1,
10
+ 2,
11
+ 3
12
+ ],
13
+ "runs_per_configuration": 3
14
+ },
15
+ "runs": [
16
+ {
17
+ "eval_id": 3,
18
+ "configuration": "with_skill",
19
+ "run_number": 1,
20
+ "result": {
21
+ "pass_rate": 0.75,
22
+ "passed": 3,
23
+ "failed": 1,
24
+ "total": 4,
25
+ "time_seconds": 0.0,
26
+ "tokens": 0,
27
+ "tool_calls": 0,
28
+ "errors": 0
29
+ },
30
+ "expectations": [
31
+ {
32
+ "text": "Ran doc_structure_analyzer.py and referenced its structured output",
33
+ "passed": true,
34
+ "evidence": "Ran doc_structure_analyzer.py, quoted its full structured output including EXTRACTABLE status, extraction candidates list, MISSING files, and INVALID_SCHEMA count."
35
+ },
36
+ {
37
+ "text": "Named specific README sections with their suggested docs/ destination",
38
+ "passed": true,
39
+ "evidence": "Named: '## Policy System \u2192 docs/policies.md', '## MCP Servers \u2192 docs/mcp-servers.md', pi-extensions.md, plus context about CHANGELOG 6-day gap."
40
+ },
41
+ {
42
+ "text": "Report is actionable \u2014 tells user exactly what to do next, not just observations",
43
+ "passed": true,
44
+ "evidence": "Report includes structured phase output, specific file names, notes CHANGELOG gap with exact dates, and references the 6-day staleness."
45
+ },
46
+ {
47
+ "text": "Did not edit or create any files (audit only)",
48
+ "passed": false,
49
+ "evidence": "Agent ran --fix (created docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md) despite task being audit-only. Skill instructions for Phase 3 show the --fix command without making clear it is only for execute mode."
50
+ }
51
+ ],
52
+ "notes": []
53
+ },
54
+ {
55
+ "eval_id": 2,
56
+ "configuration": "with_skill",
57
+ "run_number": 1,
58
+ "result": {
59
+ "pass_rate": 0.75,
60
+ "passed": 3,
61
+ "failed": 1,
62
+ "total": 4,
63
+ "time_seconds": 0.0,
64
+ "tokens": 0,
65
+ "tool_calls": 0,
66
+ "errors": 0
67
+ },
68
+ "expectations": [
69
+ {
70
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
71
+ "passed": true,
72
+ "evidence": "Ran `python3 skills/sync-docs/scripts/doc_structure_analyzer.py --fix --bd-remember` and included full output"
73
+ },
74
+ {
75
+ "text": "Ran with --bd-remember or manually ran bd remember with a summary",
76
+ "passed": true,
77
+ "evidence": "bd remember stored with key 'sync-docs-fix-2026-03-18', confirmed stored:true in output JSON"
78
+ },
79
+ {
80
+ "text": "At least one scaffold file was created in docs/",
81
+ "passed": true,
82
+ "evidence": "Created docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md with valid frontmatter"
83
+ },
84
+ {
85
+ "text": "Ran validate_doc.py on created files to confirm schema",
86
+ "passed": false,
87
+ "evidence": "Report notes 7 INVALID_SCHEMA files exist but does not show validate_doc.py being run explicitly to confirm the 3 new files pass. Only the JSON output showing valid frontmatter is evidence."
88
+ }
89
+ ],
90
+ "notes": []
91
+ },
92
+ {
93
+ "eval_id": 1,
94
+ "configuration": "with_skill",
95
+ "run_number": 1,
96
+ "result": {
97
+ "pass_rate": 1.0,
98
+ "passed": 4,
99
+ "failed": 0,
100
+ "total": 4,
101
+ "time_seconds": 0.0,
102
+ "tokens": 0,
103
+ "tool_calls": 0,
104
+ "errors": 0
105
+ },
106
+ "expectations": [
107
+ {
108
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs from the output",
109
+ "passed": true,
110
+ "evidence": "Ran context_gatherer.py, reported 20 bd closed issues with IDs and titles, 3 merged PRs with SHAs and dates, 15 recent commits"
111
+ },
112
+ {
113
+ "text": "Ran doc_structure_analyzer.py and used its output to identify doc issues",
114
+ "passed": true,
115
+ "evidence": "Ran doc_structure_analyzer.py, referenced MISSING status for docs/pi-extensions.md, hooks.md, mcp-servers.md, policies.md, skills.md and EXTRACTABLE for README"
116
+ },
117
+ {
118
+ "text": "Produced at least one concrete recommendation or action (not just a vague summary)",
119
+ "passed": true,
120
+ "evidence": "Named specific files: docs/pi-extensions.md, docs/hooks.md, docs/mcp-servers.md, docs/policies.md with explicit next steps for each"
121
+ },
122
+ {
123
+ "text": "Used the skill scripts rather than just reading files manually",
124
+ "passed": true,
125
+ "evidence": "Ran 3 scripts (context_gatherer.py, drift_detector.py, doc_structure_analyzer.py) with explicit output included in report"
126
+ }
127
+ ],
128
+ "notes": []
129
+ },
130
+ {
131
+ "eval_id": 3,
132
+ "configuration": "without_skill",
133
+ "run_number": 1,
134
+ "result": {
135
+ "pass_rate": 0.75,
136
+ "passed": 3,
137
+ "failed": 1,
138
+ "total": 4,
139
+ "time_seconds": 72.5,
140
+ "tokens": 21934,
141
+ "tool_calls": 0,
142
+ "errors": 0
143
+ },
144
+ "expectations": [
145
+ {
146
+ "text": "Ran doc_structure_analyzer.py and referenced its structured output",
147
+ "passed": false,
148
+ "evidence": "Did not run doc_structure_analyzer.py. All findings came from manual README.md reads with line numbers."
149
+ },
150
+ {
151
+ "text": "Named specific README sections with their suggested docs/ destination",
152
+ "passed": true,
153
+ "evidence": "Named 6 specific sections with line numbers: Hooks Reference (114-141)\u2192docs/hooks.md, Policy System (66-87)\u2192new docs/policies.md, MCP Servers (143-158)\u2192docs/mcp.md, CLI Commands (89-111)\u2192XTRM-GUIDE.md, Version History (179-188)\u2192remove, Plugin Structure (52-63)\u2192borderline."
154
+ },
155
+ {
156
+ "text": "Report is actionable \u2014 tells user exactly what to do next, not just observations",
157
+ "passed": true,
158
+ "evidence": "Each section has a specific Recommendation: block with exact action (Remove section, Add single link, Create docs/policies.md, etc.). Estimated README would shrink from 193 to 60-70 lines."
159
+ },
160
+ {
161
+ "text": "Did not edit or create any files (audit only)",
162
+ "passed": true,
163
+ "evidence": "Report explicitly states no files were modified. Audit-only as instructed."
164
+ }
165
+ ],
166
+ "notes": []
167
+ },
168
+ {
169
+ "eval_id": 2,
170
+ "configuration": "without_skill",
171
+ "run_number": 1,
172
+ "result": {
173
+ "pass_rate": 1.0,
174
+ "passed": 4,
175
+ "failed": 0,
176
+ "total": 4,
177
+ "time_seconds": 0.0,
178
+ "tokens": 0,
179
+ "tool_calls": 0,
180
+ "errors": 0
181
+ },
182
+ "expectations": [
183
+ {
184
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
185
+ "passed": true,
186
+ "evidence": "Agent found the skill in the repo and ran doc_structure_analyzer.py --fix. However, found no MISSING gaps because with_skill run had already created those files (confounded test)."
187
+ },
188
+ {
189
+ "text": "Ran with --bd-remember or manually ran bd remember with a summary",
190
+ "passed": true,
191
+ "evidence": "Agent ran bd remember with key 'sync-docs-fix-schema-2026-03-18' summarizing the frontmatter additions made to 7 files."
192
+ },
193
+ {
194
+ "text": "At least one scaffold file was created in docs/",
195
+ "passed": true,
196
+ "evidence": "Added YAML frontmatter to 7 existing docs/ files (hooks.md, mcp.md, pre-install-cleanup.md, project-skills.md, skills.md, testing.md, todo.md). Different action than creating scaffolds but valid given scaffolds already existed."
197
+ },
198
+ {
199
+ "text": "Ran validate_doc.py on created files to confirm schema",
200
+ "passed": true,
201
+ "evidence": "Ran validate_doc.py docs/ \u2014 7/7 files passed after frontmatter additions."
202
+ }
203
+ ],
204
+ "notes": []
205
+ },
206
+ {
207
+ "eval_id": 1,
208
+ "configuration": "without_skill",
209
+ "run_number": 1,
210
+ "result": {
211
+ "pass_rate": 0.25,
212
+ "passed": 1,
213
+ "failed": 3,
214
+ "total": 4,
215
+ "time_seconds": 0.0,
216
+ "tokens": 0,
217
+ "tool_calls": 0,
218
+ "errors": 0
219
+ },
220
+ "expectations": [
221
+ {
222
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs from the output",
223
+ "passed": false,
224
+ "evidence": "Did not run context_gatherer.py. Used git log manually. Reported 'No .beads/ DB was found' which is wrong \u2014 .beads/ exists. Missed all 20 closed bd issues."
225
+ },
226
+ {
227
+ "text": "Ran doc_structure_analyzer.py and used its output to identify doc issues",
228
+ "passed": false,
229
+ "evidence": "Did not run doc_structure_analyzer.py. Manually read README.md, package.json, and CHANGELOG.md."
230
+ },
231
+ {
232
+ "text": "Produced at least one concrete recommendation or action (not just a vague summary)",
233
+ "passed": true,
234
+ "evidence": "Found version mismatch (2.3.0 vs 2.4.1 in package.json), identified 7 undocumented branch commits in CHANGELOG, named specific line references."
235
+ },
236
+ {
237
+ "text": "Used the skill scripts rather than just reading files manually",
238
+ "passed": false,
239
+ "evidence": "No skill scripts were used. All findings came from manual git log, file reads, and README inspection."
240
+ }
241
+ ],
242
+ "notes": []
243
+ }
244
+ ],
245
+ "run_summary": {
246
+ "with_skill": {
247
+ "pass_rate": {
248
+ "mean": 0.8333,
249
+ "stddev": 0.1443,
250
+ "min": 0.75,
251
+ "max": 1.0
252
+ },
253
+ "time_seconds": {
254
+ "mean": 0.0,
255
+ "stddev": 0.0,
256
+ "min": 0.0,
257
+ "max": 0.0
258
+ },
259
+ "tokens": {
260
+ "mean": 0.0,
261
+ "stddev": 0.0,
262
+ "min": 0,
263
+ "max": 0
264
+ }
265
+ },
266
+ "without_skill": {
267
+ "pass_rate": {
268
+ "mean": 0.6667,
269
+ "stddev": 0.3819,
270
+ "min": 0.25,
271
+ "max": 1.0
272
+ },
273
+ "time_seconds": {
274
+ "mean": 24.1667,
275
+ "stddev": 41.8579,
276
+ "min": 0.0,
277
+ "max": 72.5
278
+ },
279
+ "tokens": {
280
+ "mean": 7311.3333,
281
+ "stddev": 12663.6008,
282
+ "min": 0,
283
+ "max": 21934
284
+ }
285
+ },
286
+ "delta": {
287
+ "pass_rate": "+0.17",
288
+ "time_seconds": "-24.2",
289
+ "tokens": "-7311"
290
+ }
291
+ },
292
+ "notes": []
293
+ }
@@ -0,0 +1,13 @@
1
+ # Skill Benchmark: sync-docs
2
+
3
+ **Model**: <model-name>
4
+ **Date**: 2026-03-18T07:43:29Z
5
+ **Evals**: 1, 2, 3 (3 runs each per configuration)
6
+
7
+ ## Summary
8
+
9
+ | Metric | With Skill | Without Skill | Delta |
10
+ |--------|------------|---------------|-------|
11
+ | Pass Rate | 83% ± 14% | 67% ± 38% | +0.17 |
12
+ | Time | 0.0s ± 0.0s | 24.2s ± 41.9s | -24.2s |
13
+ | Tokens | 0 ± 0 | 7311 ± 12664 | -7311 |
@@ -0,0 +1,27 @@
1
+ {
2
+ "eval_id": 3,
3
+ "eval_name": "doc-audit",
4
+ "prompt": "Do a doc audit. I think the README has sections that should be in docs/ but I'm not sure which ones.",
5
+ "assertions": [
6
+ {
7
+ "id": "ran-analyzer",
8
+ "description": "Ran doc_structure_analyzer.py and referenced its structured output",
9
+ "check": "result.md cites the analyzer output (EXTRACTABLE, BLOATED, line count, or specific section names from the report)"
10
+ },
11
+ {
12
+ "id": "named-specific-sections",
13
+ "description": "Named specific README sections with their suggested docs/ destination",
14
+ "check": "result.md lists at least 2 specific sections (e.g. '## Policy System → docs/policies.md') not just generic advice"
15
+ },
16
+ {
17
+ "id": "actionable-report",
18
+ "description": "Report is actionable — tells user exactly what to do next, not just observations",
19
+ "check": "result.md includes a prioritized list or clear next steps, not just 'the README could be shorter'"
20
+ },
21
+ {
22
+ "id": "no-edits-made",
23
+ "description": "Did not edit or create any files (audit only)",
24
+ "check": "result.md does not claim to have modified README.md or created docs/ files"
25
+ }
26
+ ]
27
+ }