xtrm-tools 2.4.1 → 2.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/README.md +15 -6
  2. package/cli/dist/index.cjs +738 -239
  3. package/cli/dist/index.cjs.map +1 -1
  4. package/cli/package.json +1 -1
  5. package/config/hooks.json +10 -0
  6. package/config/pi/extensions/core/adapter.ts +2 -14
  7. package/config/pi/extensions/core/guard-rules.ts +70 -0
  8. package/config/pi/extensions/core/session-state.ts +59 -0
  9. package/config/pi/extensions/main-guard.ts +10 -14
  10. package/config/pi/extensions/plan-mode/README.md +65 -0
  11. package/config/pi/extensions/plan-mode/index.ts +340 -0
  12. package/config/pi/extensions/plan-mode/utils.ts +168 -0
  13. package/config/pi/extensions/service-skills.ts +51 -7
  14. package/config/pi/extensions/session-flow.ts +117 -0
  15. package/hooks/beads-claim-sync.mjs +123 -2
  16. package/hooks/beads-compact-restore.mjs +41 -9
  17. package/hooks/beads-compact-save.mjs +36 -5
  18. package/hooks/beads-gate-messages.mjs +27 -1
  19. package/hooks/beads-stop-gate.mjs +58 -8
  20. package/hooks/guard-rules.mjs +86 -0
  21. package/hooks/hooks.json +28 -18
  22. package/hooks/main-guard.mjs +3 -21
  23. package/hooks/quality-check.cjs +1286 -0
  24. package/hooks/quality-check.py +345 -0
  25. package/hooks/session-state.mjs +138 -0
  26. package/package.json +2 -1
  27. package/project-skills/quality-gates/.claude/settings.json +1 -24
  28. package/skills/creating-service-skills/SKILL.md +433 -0
  29. package/skills/creating-service-skills/references/script_quality_standards.md +425 -0
  30. package/skills/creating-service-skills/references/service_skill_system_guide.md +278 -0
  31. package/skills/creating-service-skills/scripts/bootstrap.py +326 -0
  32. package/skills/creating-service-skills/scripts/deep_dive.py +304 -0
  33. package/skills/creating-service-skills/scripts/scaffolder.py +482 -0
  34. package/skills/scoping-service-skills/SKILL.md +231 -0
  35. package/skills/scoping-service-skills/scripts/scope.py +74 -0
  36. package/skills/sync-docs/SKILL.md +235 -0
  37. package/skills/sync-docs/evals/evals.json +89 -0
  38. package/skills/sync-docs/references/doc-structure.md +104 -0
  39. package/skills/sync-docs/references/schema.md +103 -0
  40. package/skills/sync-docs/scripts/context_gatherer.py +246 -0
  41. package/skills/sync-docs/scripts/doc_structure_analyzer.py +495 -0
  42. package/skills/sync-docs/scripts/validate_doc.py +365 -0
  43. package/skills/sync-docs-workspace/iteration-1/benchmark.json +293 -0
  44. package/skills/sync-docs-workspace/iteration-1/benchmark.md +13 -0
  45. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/eval_metadata.json +27 -0
  46. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/outputs/result.md +210 -0
  47. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/grading.json +28 -0
  48. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  49. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/outputs/result.md +101 -0
  50. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/grading.json +28 -0
  51. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  52. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/timing.json +5 -0
  53. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/eval_metadata.json +27 -0
  54. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/outputs/result.md +198 -0
  55. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/grading.json +28 -0
  56. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  57. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/outputs/result.md +94 -0
  58. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/grading.json +28 -0
  59. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/timing.json +1 -0
  60. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/eval_metadata.json +27 -0
  61. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/outputs/result.md +237 -0
  62. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/grading.json +28 -0
  63. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  64. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/outputs/result.md +134 -0
  65. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/grading.json +28 -0
  66. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/timing.json +1 -0
  67. package/skills/sync-docs-workspace/iteration-2/benchmark.json +297 -0
  68. package/skills/sync-docs-workspace/iteration-2/benchmark.md +13 -0
  69. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/eval_metadata.json +27 -0
  70. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/outputs/result.md +137 -0
  71. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/grading.json +92 -0
  72. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  73. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/outputs/result.md +134 -0
  74. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json +86 -0
  75. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/timing.json +1 -0
  76. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/eval_metadata.json +27 -0
  77. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/outputs/result.md +193 -0
  78. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/grading.json +72 -0
  79. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  80. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/outputs/result.md +211 -0
  81. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/grading.json +91 -0
  82. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  83. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/eval_metadata.json +27 -0
  84. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/outputs/result.md +182 -0
  85. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  86. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  87. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/outputs/result.md +222 -0
  88. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/grading.json +88 -0
  89. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  90. package/skills/sync-docs-workspace/iteration-3/benchmark.json +298 -0
  91. package/skills/sync-docs-workspace/iteration-3/benchmark.md +13 -0
  92. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/eval_metadata.json +27 -0
  93. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/outputs/result.md +125 -0
  94. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/grading.json +97 -0
  95. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/timing.json +5 -0
  96. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/outputs/result.md +144 -0
  97. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/grading.json +78 -0
  98. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  99. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/eval_metadata.json +27 -0
  100. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/outputs/result.md +104 -0
  101. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/grading.json +91 -0
  102. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/timing.json +5 -0
  103. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/outputs/result.md +79 -0
  104. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/grading.json +82 -0
  105. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  106. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/eval_metadata.json +27 -0
  107. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase1_context.json +302 -0
  108. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase2_drift.txt +33 -0
  109. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase3_analysis.json +114 -0
  110. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase4_fix.txt +118 -0
  111. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase5_validate.txt +38 -0
  112. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/result.md +158 -0
  113. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  114. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/timing.json +5 -0
  115. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/outputs/result.md +71 -0
  116. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/grading.json +90 -0
  117. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  118. package/skills/updating-service-skills/SKILL.md +136 -0
  119. package/skills/updating-service-skills/scripts/drift_detector.py +222 -0
  120. package/skills/using-quality-gates/SKILL.md +254 -0
  121. package/skills/using-service-skills/SKILL.md +108 -0
  122. package/skills/using-service-skills/scripts/cataloger.py +74 -0
  123. package/skills/using-service-skills/scripts/skill_activator.py +152 -0
  124. package/skills/using-service-skills/scripts/test_skill_activator.py +58 -0
  125. package/skills/using-xtrm/SKILL.md +34 -38
@@ -0,0 +1,297 @@
1
+ {
2
+ "metadata": {
3
+ "skill_name": "sync-docs",
4
+ "skill_path": "<path/to/skill>",
5
+ "executor_model": "<model-name>",
6
+ "analyzer_model": "<model-name>",
7
+ "timestamp": "2026-03-18T13:15:12Z",
8
+ "evals_run": [
9
+ 1,
10
+ 2,
11
+ 3
12
+ ],
13
+ "runs_per_configuration": 3
14
+ },
15
+ "runs": [
16
+ {
17
+ "eval_id": 3,
18
+ "configuration": "with_skill",
19
+ "run_number": 1,
20
+ "result": {
21
+ "pass_rate": 1.0,
22
+ "passed": 4,
23
+ "failed": 0,
24
+ "total": 4,
25
+ "time_seconds": 82.0,
26
+ "tokens": 26000,
27
+ "tool_calls": 0,
28
+ "errors": 0
29
+ },
30
+ "expectations": [
31
+ {
32
+ "text": "Ran doc_structure_analyzer.py and cited its output",
33
+ "passed": true,
34
+ "evidence": "Phase 3 of the report cites specific script output: 'Line count: 192 / 200 threshold', 'Sections: 24', 'Extraction candidates flagged by script: None'. These metrics are consistent with structured script output rather than manual inspection, and the phrasing 'flagged by script' directly attributes them to an automated tool."
35
+ },
36
+ {
37
+ "text": "Named at least 2 specific README sections with their suggested docs/ destination",
38
+ "passed": true,
39
+ "evidence": "The report names three sections with explicit docs/ destinations: 'Hooks Reference' (lines 114-141) -> docs/hooks.md, 'Policy System' (lines 66-87) -> docs/policies.md, 'MCP Servers' (lines 143-158) -> docs/mcp-servers.md. These appear both in the Phase 3 table and in the numbered summary under 'Three sections to replace with summary + link'."
40
+ },
41
+ {
42
+ "text": "Did NOT run --fix or create/edit any files (audit-only mode respected)",
43
+ "passed": true,
44
+ "evidence": "The report header states 'Mode: Audit only (Phase 1-3). No files were modified.' The 'Recommended Next Steps' section frames all actions as future work 'for explicit execution, not done here'. The 'What NOT to Do' section further reinforces restraint. No output files other than result.md are present in the outputs directory."
45
+ },
46
+ {
47
+ "text": "Report is actionable with clear next steps",
48
+ "passed": true,
49
+ "evidence": "The 'Recommended Next Steps' section lists 4 numbered, specific, executable actions: (1) Fix CHANGELOG using add_entry.py, (2) Fix README version badge from 2.3.0 to 2.4.0, (3) Trim README duplicate sections using Serena tools, (4) Update stale Serena memories for hooks and installer architecture. Each step names the specific tool or method to use."
50
+ }
51
+ ],
52
+ "notes": []
53
+ },
54
+ {
55
+ "eval_id": 2,
56
+ "configuration": "with_skill",
57
+ "run_number": 1,
58
+ "result": {
59
+ "pass_rate": 1.0,
60
+ "passed": 4,
61
+ "failed": 0,
62
+ "total": 4,
63
+ "time_seconds": 120.0,
64
+ "tokens": 0,
65
+ "tool_calls": 0,
66
+ "errors": 0
67
+ },
68
+ "expectations": [
69
+ {
70
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
71
+ "passed": true,
72
+ "evidence": "result.md Command Executed section shows: `python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/doc_structure_analyzer.py --fix --bd-remember --root=/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ad500878`"
73
+ },
74
+ {
75
+ "text": "Handled both MISSING scaffolds AND INVALID_SCHEMA files (or correctly reported none found)",
76
+ "passed": true,
77
+ "evidence": "result.md shows 5 MISSING scaffold files created (hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md) and 7 INVALID_SCHEMA files fixed by injecting frontmatter (cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md). Both categories were handled."
78
+ },
79
+ {
80
+ "text": "Ran bd remember and reported the memory key",
81
+ "passed": true,
82
+ "evidence": "result.md bd Memory section reports key `sync-docs-fix-2026-03-18` and the full insight string. The script output JSON includes `bd_remember.key = 'sync-docs-fix-2026-03-18'`. Note: `stored: false` because no `.beads/` directory existed in the worktree, but the key was computed and reported, which satisfies the expectation."
83
+ },
84
+ {
85
+ "text": "Ran validate_doc.py docs/ after fixing to confirm results",
86
+ "passed": true,
87
+ "evidence": "result.md validate_doc.py docs/ Result section shows all 12 files passed (12/12) with exit code 0, covering both the 5 newly created scaffolds and the 7 schema-fixed files."
88
+ }
89
+ ],
90
+ "notes": []
91
+ },
92
+ {
93
+ "eval_id": 1,
94
+ "configuration": "with_skill",
95
+ "run_number": 1,
96
+ "result": {
97
+ "pass_rate": 1.0,
98
+ "passed": 4,
99
+ "failed": 0,
100
+ "total": 4,
101
+ "time_seconds": 170.0,
102
+ "tokens": 27000,
103
+ "tool_calls": 0,
104
+ "errors": 2
105
+ },
106
+ "expectations": [
107
+ {
108
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs with specific data",
109
+ "passed": true,
110
+ "evidence": "Phase 1 documents running 'skills/sync-docs/scripts/context_gatherer.py --since=30'. While the script returned empty arrays for bd_closed_issues due to a Dolt server connectivity issue from the worktree, the agent performed a manual fallback using 'bd list --state=closed --limit=10' and reported 8 specific named issues (jaggers-agent-tools-1lc, -7dwo, -8dhs, -9bl, -ihgz, -l1g, -p9wc, plus subtasks). Merged PRs were also reported with specific details: 10 PRs including PR #15 release/2.0.1, PR #14 chore/update-status-doc, PR #13 fix/agents-target, etc."
111
+ },
112
+ {
113
+ "text": "Ran doc_structure_analyzer.py and cited its structured output (STALE, EXTRACTABLE, MISSING, etc.)",
114
+ "passed": true,
115
+ "evidence": "Phase 3 documents running 'skills/sync-docs/scripts/doc_structure_analyzer.py'. The result explicitly cites: README.md as EXTRACTABLE (192 lines, threshold 200) with 5 sections identified for extraction; CHANGELOG.md as STALE; 5 MISSING docs/ files (docs/hooks.md, docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md, docs/skills.md); and 7 existing docs/ files with INVALID_SCHEMA. The structured output vocabulary (STALE, EXTRACTABLE, MISSING, INVALID_SCHEMA) is used throughout."
116
+ },
117
+ {
118
+ "text": "Detected the CHANGELOG version gap (package.json v2.4.0 vs CHANGELOG v2.0.0)",
119
+ "passed": true,
120
+ "evidence": "Phase 3 explicitly states: 'package.json version: 2.4.0', 'Latest CHANGELOG entry: v2.0.0', 'Gap: v2.1.0 through v2.4.0 are undocumented'. Outstanding Actions item 1 also calls out: 'Add entries for v2.1.0 through v2.4.0 using skills/documenting/scripts/changelog/add_entry.py'."
121
+ },
122
+ {
123
+ "text": "Named at least one concrete next step with a specific file or action",
124
+ "passed": true,
125
+ "evidence": "The 'Outstanding Actions' section names 4 concrete next steps with specific files/tools: (1) 'Add entries for v2.1.0 through v2.4.0 using skills/documenting/scripts/changelog/add_entry.py'; (2) 'Use Serena to extract sections into their new docs/ files'; (3) 'Update 5 stale memories, especially ssot_cli_hooks_2026-02-03'; (4) 'Fill [5 scaffolded files] using Serena or by extracting from README'."
126
+ }
127
+ ],
128
+ "notes": []
129
+ },
130
+ {
131
+ "eval_id": 3,
132
+ "configuration": "without_skill",
133
+ "run_number": 1,
134
+ "result": {
135
+ "pass_rate": 0.75,
136
+ "passed": 3,
137
+ "failed": 1,
138
+ "total": 4,
139
+ "time_seconds": 75.0,
140
+ "tokens": 15000,
141
+ "tool_calls": 0,
142
+ "errors": 0
143
+ },
144
+ "expectations": [
145
+ {
146
+ "text": "Ran doc_structure_analyzer.py and cited its output",
147
+ "passed": false,
148
+ "evidence": "No mention of doc_structure_analyzer.py anywhere in result.md. The audit was performed by directly reading files rather than running any script. No script output is quoted or referenced."
149
+ },
150
+ {
151
+ "text": "Named at least 2 specific README sections with their suggested docs/ destination",
152
+ "passed": true,
153
+ "evidence": "result.md names six README sections with specific destinations: 'Hooks Reference' -> docs/hooks.md, 'Policy System' -> docs/policies.md, 'MCP Servers' -> docs/mcp.md, 'Plugin Structure' -> docs/plugin.md or XTRM-GUIDE.md, 'Skills table' -> docs/skills.md, 'Version History' -> CHANGELOG.md. The recommended moves table further enumerates these with priority levels."
154
+ },
155
+ {
156
+ "text": "Did NOT run --fix or create/edit any files (audit-only mode respected)",
157
+ "passed": true,
158
+ "evidence": "result.md is a read-only audit report. No --fix flag is mentioned, no files were edited, no docs/ files were created or modified. The report only analyzes and recommends."
159
+ },
160
+ {
161
+ "text": "Report is actionable with clear next steps",
162
+ "passed": true,
163
+ "evidence": "result.md includes a 'Recommended Moves (Prioritized)' table with Priority (High/Medium/Low), README Section, Action (Move/Remove/Add), and Target columns. Each section also has a specific Verdict with a concrete instruction (e.g., 'Move this content to docs/policies.md', 'Remove this table from the README and rely on the CHANGELOG.md link')."
164
+ }
165
+ ],
166
+ "notes": []
167
+ },
168
+ {
169
+ "eval_id": 2,
170
+ "configuration": "without_skill",
171
+ "run_number": 1,
172
+ "result": {
173
+ "pass_rate": 1.0,
174
+ "passed": 4,
175
+ "failed": 0,
176
+ "total": 4,
177
+ "time_seconds": 129.8,
178
+ "tokens": 5843,
179
+ "tool_calls": 0,
180
+ "errors": 1
181
+ },
182
+ "expectations": [
183
+ {
184
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
185
+ "passed": true,
186
+ "evidence": "Step 2 of result.md shows the exact command run: 'python3 .../doc_structure_analyzer.py --root=... --fix' with stdout confirming 5 files created and 7 files fixed."
187
+ },
188
+ {
189
+ "text": "Handled both MISSING scaffolds AND INVALID_SCHEMA files (or correctly reported none found)",
190
+ "passed": true,
191
+ "evidence": "Step 2 shows 5 MISSING scaffold files were created (hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md) and 7 INVALID_SCHEMA files had frontmatter injected (cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md). Both categories were explicitly handled."
192
+ },
193
+ {
194
+ "text": "Ran bd remember and reported the memory key",
195
+ "passed": true,
196
+ "evidence": "Step 4 shows bd remember was attempted with key 'sync-docs-fix-2026-03-18'. It failed with 'database jaggers_agent_tools not found on Dolt server at 127.0.0.1:13800', but the key was explicitly reported ('## bd Memory Key (intended): sync-docs-fix-2026-03-18') and the attempt was fully documented. Per grading note, attempt + reported key satisfies this assertion."
197
+ },
198
+ {
199
+ "text": "Ran validate_doc.py docs/ after fixing to confirm results",
200
+ "passed": true,
201
+ "evidence": "Step 3 shows the command 'python3 .../validate_doc.py .../docs/' was run post-fix, with all 12 files showing [PASS] and final output 'Result: 12/12 files passed'."
202
+ }
203
+ ],
204
+ "notes": [
205
+ "README.md EXTRACTABLE \u2014 5 sections should move to docs/ but requires content judgment",
206
+ "CHANGELOG.md STALE \u2014 v2.4.0 undocumented; requires add_entry.py and human input",
207
+ "bd remember failed due to Dolt server database mismatch; key was reported manually in result.md"
208
+ ]
209
+ },
210
+ {
211
+ "eval_id": 1,
212
+ "configuration": "without_skill",
213
+ "run_number": 1,
214
+ "result": {
215
+ "pass_rate": 0.5,
216
+ "passed": 2,
217
+ "failed": 2,
218
+ "total": 4,
219
+ "time_seconds": 219.9,
220
+ "tokens": 8123,
221
+ "tool_calls": 0,
222
+ "errors": 0
223
+ },
224
+ "expectations": [
225
+ {
226
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs with specific data",
227
+ "passed": false,
228
+ "evidence": "The result contains specific bd closed issue data (#38, #33 with dates) and merged PR data (#102\u2013#111 with titles and dates). However, the report header explicitly states 'Assessed by: Manual review (git log, gh issue/pr, file reads)' \u2014 context_gatherer.py was never invoked. The data is present but was gathered manually, not via the script. The expectation requires the script to have been run."
229
+ },
230
+ {
231
+ "text": "Ran doc_structure_analyzer.py and cited its structured output (STALE, EXTRACTABLE, MISSING, etc.)",
232
+ "passed": false,
233
+ "evidence": "No mention of doc_structure_analyzer.py anywhere in the result. The documentation analysis uses informal labels like 'Drifted' and 'Stale' from the agent's own judgment, not the structured taxonomy (STALE, EXTRACTABLE, MISSING) that the script would emit. The script was not run."
234
+ },
235
+ {
236
+ "text": "Detected the CHANGELOG version gap (package.json v2.4.0 vs CHANGELOG v2.0.0)",
237
+ "passed": true,
238
+ "evidence": "Section 2 of the result is titled 'CHANGELOG.md Has No v2.4.0 Entry' and is rated High severity and listed as the #1 recommended action: 'Write the [2.4.0] CHANGELOG entry \u2014 this is the most critical gap. The release shipped but has no record.' The result also references the package version as 2.4.1 (cli/package.json) and 2.4.0 (released via PR #110). The specific last CHANGELOG version is not named but the gap is clearly identified and substantiated."
239
+ },
240
+ {
241
+ "text": "Named at least one concrete next step with a specific file or action",
242
+ "passed": true,
243
+ "evidence": "The 'Recommended Actions (Priority Order)' section lists 7 concrete steps, each referencing specific files: e.g., '1. Write the [2.4.0] CHANGELOG entry', '2. Update README.md version badge, example output, and version history table to 2.4.0', '4. Bump plugins/xtrm-tools/.claude-plugin/plugin.json version to 2.4.0'."
244
+ }
245
+ ],
246
+ "notes": []
247
+ }
248
+ ],
249
+ "run_summary": {
250
+ "with_skill": {
251
+ "pass_rate": {
252
+ "mean": 1.0,
253
+ "stddev": 0.0,
254
+ "min": 1.0,
255
+ "max": 1.0
256
+ },
257
+ "time_seconds": {
258
+ "mean": 124.0,
259
+ "stddev": 44.1362,
260
+ "min": 82.0,
261
+ "max": 170.0
262
+ },
263
+ "tokens": {
264
+ "mean": 17666.6667,
265
+ "stddev": 15307.95,
266
+ "min": 0,
267
+ "max": 27000
268
+ }
269
+ },
270
+ "without_skill": {
271
+ "pass_rate": {
272
+ "mean": 0.75,
273
+ "stddev": 0.25,
274
+ "min": 0.5,
275
+ "max": 1.0
276
+ },
277
+ "time_seconds": {
278
+ "mean": 141.5667,
279
+ "stddev": 73.1631,
280
+ "min": 75.0,
281
+ "max": 219.9
282
+ },
283
+ "tokens": {
284
+ "mean": 9655.3333,
285
+ "stddev": 4766.9378,
286
+ "min": 5843,
287
+ "max": 15000
288
+ }
289
+ },
290
+ "delta": {
291
+ "pass_rate": "+0.25",
292
+ "time_seconds": "-17.6",
293
+ "tokens": "+8011"
294
+ }
295
+ },
296
+ "notes": []
297
+ }
@@ -0,0 +1,13 @@
1
+ # Skill Benchmark: sync-docs
2
+
3
+ **Model**: <model-name>
4
+ **Date**: 2026-03-18T13:15:12Z
5
+ **Evals**: 1, 2, 3 (3 runs each per configuration)
6
+
7
+ ## Summary
8
+
9
+ | Metric | With Skill | Without Skill | Delta |
10
+ |--------|------------|---------------|-------|
11
+ | Pass Rate | 100% ± 0% | 75% ± 25% | +0.25 |
12
+ | Time | 124.0s ± 44.1s | 141.6s ± 73.2s | -17.6s |
13
+ | Tokens | 17667 ± 15308 | 9655 ± 4767 | +8011 |
@@ -0,0 +1,27 @@
1
+ {
2
+ "eval_id": 3,
3
+ "eval_name": "doc-audit",
4
+ "prompt": "Do a doc audit. I think the README has sections that should be in docs/ but I'm not sure which ones.",
5
+ "assertions": [
6
+ {
7
+ "text": "Ran doc_structure_analyzer.py and cited its output",
8
+ "passed": false,
9
+ "evidence": ""
10
+ },
11
+ {
12
+ "text": "Named at least 2 specific README sections with their suggested docs/ destination",
13
+ "passed": false,
14
+ "evidence": ""
15
+ },
16
+ {
17
+ "text": "Did NOT run --fix or create/edit any files (audit-only mode respected)",
18
+ "passed": false,
19
+ "evidence": ""
20
+ },
21
+ {
22
+ "text": "Report is actionable with clear next steps",
23
+ "passed": false,
24
+ "evidence": ""
25
+ }
26
+ ]
27
+ }
@@ -0,0 +1,137 @@
1
+ # Doc Audit Report — xtrm-tools
2
+
3
+ **Date:** 2026-03-18
4
+ **Task:** Audit README for sections that should be in docs/
5
+ **Mode:** Audit only (Phase 1–3). No files were modified.
6
+
7
+ ---
8
+
9
+ ## Phase 1: Context Summary
10
+
11
+ ### Recent Activity
12
+
13
+ **Merged PRs (last ~30 days):**
14
+ - PR #15 — release/2.0.1 (2026-03-13)
15
+ - PR #14 — chore/update-status-doc (2026-03-13)
16
+ - PR #13 — fix/agents-target (2026-03-13)
17
+ - PR #12 — feat/project-install-all (2026-03-13)
18
+ - PR #8 — phase2-cli-refactor (2026-03-12)
19
+
20
+ **Recent significant commits (today):**
21
+ - Add Pi extension drift checks and guard-rules parity
22
+ - Centralize guard tool rules and matcher expansion
23
+ - Deprecate install project command in favor of xtrm init
24
+ - Add global-first architecture regression tests
25
+ - Add project detection and service registry scaffolding to xtrm init
26
+
27
+ **Active epic:** `jaggers-agent-tools-4xr6` — Global-first plugin architecture (hooks, skills, Pi extensions all go global; `xtrm init` replaces `install project`)
28
+
29
+ This is a significant structural change cycle. The CLI commands table in README already shows `project init` but also still lists `install project <name>` — these may be in conflict now that install-project is deprecated.
30
+
31
+ ---
32
+
33
+ ## Phase 2: SSOT Drift (Serena Memories)
34
+
35
+ **5 stale memories detected:**
36
+
37
+ | Memory | Last Updated | Modified Files |
38
+ |---|---|---|
39
+ | `ssot_cli_hooks_2026-02-03` | 2026-02-25 | hooks/guard-rules.mjs, hooks/hooks.json, hooks/main-guard.mjs |
40
+ | `ssot_cli_universal_hub_2026-02-19` | 2026-02-25 | cli/src/commands/install-pi.ts, cli/src/tests/policy-parity.test.ts |
41
+ | `ssot_cli_ux_improvements_2026-02-22` | 2026-02-25 | cli/src/commands/install-pi.ts, cli/src/commands/install-project.ts |
42
+ | `ssot_jaggers-agent-tools_installer_architecture_2026-02-03` | 2026-02-25 | cli/src/commands/install-pi.ts, cli/src/tests/policy-parity.test.ts |
43
+ | `ssot_jaggers-agent-tools_migration_2026-02-01` | 2026-02-01 | cli/src/commands/install-pi.ts, cli/src/tests/policy-parity.test.ts |
44
+
45
+ The hooks memories are stale due to the guard-rules centralization work done today. The installer architecture memories are stale due to the global-first migration and deprecation of `install-project`. These need updating but are out of scope for this audit (they require Serena tools and explicit intent to fix).
46
+
47
+ ---
48
+
49
+ ## Phase 3: Document Structure Analysis
50
+
51
+ ### README.md — Status: OK (borderline)
52
+
53
+ - **Line count:** 192 / 200 threshold
54
+ - **Sections:** 24
55
+ - **Extraction candidates flagged by script:** None
56
+
57
+ The script reports `OK` because README is 8 lines under the 200-line bloat threshold. However, manual review reveals several sections that are substantive enough to warrant dedicated docs/ files or already have them:
58
+
59
+ #### Sections with candidate docs/ homes
60
+
61
+ | README Section | Lines | Status | Recommended Action |
62
+ |---|---|---|---|
63
+ | **Hooks Reference** (lines 114–141) | ~28 lines | Has `docs/hooks.md` | README section should be a 1-line summary + link to `docs/hooks.md` |
64
+ | **MCP Servers** (lines 143–158) | ~16 lines | Has `docs/mcp-servers.md` | README section is a partial duplicate of `docs/mcp-servers.md` |
65
+ | **Policy System** (lines 66–87) | ~22 lines | Has `docs/policies.md` | README section should be a 1-line summary + link to `docs/policies.md` |
66
+ | **CLI Commands** (lines 89–111) | ~23 lines | No `docs/cli-reference.md` | At 6 commands + 3 flags this is borderline; when CLI grows, extract |
67
+ | **Version History** (lines 179–187) | ~9 lines | Has CHANGELOG.md | Already linked; table is a useful quick summary, keep |
68
+ | **Issue Tracking (Beads)** (lines 161–168) | ~8 lines | No dedicated docs/ file | Short enough to keep in README |
69
+
70
+ **Key finding:** README has three sections (Hooks Reference, MCP Servers, Policy System) that directly duplicate content already in dedicated docs/ files. These sections should be replaced with single-line summaries + links. This would bring README down to approximately 130–140 lines and eliminate the drift risk.
71
+
72
+ ### CHANGELOG.md — Status: STALE (critical)
73
+
74
+ - **package.json version:** 2.4.0
75
+ - **Latest CHANGELOG entry:** 2.0.0 (2026-03-12)
76
+ - **Gap:** v2.1.x, v2.2.0, v2.3.0, and v2.4.0 are all undocumented in CHANGELOG
77
+ - **Note:** README header also still says "Version 2.3.0" — should be 2.4.0
78
+
79
+ The CHANGELOG has not been updated across multiple release cycles. This is the most critical finding.
80
+
81
+ ### docs/ Files — Status: All OK
82
+
83
+ All 10 existing docs/ files pass schema validation (have YAML frontmatter, no structural issues):
84
+
85
+ - docs/hooks.md (133 lines) — OK
86
+ - docs/mcp-servers.md (25 lines) — OK
87
+ - docs/mcp.md (110 lines) — OK
88
+ - docs/pi-extensions.md (25 lines) — OK
89
+ - docs/policies.md (25 lines) — OK
90
+ - docs/pre-install-cleanup.md (128 lines) — OK
91
+ - docs/project-skills.md (105 lines) — OK
92
+ - docs/skills.md (114 lines) — OK
93
+ - docs/testing.md (148 lines) — OK
94
+ - docs/todo.md (14 lines) — OK
95
+
96
+ **No missing docs/ gaps** were flagged by the script for existing subsystems.
97
+
98
+ ---
99
+
100
+ ## Summary of Findings
101
+
102
+ ### README Duplication (answer to the user's question)
103
+
104
+ The README does contain sections that belong in docs/ — but not because they're missing docs/ files. The docs/ files already exist. The problem is that README still carries full content in those sections instead of pointing to the dedicated files.
105
+
106
+ **Three sections to replace with summary + link:**
107
+
108
+ 1. **"Hooks Reference"** (lines 114–141, ~28 lines) → summarize in 2–3 lines + link to `docs/hooks.md`
109
+ 2. **"Policy System"** (lines 66–87, ~22 lines) → summarize in 2–3 lines + link to `docs/policies.md`
110
+ 3. **"MCP Servers"** (lines 143–158, ~16 lines) → summarize in 1–2 lines + link to `docs/mcp-servers.md`
111
+
112
+ Doing this would remove ~60 lines from README (192 → ~132), well within the healthy range, and eliminate drift between README and its docs/ counterparts.
113
+
114
+ ### CHANGELOG is critically stale
115
+
116
+ Four release versions (2.1.x through 2.4.0) have no CHANGELOG entries. The README version badge is also one version behind (shows 2.3.0, package.json is 2.4.0).
117
+
118
+ ### Stale Serena memories (5 total)
119
+
120
+ All relate to hooks and installer architecture — both subsystems were modified today as part of the global-first migration. These need updating via the `/documenting` skill or Phase 4 of this skill.
121
+
122
+ ---
123
+
124
+ ## What NOT to Do
125
+
126
+ - Do not extract Beads, CLI Commands, Version History, or Quick Start from README — these are appropriate for a README-level entry point.
127
+ - Do not create new docs/ files — all expected subsystem files already exist.
128
+ - Do not touch the docs/ files themselves — they all pass schema validation.
129
+
130
+ ---
131
+
132
+ ## Recommended Next Steps (for explicit execution, not done here)
133
+
134
+ 1. **Fix CHANGELOG** — add entries for v2.1.x, v2.2.0, v2.3.0, v2.4.0 using `add_entry.py`
135
+ 2. **Fix README version badge** — update "Version 2.3.0" to "Version 2.4.0"
136
+ 3. **Trim README duplicate sections** — replace Hooks Reference, Policy System, MCP Servers with summary + link (use Serena tools, not direct Edit)
137
+ 4. **Update stale Serena memories** — especially `ssot_cli_hooks_*` and `ssot_jaggers-agent-tools_installer_architecture_*` given today's guard-rules and global-first changes
@@ -0,0 +1,92 @@
1
+ {
2
+ "expectations": [
3
+ {
4
+ "text": "Ran doc_structure_analyzer.py and cited its output",
5
+ "passed": true,
6
+ "evidence": "Phase 3 of the report cites specific script output: 'Line count: 192 / 200 threshold', 'Sections: 24', 'Extraction candidates flagged by script: None'. These metrics are consistent with structured script output rather than manual inspection, and the phrasing 'flagged by script' directly attributes them to an automated tool."
7
+ },
8
+ {
9
+ "text": "Named at least 2 specific README sections with their suggested docs/ destination",
10
+ "passed": true,
11
+ "evidence": "The report names three sections with explicit docs/ destinations: 'Hooks Reference' (lines 114-141) -> docs/hooks.md, 'Policy System' (lines 66-87) -> docs/policies.md, 'MCP Servers' (lines 143-158) -> docs/mcp-servers.md. These appear both in the Phase 3 table and in the numbered summary under 'Three sections to replace with summary + link'."
12
+ },
13
+ {
14
+ "text": "Did NOT run --fix or create/edit any files (audit-only mode respected)",
15
+ "passed": true,
16
+ "evidence": "The report header states 'Mode: Audit only (Phase 1-3). No files were modified.' The 'Recommended Next Steps' section frames all actions as future work 'for explicit execution, not done here'. The 'What NOT to Do' section further reinforces restraint. No output files other than result.md are present in the outputs directory."
17
+ },
18
+ {
19
+ "text": "Report is actionable with clear next steps",
20
+ "passed": true,
21
+ "evidence": "The 'Recommended Next Steps' section lists 4 numbered, specific, executable actions: (1) Fix CHANGELOG using add_entry.py, (2) Fix README version badge from 2.3.0 to 2.4.0, (3) Trim README duplicate sections using Serena tools, (4) Update stale Serena memories for hooks and installer architecture. Each step names the specific tool or method to use."
22
+ }
23
+ ],
24
+ "summary": {
25
+ "passed": 4,
26
+ "failed": 0,
27
+ "total": 4,
28
+ "pass_rate": 1.0
29
+ },
30
+ "execution_metrics": {
31
+ "tool_calls": {},
32
+ "total_tool_calls": 0,
33
+ "total_steps": 0,
34
+ "errors_encountered": 0,
35
+ "output_chars": 3871,
36
+ "transcript_chars": 0
37
+ },
38
+ "timing": {
39
+ "executor_duration_seconds": 0.0,
40
+ "grader_duration_seconds": 0.0,
41
+ "total_duration_seconds": 0.0
42
+ },
43
+ "claims": [
44
+ {
45
+ "claim": "README is 192 lines, just under the 200-line threshold",
46
+ "type": "factual",
47
+ "verified": false,
48
+ "evidence": "Reported as script output but the README was not independently verified in grading. The number is internally consistent with the report's conclusion that README is 'borderline'."
49
+ },
50
+ {
51
+ "claim": "5 stale Serena memories were detected",
52
+ "type": "factual",
53
+ "verified": false,
54
+ "evidence": "The report lists 5 specific memory entries with dates and associated files. Cannot verify without access to the Serena memory store, but the specificity (named files, timestamps) suggests genuine inspection rather than fabrication."
55
+ },
56
+ {
57
+ "claim": "All 10 existing docs/ files pass schema validation",
58
+ "type": "quality",
59
+ "verified": false,
60
+ "evidence": "10 docs/ files are listed with 'OK' status. The report attributes this to schema validation but no validator output or script name is cited for this check, making it less verifiable than the doc_structure_analyzer output."
61
+ },
62
+ {
63
+ "claim": "CHANGELOG is missing entries for v2.1.x through v2.4.0",
64
+ "type": "factual",
65
+ "verified": false,
66
+ "evidence": "Report states package.json shows 2.4.0 but CHANGELOG's latest entry is 2.0.0. Plausible given the stated active development cycle but not independently verified in grading."
67
+ },
68
+ {
69
+ "claim": "No files were modified during the run",
70
+ "type": "process",
71
+ "verified": true,
72
+ "evidence": "Only result.md is present in the outputs directory. No other files were created or edited as part of the run."
73
+ }
74
+ ],
75
+ "user_notes_summary": {
76
+ "uncertainties": [],
77
+ "needs_review": [],
78
+ "workarounds": []
79
+ },
80
+ "eval_feedback": {
81
+ "suggestions": [
82
+ {
83
+ "assertion": "Ran doc_structure_analyzer.py and cited its output",
84
+ "reason": "The assertion passes based on plausible script-attributed output in the report, but there is no transcript available to confirm the script was actually executed (e.g., a Bash tool call). A stronger assertion would require a transcript showing the actual tool invocation, or the raw script output as a separate artifact in outputs/. As written, a well-crafted fabrication would also pass this assertion."
85
+ },
86
+ {
87
+ "reason": "No assertion checks the accuracy of the section-to-docs/ mapping. The report correctly identifies docs/hooks.md, docs/policies.md, and docs/mcp-servers.md as targets, but an eval that verifies those files actually exist in the repo would catch hallucinated destinations. Consider adding: 'All recommended docs/ destinations exist as actual files in the repository'."
88
+ }
89
+ ],
90
+ "overall": "The four assertions are well-chosen and cover the key audit behaviors. The main gap is that expectation 1 (script was run) cannot be fully verified without a transcript or raw script output artifact. The eval would be stronger if it required evidence of actual execution rather than cited output alone."
91
+ }
92
+ }
@@ -0,0 +1 @@
1
+ {"total_tokens": 26000, "duration_ms": 82000, "total_duration_seconds": 82.0}