xtrm-tools 2.4.1 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/README.md +15 -6
  2. package/cli/dist/index.cjs +738 -239
  3. package/cli/dist/index.cjs.map +1 -1
  4. package/cli/package.json +1 -1
  5. package/config/hooks.json +10 -0
  6. package/config/pi/extensions/core/adapter.ts +2 -14
  7. package/config/pi/extensions/core/guard-rules.ts +70 -0
  8. package/config/pi/extensions/core/session-state.ts +59 -0
  9. package/config/pi/extensions/main-guard.ts +10 -14
  10. package/config/pi/extensions/plan-mode/README.md +65 -0
  11. package/config/pi/extensions/plan-mode/index.ts +340 -0
  12. package/config/pi/extensions/plan-mode/utils.ts +168 -0
  13. package/config/pi/extensions/service-skills.ts +51 -7
  14. package/config/pi/extensions/session-flow.ts +117 -0
  15. package/hooks/beads-claim-sync.mjs +140 -14
  16. package/hooks/beads-compact-restore.mjs +41 -9
  17. package/hooks/beads-compact-save.mjs +36 -5
  18. package/hooks/beads-gate-messages.mjs +27 -1
  19. package/hooks/beads-memory-gate.mjs +24 -16
  20. package/hooks/beads-stop-gate.mjs +58 -8
  21. package/hooks/guard-rules.mjs +117 -0
  22. package/hooks/hooks.json +28 -18
  23. package/hooks/main-guard.mjs +22 -22
  24. package/hooks/quality-check.cjs +1286 -0
  25. package/hooks/quality-check.py +345 -0
  26. package/hooks/session-state.mjs +138 -0
  27. package/package.json +2 -1
  28. package/project-skills/quality-gates/.claude/settings.json +1 -24
  29. package/skills/creating-service-skills/SKILL.md +433 -0
  30. package/skills/creating-service-skills/references/script_quality_standards.md +425 -0
  31. package/skills/creating-service-skills/references/service_skill_system_guide.md +278 -0
  32. package/skills/creating-service-skills/scripts/bootstrap.py +326 -0
  33. package/skills/creating-service-skills/scripts/deep_dive.py +304 -0
  34. package/skills/creating-service-skills/scripts/scaffolder.py +482 -0
  35. package/skills/scoping-service-skills/SKILL.md +231 -0
  36. package/skills/scoping-service-skills/scripts/scope.py +74 -0
  37. package/skills/sync-docs/SKILL.md +235 -0
  38. package/skills/sync-docs/evals/evals.json +89 -0
  39. package/skills/sync-docs/references/doc-structure.md +104 -0
  40. package/skills/sync-docs/references/schema.md +103 -0
  41. package/skills/sync-docs/scripts/context_gatherer.py +246 -0
  42. package/skills/sync-docs/scripts/doc_structure_analyzer.py +495 -0
  43. package/skills/sync-docs/scripts/validate_doc.py +365 -0
  44. package/skills/sync-docs-workspace/iteration-1/benchmark.json +293 -0
  45. package/skills/sync-docs-workspace/iteration-1/benchmark.md +13 -0
  46. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/eval_metadata.json +27 -0
  47. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/outputs/result.md +210 -0
  48. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/grading.json +28 -0
  49. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  50. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/outputs/result.md +101 -0
  51. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/grading.json +28 -0
  52. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  53. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/timing.json +5 -0
  54. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/eval_metadata.json +27 -0
  55. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/outputs/result.md +198 -0
  56. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/grading.json +28 -0
  57. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  58. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/outputs/result.md +94 -0
  59. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/grading.json +28 -0
  60. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/timing.json +1 -0
  61. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/eval_metadata.json +27 -0
  62. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/outputs/result.md +237 -0
  63. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/grading.json +28 -0
  64. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  65. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/outputs/result.md +134 -0
  66. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/grading.json +28 -0
  67. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/timing.json +1 -0
  68. package/skills/sync-docs-workspace/iteration-2/benchmark.json +297 -0
  69. package/skills/sync-docs-workspace/iteration-2/benchmark.md +13 -0
  70. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/eval_metadata.json +27 -0
  71. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/outputs/result.md +137 -0
  72. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/grading.json +92 -0
  73. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  74. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/outputs/result.md +134 -0
  75. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json +86 -0
  76. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/timing.json +1 -0
  77. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/eval_metadata.json +27 -0
  78. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/outputs/result.md +193 -0
  79. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/grading.json +72 -0
  80. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  81. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/outputs/result.md +211 -0
  82. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/grading.json +91 -0
  83. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  84. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/eval_metadata.json +27 -0
  85. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/outputs/result.md +182 -0
  86. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  87. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  88. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/outputs/result.md +222 -0
  89. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/grading.json +88 -0
  90. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  91. package/skills/sync-docs-workspace/iteration-3/benchmark.json +298 -0
  92. package/skills/sync-docs-workspace/iteration-3/benchmark.md +13 -0
  93. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/eval_metadata.json +27 -0
  94. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/outputs/result.md +125 -0
  95. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/grading.json +97 -0
  96. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/timing.json +5 -0
  97. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/outputs/result.md +144 -0
  98. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/grading.json +78 -0
  99. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  100. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/eval_metadata.json +27 -0
  101. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/outputs/result.md +104 -0
  102. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/grading.json +91 -0
  103. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/timing.json +5 -0
  104. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/outputs/result.md +79 -0
  105. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/grading.json +82 -0
  106. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  107. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/eval_metadata.json +27 -0
  108. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase1_context.json +302 -0
  109. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase2_drift.txt +33 -0
  110. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase3_analysis.json +114 -0
  111. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase4_fix.txt +118 -0
  112. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase5_validate.txt +38 -0
  113. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/result.md +158 -0
  114. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  115. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/timing.json +5 -0
  116. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/outputs/result.md +71 -0
  117. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/grading.json +90 -0
  118. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  119. package/skills/updating-service-skills/SKILL.md +136 -0
  120. package/skills/updating-service-skills/scripts/drift_detector.py +222 -0
  121. package/skills/using-quality-gates/SKILL.md +254 -0
  122. package/skills/using-service-skills/SKILL.md +108 -0
  123. package/skills/using-service-skills/scripts/cataloger.py +74 -0
  124. package/skills/using-service-skills/scripts/skill_activator.py +152 -0
  125. package/skills/using-service-skills/scripts/test_skill_activator.py +58 -0
  126. package/skills/using-xtrm/SKILL.md +34 -38
@@ -0,0 +1,211 @@
1
+ # sync-docs --fix: Evaluation Result (without_skill)
2
+
3
+ ## Summary
4
+
5
+ Ran `doc_structure_analyzer.py --fix` on the project worktree at
6
+ `/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-a881ecc1`.
7
+
8
+ All MISSING scaffold files were created and all INVALID_SCHEMA files had
9
+ frontmatter injected. All 12 docs/ files passed schema validation afterward.
10
+ `bd remember` was attempted but failed — the Dolt server is running but the
11
+ `jaggers_agent_tools` database was not found at `127.0.0.1:13800`.
12
+
13
+ ---
14
+
15
+ ## Step 1: Initial analysis (no --fix)
16
+
17
+ **Command:**
18
+ ```
19
+ python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/doc_structure_analyzer.py \
20
+ --root=/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-a881ecc1
21
+ ```
22
+
23
+ **Result (exit 1, 14 issues):**
24
+
25
+ - README.md: EXTRACTABLE (192 lines, 5 sections suggest docs/ files)
26
+ - CHANGELOG.md: STALE (package.json at v2.4.0, latest CHANGELOG entry is v2.0.0)
27
+ - docs_gaps (5 MISSING):
28
+ - docs/hooks.md — hooks/ directory exists
29
+ - docs/pi-extensions.md — Pi extensions directory exists
30
+ - docs/mcp-servers.md — .mcp.json present
31
+ - docs/policies.md — policies/ directory exists
32
+ - docs/skills.md — skills/ directory exists
33
+ - existing_docs (7 INVALID_SCHEMA — all missing YAML frontmatter):
34
+ - docs/cleanup.md
35
+ - docs/delegation-architecture.md
36
+ - docs/hook-system-summary.md
37
+ - docs/mcp-servers-config.md
38
+ - docs/pi-extensions-migration.md
39
+ - docs/pre-install-cleanup.md
40
+ - docs/todo.md
41
+
42
+ ---
43
+
44
+ ## Step 2: Run --fix
45
+
46
+ **Command:**
47
+ ```
48
+ python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/doc_structure_analyzer.py \
49
+ --root=/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-a881ecc1 --fix
50
+ ```
51
+
52
+ **Script stdout:**
53
+ ```
54
+ Fixing 5 missing docs/ files...
55
+ CREATED docs/hooks.md
56
+ CREATED docs/pi-extensions.md
57
+ CREATED docs/mcp-servers.md
58
+ CREATED docs/policies.md
59
+ CREATED docs/skills.md
60
+
61
+ Injecting frontmatter into 7 schema-invalid docs/ files...
62
+ FIXED docs/cleanup.md
63
+ FIXED docs/delegation-architecture.md
64
+ FIXED docs/hook-system-summary.md
65
+ FIXED docs/mcp-servers-config.md
66
+ FIXED docs/pi-extensions-migration.md
67
+ FIXED docs/pre-install-cleanup.md
68
+ FIXED docs/todo.md
69
+ ```
70
+
71
+ **fix_created** (5 scaffold files generated via validate_doc.py --generate):
72
+ - docs/hooks.md
73
+ - docs/pi-extensions.md
74
+ - docs/mcp-servers.md
75
+ - docs/policies.md
76
+ - docs/skills.md
77
+
78
+ **fix_schema_fixed** (7 files had minimal frontmatter prepended):
79
+ - docs/cleanup.md
80
+ - docs/delegation-architecture.md
81
+ - docs/hook-system-summary.md
82
+ - docs/mcp-servers-config.md
83
+ - docs/pi-extensions-migration.md
84
+ - docs/pre-install-cleanup.md
85
+ - docs/todo.md
86
+
87
+ Note: The JSON report's `docs_gaps` and `existing_docs` still show pre-fix
88
+ state — this is expected. The script captures the state before applying fixes.
89
+ The `fix_created` and `fix_schema_fixed` keys confirm actual changes made.
90
+
91
+ ---
92
+
93
+ ## Step 3: Post-fix validation
94
+
95
+ **Command:**
96
+ ```
97
+ python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/validate_doc.py \
98
+ /home/dawid/projects/xtrm-tools/.claude/worktrees/agent-a881ecc1/docs/
99
+ ```
100
+
101
+ **Output:**
102
+ ```
103
+ docs/cleanup.md [PASS]
104
+ WARN: INDEX regenerated
105
+
106
+ docs/delegation-architecture.md [PASS]
107
+ WARN: INDEX regenerated
108
+
109
+ docs/hook-system-summary.md [PASS]
110
+ WARN: INDEX regenerated
111
+
112
+ docs/hooks.md [PASS]
113
+ WARN: INDEX regenerated
114
+
115
+ docs/mcp-servers-config.md [PASS]
116
+ WARN: INDEX regenerated
117
+
118
+ docs/mcp-servers.md [PASS]
119
+ WARN: INDEX regenerated
120
+
121
+ docs/pi-extensions-migration.md [PASS]
122
+ WARN: INDEX regenerated
123
+
124
+ docs/pi-extensions.md [PASS]
125
+ WARN: INDEX regenerated
126
+
127
+ docs/policies.md [PASS]
128
+ WARN: INDEX regenerated
129
+
130
+ docs/pre-install-cleanup.md [PASS]
131
+ WARN: INDEX regenerated
132
+
133
+ docs/skills.md [PASS]
134
+ WARN: INDEX regenerated
135
+
136
+ docs/todo.md [PASS]
137
+ All checks passed.
138
+
139
+ Result: 12/12 files passed
140
+ ```
141
+
142
+ All 12 docs/ files pass schema validation. The INDEX regenerated warnings are
143
+ expected — validate_doc.py auto-regenerates the `<!-- INDEX -->` table on
144
+ first pass for files that didn't have one yet.
145
+
146
+ ---
147
+
148
+ ## Step 4: bd remember
149
+
150
+ **Attempted key:** `sync-docs-fix-2026-03-18`
151
+
152
+ **Attempted insight:**
153
+ ```
154
+ sync-docs --fix: created 5 scaffold(s): hooks.md, pi-extensions.md,
155
+ mcp-servers.md, policies.md, skills.md; added frontmatter to 7 existing
156
+ file(s): cleanup.md, delegation-architecture.md, hook-system-summary.md,
157
+ mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md,
158
+ todo.md. All 12 docs/ files now pass schema validation. Fill in content and
159
+ run validate_doc.py docs/ to confirm schema.
160
+ ```
161
+
162
+ **Command:**
163
+ ```
164
+ bd remember "sync-docs --fix: created 5 scaffold(s)..." --key sync-docs-fix-2026-03-18
165
+ ```
166
+
167
+ **Result: FAILED**
168
+
169
+ Error from `bd remember`:
170
+ ```
171
+ Error: failed to open database: database "jaggers_agent_tools" not found on
172
+ Dolt server at 127.0.0.1:13800
173
+ ```
174
+
175
+ `bd doctor` confirmed the Dolt server is running but serving a different data
176
+ directory — the `jaggers_agent_tools` database is not present. This is a
177
+ server configuration mismatch, not a code issue. The `--bd-remember` flag in
178
+ `doc_structure_analyzer.py` would have hit the same failure (it also guards
179
+ on `(root / ".beads").exists()` — which passed — but the actual `bd` CLI call
180
+ fails at the database layer).
181
+
182
+ **Note on --bd-remember vs manual bd remember:** The script's `--bd-remember`
183
+ flag skips calling `bd remember` unless `.beads/` exists, which it does. The
184
+ failure happens deeper, in the Dolt connection. Running `bd remember` manually
185
+ produces the same error.
186
+
187
+ ---
188
+
189
+ ## What Was Fixed
190
+
191
+ | Category | Count | Files |
192
+ |---|---|---|
193
+ | MISSING scaffolds created | 5 | hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md |
194
+ | INVALID_SCHEMA frontmatter injected | 7 | cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md |
195
+ | Schema validation passes | 12/12 | All docs/ files |
196
+
197
+ ## What Was NOT Fixed (requires manual/Serena work)
198
+
199
+ | Issue | Reason |
200
+ |---|---|
201
+ | README.md EXTRACTABLE | 5 sections should move to docs/ — requires content judgment, not automated |
202
+ | CHANGELOG.md STALE | v2.4.0 undocumented; requires `add_entry.py` + human input |
203
+ | bd remember | Dolt server database mismatch — `bd doctor --fix` or server reconfiguration needed |
204
+
205
+ ---
206
+
207
+ ## bd Memory Key (intended)
208
+
209
+ `sync-docs-fix-2026-03-18`
210
+
211
+ This key was not stored due to the Dolt server database error described above.
@@ -0,0 +1,91 @@
1
+ {
2
+ "expectations": [
3
+ {
4
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
5
+ "passed": true,
6
+ "evidence": "Step 2 of result.md shows the exact command run: 'python3 .../doc_structure_analyzer.py --root=... --fix' with stdout confirming 5 files created and 7 files fixed."
7
+ },
8
+ {
9
+ "text": "Handled both MISSING scaffolds AND INVALID_SCHEMA files (or correctly reported none found)",
10
+ "passed": true,
11
+ "evidence": "Step 2 shows 5 MISSING scaffold files were created (hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md) and 7 INVALID_SCHEMA files had frontmatter injected (cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md). Both categories were explicitly handled."
12
+ },
13
+ {
14
+ "text": "Ran bd remember and reported the memory key",
15
+ "passed": true,
16
+ "evidence": "Step 4 shows bd remember was attempted with key 'sync-docs-fix-2026-03-18'. It failed with 'database jaggers_agent_tools not found on Dolt server at 127.0.0.1:13800', but the key was explicitly reported ('## bd Memory Key (intended): sync-docs-fix-2026-03-18') and the attempt was fully documented. Per grading note, attempt + reported key satisfies this assertion."
17
+ },
18
+ {
19
+ "text": "Ran validate_doc.py docs/ after fixing to confirm results",
20
+ "passed": true,
21
+ "evidence": "Step 3 shows the command 'python3 .../validate_doc.py .../docs/' was run post-fix, with all 12 files showing [PASS] and final output 'Result: 12/12 files passed'."
22
+ }
23
+ ],
24
+ "summary": {
25
+ "passed": 4,
26
+ "failed": 0,
27
+ "total": 4,
28
+ "pass_rate": 1.0
29
+ },
30
+ "execution_metrics": {
31
+ "tool_calls": {},
32
+ "total_tool_calls": 0,
33
+ "total_steps": 4,
34
+ "errors_encountered": 1,
35
+ "output_chars": 5843,
36
+ "transcript_chars": 5843
37
+ },
38
+ "timing": {
39
+ "executor_duration_seconds": 129.8,
40
+ "grader_duration_seconds": 0.0,
41
+ "total_duration_seconds": 129.8
42
+ },
43
+ "claims": [
44
+ {
45
+ "claim": "All 12 docs/ files pass schema validation after the fix",
46
+ "type": "factual",
47
+ "verified": true,
48
+ "evidence": "validate_doc.py output in Step 3 shows 'Result: 12/12 files passed' with each file individually listed as [PASS]."
49
+ },
50
+ {
51
+ "claim": "The Dolt server is running but serving a different data directory \u2014 jaggers_agent_tools database is not present",
52
+ "type": "factual",
53
+ "verified": true,
54
+ "evidence": "Error message quoted in Step 4: 'database jaggers_agent_tools not found on Dolt server at 127.0.0.1:13800'. bd doctor confirmed server running but wrong data directory."
55
+ },
56
+ {
57
+ "claim": "The JSON report's docs_gaps and existing_docs keys still show pre-fix state \u2014 this is expected",
58
+ "type": "quality",
59
+ "verified": true,
60
+ "evidence": "result.md notes this explicitly and explains fix_created and fix_schema_fixed keys confirm actual changes. Consistent with a script capturing state before applying fixes."
61
+ },
62
+ {
63
+ "claim": "README.md EXTRACTABLE and CHANGELOG.md STALE issues were not fixed",
64
+ "type": "factual",
65
+ "verified": true,
66
+ "evidence": "Step 1 identifies these issues. The 'What Was NOT Fixed' table confirms they were left as requiring manual work \u2014 appropriate since they require content judgment, not automated fixes."
67
+ }
68
+ ],
69
+ "user_notes_summary": {
70
+ "uncertainties": [],
71
+ "needs_review": [
72
+ "README.md EXTRACTABLE \u2014 5 sections should move to docs/ but requires content judgment",
73
+ "CHANGELOG.md STALE \u2014 v2.4.0 undocumented; requires add_entry.py and human input"
74
+ ],
75
+ "workarounds": [
76
+ "bd remember failed due to Dolt server database mismatch; key was reported manually in result.md"
77
+ ]
78
+ },
79
+ "eval_feedback": {
80
+ "suggestions": [
81
+ {
82
+ "assertion": "Ran validate_doc.py docs/ after fixing to confirm results",
83
+ "reason": "This assertion passes even if validate_doc.py reported failures. A stronger assertion would check that all files passed (e.g., 'validate_doc.py reported 0 failures after the fix'), which would only pass on a genuinely successful run."
84
+ },
85
+ {
86
+ "reason": "No assertion covers the quality of generated scaffold files \u2014 e.g., that docs/hooks.md contains meaningful scaffold content (title, frontmatter fields) rather than being empty or trivially minimal. The agent created 5 files and fixed 7, but scaffold content correctness is unchecked."
87
+ }
88
+ ],
89
+ "overall": "All four assertions were satisfied with clear evidence. The main gap is that assertions check process steps rather than outcome quality \u2014 specifically validate_doc.py pass rate and scaffold content correctness."
90
+ }
91
+ }
@@ -0,0 +1,5 @@
1
+ {
2
+ "total_tokens": 34078,
3
+ "duration_ms": 129817,
4
+ "total_duration_seconds": 129.8
5
+ }
@@ -0,0 +1,27 @@
1
+ {
2
+ "eval_id": 1,
3
+ "eval_name": "sprint-closeout",
4
+ "prompt": "I just closed a bunch of bd issues this sprint and merged 3 PRs. The README.md is getting long — can you sync the docs and make sure everything's in order? Use sync-docs.",
5
+ "assertions": [
6
+ {
7
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs with specific data",
8
+ "passed": false,
9
+ "evidence": ""
10
+ },
11
+ {
12
+ "text": "Ran doc_structure_analyzer.py and cited its structured output (STALE, EXTRACTABLE, MISSING, etc.)",
13
+ "passed": false,
14
+ "evidence": ""
15
+ },
16
+ {
17
+ "text": "Detected the CHANGELOG version gap (package.json v2.4.0 vs CHANGELOG v2.0.0)",
18
+ "passed": false,
19
+ "evidence": ""
20
+ },
21
+ {
22
+ "text": "Named at least one concrete next step with a specific file or action",
23
+ "passed": false,
24
+ "evidence": ""
25
+ }
26
+ ]
27
+ }
@@ -0,0 +1,182 @@
1
+ # sync-docs Skill Evaluation: Sprint Closeout
2
+ **Task:** "I just closed a bunch of bd issues this sprint and merged 3 PRs. The README.md is getting long — can you sync the docs and make sure everything's in order? Use sync-docs."
3
+ **Date:** 2026-03-18
4
+ **Worktree:** /home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ae8223b7
5
+ **Scripts run from:** skills/sync-docs/scripts/ (repo-local, not globally installed)
6
+
7
+ ---
8
+
9
+ ## Phases Executed
10
+
11
+ All 5 phases ran. No commits were made.
12
+
13
+ ---
14
+
15
+ ## Phase 1: Gather Context
16
+
17
+ **Script:** `skills/sync-docs/scripts/context_gatherer.py --since=30`
18
+
19
+ **Key findings:**
20
+
21
+ - **bd closed issues:** 0 returned by the script (see issue note below). Manually confirmed via `bd list --state=closed --limit=10` from the main repo directory: 10 recently closed issues including:
22
+ - `jaggers-agent-tools-1lc` — Remove dead code cli/index.js (P0 bug)
23
+ - `jaggers-agent-tools-7dwo` — Fix commit gate blocking when session claim is already closed (P0 bug)
24
+ - `jaggers-agent-tools-8dhs` — Wire quality gates PostToolUse hooks into project .claude/settings.json (P0 bug)
25
+ - `jaggers-agent-tools-9bl` — Fix hash-based drift detection misclassification (P0 bug)
26
+ - `jaggers-agent-tools-ihgz` — Clean up legacy hooks and config on current machine before fresh install (P0 task)
27
+ - `jaggers-agent-tools-l1g` — Fix MCP sync guard placement (P0 bug)
28
+ - `jaggers-agent-tools-p9wc` — Install official Claude plugins during xtrm install and prune duplicate .mcp servers (P1 task)
29
+ - (plus 3 subtasks of jaggers-agent-tools-4xr6)
30
+
31
+ - **Merged PRs (last 30 days, from git log):** 10 found, including:
32
+ - PR #15: release/2.0.1
33
+ - PR #14: chore/update-status-doc
34
+ - PR #13: fix/agents-target
35
+ - PR #12: feat/project-install-all
36
+ - PR #8: phase2-cli-refactor
37
+
38
+ - **bd memories:** 0 (Dolt server not accessible from worktree)
39
+ - **Serena drift:** `available: false` — drift_detector.py was run separately (see Phase 2)
40
+
41
+ **Issue with script:** `context_gatherer.py` returned `bd_closed_issues: []` and `bd_memories: []` because the Dolt server is unavailable from the worktree path (`database "jaggers_agent_tools" not found on Dolt server at 127.0.0.1:13794`). The script silently returns empty arrays instead of warning about this. The `bd list` command only works from the main repo directory, not the worktree. This is a skill reliability gap for worktree-based execution.
42
+
43
+ ---
44
+
45
+ ## Phase 2: Detect SSOT Drift
46
+
47
+ **Script:** `skills/documenting/scripts/drift_detector.py scan` (from skill's cross-reference)
48
+
49
+ **Result:** Exit code 1 (stale detected). **5 memories stale:**
50
+
51
+ | Memory | Last Updated | Stale Files |
52
+ |--------|-------------|-------------|
53
+ | `ssot_cli_hooks_2026-02-03` | 2026-02-25 | hooks/beads-gate-core.mjs, hooks/hooks.json, hooks/beads-memory-gate.mjs |
54
+ | `ssot_cli_universal_hub_2026-02-19` | 2026-02-25 | cli/src/commands/install.ts, cli/src/core/diff.ts |
55
+ | `ssot_cli_ux_improvements_2026-02-22` | 2026-02-25 | cli/src/commands/install.ts, cli/src/core/diff.ts |
56
+ | `ssot_jaggers-agent-tools_installer_architecture_2026-02-03` | 2026-02-25 | cli/dist/index.cjs, cli/src/commands/install.ts |
57
+ | `ssot_jaggers-agent-tools_migration_2026-02-01` | 2026-02-01 | cli/dist/index.cjs, cli/src/commands/install.ts |
58
+
59
+ These Serena memories need updating via Serena tools. Not done in this run (Serena MCP not available in eval context), but flagged for manual follow-up.
60
+
61
+ ---
62
+
63
+ ## Phase 3: Analyze Document Structure
64
+
65
+ **Script:** `skills/sync-docs/scripts/doc_structure_analyzer.py`
66
+
67
+ **Result:** Exit code 1 (issues found). 14 total issues.
68
+
69
+ ### README.md
70
+ - **Status:** EXTRACTABLE (192 lines, threshold 200)
71
+ - Not BLOATED yet, but close. Contains 5 sections that belong in docs/:
72
+ - `### Skills` → docs/skills.md
73
+ - `## Policy System` → docs/policies.md
74
+ - `### Policy Files` → docs/policies.md
75
+ - `## Hooks Reference` → docs/hooks.md
76
+ - `## MCP Servers` → docs/mcp-servers.md
77
+
78
+ ### CHANGELOG.md
79
+ - **Status:** STALE
80
+ - Last entry date: 2026-03-12
81
+ - Last commit date: 2026-03-18
82
+ - package.json version: **2.4.0**
83
+ - Latest CHANGELOG entry: **v2.0.0**
84
+ - Gap: v2.1.0 through v2.4.0 are undocumented
85
+
86
+ ### Missing docs/ files (5)
87
+ | Path | Signal |
88
+ |------|--------|
89
+ | docs/hooks.md | hooks/ directory exists |
90
+ | docs/pi-extensions.md | config/pi/extensions/ exists |
91
+ | docs/mcp-servers.md | .mcp.json present |
92
+ | docs/policies.md | policies/ directory exists |
93
+ | docs/skills.md | skills/ directory exists |
94
+
95
+ ### Existing docs/ with invalid schema (7)
96
+ All 7 existing docs/ files were missing YAML frontmatter:
97
+ - docs/cleanup.md (438 lines)
98
+ - docs/delegation-architecture.md (185 lines)
99
+ - docs/hook-system-summary.md (176 lines)
100
+ - docs/mcp-servers-config.md (364 lines)
101
+ - docs/pi-extensions-migration.md (56 lines)
102
+ - docs/pre-install-cleanup.md (107 lines)
103
+ - docs/todo.md (4 lines)
104
+
105
+ ---
106
+
107
+ ## Phase 4: Execute Fixes
108
+
109
+ **Script:** `skills/sync-docs/scripts/doc_structure_analyzer.py --fix`
110
+
111
+ **Actions taken:**
112
+
113
+ - CREATED 5 missing docs/ scaffolds with valid YAML frontmatter:
114
+ - docs/hooks.md
115
+ - docs/pi-extensions.md
116
+ - docs/mcp-servers.md
117
+ - docs/policies.md
118
+ - docs/skills.md
119
+
120
+ - FIXED 7 existing docs/ files by injecting YAML frontmatter:
121
+ - docs/cleanup.md
122
+ - docs/delegation-architecture.md
123
+ - docs/hook-system-summary.md
124
+ - docs/mcp-servers-config.md
125
+ - docs/pi-extensions-migration.md
126
+ - docs/pre-install-cleanup.md
127
+ - docs/todo.md
128
+
129
+ **NOT done (require Serena or manual action):**
130
+ - README.md extraction: sections identified as EXTRACTABLE cannot be auto-moved by `--fix`; requires Serena tools (`mcp__serena__get_symbols_overview`, `mcp__serena__replace_symbol_body`) to safely extract and replace with summary + link.
131
+ - CHANGELOG gap: v2.0.0 to v2.4.0 undocumented. Requires `skills/documenting/scripts/changelog/add_entry.py` to add entries for v2.1.0–v2.4.0.
132
+ - Serena memory updates: 5 stale memories need updating via Serena tools.
133
+
134
+ **Issue with --fix output:** After running `--fix`, the JSON report still shows all items as MISSING/INVALID_SCHEMA. The analysis section reflects the state before fixes were applied (pre-fix snapshot), not the post-fix state. This is confusing — the fix confirmation lines at the top confirm success, but the JSON section contradicts it. The script should re-analyze after applying fixes, or clearly label the JSON as "pre-fix state".
135
+
136
+ ---
137
+
138
+ ## Phase 5: Validate
139
+
140
+ **Script:** `skills/sync-docs/scripts/validate_doc.py /home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ae8223b7/docs/`
141
+
142
+ **Result:** 12/12 files PASSED
143
+
144
+ All docs/ files passed schema validation after Phase 4 fixes. Each file received an auto-generated INDEX table. Warnings noted: "INDEX regenerated" on 11 of 12 files — expected behavior for newly-created or newly-fixed files.
145
+
146
+ ---
147
+
148
+ ## Summary of Actions Taken
149
+
150
+ | Action | Status |
151
+ |--------|--------|
152
+ | Phase 1: Context gathered | Done (10 merged PRs, 10 closed issues via manual check) |
153
+ | Phase 2: SSOT drift detected | Done (5 memories stale) |
154
+ | Phase 3: Structure analyzed | Done (14 issues found) |
155
+ | Phase 4: Missing docs scaffolded | Done (5 created, 7 fixed) |
156
+ | Phase 4: README extraction | NOT done — requires Serena MCP |
157
+ | Phase 4: CHANGELOG gap filled | NOT done — requires manual add_entry.py calls |
158
+ | Phase 4: Serena memory updates | NOT done — requires Serena MCP |
159
+ | Phase 5: Validation | Done (12/12 PASS) |
160
+
161
+ ---
162
+
163
+ ## Outstanding Actions (for human or next agent)
164
+
165
+ 1. **CHANGELOG:** Add entries for v2.1.0 through v2.4.0 using `skills/documenting/scripts/changelog/add_entry.py`. Key changes to document: plugin-based install, quality gates, policy system, Pi memory gate, MCP cleanup.
166
+
167
+ 2. **README extraction:** Use Serena to extract `## Hooks Reference`, `## MCP Servers`, `## Policy System`, and `### Skills` sections into their new docs/ files, then replace each section with a short summary + link.
168
+
169
+ 3. **Serena memory updates:** Update 5 stale memories, especially `ssot_cli_hooks_2026-02-03` (hooks heavily modified) and installer architecture memories (CLI refactored).
170
+
171
+ 4. **New docs/ files need content:** The 5 scaffolded files (`docs/hooks.md`, `docs/skills.md`, etc.) have valid frontmatter and structure but contain only placeholder content. Fill them using Serena or by extracting from README.
172
+
173
+ ---
174
+
175
+ ## Skill Issues Found
176
+
177
+ | Issue | Severity | Description |
178
+ |-------|----------|-------------|
179
+ | bd closed issues empty in worktree | Medium | `context_gatherer.py` silently returns empty arrays when Dolt server is unreachable from worktree. No warning emitted. Should log a clear error and fall back to git-based inference of closed issues (e.g. parse commit messages for issue IDs). |
180
+ | `--fix` JSON output shows pre-fix state | Low | After `--fix`, the JSON analysis section still shows MISSING/INVALID_SCHEMA statuses. Re-analyze post-fix or clearly label as pre-fix snapshot. |
181
+ | Phase 2 uses a different skill's script | Low | Phase 2 calls `skills/documenting/scripts/drift_detector.py`, not a sync-docs script. This cross-skill dependency isn't clearly flagged in the skill. The SKILL.md instructions correctly note it, but the path uses `$HOME/.claude/skills/documenting/` (global install path), not a repo-local path. Evaluators running repo-local must resolve this manually. |
182
+ | Serena required for README extraction | Note | Phase 4 cannot fully execute without Serena MCP. The skill correctly documents this but offers no fallback. In a non-Serena eval context, README extraction simply cannot proceed. |
@@ -0,0 +1,95 @@
1
+ {
2
+ "expectations": [
3
+ {
4
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs with specific data",
5
+ "passed": true,
6
+ "evidence": "Phase 1 documents running 'skills/sync-docs/scripts/context_gatherer.py --since=30'. While the script returned empty arrays for bd_closed_issues due to a Dolt server connectivity issue from the worktree, the agent performed a manual fallback using 'bd list --state=closed --limit=10' and reported 8 specific named issues (jaggers-agent-tools-1lc, -7dwo, -8dhs, -9bl, -ihgz, -l1g, -p9wc, plus subtasks). Merged PRs were also reported with specific details: 10 PRs including PR #15 release/2.0.1, PR #14 chore/update-status-doc, PR #13 fix/agents-target, etc."
7
+ },
8
+ {
9
+ "text": "Ran doc_structure_analyzer.py and cited its structured output (STALE, EXTRACTABLE, MISSING, etc.)",
10
+ "passed": true,
11
+ "evidence": "Phase 3 documents running 'skills/sync-docs/scripts/doc_structure_analyzer.py'. The result explicitly cites: README.md as EXTRACTABLE (192 lines, threshold 200) with 5 sections identified for extraction; CHANGELOG.md as STALE; 5 MISSING docs/ files (docs/hooks.md, docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md, docs/skills.md); and 7 existing docs/ files with INVALID_SCHEMA. The structured output vocabulary (STALE, EXTRACTABLE, MISSING, INVALID_SCHEMA) is used throughout."
12
+ },
13
+ {
14
+ "text": "Detected the CHANGELOG version gap (package.json v2.4.0 vs CHANGELOG v2.0.0)",
15
+ "passed": true,
16
+ "evidence": "Phase 3 explicitly states: 'package.json version: 2.4.0', 'Latest CHANGELOG entry: v2.0.0', 'Gap: v2.1.0 through v2.4.0 are undocumented'. Outstanding Actions item 1 also calls out: 'Add entries for v2.1.0 through v2.4.0 using skills/documenting/scripts/changelog/add_entry.py'."
17
+ },
18
+ {
19
+ "text": "Named at least one concrete next step with a specific file or action",
20
+ "passed": true,
21
+ "evidence": "The 'Outstanding Actions' section names 4 concrete next steps with specific files/tools: (1) 'Add entries for v2.1.0 through v2.4.0 using skills/documenting/scripts/changelog/add_entry.py'; (2) 'Use Serena to extract sections into their new docs/ files'; (3) 'Update 5 stale memories, especially ssot_cli_hooks_2026-02-03'; (4) 'Fill [5 scaffolded files] using Serena or by extracting from README'."
22
+ }
23
+ ],
24
+ "summary": {
25
+ "passed": 4,
26
+ "failed": 0,
27
+ "total": 4,
28
+ "pass_rate": 1.0
29
+ },
30
+ "execution_metrics": {
31
+ "tool_calls": 0,
32
+ "total_tool_calls": 0,
33
+ "total_steps": 5,
34
+ "errors_encountered": 2,
35
+ "output_chars": 6127,
36
+ "transcript_chars": 0
37
+ },
38
+ "timing": {
39
+ "executor_duration_seconds": 0.0,
40
+ "grader_duration_seconds": 0.0,
41
+ "total_duration_seconds": 0.0
42
+ },
43
+ "claims": [
44
+ {
45
+ "claim": "context_gatherer.py returned 0 bd_closed_issues and 0 bd_memories",
46
+ "type": "factual",
47
+ "verified": true,
48
+ "evidence": "The result documents this explicitly, attributing it to the Dolt server being unreachable from the worktree path (database 'jaggers_agent_tools' not found on Dolt server at 127.0.0.1:13794). The agent correctly identified this as a skill reliability gap."
49
+ },
50
+ {
51
+ "claim": "10 merged PRs found in last 30 days via git log",
52
+ "type": "factual",
53
+ "verified": false,
54
+ "evidence": "The result lists 5 PRs by name (PR #15, #14, #13, #12, #8) but says 10 were found. The remaining 5 are not enumerated, so the count of 10 cannot be fully verified from the output alone."
55
+ },
56
+ {
57
+ "claim": "Phase 5 validation: 12/12 files PASSED",
58
+ "type": "quality",
59
+ "verified": true,
60
+ "evidence": "The result states 'All docs/ files passed schema validation after Phase 4 fixes' and notes 'INDEX regenerated' warnings on 11/12 files, which is consistent with newly created or patched files. The count of 12 matches 5 newly created + 7 patched existing files."
61
+ },
62
+ {
63
+ "claim": "CHANGELOG last entry date is 2026-03-12",
64
+ "type": "factual",
65
+ "verified": false,
66
+ "evidence": "This value came from doc_structure_analyzer.py output. The result.md cites it directly but there is no way to independently verify this from the output file alone \u2014 CHANGELOG.md was not read or quoted."
67
+ },
68
+ {
69
+ "claim": "README.md is 192 lines (near EXTRACTABLE threshold of 200)",
70
+ "type": "factual",
71
+ "verified": false,
72
+ "evidence": "Reported from doc_structure_analyzer.py output. Not independently verified from outputs_dir, but consistent with the script's structured analysis."
73
+ }
74
+ ],
75
+ "user_notes_summary": {
76
+ "uncertainties": [],
77
+ "needs_review": [],
78
+ "workarounds": []
79
+ },
80
+ "eval_feedback": {
81
+ "suggestions": [
82
+ {
83
+ "assertion": "Ran context_gatherer.py and reported bd closed issues or merged PRs with specific data",
84
+ "reason": "The assertion passes even though context_gatherer.py itself returned empty arrays and the agent had to work around it manually. This means the assertion rewards partial skill execution + manual workaround rather than the skill actually working. Consider splitting into: (a) script ran successfully, and (b) specific bd or PR data was surfaced \u2014 or requiring the data came from the script rather than a manual fallback."
85
+ },
86
+ {
87
+ "reason": "No assertion checks whether Phase 4 fixes were actually applied correctly \u2014 the result notes a confusing bug where '--fix' JSON output shows pre-fix state while the confirmation lines say success. An assertion like 'Phase 4 --fix created or patched at least N docs/ files' would catch regressions in the fix pipeline."
88
+ },
89
+ {
90
+ "reason": "No assertion checks that the agent identified Serena as a dependency for README extraction and CHANGELOG gap closure. This is a meaningful skill-awareness outcome: a run that blindly claimed to fix the README without Serena would be factually wrong, but the current assertions would still pass."
91
+ }
92
+ ],
93
+ "overall": "All four assertions pass with strong, specific evidence. The weakest assertion is #1, which can be satisfied by a manual workaround rather than the script functioning correctly. The other three are discriminating and well-scoped."
94
+ }
95
+ }
@@ -0,0 +1 @@
1
+ {"total_tokens": 27000, "duration_ms": 170000, "total_duration_seconds": 170.0}