xtrm-tools 2.4.1 → 2.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -6
- package/cli/dist/index.cjs +738 -239
- package/cli/dist/index.cjs.map +1 -1
- package/cli/package.json +1 -1
- package/config/hooks.json +10 -0
- package/config/pi/extensions/core/adapter.ts +2 -14
- package/config/pi/extensions/core/guard-rules.ts +70 -0
- package/config/pi/extensions/core/session-state.ts +59 -0
- package/config/pi/extensions/main-guard.ts +10 -14
- package/config/pi/extensions/plan-mode/README.md +65 -0
- package/config/pi/extensions/plan-mode/index.ts +340 -0
- package/config/pi/extensions/plan-mode/utils.ts +168 -0
- package/config/pi/extensions/service-skills.ts +51 -7
- package/config/pi/extensions/session-flow.ts +117 -0
- package/hooks/beads-claim-sync.mjs +140 -14
- package/hooks/beads-compact-restore.mjs +41 -9
- package/hooks/beads-compact-save.mjs +36 -5
- package/hooks/beads-gate-messages.mjs +27 -1
- package/hooks/beads-memory-gate.mjs +24 -16
- package/hooks/beads-stop-gate.mjs +58 -8
- package/hooks/guard-rules.mjs +117 -0
- package/hooks/hooks.json +28 -18
- package/hooks/main-guard.mjs +22 -22
- package/hooks/quality-check.cjs +1286 -0
- package/hooks/quality-check.py +345 -0
- package/hooks/session-state.mjs +138 -0
- package/package.json +2 -1
- package/project-skills/quality-gates/.claude/settings.json +1 -24
- package/skills/creating-service-skills/SKILL.md +433 -0
- package/skills/creating-service-skills/references/script_quality_standards.md +425 -0
- package/skills/creating-service-skills/references/service_skill_system_guide.md +278 -0
- package/skills/creating-service-skills/scripts/bootstrap.py +326 -0
- package/skills/creating-service-skills/scripts/deep_dive.py +304 -0
- package/skills/creating-service-skills/scripts/scaffolder.py +482 -0
- package/skills/scoping-service-skills/SKILL.md +231 -0
- package/skills/scoping-service-skills/scripts/scope.py +74 -0
- package/skills/sync-docs/SKILL.md +235 -0
- package/skills/sync-docs/evals/evals.json +89 -0
- package/skills/sync-docs/references/doc-structure.md +104 -0
- package/skills/sync-docs/references/schema.md +103 -0
- package/skills/sync-docs/scripts/context_gatherer.py +246 -0
- package/skills/sync-docs/scripts/doc_structure_analyzer.py +495 -0
- package/skills/sync-docs/scripts/validate_doc.py +365 -0
- package/skills/sync-docs-workspace/iteration-1/benchmark.json +293 -0
- package/skills/sync-docs-workspace/iteration-1/benchmark.md +13 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/outputs/result.md +210 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/outputs/result.md +101 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/outputs/result.md +198 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/outputs/result.md +94 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/outputs/result.md +237 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/outputs/result.md +134 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/grading.json +28 -0
- package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-2/benchmark.json +297 -0
- package/skills/sync-docs-workspace/iteration-2/benchmark.md +13 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/outputs/result.md +137 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/grading.json +92 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/outputs/result.md +134 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json +86 -0
- package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/outputs/result.md +193 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/grading.json +72 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/outputs/result.md +211 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/grading.json +91 -0
- package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/outputs/result.md +182 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/outputs/result.md +222 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/grading.json +88 -0
- package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/benchmark.json +298 -0
- package/skills/sync-docs-workspace/iteration-3/benchmark.md +13 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/outputs/result.md +125 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/grading.json +97 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/outputs/result.md +144 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/grading.json +78 -0
- package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/outputs/result.md +104 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/grading.json +91 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/outputs/result.md +79 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/grading.json +82 -0
- package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/eval_metadata.json +27 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase1_context.json +302 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase2_drift.txt +33 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase3_analysis.json +114 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase4_fix.txt +118 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase5_validate.txt +38 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/result.md +158 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/timing.json +5 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/outputs/result.md +71 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/grading.json +90 -0
- package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
- package/skills/updating-service-skills/SKILL.md +136 -0
- package/skills/updating-service-skills/scripts/drift_detector.py +222 -0
- package/skills/using-quality-gates/SKILL.md +254 -0
- package/skills/using-service-skills/SKILL.md +108 -0
- package/skills/using-service-skills/scripts/cataloger.py +74 -0
- package/skills/using-service-skills/scripts/skill_activator.py +152 -0
- package/skills/using-service-skills/scripts/test_skill_activator.py +58 -0
- package/skills/using-xtrm/SKILL.md +34 -38
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Doc Audit: README.md vs docs/
|
|
2
|
+
|
|
3
|
+
**Date:** 2026-03-18
|
|
4
|
+
**Scope:** `/home/dawid/projects/xtrm-tools/README.md` and `/home/dawid/projects/xtrm-tools/docs/`
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Summary
|
|
9
|
+
|
|
10
|
+
The README is 193 lines and contains six substantive reference sections beyond the quick-start and version history. Most of them already have dedicated counterparts in `docs/`, but the README duplicates or partially overlaps that content. Below is a section-by-section breakdown with a concrete recommendation for each.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## README Sections Reviewed
|
|
15
|
+
|
|
16
|
+
### 1. "Hooks Reference" (lines 114-141)
|
|
17
|
+
|
|
18
|
+
**What it contains:**
|
|
19
|
+
- Event type table (SessionStart, PreToolUse, PostToolUse, Stop, PreCompact)
|
|
20
|
+
- Main Guard behavior summary
|
|
21
|
+
- Beads Gates behavior table (Edit Gate, Commit Gate, Stop Gate, Memory Gate)
|
|
22
|
+
|
|
23
|
+
**Existing doc:** `docs/hooks.md` — a fully developed reference (134 lines) covering the event model, all hook groups, install profiles, operational workflow, and troubleshooting.
|
|
24
|
+
|
|
25
|
+
**Verdict: Move this content to `docs/hooks.md`.**
|
|
26
|
+
|
|
27
|
+
The README's hooks section is a shallow subset of what is already in `docs/hooks.md`. The event types table duplicates the Event Model section. The Main Guard and Beads Gates summaries duplicate the Hook Groups section. The README should keep at most one sentence pointing to `docs/hooks.md`.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
### 2. "Policy System" (lines 66-87)
|
|
32
|
+
|
|
33
|
+
**What it contains:**
|
|
34
|
+
- Policy files table (5 policies, runtime, purpose)
|
|
35
|
+
- Compiler commands (`compile-policies.mjs`)
|
|
36
|
+
|
|
37
|
+
**Existing doc:** `docs/policies.md` — currently a stub with only a placeholder "Overview" section and no real content.
|
|
38
|
+
|
|
39
|
+
**Verdict: Move this content to `docs/policies.md`.**
|
|
40
|
+
|
|
41
|
+
The README's Policy System section is the only place this information is documented. The stub at `docs/policies.md` exists but is empty. The policy files table and compiler commands belong there. The README should summarize in one sentence and link to `docs/policies.md`.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
### 3. "MCP Servers" (lines 143-158)
|
|
46
|
+
|
|
47
|
+
**What it contains:**
|
|
48
|
+
- Table of xtrm-managed MCP servers (gitnexus, github-grep, deepwiki)
|
|
49
|
+
- List of official Claude plugins installed during `xtrm install all`
|
|
50
|
+
|
|
51
|
+
**Existing docs:** `docs/mcp.md` — a developed reference covering canonical sources, server inventory (core + optional), operational workflow, and troubleshooting. `docs/mcp-servers.md` — a stub with only a placeholder "Overview" section.
|
|
52
|
+
|
|
53
|
+
**Verdict: Move this content to `docs/mcp.md` and fill `docs/mcp-servers.md`.**
|
|
54
|
+
|
|
55
|
+
The README's server list is an abbreviated duplicate of `docs/mcp.md`'s Server Inventory section. The official Claude plugins list (serena, context7, github, ralph-loop) is not captured anywhere in `docs/` — it should be added to `docs/mcp.md` under a "Plugin Installation" subsection, not kept only in the README. The README should link to `docs/mcp.md`.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
### 4. "Plugin Structure" (lines 52-63)
|
|
60
|
+
|
|
61
|
+
**What it contains:**
|
|
62
|
+
- Directory tree of `plugins/xtrm-tools/`
|
|
63
|
+
- Note about `${CLAUDE_PLUGIN_ROOT}` path resolution
|
|
64
|
+
|
|
65
|
+
**Existing doc:** None. There is no `docs/plugin.md` or equivalent.
|
|
66
|
+
|
|
67
|
+
**Verdict: Move this content to a new `docs/plugin.md` (or into XTRM-GUIDE.md).**
|
|
68
|
+
|
|
69
|
+
XTRM-GUIDE.md already contains an Architecture section and an Installation section — the plugin structure tree logically belongs there. The README can keep a condensed one-liner summary. Either move the tree to XTRM-GUIDE.md's existing "Plugin Structure" section (line 4 of the guide ToC) or create `docs/plugin.md`.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
### 5. "Skills" table (lines 42-49, inside "What's Included")
|
|
74
|
+
|
|
75
|
+
**What it contains:**
|
|
76
|
+
- Table of 4 global skills with type and purpose
|
|
77
|
+
|
|
78
|
+
**Existing doc:** `docs/skills.md` — a fully developed reference covering the runtime model, core global skills, specialized global skills, authoring contract, and operational commands.
|
|
79
|
+
|
|
80
|
+
**Verdict: The README table is a useful at-a-glance summary — keep it, but ensure it stays in sync with `docs/skills.md`.**
|
|
81
|
+
|
|
82
|
+
The README lists only 4 skills while `docs/skills.md` documents many more. This is an acceptable intentional narrowing for a README intro, but the content overlap means these can diverge. If the team prefers a single source of truth, the README table should be removed and replaced with a link to `docs/skills.md`. If the README table is kept, it should explicitly say it is not exhaustive.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
### 6. "Version History" table (lines 179-187)
|
|
87
|
+
|
|
88
|
+
**What it contains:**
|
|
89
|
+
- 4-row table of recent versions (2.0.0–2.3.0) with dates and highlights
|
|
90
|
+
|
|
91
|
+
**Existing doc:** `CHANGELOG.md` (linked from README line 5).
|
|
92
|
+
|
|
93
|
+
**Verdict: Remove this table from the README and rely on the CHANGELOG.md link.**
|
|
94
|
+
|
|
95
|
+
A partial version table in the README that duplicates CHANGELOG.md adds maintenance burden. The link on line 5 already points to the full changelog. The README version history adds no value that the link does not already provide.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Recommended Moves (Prioritized)
|
|
100
|
+
|
|
101
|
+
| Priority | README Section | Action | Target |
|
|
102
|
+
|----------|---------------|--------|--------|
|
|
103
|
+
| High | Policy System | Move policy files table + compiler commands | `docs/policies.md` (currently a stub) |
|
|
104
|
+
| High | MCP Servers — official plugins list | Add to existing docs | `docs/mcp.md` (this data is missing from docs/) |
|
|
105
|
+
| Medium | Hooks Reference | Remove from README, link to existing doc | `docs/hooks.md` |
|
|
106
|
+
| Medium | Version History table | Remove from README | Already covered by `CHANGELOG.md` link |
|
|
107
|
+
| Medium | MCP Servers table | Remove from README, link to existing doc | `docs/mcp.md` |
|
|
108
|
+
| Low | Plugin Structure tree | Move to XTRM-GUIDE.md or new `docs/plugin.md` | `XTRM-GUIDE.md` (Plugin Structure section) |
|
|
109
|
+
| Low | Skills table | Keep as intentional summary or remove + link | `docs/skills.md` |
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## What Should Stay in README
|
|
114
|
+
|
|
115
|
+
The README should remain a fast-path entry point containing only:
|
|
116
|
+
- One-paragraph description and version badge
|
|
117
|
+
- Quick Start commands (install + verify)
|
|
118
|
+
- "What's Included" as a brief feature summary (not a full reference)
|
|
119
|
+
- Links to docs/ and XTRM-GUIDE.md for details
|
|
120
|
+
- License
|
|
121
|
+
|
|
122
|
+
All reference-level detail (event types, hook behaviors, policy files, MCP server inventories) belongs in `docs/`.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Stubs That Need Content
|
|
127
|
+
|
|
128
|
+
Two files in `docs/` exist as nearly-empty stubs and should be filled during any move:
|
|
129
|
+
|
|
130
|
+
| File | Current state | Should contain |
|
|
131
|
+
|------|--------------|----------------|
|
|
132
|
+
| `docs/policies.md` | Placeholder only | Policy files table, compiler usage, policy-to-hook compilation model |
|
|
133
|
+
| `docs/mcp-servers.md` | Placeholder only | Could absorb or replace `docs/mcp.md`, or be removed to avoid duplication |
|
|
134
|
+
| `docs/pi-extensions.md` | Placeholder only | Pi extensions system (config/pi/extensions/), runtime model, migration from project skills |
|
package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
{
|
|
2
|
+
"expectations": [
|
|
3
|
+
{
|
|
4
|
+
"text": "Ran doc_structure_analyzer.py and cited its output",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "No mention of doc_structure_analyzer.py anywhere in result.md. The audit was performed by directly reading files rather than running any script. No script output is quoted or referenced."
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"text": "Named at least 2 specific README sections with their suggested docs/ destination",
|
|
10
|
+
"passed": true,
|
|
11
|
+
"evidence": "result.md names six README sections with specific destinations: 'Hooks Reference' -> docs/hooks.md, 'Policy System' -> docs/policies.md, 'MCP Servers' -> docs/mcp.md, 'Plugin Structure' -> docs/plugin.md or XTRM-GUIDE.md, 'Skills table' -> docs/skills.md, 'Version History' -> CHANGELOG.md. The recommended moves table further enumerates these with priority levels."
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"text": "Did NOT run --fix or create/edit any files (audit-only mode respected)",
|
|
15
|
+
"passed": true,
|
|
16
|
+
"evidence": "result.md is a read-only audit report. No --fix flag is mentioned, no files were edited, no docs/ files were created or modified. The report only analyzes and recommends."
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"text": "Report is actionable with clear next steps",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "result.md includes a 'Recommended Moves (Prioritized)' table with Priority (High/Medium/Low), README Section, Action (Move/Remove/Add), and Target columns. Each section also has a specific Verdict with a concrete instruction (e.g., 'Move this content to docs/policies.md', 'Remove this table from the README and rely on the CHANGELOG.md link')."
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
"summary": {
|
|
25
|
+
"passed": 3,
|
|
26
|
+
"failed": 1,
|
|
27
|
+
"total": 4,
|
|
28
|
+
"pass_rate": 0.75
|
|
29
|
+
},
|
|
30
|
+
"execution_metrics": {
|
|
31
|
+
"tool_calls": {},
|
|
32
|
+
"total_tool_calls": 0,
|
|
33
|
+
"total_steps": 0,
|
|
34
|
+
"errors_encountered": 0,
|
|
35
|
+
"output_chars": 3847,
|
|
36
|
+
"transcript_chars": 0
|
|
37
|
+
},
|
|
38
|
+
"timing": {
|
|
39
|
+
"executor_duration_seconds": 0.0,
|
|
40
|
+
"grader_duration_seconds": 0.0,
|
|
41
|
+
"total_duration_seconds": 0.0
|
|
42
|
+
},
|
|
43
|
+
"claims": [
|
|
44
|
+
{
|
|
45
|
+
"claim": "The README is 193 lines",
|
|
46
|
+
"type": "factual",
|
|
47
|
+
"verified": false,
|
|
48
|
+
"evidence": "No transcript or tool call log available to verify the README was actually read. The line count is plausible but cannot be confirmed from available outputs."
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"claim": "docs/policies.md is currently a stub with only a placeholder 'Overview' section",
|
|
52
|
+
"type": "factual",
|
|
53
|
+
"verified": false,
|
|
54
|
+
"evidence": "No transcript available to confirm the file was read. The claim is specific and detailed, but cannot be verified from the available output alone."
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
"claim": "The official Claude plugins list (serena, context7, github, ralph-loop) is not captured anywhere in docs/",
|
|
58
|
+
"type": "factual",
|
|
59
|
+
"verified": false,
|
|
60
|
+
"evidence": "Without a transcript showing the docs/ directory was searched, this claim cannot be fully verified. It is a meaningful and specific claim, but unverifiable from result.md alone."
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"claim": "The audit covered six substantive reference sections beyond quick-start and version history",
|
|
64
|
+
"type": "quality",
|
|
65
|
+
"verified": true,
|
|
66
|
+
"evidence": "result.md explicitly reviews six named sections with line references, verdicts, and recommendations for each."
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
"user_notes_summary": {
|
|
70
|
+
"uncertainties": [],
|
|
71
|
+
"needs_review": [],
|
|
72
|
+
"workarounds": []
|
|
73
|
+
},
|
|
74
|
+
"eval_feedback": {
|
|
75
|
+
"suggestions": [
|
|
76
|
+
{
|
|
77
|
+
"assertion": "Ran doc_structure_analyzer.py and cited its output",
|
|
78
|
+
"reason": "This is the most discriminating assertion and it failed \u2014 the run produced a high-quality audit without using the designated script at all. This reveals that a skilled agent can satisfy the other three expectations (section names, no-fix, actionable report) purely through manual file reading. The eval should add an assertion that verifies a specific artifact from the script's output (e.g., a section header or JSON field that only the script produces), making it impossible to pass by circumventing the tool."
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"reason": "No assertion checks the accuracy of line number references or section names cited. The report references specific line ranges (e.g., 'lines 114-141' for Hooks Reference). An assertion verifying at least one cited line range matches actual README content would catch hallucinated citations."
|
|
82
|
+
}
|
|
83
|
+
],
|
|
84
|
+
"overall": "The three passing assertions are satisfied by a well-written audit regardless of whether the prescribed script was used. Assertion 1 is the only one that distinguishes correct process from bypassed process \u2014 and it failed. The eval should strengthen the script-usage assertion with output-artifact verification to prevent this class of bypass."
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"total_tokens": 15000, "duration_ms": 75000, "total_duration_seconds": 75.0}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"eval_id": 2,
|
|
3
|
+
"eval_name": "fix-mode",
|
|
4
|
+
"prompt": "Run sync-docs --fix on this project and remember what you did with bd.",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"text": "Ran doc_structure_analyzer.py with --fix flag",
|
|
8
|
+
"passed": false,
|
|
9
|
+
"evidence": ""
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"text": "Handled both MISSING scaffolds AND INVALID_SCHEMA files (or correctly reported none found)",
|
|
13
|
+
"passed": false,
|
|
14
|
+
"evidence": ""
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"text": "Ran bd remember and reported the memory key",
|
|
18
|
+
"passed": false,
|
|
19
|
+
"evidence": ""
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"text": "Ran validate_doc.py docs/ after fixing to confirm results",
|
|
23
|
+
"passed": false,
|
|
24
|
+
"evidence": ""
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# sync-docs --fix Evaluation Result
|
|
2
|
+
|
|
3
|
+
**Date:** 2026-03-18
|
|
4
|
+
**Working directory:** `/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ad500878`
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Command Executed
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/doc_structure_analyzer.py \
|
|
12
|
+
--fix --bd-remember \
|
|
13
|
+
--root=/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ad500878
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Note: The script was invoked from the main repo path (not the worktree) because `skills/sync-docs/` is not present in the worktree. `--root` was passed explicitly to target the worktree.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Full Script Output
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
Fixing 5 missing docs/ files...
|
|
24
|
+
CREATED docs/hooks.md
|
|
25
|
+
CREATED docs/pi-extensions.md
|
|
26
|
+
CREATED docs/mcp-servers.md
|
|
27
|
+
CREATED docs/policies.md
|
|
28
|
+
CREATED docs/skills.md
|
|
29
|
+
|
|
30
|
+
Injecting frontmatter into 7 schema-invalid docs/ files...
|
|
31
|
+
FIXED docs/cleanup.md
|
|
32
|
+
FIXED docs/delegation-architecture.md
|
|
33
|
+
FIXED docs/hook-system-summary.md
|
|
34
|
+
FIXED docs/mcp-servers-config.md
|
|
35
|
+
FIXED docs/pi-extensions-migration.md
|
|
36
|
+
FIXED docs/pre-install-cleanup.md
|
|
37
|
+
FIXED docs/todo.md
|
|
38
|
+
{
|
|
39
|
+
"project_root": "/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ad500878",
|
|
40
|
+
"summary": {
|
|
41
|
+
"total_issues": 14,
|
|
42
|
+
"needs_attention": true
|
|
43
|
+
},
|
|
44
|
+
"readme": {
|
|
45
|
+
"status": "EXTRACTABLE",
|
|
46
|
+
"path": "README.md",
|
|
47
|
+
"line_count": 192,
|
|
48
|
+
"section_count": 24,
|
|
49
|
+
"threshold": 200,
|
|
50
|
+
"extraction_candidates": [
|
|
51
|
+
{ "section": "### Skills", "suggest": "docs/skills.md", "reason": "Skills catalog" },
|
|
52
|
+
{ "section": "## Policy System", "suggest": "docs/policies.md", "reason": "Policy reference" },
|
|
53
|
+
{ "section": "### Policy Files", "suggest": "docs/policies.md", "reason": "Policy reference" },
|
|
54
|
+
{ "section": "## Hooks Reference", "suggest": "docs/hooks.md", "reason": "Hooks reference" },
|
|
55
|
+
{ "section": "## MCP Servers", "suggest": "docs/mcp-servers.md", "reason": "MCP server configuration" }
|
|
56
|
+
],
|
|
57
|
+
"issues": []
|
|
58
|
+
},
|
|
59
|
+
"changelog": {
|
|
60
|
+
"status": "STALE",
|
|
61
|
+
"path": "CHANGELOG.md",
|
|
62
|
+
"last_entry_date": "2026-03-12",
|
|
63
|
+
"last_commit_date": "2026-03-18",
|
|
64
|
+
"package_version": "2.4.0",
|
|
65
|
+
"latest_changelog_version": "2.0.0",
|
|
66
|
+
"issues": [
|
|
67
|
+
"package.json is at v2.4.0 but latest CHANGELOG entry is v2.0.0 — release is undocumented"
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
"docs_gaps": [
|
|
71
|
+
{ "status": "MISSING", "path": "docs/hooks.md", "reason": "hooks/ directory exists", "signal": "hooks/" },
|
|
72
|
+
{ "status": "MISSING", "path": "docs/pi-extensions.md", "reason": "Pi extensions directory exists", "signal": "config/pi/extensions/" },
|
|
73
|
+
{ "status": "MISSING", "path": "docs/mcp-servers.md", "reason": ".mcp.json present", "signal": ".mcp.json" },
|
|
74
|
+
{ "status": "MISSING", "path": "docs/policies.md", "reason": "policies/ directory exists", "signal": "policies/" },
|
|
75
|
+
{ "status": "MISSING", "path": "docs/skills.md", "reason": "skills/ directory exists", "signal": "skills/" }
|
|
76
|
+
],
|
|
77
|
+
"existing_docs": [
|
|
78
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/cleanup.md", "line_count": 438, "has_frontmatter": false },
|
|
79
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/delegation-architecture.md", "line_count": 185, "has_frontmatter": false },
|
|
80
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/hook-system-summary.md", "line_count": 176, "has_frontmatter": false },
|
|
81
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/mcp-servers-config.md", "line_count": 364, "has_frontmatter": false },
|
|
82
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/pi-extensions-migration.md", "line_count": 56, "has_frontmatter": false },
|
|
83
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/pre-install-cleanup.md", "line_count": 107, "has_frontmatter": false },
|
|
84
|
+
{ "status": "INVALID_SCHEMA", "path": "docs/todo.md", "line_count": 4, "has_frontmatter": false }
|
|
85
|
+
],
|
|
86
|
+
"fix_created": [
|
|
87
|
+
"docs/hooks.md",
|
|
88
|
+
"docs/pi-extensions.md",
|
|
89
|
+
"docs/mcp-servers.md",
|
|
90
|
+
"docs/policies.md",
|
|
91
|
+
"docs/skills.md"
|
|
92
|
+
],
|
|
93
|
+
"fix_schema_fixed": [
|
|
94
|
+
"docs/cleanup.md",
|
|
95
|
+
"docs/delegation-architecture.md",
|
|
96
|
+
"docs/hook-system-summary.md",
|
|
97
|
+
"docs/mcp-servers-config.md",
|
|
98
|
+
"docs/pi-extensions-migration.md",
|
|
99
|
+
"docs/pre-install-cleanup.md",
|
|
100
|
+
"docs/todo.md"
|
|
101
|
+
],
|
|
102
|
+
"bd_remember": {
|
|
103
|
+
"stored": false,
|
|
104
|
+
"key": "sync-docs-fix-2026-03-18",
|
|
105
|
+
"insight": "sync-docs --fix: created 5 scaffold(s): hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md; added frontmatter to 7 existing file(s): cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md. Fill in content and run validate_doc.py docs/ to confirm schema."
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Files Created (scaffolds for MISSING docs/ gaps)
|
|
113
|
+
|
|
114
|
+
| File | Reason |
|
|
115
|
+
|---|---|
|
|
116
|
+
| `docs/hooks.md` | `hooks/` directory exists |
|
|
117
|
+
| `docs/pi-extensions.md` | `config/pi/extensions/` directory exists |
|
|
118
|
+
| `docs/mcp-servers.md` | `.mcp.json` present |
|
|
119
|
+
| `docs/policies.md` | `policies/` directory exists |
|
|
120
|
+
| `docs/skills.md` | `skills/` directory exists |
|
|
121
|
+
|
|
122
|
+
## Files Schema-Fixed (frontmatter injected into INVALID_SCHEMA files)
|
|
123
|
+
|
|
124
|
+
| File | Lines before fix |
|
|
125
|
+
|---|---|
|
|
126
|
+
| `docs/cleanup.md` | 438 |
|
|
127
|
+
| `docs/delegation-architecture.md` | 185 |
|
|
128
|
+
| `docs/hook-system-summary.md` | 176 |
|
|
129
|
+
| `docs/mcp-servers-config.md` | 364 |
|
|
130
|
+
| `docs/pi-extensions-migration.md` | 56 |
|
|
131
|
+
| `docs/pre-install-cleanup.md` | 107 |
|
|
132
|
+
| `docs/todo.md` | 4 |
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## bd Memory
|
|
137
|
+
|
|
138
|
+
- **Key attempted:** `sync-docs-fix-2026-03-18`
|
|
139
|
+
- **Insight:** `sync-docs --fix: created 5 scaffold(s): hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md; added frontmatter to 7 existing file(s): cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md. Fill in content and run validate_doc.py docs/ to confirm schema.`
|
|
140
|
+
- **Stored:** false — `bd` could not persist because no `.beads/` directory exists in the worktree (the script guards on `(root / ".beads").exists()`). The key and insight were computed and are recorded here for reference.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## validate_doc.py docs/ Result
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
docs/cleanup.md [PASS]
|
|
148
|
+
WARN: INDEX regenerated
|
|
149
|
+
|
|
150
|
+
docs/delegation-architecture.md [PASS]
|
|
151
|
+
WARN: INDEX regenerated
|
|
152
|
+
|
|
153
|
+
docs/hook-system-summary.md [PASS]
|
|
154
|
+
WARN: INDEX regenerated
|
|
155
|
+
|
|
156
|
+
docs/hooks.md [PASS]
|
|
157
|
+
WARN: INDEX regenerated
|
|
158
|
+
|
|
159
|
+
docs/mcp-servers-config.md [PASS]
|
|
160
|
+
WARN: INDEX regenerated
|
|
161
|
+
|
|
162
|
+
docs/mcp-servers.md [PASS]
|
|
163
|
+
WARN: INDEX regenerated
|
|
164
|
+
|
|
165
|
+
docs/pi-extensions-migration.md [PASS]
|
|
166
|
+
WARN: INDEX regenerated
|
|
167
|
+
|
|
168
|
+
docs/pi-extensions.md [PASS]
|
|
169
|
+
WARN: INDEX regenerated
|
|
170
|
+
|
|
171
|
+
docs/policies.md [PASS]
|
|
172
|
+
WARN: INDEX regenerated
|
|
173
|
+
|
|
174
|
+
docs/pre-install-cleanup.md [PASS]
|
|
175
|
+
WARN: INDEX regenerated
|
|
176
|
+
|
|
177
|
+
docs/skills.md [PASS]
|
|
178
|
+
WARN: INDEX regenerated
|
|
179
|
+
|
|
180
|
+
docs/todo.md [PASS]
|
|
181
|
+
All checks passed.
|
|
182
|
+
|
|
183
|
+
Result: 12/12 files passed
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**Exit code:** 0
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Remaining Issues (not handled by --fix)
|
|
191
|
+
|
|
192
|
+
1. **README.md EXTRACTABLE** — 192 lines, 5 sections flagged for extraction into docs/ (Skills, Policy System, Policy Files, Hooks Reference, MCP Servers). README extraction requires Serena tools per the skill protocol — content judgment is needed to split correctly.
|
|
193
|
+
2. **CHANGELOG.md STALE** — `package.json` is at v2.4.0 but the latest CHANGELOG entry is v2.0.0. Versions 2.1.0, 2.2.0, 2.3.0, and 2.4.0 are undocumented.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"expectations": [
|
|
3
|
+
{
|
|
4
|
+
"text": "Ran doc_structure_analyzer.py with --fix flag",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "result.md Command Executed section shows: `python3 /home/dawid/projects/xtrm-tools/skills/sync-docs/scripts/doc_structure_analyzer.py --fix --bd-remember --root=/home/dawid/projects/xtrm-tools/.claude/worktrees/agent-ad500878`"
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"text": "Handled both MISSING scaffolds AND INVALID_SCHEMA files (or correctly reported none found)",
|
|
10
|
+
"passed": true,
|
|
11
|
+
"evidence": "result.md shows 5 MISSING scaffold files created (hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md) and 7 INVALID_SCHEMA files fixed by injecting frontmatter (cleanup.md, delegation-architecture.md, hook-system-summary.md, mcp-servers-config.md, pi-extensions-migration.md, pre-install-cleanup.md, todo.md). Both categories were handled."
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"text": "Ran bd remember and reported the memory key",
|
|
15
|
+
"passed": true,
|
|
16
|
+
"evidence": "result.md bd Memory section reports key `sync-docs-fix-2026-03-18` and the full insight string. The script output JSON includes `bd_remember.key = 'sync-docs-fix-2026-03-18'`. Note: `stored: false` because no `.beads/` directory existed in the worktree, but the key was computed and reported, which satisfies the expectation."
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"text": "Ran validate_doc.py docs/ after fixing to confirm results",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "result.md validate_doc.py docs/ Result section shows all 12 files passed (12/12) with exit code 0, covering both the 5 newly created scaffolds and the 7 schema-fixed files."
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
"summary": {
|
|
25
|
+
"passed": 4,
|
|
26
|
+
"failed": 0,
|
|
27
|
+
"total": 4,
|
|
28
|
+
"pass_rate": 1.0
|
|
29
|
+
},
|
|
30
|
+
"execution_metrics": {},
|
|
31
|
+
"timing": {
|
|
32
|
+
"executor_duration_seconds": 120.0,
|
|
33
|
+
"grader_duration_seconds": 0.0,
|
|
34
|
+
"total_duration_seconds": 120.0
|
|
35
|
+
},
|
|
36
|
+
"claims": [
|
|
37
|
+
{
|
|
38
|
+
"claim": "5 missing scaffold files were created",
|
|
39
|
+
"type": "factual",
|
|
40
|
+
"verified": true,
|
|
41
|
+
"evidence": "Script output lists `fix_created` with 5 entries: hooks.md, pi-extensions.md, mcp-servers.md, policies.md, skills.md. validate_doc.py confirms these 5 files passed."
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"claim": "7 INVALID_SCHEMA files had frontmatter injected",
|
|
45
|
+
"type": "factual",
|
|
46
|
+
"verified": true,
|
|
47
|
+
"evidence": "Script output lists `fix_schema_fixed` with 7 entries. validate_doc.py confirms all 7 passed with exit code 0."
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"claim": "bd memory could not be persisted because no .beads/ directory exists in the worktree",
|
|
51
|
+
"type": "factual",
|
|
52
|
+
"verified": true,
|
|
53
|
+
"evidence": "result.md states: 'Stored: false \u2014 bd could not persist because no .beads/ directory exists in the worktree (the script guards on (root / \".beads\").exists())'"
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"claim": "All 12 docs/ files passed validation after fixes",
|
|
57
|
+
"type": "quality",
|
|
58
|
+
"verified": true,
|
|
59
|
+
"evidence": "validate_doc.py output shows 12/12 files passed with exit code 0."
|
|
60
|
+
}
|
|
61
|
+
],
|
|
62
|
+
"user_notes_summary": {},
|
|
63
|
+
"eval_feedback": {
|
|
64
|
+
"suggestions": [
|
|
65
|
+
{
|
|
66
|
+
"assertion": "Ran bd remember and reported the memory key",
|
|
67
|
+
"reason": "The assertion passes even though bd storage failed (`stored: false`). If the intent is to verify that the memory was actually persisted (not just that the key was computed), the assertion should check that `stored: true`. Consider splitting into 'reported the key' and 'successfully stored the memory' if persistence matters for the eval."
|
|
68
|
+
}
|
|
69
|
+
],
|
|
70
|
+
"overall": "Three of the four assertions are well-targeted and discriminating. The bd remember assertion is ambiguous about whether successful storage is required \u2014 worth clarifying."
|
|
71
|
+
}
|
|
72
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"total_tokens": 25000, "duration_ms": 120000, "total_duration_seconds": 120.0}
|