xtrm-tools 2.4.1 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/README.md +15 -6
  2. package/cli/dist/index.cjs +738 -239
  3. package/cli/dist/index.cjs.map +1 -1
  4. package/cli/package.json +1 -1
  5. package/config/hooks.json +10 -0
  6. package/config/pi/extensions/core/adapter.ts +2 -14
  7. package/config/pi/extensions/core/guard-rules.ts +70 -0
  8. package/config/pi/extensions/core/session-state.ts +59 -0
  9. package/config/pi/extensions/main-guard.ts +10 -14
  10. package/config/pi/extensions/plan-mode/README.md +65 -0
  11. package/config/pi/extensions/plan-mode/index.ts +340 -0
  12. package/config/pi/extensions/plan-mode/utils.ts +168 -0
  13. package/config/pi/extensions/service-skills.ts +51 -7
  14. package/config/pi/extensions/session-flow.ts +117 -0
  15. package/hooks/beads-claim-sync.mjs +140 -14
  16. package/hooks/beads-compact-restore.mjs +41 -9
  17. package/hooks/beads-compact-save.mjs +36 -5
  18. package/hooks/beads-gate-messages.mjs +27 -1
  19. package/hooks/beads-memory-gate.mjs +24 -16
  20. package/hooks/beads-stop-gate.mjs +58 -8
  21. package/hooks/guard-rules.mjs +117 -0
  22. package/hooks/hooks.json +28 -18
  23. package/hooks/main-guard.mjs +22 -22
  24. package/hooks/quality-check.cjs +1286 -0
  25. package/hooks/quality-check.py +345 -0
  26. package/hooks/session-state.mjs +138 -0
  27. package/package.json +2 -1
  28. package/project-skills/quality-gates/.claude/settings.json +1 -24
  29. package/skills/creating-service-skills/SKILL.md +433 -0
  30. package/skills/creating-service-skills/references/script_quality_standards.md +425 -0
  31. package/skills/creating-service-skills/references/service_skill_system_guide.md +278 -0
  32. package/skills/creating-service-skills/scripts/bootstrap.py +326 -0
  33. package/skills/creating-service-skills/scripts/deep_dive.py +304 -0
  34. package/skills/creating-service-skills/scripts/scaffolder.py +482 -0
  35. package/skills/scoping-service-skills/SKILL.md +231 -0
  36. package/skills/scoping-service-skills/scripts/scope.py +74 -0
  37. package/skills/sync-docs/SKILL.md +235 -0
  38. package/skills/sync-docs/evals/evals.json +89 -0
  39. package/skills/sync-docs/references/doc-structure.md +104 -0
  40. package/skills/sync-docs/references/schema.md +103 -0
  41. package/skills/sync-docs/scripts/context_gatherer.py +246 -0
  42. package/skills/sync-docs/scripts/doc_structure_analyzer.py +495 -0
  43. package/skills/sync-docs/scripts/validate_doc.py +365 -0
  44. package/skills/sync-docs-workspace/iteration-1/benchmark.json +293 -0
  45. package/skills/sync-docs-workspace/iteration-1/benchmark.md +13 -0
  46. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/eval_metadata.json +27 -0
  47. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/outputs/result.md +210 -0
  48. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/grading.json +28 -0
  49. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  50. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/outputs/result.md +101 -0
  51. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/grading.json +28 -0
  52. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  53. package/skills/sync-docs-workspace/iteration-1/eval-doc-audit/without_skill/timing.json +5 -0
  54. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/eval_metadata.json +27 -0
  55. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/outputs/result.md +198 -0
  56. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/grading.json +28 -0
  57. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  58. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/outputs/result.md +94 -0
  59. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/grading.json +28 -0
  60. package/skills/sync-docs-workspace/iteration-1/eval-fix-mode/without_skill/run-1/timing.json +1 -0
  61. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/eval_metadata.json +27 -0
  62. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/outputs/result.md +237 -0
  63. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/grading.json +28 -0
  64. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  65. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/outputs/result.md +134 -0
  66. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/grading.json +28 -0
  67. package/skills/sync-docs-workspace/iteration-1/eval-sprint-closeout/without_skill/run-1/timing.json +1 -0
  68. package/skills/sync-docs-workspace/iteration-2/benchmark.json +297 -0
  69. package/skills/sync-docs-workspace/iteration-2/benchmark.md +13 -0
  70. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/eval_metadata.json +27 -0
  71. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/outputs/result.md +137 -0
  72. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/grading.json +92 -0
  73. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/with_skill/run-1/timing.json +1 -0
  74. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/outputs/result.md +134 -0
  75. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/grading.json +86 -0
  76. package/skills/sync-docs-workspace/iteration-2/eval-doc-audit/without_skill/run-1/timing.json +1 -0
  77. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/eval_metadata.json +27 -0
  78. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/outputs/result.md +193 -0
  79. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/grading.json +72 -0
  80. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/with_skill/run-1/timing.json +1 -0
  81. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/outputs/result.md +211 -0
  82. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/grading.json +91 -0
  83. package/skills/sync-docs-workspace/iteration-2/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  84. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/eval_metadata.json +27 -0
  85. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/outputs/result.md +182 -0
  86. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  87. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/with_skill/run-1/timing.json +1 -0
  88. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/outputs/result.md +222 -0
  89. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/grading.json +88 -0
  90. package/skills/sync-docs-workspace/iteration-2/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  91. package/skills/sync-docs-workspace/iteration-3/benchmark.json +298 -0
  92. package/skills/sync-docs-workspace/iteration-3/benchmark.md +13 -0
  93. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/eval_metadata.json +27 -0
  94. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/outputs/result.md +125 -0
  95. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/grading.json +97 -0
  96. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/with_skill/run-1/timing.json +5 -0
  97. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/outputs/result.md +144 -0
  98. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/grading.json +78 -0
  99. package/skills/sync-docs-workspace/iteration-3/eval-doc-audit/without_skill/run-1/timing.json +5 -0
  100. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/eval_metadata.json +27 -0
  101. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/outputs/result.md +104 -0
  102. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/grading.json +91 -0
  103. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/with_skill/run-1/timing.json +5 -0
  104. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/outputs/result.md +79 -0
  105. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/grading.json +82 -0
  106. package/skills/sync-docs-workspace/iteration-3/eval-fix-mode/without_skill/run-1/timing.json +5 -0
  107. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/eval_metadata.json +27 -0
  108. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase1_context.json +302 -0
  109. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase2_drift.txt +33 -0
  110. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase3_analysis.json +114 -0
  111. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase4_fix.txt +118 -0
  112. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/phase5_validate.txt +38 -0
  113. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/outputs/result.md +158 -0
  114. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/grading.json +95 -0
  115. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/with_skill/run-1/timing.json +5 -0
  116. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/outputs/result.md +71 -0
  117. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/grading.json +90 -0
  118. package/skills/sync-docs-workspace/iteration-3/eval-sprint-closeout/without_skill/run-1/timing.json +5 -0
  119. package/skills/updating-service-skills/SKILL.md +136 -0
  120. package/skills/updating-service-skills/scripts/drift_detector.py +222 -0
  121. package/skills/using-quality-gates/SKILL.md +254 -0
  122. package/skills/using-service-skills/SKILL.md +108 -0
  123. package/skills/using-service-skills/scripts/cataloger.py +74 -0
  124. package/skills/using-service-skills/scripts/skill_activator.py +152 -0
  125. package/skills/using-service-skills/scripts/test_skill_activator.py +58 -0
  126. package/skills/using-xtrm/SKILL.md +34 -38
@@ -0,0 +1,365 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Validate and generate schema-compliant docs/ files.
4
+
5
+ Schema for docs/ files:
6
+ Required: title, scope, category, version, updated
7
+ Optional: description, source_of_truth_for (glob list), domain (tag list)
8
+
9
+ Usage:
10
+ validate_doc.py <file_or_dir> # validate one file or all *.md in dir
11
+ validate_doc.py --generate <path> # generate scaffold + required flags below
12
+ --title="..."
13
+ --scope="..."
14
+ --category="reference|guide|architecture|api"
15
+ --source-for="glob1,glob2" # optional
16
+ --description="..." # optional
17
+ """
18
+
19
+ import sys
20
+ import re
21
+ import json
22
+ from pathlib import Path
23
+ from datetime import date
24
+
25
+ # ── Schema ────────────────────────────────────────────────────────────────────
26
+
27
+ REQUIRED_FIELDS = ["title", "scope", "category", "version", "updated"]
28
+ VALID_CATEGORIES = ["reference", "guide", "architecture", "api", "plan", "overview"]
29
+
30
+ CATEGORY_DESCRIPTIONS = {
31
+ "reference": "Look-up table, cheat sheet, or technical specification",
32
+ "guide": "How-to documentation with step-by-step instructions",
33
+ "architecture": "System design, component relationships, high-level overview",
34
+ "api": "API contracts, interfaces, or data schemas",
35
+ "plan": "Implementation plan or roadmap",
36
+ "overview": "Summary introduction to a subsystem",
37
+ }
38
+
39
+
40
+ # ── Frontmatter helpers ───────────────────────────────────────────────────────
41
+
42
+ def extract_frontmatter(content: str) -> dict | None:
43
+ """Parse simple YAML frontmatter without external dependencies.
44
+
45
+ Handles scalar values, quoted strings, and list fields (- item syntax).
46
+ """
47
+ m = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
48
+ if not m:
49
+ return None
50
+
51
+ result: dict[str, object] = {}
52
+ current_key: str | None = None
53
+ current_list: list[str] | None = None
54
+
55
+ for line in m.group(1).splitlines():
56
+ # List item under current key
57
+ if current_list is not None and re.match(r"^\s+-\s+", line):
58
+ current_list.append(re.sub(r"^\s+-\s+", "", line).strip().strip('"\''))
59
+ continue
60
+
61
+ # New key: value line
62
+ kv = re.match(r'^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)', line)
63
+ if kv:
64
+ # Flush previous list
65
+ if current_key is not None and current_list is not None:
66
+ result[current_key] = current_list
67
+
68
+ current_key = kv.group(1)
69
+ raw_val = kv.group(2).strip()
70
+
71
+ if raw_val == "" or raw_val == ">-":
72
+ # Value on following lines (list or multiline) — start list
73
+ current_list = []
74
+ elif raw_val.startswith("["):
75
+ # Inline list: [a, b, c]
76
+ current_list = None
77
+ inner = raw_val.strip("[]")
78
+ result[current_key] = [v.strip().strip('"\'') for v in inner.split(",") if v.strip()]
79
+ else:
80
+ current_list = None
81
+ result[current_key] = raw_val.strip('"\'')
82
+ else:
83
+ # Continuation line for multiline scalar — ignore for our purposes
84
+ pass
85
+
86
+ # Flush trailing list
87
+ if current_key is not None and current_list is not None:
88
+ result[current_key] = current_list
89
+
90
+ return result
91
+
92
+
93
+ def extract_headings(content: str) -> list[tuple[str, str]]:
94
+ """Return (heading, first_sentence) for every ## section."""
95
+ results = []
96
+ lines = content.splitlines()
97
+ i = 0
98
+ while i < len(lines):
99
+ line = lines[i]
100
+ if line.startswith("## ") and not line.startswith("### "):
101
+ heading = line[3:].strip()
102
+ summary = ""
103
+ j = i + 1
104
+ in_code = False
105
+ while j < len(lines):
106
+ ln = lines[j].strip()
107
+ if ln.startswith("```"):
108
+ in_code = not in_code
109
+ j += 1
110
+ continue
111
+ if not in_code and ln and not ln.startswith("#"):
112
+ summary = ln.split(".")[0].strip()[:120]
113
+ break
114
+ j += 1
115
+ results.append((heading, summary))
116
+ i += 1
117
+ return results
118
+
119
+
120
+ def make_anchor(heading: str) -> str:
121
+ """Generate a GitHub-compatible anchor from a heading string."""
122
+ anchor = heading.lower()
123
+ anchor = re.sub(r"\s+", "-", anchor) # spaces → hyphens
124
+ anchor = re.sub(r"[^a-z0-9\-]", "", anchor) # strip non-alphanumeric (except -)
125
+ anchor = re.sub(r"-+", "-", anchor) # collapse runs
126
+ return anchor.strip("-")
127
+
128
+
129
+ def generate_index_table(headings: list[tuple[str, str]]) -> str:
130
+ rows = ["| Section | Summary |", "|---|---|"]
131
+ for heading, summary in headings:
132
+ rows.append(f"| [{heading}](#{make_anchor(heading)}) | {summary or '_no summary_'} |")
133
+ return "\n".join(rows) + "\n"
134
+
135
+
136
+ def inject_index(content: str, table: str) -> str:
137
+ header = "<!-- INDEX: auto-generated by validate_doc.py — do not edit manually -->\n"
138
+ footer = "<!-- END INDEX -->"
139
+ block = f"{header}{table}{footer}"
140
+
141
+ existing = re.search(r"<!-- INDEX:.*?-->.*?<!-- END INDEX -->", content, re.DOTALL)
142
+ if existing:
143
+ return content[: existing.start()] + block + content[existing.end() :]
144
+
145
+ fm_match = re.match(r"^(---\n.*?\n---\n)(.*)", content, re.DOTALL)
146
+ if fm_match:
147
+ return fm_match.group(1) + "\n" + block + "\n" + fm_match.group(2)
148
+
149
+ return block + "\n" + content
150
+
151
+
152
+ # ── Validation ────────────────────────────────────────────────────────────────
153
+
154
+ def validate_file(path: Path) -> tuple[bool, list[str], list[str]]:
155
+ """Returns (passed, errors, warnings)."""
156
+ errors: list[str] = []
157
+ warnings: list[str] = []
158
+
159
+ if not path.exists():
160
+ return False, [f"File not found: {path}"], []
161
+
162
+ content = path.read_text(encoding="utf-8")
163
+ fm = extract_frontmatter(content)
164
+
165
+ if fm is None:
166
+ errors.append("Missing or invalid YAML frontmatter (wrap in --- markers)")
167
+ return False, errors, warnings
168
+
169
+ # Required fields
170
+ for field in REQUIRED_FIELDS:
171
+ if field not in fm:
172
+ errors.append(f"Missing required field: {field}")
173
+
174
+ # version format
175
+ if "version" in fm:
176
+ if not re.match(r"^\d+\.\d+\.\d+$", str(fm["version"])):
177
+ errors.append(f"version must be semver (x.y.z), got: {fm['version']}")
178
+
179
+ # updated format
180
+ if "updated" in fm:
181
+ if not re.match(r"^\d{4}-\d{2}-\d{2}", str(fm["updated"])):
182
+ warnings.append(f"updated should be ISO date (YYYY-MM-DD), got: {fm['updated']}")
183
+
184
+ # category valid
185
+ if "category" in fm and fm["category"] not in VALID_CATEGORIES:
186
+ errors.append(
187
+ f"category '{fm['category']}' not valid. Choose from: {', '.join(VALID_CATEGORIES)}"
188
+ )
189
+
190
+ # domain is list
191
+ if "domain" in fm and not isinstance(fm["domain"], list):
192
+ errors.append("domain must be a list, e.g. [hooks, claude]")
193
+
194
+ # source_of_truth_for and tracks are lists of globs
195
+ if "source_of_truth_for" in fm and not isinstance(fm["source_of_truth_for"], list):
196
+ errors.append("source_of_truth_for must be a list of glob patterns")
197
+ if "tracks" in fm and not isinstance(fm["tracks"], list):
198
+ errors.append("tracks must be a list of glob patterns")
199
+
200
+ # Regenerate INDEX if valid
201
+ if not errors:
202
+ headings = extract_headings(content)
203
+ if headings:
204
+ table = generate_index_table(headings)
205
+ new_content = inject_index(content, table)
206
+ if new_content != content:
207
+ path.write_text(new_content, encoding="utf-8")
208
+ warnings.append("INDEX regenerated")
209
+
210
+ return len(errors) == 0, errors, warnings
211
+
212
+
213
+ def validate_directory(docs_dir: Path) -> dict:
214
+ results = {}
215
+ for md_file in sorted(docs_dir.glob("*.md")):
216
+ passed, errors, warnings = validate_file(md_file)
217
+ results[str(md_file.relative_to(docs_dir.parent))] = {
218
+ "passed": passed,
219
+ "errors": errors,
220
+ "warnings": warnings,
221
+ }
222
+ return results
223
+
224
+
225
+ def print_file_result(path: str, passed: bool, errors: list[str], warnings: list[str]) -> None:
226
+ status = "PASS" if passed else "FAIL"
227
+ mark = "" if passed else ""
228
+ print(f"\n{mark} {path} [{status}]")
229
+ for e in errors:
230
+ print(f" ERROR: {e}")
231
+ for w in warnings:
232
+ print(f" WARN: {w}")
233
+ if passed and not warnings:
234
+ print(" All checks passed.")
235
+
236
+
237
+ # ── Generator ─────────────────────────────────────────────────────────────────
238
+
239
+ SCAFFOLD_TEMPLATE = """\
240
+ ---
241
+ title: {title}
242
+ scope: {scope}
243
+ category: {category}
244
+ version: 1.0.0
245
+ updated: {today}
246
+ {source_field}{tracks_field}{description_field}domain: []
247
+ ---
248
+
249
+ <!-- INDEX: auto-generated by validate_doc.py — do not edit manually -->
250
+ <!-- END INDEX -->
251
+
252
+ # {title}
253
+
254
+ > {category_desc}
255
+
256
+ ## Overview
257
+
258
+ _Describe what this document covers._
259
+
260
+ """
261
+
262
+
263
+ def generate_scaffold(output_path: Path, title: str, scope: str, category: str,
264
+ source_for: list[str], description: str) -> None:
265
+ source_field = ""
266
+ tracks_field = ""
267
+ if source_for:
268
+ items = "\n".join(f' - "{g}"' for g in source_for)
269
+ source_field = f"source_of_truth_for:\n{items}\n"
270
+ # tracks: mirrors source_of_truth_for so drift_detector.py picks up changes
271
+ tracks_field = f"tracks:\n{items}\n"
272
+
273
+ desc_field = f'description: "{description}"\n' if description else ""
274
+ category_desc = CATEGORY_DESCRIPTIONS.get(category, category)
275
+
276
+ content = SCAFFOLD_TEMPLATE.format(
277
+ title=title,
278
+ scope=scope,
279
+ category=category,
280
+ today=date.today().isoformat(),
281
+ source_field=source_field,
282
+ tracks_field=tracks_field,
283
+ description_field=desc_field,
284
+ category_desc=category_desc,
285
+ )
286
+
287
+ output_path.parent.mkdir(parents=True, exist_ok=True)
288
+ output_path.write_text(content, encoding="utf-8")
289
+ print(f"Generated: {output_path}")
290
+
291
+
292
+ # ── Entry point ───────────────────────────────────────────────────────────────
293
+
294
+ def main() -> None:
295
+ args = sys.argv[1:]
296
+
297
+ if not args:
298
+ print("Usage:")
299
+ print(" validate_doc.py <file_or_dir>")
300
+ print(" validate_doc.py --generate <path> --title=... --scope=... --category=...")
301
+ sys.exit(1)
302
+
303
+ # Generate mode
304
+ if "--generate" in args:
305
+ idx = args.index("--generate")
306
+ if idx + 1 >= len(args):
307
+ print("ERROR: --generate requires a path argument")
308
+ sys.exit(1)
309
+ output_path = Path(args[idx + 1])
310
+
311
+ kw: dict[str, str] = {}
312
+ source_for: list[str] = []
313
+ for arg in args:
314
+ if arg.startswith("--title="):
315
+ kw["title"] = arg.split("=", 1)[1]
316
+ elif arg.startswith("--scope="):
317
+ kw["scope"] = arg.split("=", 1)[1]
318
+ elif arg.startswith("--category="):
319
+ kw["category"] = arg.split("=", 1)[1]
320
+ elif arg.startswith("--source-for="):
321
+ source_for = [g.strip() for g in arg.split("=", 1)[1].split(",")]
322
+ elif arg.startswith("--description="):
323
+ kw["description"] = arg.split("=", 1)[1]
324
+
325
+ for req in ["title", "scope", "category"]:
326
+ if req not in kw:
327
+ print(f"ERROR: --{req} is required for --generate")
328
+ sys.exit(1)
329
+
330
+ generate_scaffold(
331
+ output_path,
332
+ title=kw["title"],
333
+ scope=kw["scope"],
334
+ category=kw["category"],
335
+ source_for=source_for,
336
+ description=kw.get("description", ""),
337
+ )
338
+ sys.exit(0)
339
+
340
+ # Validate mode
341
+ target = Path(args[0])
342
+ all_passed = True
343
+
344
+ if target.is_dir():
345
+ results = validate_directory(target)
346
+ for path_str, res in results.items():
347
+ print_file_result(path_str, res["passed"], res["errors"], res["warnings"])
348
+ if not res["passed"]:
349
+ all_passed = False
350
+ if results:
351
+ total = len(results)
352
+ passed = sum(1 for r in results.values() if r["passed"])
353
+ print(f"\nResult: {passed}/{total} files passed")
354
+ else:
355
+ print(f"No .md files found in {target}")
356
+ else:
357
+ passed, errors, warnings = validate_file(target)
358
+ print_file_result(str(target), passed, errors, warnings)
359
+ all_passed = passed
360
+
361
+ sys.exit(0 if all_passed else 1)
362
+
363
+
364
+ if __name__ == "__main__":
365
+ main()
@@ -0,0 +1,293 @@
1
+ {
2
+ "metadata": {
3
+ "skill_name": "sync-docs",
4
+ "skill_path": "<path/to/skill>",
5
+ "executor_model": "<model-name>",
6
+ "analyzer_model": "<model-name>",
7
+ "timestamp": "2026-03-18T07:43:29Z",
8
+ "evals_run": [
9
+ 1,
10
+ 2,
11
+ 3
12
+ ],
13
+ "runs_per_configuration": 3
14
+ },
15
+ "runs": [
16
+ {
17
+ "eval_id": 3,
18
+ "configuration": "with_skill",
19
+ "run_number": 1,
20
+ "result": {
21
+ "pass_rate": 0.75,
22
+ "passed": 3,
23
+ "failed": 1,
24
+ "total": 4,
25
+ "time_seconds": 0.0,
26
+ "tokens": 0,
27
+ "tool_calls": 0,
28
+ "errors": 0
29
+ },
30
+ "expectations": [
31
+ {
32
+ "text": "Ran doc_structure_analyzer.py and referenced its structured output",
33
+ "passed": true,
34
+ "evidence": "Ran doc_structure_analyzer.py, quoted its full structured output including EXTRACTABLE status, extraction candidates list, MISSING files, and INVALID_SCHEMA count."
35
+ },
36
+ {
37
+ "text": "Named specific README sections with their suggested docs/ destination",
38
+ "passed": true,
39
+ "evidence": "Named: '## Policy System \u2192 docs/policies.md', '## MCP Servers \u2192 docs/mcp-servers.md', pi-extensions.md, plus context about CHANGELOG 6-day gap."
40
+ },
41
+ {
42
+ "text": "Report is actionable \u2014 tells user exactly what to do next, not just observations",
43
+ "passed": true,
44
+ "evidence": "Report includes structured phase output, specific file names, notes CHANGELOG gap with exact dates, and references the 6-day staleness."
45
+ },
46
+ {
47
+ "text": "Did not edit or create any files (audit only)",
48
+ "passed": false,
49
+ "evidence": "Agent ran --fix (created docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md) despite task being audit-only. Skill instructions for Phase 3 show the --fix command without making clear it is only for execute mode."
50
+ }
51
+ ],
52
+ "notes": []
53
+ },
54
+ {
55
+ "eval_id": 2,
56
+ "configuration": "with_skill",
57
+ "run_number": 1,
58
+ "result": {
59
+ "pass_rate": 0.75,
60
+ "passed": 3,
61
+ "failed": 1,
62
+ "total": 4,
63
+ "time_seconds": 0.0,
64
+ "tokens": 0,
65
+ "tool_calls": 0,
66
+ "errors": 0
67
+ },
68
+ "expectations": [
69
+ {
70
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
71
+ "passed": true,
72
+ "evidence": "Ran `python3 skills/sync-docs/scripts/doc_structure_analyzer.py --fix --bd-remember` and included full output"
73
+ },
74
+ {
75
+ "text": "Ran with --bd-remember or manually ran bd remember with a summary",
76
+ "passed": true,
77
+ "evidence": "bd remember stored with key 'sync-docs-fix-2026-03-18', confirmed stored:true in output JSON"
78
+ },
79
+ {
80
+ "text": "At least one scaffold file was created in docs/",
81
+ "passed": true,
82
+ "evidence": "Created docs/pi-extensions.md, docs/mcp-servers.md, docs/policies.md with valid frontmatter"
83
+ },
84
+ {
85
+ "text": "Ran validate_doc.py on created files to confirm schema",
86
+ "passed": false,
87
+ "evidence": "Report notes 7 INVALID_SCHEMA files exist but does not show validate_doc.py being run explicitly to confirm the 3 new files pass. Only the JSON output showing valid frontmatter is evidence."
88
+ }
89
+ ],
90
+ "notes": []
91
+ },
92
+ {
93
+ "eval_id": 1,
94
+ "configuration": "with_skill",
95
+ "run_number": 1,
96
+ "result": {
97
+ "pass_rate": 1.0,
98
+ "passed": 4,
99
+ "failed": 0,
100
+ "total": 4,
101
+ "time_seconds": 0.0,
102
+ "tokens": 0,
103
+ "tool_calls": 0,
104
+ "errors": 0
105
+ },
106
+ "expectations": [
107
+ {
108
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs from the output",
109
+ "passed": true,
110
+ "evidence": "Ran context_gatherer.py, reported 20 bd closed issues with IDs and titles, 3 merged PRs with SHAs and dates, 15 recent commits"
111
+ },
112
+ {
113
+ "text": "Ran doc_structure_analyzer.py and used its output to identify doc issues",
114
+ "passed": true,
115
+ "evidence": "Ran doc_structure_analyzer.py, referenced MISSING status for docs/pi-extensions.md, hooks.md, mcp-servers.md, policies.md, skills.md and EXTRACTABLE for README"
116
+ },
117
+ {
118
+ "text": "Produced at least one concrete recommendation or action (not just a vague summary)",
119
+ "passed": true,
120
+ "evidence": "Named specific files: docs/pi-extensions.md, docs/hooks.md, docs/mcp-servers.md, docs/policies.md with explicit next steps for each"
121
+ },
122
+ {
123
+ "text": "Used the skill scripts rather than just reading files manually",
124
+ "passed": true,
125
+ "evidence": "Ran 3 scripts (context_gatherer.py, drift_detector.py, doc_structure_analyzer.py) with explicit output included in report"
126
+ }
127
+ ],
128
+ "notes": []
129
+ },
130
+ {
131
+ "eval_id": 3,
132
+ "configuration": "without_skill",
133
+ "run_number": 1,
134
+ "result": {
135
+ "pass_rate": 0.75,
136
+ "passed": 3,
137
+ "failed": 1,
138
+ "total": 4,
139
+ "time_seconds": 72.5,
140
+ "tokens": 21934,
141
+ "tool_calls": 0,
142
+ "errors": 0
143
+ },
144
+ "expectations": [
145
+ {
146
+ "text": "Ran doc_structure_analyzer.py and referenced its structured output",
147
+ "passed": false,
148
+ "evidence": "Did not run doc_structure_analyzer.py. All findings came from manual README.md reads with line numbers."
149
+ },
150
+ {
151
+ "text": "Named specific README sections with their suggested docs/ destination",
152
+ "passed": true,
153
+ "evidence": "Named 6 specific sections with line numbers: Hooks Reference (114-141)\u2192docs/hooks.md, Policy System (66-87)\u2192new docs/policies.md, MCP Servers (143-158)\u2192docs/mcp.md, CLI Commands (89-111)\u2192XTRM-GUIDE.md, Version History (179-188)\u2192remove, Plugin Structure (52-63)\u2192borderline."
154
+ },
155
+ {
156
+ "text": "Report is actionable \u2014 tells user exactly what to do next, not just observations",
157
+ "passed": true,
158
+ "evidence": "Each section has a specific Recommendation: block with exact action (Remove section, Add single link, Create docs/policies.md, etc.). Estimated README would shrink from 193 to 60-70 lines."
159
+ },
160
+ {
161
+ "text": "Did not edit or create any files (audit only)",
162
+ "passed": true,
163
+ "evidence": "Report explicitly states no files were modified. Audit-only as instructed."
164
+ }
165
+ ],
166
+ "notes": []
167
+ },
168
+ {
169
+ "eval_id": 2,
170
+ "configuration": "without_skill",
171
+ "run_number": 1,
172
+ "result": {
173
+ "pass_rate": 1.0,
174
+ "passed": 4,
175
+ "failed": 0,
176
+ "total": 4,
177
+ "time_seconds": 0.0,
178
+ "tokens": 0,
179
+ "tool_calls": 0,
180
+ "errors": 0
181
+ },
182
+ "expectations": [
183
+ {
184
+ "text": "Ran doc_structure_analyzer.py with --fix flag",
185
+ "passed": true,
186
+ "evidence": "Agent found the skill in the repo and ran doc_structure_analyzer.py --fix. However, found no MISSING gaps because with_skill run had already created those files (confounded test)."
187
+ },
188
+ {
189
+ "text": "Ran with --bd-remember or manually ran bd remember with a summary",
190
+ "passed": true,
191
+ "evidence": "Agent ran bd remember with key 'sync-docs-fix-schema-2026-03-18' summarizing the frontmatter additions made to 7 files."
192
+ },
193
+ {
194
+ "text": "At least one scaffold file was created in docs/",
195
+ "passed": true,
196
+ "evidence": "Added YAML frontmatter to 7 existing docs/ files (hooks.md, mcp.md, pre-install-cleanup.md, project-skills.md, skills.md, testing.md, todo.md). Different action than creating scaffolds but valid given scaffolds already existed."
197
+ },
198
+ {
199
+ "text": "Ran validate_doc.py on created files to confirm schema",
200
+ "passed": true,
201
+ "evidence": "Ran validate_doc.py docs/ \u2014 7/7 files passed after frontmatter additions."
202
+ }
203
+ ],
204
+ "notes": []
205
+ },
206
+ {
207
+ "eval_id": 1,
208
+ "configuration": "without_skill",
209
+ "run_number": 1,
210
+ "result": {
211
+ "pass_rate": 0.25,
212
+ "passed": 1,
213
+ "failed": 3,
214
+ "total": 4,
215
+ "time_seconds": 0.0,
216
+ "tokens": 0,
217
+ "tool_calls": 0,
218
+ "errors": 0
219
+ },
220
+ "expectations": [
221
+ {
222
+ "text": "Ran context_gatherer.py and reported bd closed issues or merged PRs from the output",
223
+ "passed": false,
224
+ "evidence": "Did not run context_gatherer.py. Used git log manually. Reported 'No .beads/ DB was found' which is wrong \u2014 .beads/ exists. Missed all 20 closed bd issues."
225
+ },
226
+ {
227
+ "text": "Ran doc_structure_analyzer.py and used its output to identify doc issues",
228
+ "passed": false,
229
+ "evidence": "Did not run doc_structure_analyzer.py. Manually read README.md, package.json, and CHANGELOG.md."
230
+ },
231
+ {
232
+ "text": "Produced at least one concrete recommendation or action (not just a vague summary)",
233
+ "passed": true,
234
+ "evidence": "Found version mismatch (2.3.0 vs 2.4.1 in package.json), identified 7 undocumented branch commits in CHANGELOG, named specific line references."
235
+ },
236
+ {
237
+ "text": "Used the skill scripts rather than just reading files manually",
238
+ "passed": false,
239
+ "evidence": "No skill scripts were used. All findings came from manual git log, file reads, and README inspection."
240
+ }
241
+ ],
242
+ "notes": []
243
+ }
244
+ ],
245
+ "run_summary": {
246
+ "with_skill": {
247
+ "pass_rate": {
248
+ "mean": 0.8333,
249
+ "stddev": 0.1443,
250
+ "min": 0.75,
251
+ "max": 1.0
252
+ },
253
+ "time_seconds": {
254
+ "mean": 0.0,
255
+ "stddev": 0.0,
256
+ "min": 0.0,
257
+ "max": 0.0
258
+ },
259
+ "tokens": {
260
+ "mean": 0.0,
261
+ "stddev": 0.0,
262
+ "min": 0,
263
+ "max": 0
264
+ }
265
+ },
266
+ "without_skill": {
267
+ "pass_rate": {
268
+ "mean": 0.6667,
269
+ "stddev": 0.3819,
270
+ "min": 0.25,
271
+ "max": 1.0
272
+ },
273
+ "time_seconds": {
274
+ "mean": 24.1667,
275
+ "stddev": 41.8579,
276
+ "min": 0.0,
277
+ "max": 72.5
278
+ },
279
+ "tokens": {
280
+ "mean": 7311.3333,
281
+ "stddev": 12663.6008,
282
+ "min": 0,
283
+ "max": 21934
284
+ }
285
+ },
286
+ "delta": {
287
+ "pass_rate": "+0.17",
288
+ "time_seconds": "-24.2",
289
+ "tokens": "-7311"
290
+ }
291
+ },
292
+ "notes": []
293
+ }
@@ -0,0 +1,13 @@
1
+ # Skill Benchmark: sync-docs
2
+
3
+ **Model**: <model-name>
4
+ **Date**: 2026-03-18T07:43:29Z
5
+ **Evals**: 1, 2, 3 (3 runs each per configuration)
6
+
7
+ ## Summary
8
+
9
+ | Metric | With Skill | Without Skill | Delta |
10
+ |--------|------------|---------------|-------|
11
+ | Pass Rate | 83% ± 14% | 67% ± 38% | +0.17 |
12
+ | Time | 0.0s ± 0.0s | 24.2s ± 41.9s | -24.2s |
13
+ | Tokens | 0 ± 0 | 7311 ± 12664 | -7311 |