sdtk-wiki-kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1110 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SDTK-WIKI Builder -- generic local-project edition.
4
+
5
+ Scans markdown files under configured scan roots, builds a document index
6
+ and graph, and generates a static local viewer.
7
+
8
+ Usage:
9
+ python build_atlas.py --project-root <path> --output-dir <path>
10
+ [--scan-root <path> ...] [--exclude <frag> ...]
11
+ [--verbose]
12
+
13
+ Outputs (written to <output-dir>/):
14
+ ATLAS_STATE.json - incremental scan/build state
15
+ SDTK_DOC_INDEX.json - full document index
16
+ SDTK_DOC_GRAPH.json - nodes + typed edges
17
+ SDTK_DOC_ATLAS_SUMMARY.md - human-readable summary
18
+ viewer.html - static local viewer (data embedded)
19
+ vendor/mermaid.min.js - vendored viewer asset
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import hashlib
26
+ import json
27
+ import re
28
+ import shutil
29
+ import sys
30
+ from datetime import datetime, timezone
31
+ from pathlib import Path
32
+ from typing import Any
33
+
34
+ ATLAS_STATE_VERSION = 6
35
+ WIKI_PAGE_SCHEMA_VERSION = 1
36
+ WIKI_PROVENANCE_SCHEMA_VERSION = 1
37
+ MERMAID_VENDOR_PATH = Path(__file__).parent / "vendor" / "mermaid.min.js"
38
+ MERMAID_ASSET_NAME = "mermaid.min.js"
39
+ _VIEWER_TEMPLATE_PATH = Path(__file__).parent / "doc_atlas_viewer_template.html"
40
+
41
+
42
+ def _json_for_inline_script(value: Any) -> str:
43
+ return (
44
+ json.dumps(value, ensure_ascii=True, separators=(",", ":"))
45
+ .replace("</", "<\\/")
46
+ .replace("<!--", "<\\!--")
47
+ )
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Default consumer project exclude fragments
51
+ # ---------------------------------------------------------------------------
52
+ DEFAULT_EXCLUDE_FRAGS: list[str] = [
53
+ ".git",
54
+ ".sdtk/wiki",
55
+ ".sdtk/atlas",
56
+ "node_modules",
57
+ ".venv",
58
+ "venv",
59
+ "dist",
60
+ "build",
61
+ "coverage",
62
+ ".next",
63
+ ".turbo",
64
+ ".cache",
65
+ "__pycache__",
66
+ ]
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Reference patterns
70
+ # ---------------------------------------------------------------------------
71
+ RE_BK = re.compile(r"\bBK-(\d{3,})\b")
72
+ RE_KNOWLEDGE_ID = re.compile(r"\b(KD|KT|KP|KA|KR|KRB|KF)-(\d{4})\b")
73
+ RE_REPO_PATH = re.compile(
74
+ r"(?:^|[\s`(\[])([a-zA-Z0-9_\-]+(?:/[a-zA-Z0-9_\-. ]+)+\."
75
+ r"(?:md|py|ps1|json|yaml|yml|html|txt))"
76
+ )
77
+ RE_WIKI_LINK = re.compile(r"\[\[([^\]]+)\]\]")
78
+ RE_MARKDOWN_LINK = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)")
79
+ RE_SKILL_REF = re.compile(r"\b(sdtk-[a-z0-9][a-z0-9-]*)\b")
80
+ RE_RELEASE_REF = re.compile(r"\b(?:sdtk-spec-kit@)?(0\.\d+\.\d+)\b")
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Generic doc-family classifier (project-scope, no maintainer assumptions)
85
+ # ---------------------------------------------------------------------------
86
+ def classify_family(rel: str) -> str:
87
+ p = rel.replace("\\", "/").lower()
88
+ name = Path(rel).name.lower()
89
+ is_guide_path = p.startswith("guides/") or "/guides/" in p
90
+ if p == "readme.md":
91
+ return "root-readme"
92
+ if "backlog" in name:
93
+ return "backlog"
94
+ if "skills" in p:
95
+ return "skill"
96
+ if "templates" in p:
97
+ return "template"
98
+ if "docs/database" in p or "database/" in p:
99
+ return "database"
100
+ if "docs/specs" in p or "specs/" in p:
101
+ return "spec"
102
+ if "docs/architecture" in p or "architecture/" in p:
103
+ return "architecture"
104
+ if "docs/api" in p or "api/" in p:
105
+ return "api"
106
+ if "docs/qa" in p or "qa/" in p:
107
+ return "qa"
108
+ if "docs/design" in p or "design/" in p:
109
+ return "design"
110
+ if "docs/dev" in p or "dev/" in p:
111
+ return "dev"
112
+ if "docs/product" in p or "product/" in p:
113
+ return "product"
114
+ if is_guide_path:
115
+ return "guide"
116
+ if "governance" in p:
117
+ return "governance"
118
+ return "other-markdown"
119
+
120
+
121
+ def classify_role(rel: str) -> str:
122
+ p = rel.replace("\\", "/").lower()
123
+ if "governance" in p:
124
+ return "governance"
125
+ if "spec" in p or "architecture" in p:
126
+ return "spec-artifact"
127
+ if "skill" in p:
128
+ return "skill"
129
+ return "other"
130
+
131
+
132
+ # ---------------------------------------------------------------------------
133
+ # Scanner helpers
134
+ # ---------------------------------------------------------------------------
135
+ def _now_utc() -> str:
136
+ return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
137
+
138
+
139
+ def _write_text_lf(path: Path, content: str) -> None:
140
+ path.write_text(content, encoding="utf-8", newline="\n")
141
+
142
+
143
+ def _assert_inside(base: Path, target: Path) -> None:
144
+ resolved_base = base.resolve()
145
+ resolved_target = target.resolve()
146
+ if resolved_target != resolved_base and resolved_base not in resolved_target.parents:
147
+ raise ValueError(f"Refusing to write outside SDTK-WIKI workspace: {resolved_target}")
148
+
149
+
150
+ def _is_excluded(
151
+ path: Path,
152
+ root: Path,
153
+ exclude_frags: list[str],
154
+ ) -> bool:
155
+ try:
156
+ rel = path.relative_to(root).as_posix().lower()
157
+ except ValueError:
158
+ rel = path.as_posix().lower()
159
+ for frag in exclude_frags:
160
+ norm_frag = frag.replace("\\", "/").lower()
161
+ if norm_frag in rel:
162
+ return True
163
+ return False
164
+
165
+
166
+ def _extract_title(text: str) -> str:
167
+ for line in text.splitlines():
168
+ stripped = line.strip()
169
+ if stripped.startswith("# "):
170
+ return stripped[2:].strip()
171
+ return ""
172
+
173
+
174
+ def _extract_headings(text: str) -> list[str]:
175
+ headings: list[str] = []
176
+ for line in text.splitlines():
177
+ stripped = line.strip()
178
+ if not stripped.startswith("#"):
179
+ continue
180
+ level = len(stripped) - len(stripped.lstrip("#"))
181
+ if 1 <= level <= 6 and len(stripped) > level and stripped[level] == " ":
182
+ headings.append(stripped[level + 1:].strip())
183
+ return headings
184
+
185
+
186
+ def _parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
187
+ lines = text.splitlines()
188
+ if not lines or lines[0].strip() != "---":
189
+ return {}, text
190
+
191
+ fields: dict[str, Any] = {}
192
+ current_list_key: str | None = None
193
+ for idx in range(1, len(lines)):
194
+ raw = lines[idx]
195
+ stripped = raw.strip()
196
+ if stripped in {"---", "..."}:
197
+ body = "\n".join(lines[idx + 1:])
198
+ if text.endswith("\n"):
199
+ body += "\n"
200
+ return fields, body
201
+ if not stripped:
202
+ current_list_key = None
203
+ continue
204
+ if stripped.startswith("- ") and current_list_key and isinstance(fields.get(current_list_key), list):
205
+ fields[current_list_key].append(stripped[2:].strip().strip('"\''))
206
+ continue
207
+ if ":" not in raw:
208
+ current_list_key = None
209
+ continue
210
+ key, value = raw.split(":", 1)
211
+ key = key.strip()
212
+ value = value.strip()
213
+ if not key:
214
+ current_list_key = None
215
+ continue
216
+ if not value:
217
+ fields[key] = []
218
+ current_list_key = key
219
+ continue
220
+ if value.startswith("[") and value.endswith("]"):
221
+ inner = value[1:-1].strip()
222
+ if inner:
223
+ fields[key] = [part.strip().strip('"\'') for part in inner.split(",") if part.strip()]
224
+ else:
225
+ fields[key] = []
226
+ current_list_key = None
227
+ continue
228
+ fields[key] = value.strip('"\'')
229
+ current_list_key = None
230
+
231
+ return {}, text
232
+
233
+
234
+ def _normalize_internal_ref(raw: str) -> str:
235
+ value = raw.strip()
236
+ if not value:
237
+ return ""
238
+ value = value.split("|", 1)[0].strip()
239
+ value = value.split("#", 1)[0].strip()
240
+ value = value.replace("\\", "/")
241
+ while value.startswith("./"):
242
+ value = value[2:]
243
+ if value.startswith("/"):
244
+ value = value[1:]
245
+ return value.strip()
246
+
247
+
248
+ def _extract_references(text: str) -> tuple[list[str], list[str], list[str]]:
249
+ issues = sorted(set(f"BK-{m}" for m in RE_BK.findall(text)))
250
+ knowledge_ids = sorted(
251
+ set(f"{m[0]}-{m[1]}" for m in RE_KNOWLEDGE_ID.findall(text))
252
+ )
253
+ raw_paths = RE_REPO_PATH.findall(text)
254
+ paths: list[str] = []
255
+ seen: set[str] = set()
256
+ for rp in raw_paths:
257
+ normalised = _normalize_internal_ref(rp)
258
+ if normalised and normalised not in seen:
259
+ seen.add(normalised)
260
+ paths.append(normalised)
261
+ return issues, knowledge_ids, paths
262
+
263
+
264
+ def _extract_wiki_links(text: str) -> list[str]:
265
+ links: list[str] = []
266
+ seen: set[str] = set()
267
+ for raw in RE_WIKI_LINK.findall(text):
268
+ normalised = _normalize_internal_ref(raw)
269
+ if normalised and normalised not in seen:
270
+ seen.add(normalised)
271
+ links.append(normalised)
272
+ return links
273
+
274
+
275
+ def _extract_markdown_links(text: str) -> list[str]:
276
+ links: list[str] = []
277
+ seen: set[str] = set()
278
+ for raw in RE_MARKDOWN_LINK.findall(text):
279
+ target = raw.strip().strip('<>')
280
+ lower = target.lower()
281
+ if not target or lower.startswith(("http://", "https://", "mailto:", "#")) or "://" in target:
282
+ continue
283
+ # Markdown links may include optional titles: [x](path.md "title").
284
+ if ' "' in target:
285
+ target = target.split(' "', 1)[0]
286
+ if " '" in target:
287
+ target = target.split(" '", 1)[0]
288
+ normalised = _normalize_internal_ref(target)
289
+ if normalised and normalised not in seen:
290
+ seen.add(normalised)
291
+ links.append(normalised)
292
+ return links
293
+
294
+
295
+ def _extract_skill_refs(text: str, path_refs: list[str], wiki_links: list[str]) -> list[str]:
296
+ refs = set(match.lower() for match in RE_SKILL_REF.findall(text))
297
+ for ref in path_refs + wiki_links:
298
+ parts = [part for part in ref.split("/") if part]
299
+ for marker in ("skills", "skills-claude"):
300
+ if marker in parts:
301
+ idx = parts.index(marker)
302
+ if idx + 1 < len(parts):
303
+ refs.add(parts[idx + 1].lower())
304
+ return sorted(refs)
305
+
306
+
307
+ def _extract_template_refs(path_refs: list[str], wiki_links: list[str]) -> list[str]:
308
+ refs: set[str] = set()
309
+ for ref in path_refs + wiki_links:
310
+ norm = _normalize_internal_ref(ref)
311
+ if "/templates/" in f"/{norm}":
312
+ refs.add(norm)
313
+ return sorted(refs)
314
+
315
+
316
+ def _extract_release_refs(text: str) -> list[str]:
317
+ return sorted(set(RE_RELEASE_REF.findall(text)))
318
+
319
+
320
+ def _compute_file_hash(md_file: Path) -> str:
321
+ content = md_file.read_bytes()
322
+ return hashlib.sha256(content).hexdigest()
323
+
324
+
325
+ def _parse_doc_record(md_file: Path, root: Path) -> dict[str, Any]:
326
+ rel = md_file.relative_to(root).as_posix()
327
+ text = md_file.read_text(encoding="utf-8", errors="replace")
328
+ frontmatter_fields, body_text = _parse_frontmatter(text)
329
+ title = str(
330
+ frontmatter_fields.get("title")
331
+ or _extract_title(body_text)
332
+ or md_file.stem.replace("_", " ").replace("-", " ")
333
+ )
334
+ headings = _extract_headings(body_text)
335
+ issues, knowledge_ids, path_refs = _extract_references(text)
336
+ wiki_links = _extract_wiki_links(text)
337
+ markdown_links = _extract_markdown_links(text)
338
+ path_refs = sorted(set(path_refs + markdown_links))
339
+ family = classify_family(rel)
340
+ role = classify_role(rel)
341
+ skill_refs = _extract_skill_refs(text, path_refs, wiki_links)
342
+ template_refs = _extract_template_refs(path_refs, wiki_links)
343
+ release_refs = _extract_release_refs(text)
344
+ return {
345
+ "id": rel,
346
+ "path": rel,
347
+ "title": title,
348
+ "family": family,
349
+ "role": role,
350
+ "trust_zone": "medium",
351
+ "body_markdown": body_text,
352
+ "issues": issues,
353
+ "knowledge_ids": knowledge_ids,
354
+ "headings": headings,
355
+ "frontmatter_fields": frontmatter_fields,
356
+ "skill_refs": skill_refs,
357
+ "template_refs": template_refs,
358
+ "release_refs": release_refs,
359
+ "lane_refs": [],
360
+ "wiki_links": wiki_links,
361
+ "path_refs": path_refs,
362
+ "outgoing_paths": path_refs,
363
+ }
364
+
365
+
366
+ def list_indexable_markdown_files(
367
+ root: Path,
368
+ scan_roots: list[Path],
369
+ exclude_frags: list[str],
370
+ ) -> list[Path]:
371
+ files: list[Path] = []
372
+ seen_paths: set[str] = set()
373
+
374
+ for scan_root in scan_roots:
375
+ if not scan_root.exists():
376
+ print(f"[atlas] Warning: scan root does not exist, skipping: {scan_root}", file=sys.stderr)
377
+ continue
378
+ if scan_root.is_file() and scan_root.suffix.lower() == ".md":
379
+ candidates = [scan_root]
380
+ elif scan_root.is_dir():
381
+ candidates = [p for p in sorted(scan_root.rglob("*.md")) if p.is_file()]
382
+ else:
383
+ candidates = []
384
+
385
+ for md_file in candidates:
386
+ if _is_excluded(md_file, root=root, exclude_frags=exclude_frags):
387
+ continue
388
+ try:
389
+ rel = md_file.relative_to(root).as_posix()
390
+ except ValueError:
391
+ rel = md_file.as_posix()
392
+ if rel in seen_paths:
393
+ continue
394
+ seen_paths.add(rel)
395
+ files.append(md_file)
396
+
397
+ files.sort(key=lambda p: p.as_posix())
398
+ return files
399
+
400
+
401
+ # ---------------------------------------------------------------------------
402
+ # Incremental build
403
+ # ---------------------------------------------------------------------------
404
+ def _empty_atlas_state() -> dict[str, Any]:
405
+ return {"version": ATLAS_STATE_VERSION, "documents": {}}
406
+
407
+
408
+ def _atlas_state_path(atlas_dir: Path) -> Path:
409
+ return atlas_dir / "ATLAS_STATE.json"
410
+
411
+
412
+ def load_atlas_state(atlas_dir: Path) -> dict[str, Any]:
413
+ state_path = _atlas_state_path(atlas_dir)
414
+ if not state_path.exists():
415
+ return _empty_atlas_state()
416
+ try:
417
+ data = json.loads(state_path.read_text(encoding="utf-8"))
418
+ except (OSError, json.JSONDecodeError):
419
+ return _empty_atlas_state()
420
+ if not isinstance(data, dict):
421
+ return _empty_atlas_state()
422
+ if data.get("version") != ATLAS_STATE_VERSION:
423
+ return _empty_atlas_state()
424
+ documents = data.get("documents")
425
+ if not isinstance(documents, dict):
426
+ return _empty_atlas_state()
427
+ return {"version": ATLAS_STATE_VERSION, "generated": data.get("generated"), "documents": documents}
428
+
429
+
430
+ def save_atlas_state(state: dict[str, Any], atlas_dir: Path) -> Path:
431
+ atlas_dir.mkdir(parents=True, exist_ok=True)
432
+ state_path = _atlas_state_path(atlas_dir)
433
+ _write_text_lf(state_path, json.dumps(state, ensure_ascii=True, indent=2, sort_keys=False))
434
+ return state_path
435
+
436
+
437
+ # ---------------------------------------------------------------------------
438
+ # Wiki pages and provenance
439
+ # ---------------------------------------------------------------------------
440
+ def _wiki_workspace_root(root: Path) -> Path:
441
+ return root / ".sdtk" / "wiki"
442
+
443
+
444
+ def _wiki_pages_root(root: Path) -> Path:
445
+ return _wiki_workspace_root(root) / "pages"
446
+
447
+
448
+ def _wiki_provenance_root(root: Path) -> Path:
449
+ return _wiki_workspace_root(root) / "provenance"
450
+
451
+
452
+ def _stable_page_id(source_path: str) -> str:
453
+ norm = source_path.replace("\\", "/")
454
+ digest = hashlib.sha256(norm.encode("utf-8")).hexdigest()
455
+ return f"wiki:{digest[:16]}"
456
+
457
+
458
+ def _safe_slug(value: str) -> str:
459
+ slug = value.strip().lower()
460
+ slug = re.sub(r"[^a-z0-9]+", "-", slug)
461
+ slug = slug.strip("-")
462
+ return slug or "page"
463
+
464
+
465
+ def _page_relative_path(doc: dict[str, Any]) -> str:
466
+ source_path = str(doc["path"]).replace("\\", "/")
467
+ source_digest = hashlib.sha256(source_path.encode("utf-8")).hexdigest()[:8]
468
+ slug = _safe_slug(str(doc.get("title") or Path(source_path).stem))
469
+ family = _safe_slug(str(doc.get("family") or "other-markdown"))
470
+ return f".sdtk/wiki/pages/{family}/{slug}--{source_digest}.md"
471
+
472
+
473
+ def _yaml_quote(value: Any) -> str:
474
+ text = str(value)
475
+ escaped = text.replace("\\", "\\\\").replace('"', '\\"')
476
+ return f'"{escaped}"'
477
+
478
+
479
+ def _render_generated_page(doc: dict[str, Any], page_id: str, source_hash: str, generated: str) -> str:
480
+ frontmatter = [
481
+ "---",
482
+ f"schema_version: {WIKI_PAGE_SCHEMA_VERSION}",
483
+ 'product: "SDTK-WIKI"',
484
+ 'managed_by: "sdtk-wiki"',
485
+ f"page_id: {_yaml_quote(page_id)}",
486
+ f"source_path: {_yaml_quote(doc['path'])}",
487
+ f"source_hash: {_yaml_quote(source_hash)}",
488
+ f"title: {_yaml_quote(doc.get('title') or '')}",
489
+ f"family: {_yaml_quote(doc.get('family') or '')}",
490
+ f"role: {_yaml_quote(doc.get('role') or '')}",
491
+ f"generated_at: {_yaml_quote(generated)}",
492
+ "---",
493
+ "",
494
+ ]
495
+ body = str(doc.get("body_markdown") or "")
496
+ if body and not body.endswith("\n"):
497
+ body += "\n"
498
+ return "\n".join(frontmatter) + body
499
+
500
+
501
+ def _prior_source_hashes(sources_path: Path) -> dict[str, str]:
502
+ if not sources_path.exists():
503
+ return {}
504
+ try:
505
+ payload = json.loads(sources_path.read_text(encoding="utf-8"))
506
+ except (OSError, json.JSONDecodeError):
507
+ return {}
508
+ sources = payload.get("sources")
509
+ if not isinstance(sources, list):
510
+ return {}
511
+ hashes: dict[str, str] = {}
512
+ for record in sources:
513
+ if not isinstance(record, dict):
514
+ continue
515
+ source_path = record.get("sourcePath")
516
+ source_hash = record.get("sourceHash")
517
+ if isinstance(source_path, str) and isinstance(source_hash, str):
518
+ hashes[source_path] = source_hash
519
+ return hashes
520
+
521
+
522
+ def _build_change_set(
523
+ prior_hashes: dict[str, str],
524
+ current_hashes: dict[str, str],
525
+ ) -> dict[str, list[str]]:
526
+ prior_paths = set(prior_hashes)
527
+ current_paths = set(current_hashes)
528
+ added = sorted(current_paths - prior_paths)
529
+ removed = sorted(prior_paths - current_paths)
530
+ changed = sorted(
531
+ path for path in current_paths & prior_paths
532
+ if prior_hashes.get(path) != current_hashes.get(path)
533
+ )
534
+ unchanged = sorted(
535
+ path for path in current_paths & prior_paths
536
+ if prior_hashes.get(path) == current_hashes.get(path)
537
+ )
538
+ return {
539
+ "added": added,
540
+ "changed": changed,
541
+ "unchanged": unchanged,
542
+ "removed": removed,
543
+ }
544
+
545
+
546
+ def write_wiki_pages_and_provenance(
547
+ docs: list[dict[str, Any]],
548
+ state: dict[str, Any],
549
+ root: Path,
550
+ generated: str,
551
+ scan_roots: list[Path],
552
+ ) -> dict[str, Any]:
553
+ workspace_root = _wiki_workspace_root(root)
554
+ pages_root = _wiki_pages_root(root)
555
+ provenance_root = _wiki_provenance_root(root)
556
+ sources_path = provenance_root / "sources.json"
557
+ changes_path = provenance_root / "changes.json"
558
+
559
+ pages_root.mkdir(parents=True, exist_ok=True)
560
+ provenance_root.mkdir(parents=True, exist_ok=True)
561
+
562
+ prior_hashes = _prior_source_hashes(sources_path)
563
+ state_docs = state.get("documents", {})
564
+ provenance_records: list[dict[str, Any]] = []
565
+ index_rows: list[tuple[str, str, str, str]] = []
566
+ current_hashes: dict[str, str] = {}
567
+
568
+ for doc in sorted(docs, key=lambda d: d["path"]):
569
+ source_path = str(doc["path"]).replace("\\", "/")
570
+ state_record = state_docs.get(source_path, {}) if isinstance(state_docs, dict) else {}
571
+ source_hash = state_record.get("hash")
572
+ if not isinstance(source_hash, str):
573
+ source_hash = hashlib.sha256(source_path.encode("utf-8")).hexdigest()
574
+ page_id = _stable_page_id(source_path)
575
+ page_rel = _page_relative_path(doc)
576
+ page_path = root / page_rel
577
+ _assert_inside(workspace_root, page_path)
578
+ page_path.parent.mkdir(parents=True, exist_ok=True)
579
+ _write_text_lf(page_path, _render_generated_page(doc, page_id, source_hash, generated))
580
+
581
+ current_hashes[source_path] = source_hash
582
+ provenance_records.append({
583
+ "pageId": page_id,
584
+ "sourcePath": source_path,
585
+ "sourceHash": source_hash,
586
+ "pagePath": page_rel,
587
+ "graphNodeId": doc["id"],
588
+ "title": doc.get("title") or "",
589
+ "family": doc.get("family") or "",
590
+ "role": doc.get("role") or "",
591
+ "frontmatter": doc.get("frontmatter_fields") or {},
592
+ "headings": doc.get("headings") or [],
593
+ "issues": doc.get("issues") or [],
594
+ "knowledgeIds": doc.get("knowledge_ids") or [],
595
+ "pathRefs": doc.get("path_refs") or [],
596
+ "wikiLinks": doc.get("wiki_links") or [],
597
+ })
598
+ index_rows.append((doc.get("title") or source_path, source_path, page_rel, page_id))
599
+
600
+ index_lines = [
601
+ "# SDTK-WIKI Page Index",
602
+ "",
603
+ f"Generated: {generated}",
604
+ "",
605
+ "| Title | Source | Page | Page ID |",
606
+ "|---|---|---|---|",
607
+ ]
608
+ for title, source_path, page_rel, page_id in sorted(index_rows, key=lambda row: row[1]):
609
+ index_lines.append(f"| {title} | `{source_path}` | `{page_rel}` | `{page_id}` |")
610
+ _write_text_lf(pages_root / "_index.md", "\n".join(index_lines) + "\n")
611
+
612
+ source_payload = {
613
+ "schemaVersion": WIKI_PROVENANCE_SCHEMA_VERSION,
614
+ "product": "SDTK-WIKI",
615
+ "generatedAt": generated,
616
+ "projectRoot": str(root),
617
+ "scanRoots": [str(sr) for sr in scan_roots],
618
+ "sourceCount": len(provenance_records),
619
+ "sources": provenance_records,
620
+ }
621
+ _write_text_lf(sources_path, json.dumps(source_payload, ensure_ascii=True, indent=2, sort_keys=False) + "\n")
622
+
623
+ change_set = _build_change_set(prior_hashes, current_hashes)
624
+ change_payload = {
625
+ "schemaVersion": WIKI_PROVENANCE_SCHEMA_VERSION,
626
+ "product": "SDTK-WIKI",
627
+ "generatedAt": generated,
628
+ **change_set,
629
+ }
630
+ _write_text_lf(changes_path, json.dumps(change_payload, ensure_ascii=True, indent=2, sort_keys=False) + "\n")
631
+
632
+ return {
633
+ "page_count": len(provenance_records),
634
+ "pages_root": str(pages_root),
635
+ "page_index_path": str(pages_root / "_index.md"),
636
+ "provenance_path": str(sources_path),
637
+ "changes_path": str(changes_path),
638
+ "changes": change_set,
639
+ }
640
+
641
+
642
+ def build_docs_incremental(
643
+ root: Path,
644
+ atlas_dir: Path,
645
+ generated: str,
646
+ scan_roots: list[Path],
647
+ exclude_frags: list[str],
648
+ ) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, int]]:
649
+ prior_state = load_atlas_state(atlas_dir)
650
+ prior_documents = prior_state.get("documents", {})
651
+ current_files = list_indexable_markdown_files(root, scan_roots, exclude_frags)
652
+
653
+ current_rel_paths = {}
654
+ for md_file in current_files:
655
+ try:
656
+ rel = md_file.relative_to(root).as_posix()
657
+ except ValueError:
658
+ rel = md_file.as_posix()
659
+ current_rel_paths[rel] = md_file
660
+
661
+ next_documents: dict[str, Any] = {}
662
+ reused_count = 0
663
+ reparsed_count = 0
664
+
665
+ for rel, md_file in current_rel_paths.items():
666
+ stats = md_file.stat()
667
+ current_mtime = stats.st_mtime_ns
668
+ prior_record = prior_documents.get(rel)
669
+ prior_doc = prior_record.get("doc") if isinstance(prior_record, dict) else None
670
+
671
+ if (
672
+ isinstance(prior_record, dict)
673
+ and isinstance(prior_doc, dict)
674
+ and prior_record.get("mtime") == current_mtime
675
+ ):
676
+ next_documents[rel] = prior_record
677
+ reused_count += 1
678
+ continue
679
+
680
+ current_hash = _compute_file_hash(md_file)
681
+ if (
682
+ isinstance(prior_record, dict)
683
+ and isinstance(prior_doc, dict)
684
+ and prior_record.get("hash") == current_hash
685
+ ):
686
+ next_documents[rel] = {
687
+ "mtime": current_mtime,
688
+ "hash": current_hash,
689
+ "last_indexed": prior_record.get("last_indexed") or generated,
690
+ "doc": prior_doc,
691
+ }
692
+ reused_count += 1
693
+ continue
694
+
695
+ next_documents[rel] = {
696
+ "mtime": current_mtime,
697
+ "hash": current_hash,
698
+ "last_indexed": generated,
699
+ "doc": _parse_doc_record(md_file, root=root),
700
+ }
701
+ reparsed_count += 1
702
+
703
+ removed_count = len(set(prior_documents.keys()) - set(current_rel_paths.keys()))
704
+ docs = sorted(
705
+ [record["doc"] for record in next_documents.values()],
706
+ key=lambda d: d["id"],
707
+ )
708
+ next_state = {
709
+ "version": ATLAS_STATE_VERSION,
710
+ "generated": generated,
711
+ "documents": next_documents,
712
+ }
713
+ build_stats = {
714
+ "discovered_count": len(current_rel_paths),
715
+ "reused_count": reused_count,
716
+ "reparsed_count": reparsed_count,
717
+ "removed_count": removed_count,
718
+ }
719
+ return docs, next_state, build_stats
720
+
721
+
722
+ # ---------------------------------------------------------------------------
723
+ # Graph builder
724
+ # ---------------------------------------------------------------------------
725
+ def _build_doc_alias_map(docs: list[dict[str, Any]]) -> dict[str, set[str]]:
726
+ alias_map: dict[str, set[str]] = {}
727
+ for doc in docs:
728
+ doc_id = doc["id"]
729
+ path_obj = Path(doc_id)
730
+ aliases = {
731
+ doc_id,
732
+ doc_id.lower(),
733
+ path_obj.name,
734
+ path_obj.name.lower(),
735
+ path_obj.stem,
736
+ path_obj.stem.lower(),
737
+ }
738
+ if doc_id.lower().endswith(".md"):
739
+ no_ext = doc_id[:-3]
740
+ no_ext_path = Path(no_ext)
741
+ aliases.update({no_ext, no_ext.lower(), no_ext_path.name, no_ext_path.name.lower()})
742
+ for alias in aliases:
743
+ alias_map.setdefault(alias, set()).add(doc_id)
744
+ return alias_map
745
+
746
+
747
+ def _resolve_doc_reference(raw: str, alias_map: dict[str, set[str]]) -> str | None:
748
+ normalised = _normalize_internal_ref(raw)
749
+ if not normalised:
750
+ return None
751
+ candidates = [normalised, normalised.lower()]
752
+ if not normalised.lower().endswith(".md"):
753
+ candidates.extend([f"{normalised}.md", f"{normalised.lower()}.md"])
754
+ for candidate in candidates:
755
+ matches = alias_map.get(candidate)
756
+ if matches and len(matches) == 1:
757
+ return next(iter(matches))
758
+ return None
759
+
760
+
761
+ def build_graph(docs: list[dict[str, Any]]) -> dict[str, Any]:
762
+ alias_map = _build_doc_alias_map(docs)
763
+
764
+ nodes = [
765
+ {
766
+ "id": d["id"],
767
+ "title": d["title"],
768
+ "family": d["family"],
769
+ "role": d["role"],
770
+ "trust_zone": d.get("trust_zone", "medium"),
771
+ }
772
+ for d in docs
773
+ ]
774
+
775
+ edges: list[dict[str, Any]] = []
776
+
777
+ for doc in docs:
778
+ src = doc["id"]
779
+
780
+ for issue in doc.get("issues", []):
781
+ edges.append({"source": src, "target": issue, "type": "references_issue", "label": issue})
782
+
783
+ for kid in doc.get("knowledge_ids", []):
784
+ edges.append({"source": src, "target": kid, "type": "references_knowledge_object", "label": kid})
785
+
786
+ for rp in doc.get("path_refs", doc.get("outgoing_paths", [])):
787
+ target = _resolve_doc_reference(rp, alias_map)
788
+ if target:
789
+ edges.append({"source": src, "target": target, "type": "references_path", "label": rp})
790
+
791
+ for wiki_ref in doc.get("wiki_links", []):
792
+ target = _resolve_doc_reference(wiki_ref, alias_map)
793
+ if target:
794
+ edges.append({"source": src, "target": target, "type": "references_wiki_link", "label": wiki_ref})
795
+
796
+ for skill_ref in doc.get("skill_refs", []):
797
+ edges.append({"source": src, "target": f"__skill__{skill_ref}", "type": "references_skill", "label": skill_ref})
798
+
799
+ for template_ref in doc.get("template_refs", []):
800
+ edges.append({"source": src, "target": f"__template__{template_ref}", "type": "references_template", "label": template_ref})
801
+
802
+ family_groups: dict[str, list[str]] = {}
803
+ for doc in docs:
804
+ family_groups.setdefault(doc["family"], []).append(doc["id"])
805
+ for family, members in family_groups.items():
806
+ if len(members) < 2:
807
+ continue
808
+ for mid in members:
809
+ edges.append({"source": mid, "target": f"__family__{family}", "type": "same_family", "label": family})
810
+
811
+ return {"nodes": nodes, "edges": edges}
812
+
813
+
814
+ # ---------------------------------------------------------------------------
815
+ # Summary markdown
816
+ # ---------------------------------------------------------------------------
817
+ def build_summary(
818
+ docs: list[dict[str, Any]],
819
+ graph: dict[str, Any],
820
+ generated: str,
821
+ stats: dict[str, int] | None,
822
+ root: Path,
823
+ scan_roots: list[Path],
824
+ exclude_frags: list[str],
825
+ ) -> str:
826
+ family_counts: dict[str, int] = {}
827
+ for d in docs:
828
+ family_counts[d["family"]] = family_counts.get(d["family"], 0) + 1
829
+
830
+ edge_type_counts: dict[str, int] = {}
831
+ for e in graph["edges"]:
832
+ et = e["type"]
833
+ edge_type_counts[et] = edge_type_counts.get(et, 0) + 1
834
+
835
+ lines: list[str] = [
836
+ "# SDTK-WIKI Graph Summary",
837
+ "",
838
+ f"Generated: {generated}",
839
+ f"Project root: {root}",
840
+ "",
841
+ "## Document Counts",
842
+ "",
843
+ f"Total documents indexed: {len(docs)}",
844
+ "",
845
+ "| Family | Count |",
846
+ "|--------|-------|",
847
+ ]
848
+ for fam, cnt in sorted(family_counts.items(), key=lambda x: -x[1]):
849
+ lines.append(f"| {fam} | {cnt} |")
850
+
851
+ if stats is not None:
852
+ lines += [
853
+ "",
854
+ "## Incremental Build",
855
+ "",
856
+ f"Discovered markdown docs: {stats['discovered_count']}",
857
+ f"Reused cached docs: {stats['reused_count']}",
858
+ f"Reparsed docs: {stats['reparsed_count']}",
859
+ f"Removed stale docs: {stats['removed_count']}",
860
+ ]
861
+
862
+ lines += [
863
+ "",
864
+ "## Graph Summary",
865
+ "",
866
+ f"Total nodes: {len(graph['nodes'])}",
867
+ f"Total edges: {len(graph['edges'])}",
868
+ "",
869
+ "## Scan Roots",
870
+ "",
871
+ ]
872
+ for sr in scan_roots:
873
+ lines.append(f"- {sr}")
874
+
875
+ lines += [
876
+ "",
877
+ "## Exclusions Applied",
878
+ "",
879
+ ]
880
+ for frag in exclude_frags:
881
+ lines.append(f"- {frag}")
882
+
883
+ return "\n".join(lines) + "\n"
884
+
885
+
886
+ # ---------------------------------------------------------------------------
887
+ # Static viewer
888
+ # ---------------------------------------------------------------------------
889
+ _FAMILY_COLORS = {
890
+ "governance": "#58a6ff",
891
+ "guide": "#14b8a6",
892
+ "backlog": "#d2a8ff",
893
+ "spec": "#f0883e",
894
+ "architecture": "#3fb950",
895
+ "database": "#a371f7",
896
+ "api": "#f778ba",
897
+ "qa": "#79c0ff",
898
+ "design": "#ffa657",
899
+ "dev": "#56d364",
900
+ "product": "#e3b341",
901
+ "skill": "#58a6ff",
902
+ "template": "#f0883e",
903
+ "root-readme": "#e3b341",
904
+ "other-markdown": "#8b949e",
905
+ }
906
+
907
+
908
+ def build_viewer(index: dict, graph: dict, generated: str) -> str:
909
+ if not _VIEWER_TEMPLATE_PATH.exists():
910
+ raise FileNotFoundError(f"Viewer template not found: {_VIEWER_TEMPLATE_PATH}")
911
+ index_json = _json_for_inline_script(index)
912
+ graph_json = _json_for_inline_script(graph)
913
+ family_colors_json = _json_for_inline_script(_FAMILY_COLORS)
914
+ template = _VIEWER_TEMPLATE_PATH.read_text(encoding="utf-8")
915
+ return (
916
+ template
917
+ .replace("__ATLAS_GENERATED__", generated)
918
+ .replace("__ATLAS_INDEX_JSON__", index_json)
919
+ .replace("__ATLAS_GRAPH_JSON__", graph_json)
920
+ .replace("__ATLAS_FAMILY_COLORS_JSON__", family_colors_json)
921
+ )
922
+
923
+
924
+ def copy_viewer_assets(atlas_dir: Path) -> list[Path]:
925
+ if not MERMAID_VENDOR_PATH.exists():
926
+ raise FileNotFoundError(f"Missing Mermaid runtime asset: {MERMAID_VENDOR_PATH}")
927
+ atlas_dir.mkdir(parents=True, exist_ok=True)
928
+ # Copy mermaid to atlas root (same location the viewer template expects)
929
+ destination = atlas_dir / MERMAID_ASSET_NAME
930
+ shutil.copyfile(MERMAID_VENDOR_PATH, destination)
931
+ return [destination]
932
+
933
+
934
+ # ---------------------------------------------------------------------------
935
+ # Main build
936
+ # ---------------------------------------------------------------------------
937
+ def build_atlas(
938
+ root: Path,
939
+ atlas_dir: Path,
940
+ scan_roots: list[Path] | None = None,
941
+ exclude_frags: list[str] | None = None,
942
+ verbose: bool = False,
943
+ ) -> dict[str, Any]:
944
+ generated = _now_utc()
945
+ frags = exclude_frags if exclude_frags is not None else DEFAULT_EXCLUDE_FRAGS
946
+ roots = scan_roots if scan_roots else [root]
947
+
948
+ print(f"[atlas] Project root: {root}")
949
+ print(f"[atlas] Output dir: {atlas_dir}")
950
+ print(f"[atlas] Scan roots: {[str(r) for r in roots]}")
951
+
952
+ atlas_dir.mkdir(parents=True, exist_ok=True)
953
+
954
+ print("[atlas] Scanning markdown files...")
955
+ docs, state, stats = build_docs_incremental(
956
+ root=root,
957
+ atlas_dir=atlas_dir,
958
+ generated=generated,
959
+ scan_roots=roots,
960
+ exclude_frags=frags,
961
+ )
962
+ print(f"[atlas] Indexed {len(docs)} documents.")
963
+ if verbose:
964
+ print(
965
+ f"[atlas] Incremental build: reused {stats['reused_count']} cached, "
966
+ f"reparsed {stats['reparsed_count']}, removed {stats['removed_count']}."
967
+ )
968
+
969
+ print("[atlas] Building graph...")
970
+ graph = build_graph(docs)
971
+ print(f"[atlas] Graph: {len(graph['nodes'])} nodes, {len(graph['edges'])} edges.")
972
+ print("[atlas] Writing wiki pages and provenance...")
973
+ wiki_result = write_wiki_pages_and_provenance(
974
+ docs=docs,
975
+ state=state,
976
+ root=root,
977
+ generated=generated,
978
+ scan_roots=roots,
979
+ )
980
+ print(f"[atlas] Wiki pages: {wiki_result['page_count']}")
981
+
982
+ index_data = {
983
+ "generated": generated,
984
+ "count": len(docs),
985
+ "documents": docs,
986
+ }
987
+
988
+ save_atlas_state(state, atlas_dir=atlas_dir)
989
+
990
+ index_path = atlas_dir / "SDTK_DOC_INDEX.json"
991
+ _write_text_lf(index_path, json.dumps(index_data, ensure_ascii=True, indent=2, sort_keys=False))
992
+
993
+ graph_out = {
994
+ "generated": generated,
995
+ "node_count": len(graph["nodes"]),
996
+ "edge_count": len(graph["edges"]),
997
+ "nodes": graph["nodes"],
998
+ "edges": graph["edges"],
999
+ }
1000
+ graph_path = atlas_dir / "SDTK_DOC_GRAPH.json"
1001
+ _write_text_lf(graph_path, json.dumps(graph_out, ensure_ascii=True, indent=2, sort_keys=False))
1002
+
1003
+ summary_text = build_summary(docs, graph, generated, stats=stats, root=root, scan_roots=roots, exclude_frags=frags)
1004
+ summary_path = atlas_dir / "SDTK_DOC_ATLAS_SUMMARY.md"
1005
+ _write_text_lf(summary_path, summary_text)
1006
+
1007
+ viewer_html = build_viewer(index_data, graph_out, generated)
1008
+ viewer_path = atlas_dir / "viewer.html"
1009
+ _write_text_lf(viewer_path, viewer_html)
1010
+
1011
+ for asset_path in copy_viewer_assets(atlas_dir=atlas_dir):
1012
+ if verbose:
1013
+ print(f"[atlas] Wrote asset: {asset_path.name}")
1014
+
1015
+ print(f"[atlas] Done. Output: {atlas_dir}")
1016
+ return {
1017
+ "generated": generated,
1018
+ "doc_count": len(docs),
1019
+ "node_count": len(graph["nodes"]),
1020
+ "edge_count": len(graph["edges"]),
1021
+ "stats": stats,
1022
+ "atlas_dir": str(atlas_dir),
1023
+ "page_count": wiki_result["page_count"],
1024
+ "pages_root": wiki_result["pages_root"],
1025
+ "page_index_path": wiki_result["page_index_path"],
1026
+ "provenance_path": wiki_result["provenance_path"],
1027
+ "changes_path": wiki_result["changes_path"],
1028
+ "changes": wiki_result["changes"],
1029
+ }
1030
+
1031
+
1032
+ # ---------------------------------------------------------------------------
1033
+ # CLI entry point
1034
+ # ---------------------------------------------------------------------------
1035
+ def main() -> int:
1036
+ parser = argparse.ArgumentParser(
1037
+ description="SDTK-WIKI Builder -- build a local document graph, wiki pages, and viewer.",
1038
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1039
+ )
1040
+ parser.add_argument(
1041
+ "--project-root",
1042
+ required=True,
1043
+ help="Absolute path to the project root to scan.",
1044
+ )
1045
+ parser.add_argument(
1046
+ "--output-dir",
1047
+ required=True,
1048
+ help="Directory to write atlas artifacts into.",
1049
+ )
1050
+ parser.add_argument(
1051
+ "--scan-root",
1052
+ dest="scan_roots",
1053
+ action="append",
1054
+ metavar="PATH",
1055
+ default=None,
1056
+ help="Explicit scan root (repeatable). Defaults to project root.",
1057
+ )
1058
+ parser.add_argument(
1059
+ "--exclude",
1060
+ dest="excludes",
1061
+ action="append",
1062
+ metavar="FRAG",
1063
+ default=None,
1064
+ help="Exclusion path fragment (repeatable). Defaults to standard set.",
1065
+ )
1066
+ parser.add_argument(
1067
+ "--verbose",
1068
+ action="store_true",
1069
+ default=False,
1070
+ help="Show incremental build detail.",
1071
+ )
1072
+
1073
+ args = parser.parse_args()
1074
+
1075
+ root = Path(args.project_root).resolve()
1076
+ if not root.is_dir():
1077
+ print(f"[atlas] ERROR: --project-root is not a directory: {root}", file=sys.stderr)
1078
+ return 1
1079
+
1080
+ atlas_dir = Path(args.output_dir).resolve()
1081
+
1082
+ scan_roots: list[Path] | None = None
1083
+ if args.scan_roots:
1084
+ scan_roots = [Path(sr).resolve() for sr in args.scan_roots]
1085
+
1086
+ excludes: list[str] | None = None
1087
+ if args.excludes:
1088
+ excludes = args.excludes
1089
+
1090
+ try:
1091
+ result = build_atlas(
1092
+ root=root,
1093
+ atlas_dir=atlas_dir,
1094
+ scan_roots=scan_roots,
1095
+ exclude_frags=excludes,
1096
+ verbose=args.verbose,
1097
+ )
1098
+ # Print JSON summary to stdout for Node CLI to parse
1099
+ print(f"[atlas:result] {json.dumps(result)}")
1100
+ return 0
1101
+ except FileNotFoundError as e:
1102
+ print(f"[atlas] ERROR: {e}", file=sys.stderr)
1103
+ return 2
1104
+ except Exception as e:
1105
+ print(f"[atlas] ERROR: {e}", file=sys.stderr)
1106
+ return 1
1107
+
1108
+
1109
+ if __name__ == "__main__":
1110
+ sys.exit(main())