docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/repo.py ADDED
@@ -0,0 +1,1486 @@
1
+ """Repository-level documentation indexing workflow.
2
+
3
+ This module powers the CodeGraph-like UX for project documents:
4
+ ``cairn init -y``, ``cairn sync``, ``cairn status``, and repo-scoped MCP
5
+ serving. It keeps repository state in ``.cairn/`` and stores one normal Cairn
6
+ document index per discovered source file under ``.cairn/documents/<doc_id>/``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ import re
14
+ import tomllib
15
+ from collections import Counter, defaultdict
16
+ from collections.abc import Callable, Collection, Iterable
17
+ from datetime import UTC, datetime
18
+ from fnmatch import fnmatchcase
19
+ from pathlib import Path
20
+ from typing import Any, Final, Literal, Protocol
21
+
22
+ from pydantic import BaseModel, ConfigDict, Field
23
+ from slugify import slugify
24
+
25
+ from cairn import __version__
26
+ from cairn.core.errors import ConfigError, IndexNotFoundError, ToolError
27
+ from cairn.embed.base import Embedder
28
+ from cairn.engine.indexer import Indexer
29
+ from cairn.engine.manifest import read_manifest
30
+ from cairn.entity.heuristic import HeuristicExtractor
31
+ from cairn.ingest import parser_for_path, supported_extensions
32
+ from cairn.repo_search import search_repo_index
33
+ from cairn.summarize.base import Summarizer
34
+ from cairn.tools.base import DocumentIndex, estimate_tokens_of_payload
35
+ from cairn.tools.search_semantic import IncludeField
36
+ from cairn.xref.heuristic import HeuristicXRefExtractor
37
+
38
+ CAIRN_DIR: Final = ".cairn"
39
+ CONFIG_FILENAME: Final = "config.toml"
40
+ REPO_MANIFEST_FILENAME: Final = "manifest.json"
41
+ REPO_MANIFEST_VERSION: Final = 1
42
+
43
+ DEFAULT_INCLUDE: Final[tuple[str, ...]] = (
44
+ "*.md",
45
+ "*.markdown",
46
+ "*.mdown",
47
+ "*.mkd",
48
+ "*.pdf",
49
+ "*/README.md",
50
+ "*/README.markdown",
51
+ "docs/**/*.md",
52
+ "docs/**/*.markdown",
53
+ "docs/**/*.mdown",
54
+ "docs/**/*.mkd",
55
+ "docs/**/*.pdf",
56
+ )
57
+ MARKITDOWN_INCLUDE: Final[tuple[str, ...]] = (
58
+ "*.docx",
59
+ "*.pptx",
60
+ "*.xlsx",
61
+ "*.html",
62
+ "*.htm",
63
+ "*.epub",
64
+ "docs/**/*.docx",
65
+ "docs/**/*.pptx",
66
+ "docs/**/*.xlsx",
67
+ "docs/**/*.xls",
68
+ "docs/**/*.html",
69
+ "docs/**/*.htm",
70
+ "docs/**/*.csv",
71
+ "docs/**/*.json",
72
+ "docs/**/*.xml",
73
+ "docs/**/*.epub",
74
+ )
75
+ DEFAULT_EXCLUDE: Final[tuple[str, ...]] = (
76
+ ".git/**",
77
+ ".cairn/**",
78
+ ".codegraph/**",
79
+ ".hypothesis/**",
80
+ ".mypy_cache/**",
81
+ ".pytest_cache/**",
82
+ ".ruff_cache/**",
83
+ ".venv/**",
84
+ ".tox/**",
85
+ ".nox/**",
86
+ "venv/**",
87
+ "node_modules/**",
88
+ "dist/**",
89
+ "build/**",
90
+ "site/**",
91
+ "__pycache__/**",
92
+ )
93
+ NATIVE_SUFFIXES: Final = frozenset({".md", ".markdown", ".mdown", ".mkd", ".pdf"})
94
+ SUPPORTED_SUFFIXES: Final = supported_extensions()
95
+
96
+ DocState = Literal["indexed", "stale", "missing", "error", "orphaned"]
97
+
98
+
99
+ class IndexSettings(Protocol):
100
+ """Indexing knobs needed by repo sync without importing the CLI layer."""
101
+
102
+ summary_concurrency: int
103
+ embed_batch_size: int
104
+
105
+
106
+ class RepoConfig(BaseModel):
107
+ """Configuration stored in ``.cairn/config.toml``."""
108
+
109
+ model_config = ConfigDict(frozen=True, extra="forbid")
110
+
111
+ include: tuple[str, ...] = DEFAULT_INCLUDE
112
+ exclude: tuple[str, ...] = DEFAULT_EXCLUDE
113
+ documents_dir: str = "documents"
114
+ primary_doc: str | None = None
115
+ enable_markitdown: bool = False
116
+ search_sections_per_doc: int = Field(default=1, ge=1, le=8)
117
+ preferred_locales: tuple[str, ...] = Field(default=())
118
+
119
+
120
+ class DiscoveredDocument(BaseModel):
121
+ """One source document discovered from repo config globs."""
122
+
123
+ model_config = ConfigDict(frozen=True, extra="forbid")
124
+
125
+ id: str
126
+ source: Path
127
+ relative_source: str
128
+ out_dir: Path
129
+
130
+
131
+ class RepoDocumentStatus(BaseModel):
132
+ """Status for one repo document index."""
133
+
134
+ model_config = ConfigDict(frozen=True, extra="forbid")
135
+
136
+ id: str
137
+ source: str
138
+ doc_dir: str
139
+ state: DocState
140
+ section_count: int | None = None
141
+ source_hash: str | None = None
142
+ indexed_hash: str | None = None
143
+ source_file_hash: str | None = None
144
+ indexed_source_file_hash: str | None = None
145
+ indexed_at: datetime | None = None
146
+ error: str | None = None
147
+
148
+
149
+ class RepoStatus(BaseModel):
150
+ """Computed repository documentation index status."""
151
+
152
+ model_config = ConfigDict(frozen=True, extra="forbid")
153
+
154
+ root: Path
155
+ config_path: Path
156
+ documents: tuple[RepoDocumentStatus, ...]
157
+ primary_doc: str | None
158
+
159
+ @property
160
+ def indexed_count(self) -> int:
161
+ return sum(1 for doc in self.documents if doc.state == "indexed")
162
+
163
+ @property
164
+ def stale_count(self) -> int:
165
+ return sum(1 for doc in self.documents if doc.state == "stale")
166
+
167
+ @property
168
+ def missing_count(self) -> int:
169
+ return sum(1 for doc in self.documents if doc.state == "missing")
170
+
171
+ @property
172
+ def error_count(self) -> int:
173
+ return sum(1 for doc in self.documents if doc.state == "error")
174
+
175
+
176
+ class RepoSyncResult(BaseModel):
177
+ """Outcome for one document during ``cairn sync``."""
178
+
179
+ model_config = ConfigDict(frozen=True, extra="forbid")
180
+
181
+ id: str
182
+ source: str
183
+ manifest_path: Path | None = None
184
+ rebuilt: bool
185
+ ok: bool = True
186
+ error: str | None = None
187
+
188
+
189
+ def cairn_dir(root: Path) -> Path:
190
+ return root / CAIRN_DIR
191
+
192
+
193
+ def config_path(root: Path) -> Path:
194
+ return cairn_dir(root) / CONFIG_FILENAME
195
+
196
+
197
+ def repo_manifest_path(root: Path) -> Path:
198
+ return cairn_dir(root) / REPO_MANIFEST_FILENAME
199
+
200
+
201
+ def find_repo_root(start: Path | None = None) -> Path:
202
+ """Find the nearest ancestor with ``.cairn/config.toml``."""
203
+ current = (start or Path.cwd()).resolve()
204
+ if current.is_file():
205
+ current = current.parent
206
+ for candidate in (current, *current.parents):
207
+ if config_path(candidate).exists():
208
+ return candidate
209
+ msg = "Cairn repo config not found. Run `cairn init -y` first."
210
+ raise ConfigError(msg, details={"start": str(current)})
211
+
212
+
213
+ def write_default_config(
214
+ root: Path,
215
+ *,
216
+ force: bool = False,
217
+ enable_markitdown: bool = False,
218
+ ) -> Path:
219
+ """Create ``.cairn/config.toml`` with conservative repo-doc defaults."""
220
+ path = config_path(root)
221
+ if path.exists() and not force:
222
+ return path
223
+ path.parent.mkdir(parents=True, exist_ok=True)
224
+ include = DEFAULT_INCLUDE
225
+ if enable_markitdown:
226
+ include = (*DEFAULT_INCLUDE, *MARKITDOWN_INCLUDE)
227
+ cfg = RepoConfig(
228
+ include=include,
229
+ primary_doc="readme",
230
+ enable_markitdown=enable_markitdown,
231
+ )
232
+ path.write_text(_render_config(cfg), encoding="utf-8")
233
+ return path
234
+
235
+
236
+ def load_repo_config(root: Path) -> RepoConfig:
237
+ path = config_path(root)
238
+ if not path.exists():
239
+ msg = "Cairn repo config not found. Run `cairn init -y` first."
240
+ raise ConfigError(msg, details={"path": str(path)})
241
+ with path.open("rb") as fh:
242
+ payload = tomllib.load(fh)
243
+ try:
244
+ return RepoConfig.model_validate(payload)
245
+ except ValueError as exc:
246
+ msg = f"invalid Cairn repo config: {path}"
247
+ raise ConfigError(msg, details={"path": str(path)}) from exc
248
+
249
+
250
+ def discover_documents(root: Path, config: RepoConfig) -> tuple[DiscoveredDocument, ...]:
251
+ """Discover configured source documents in deterministic order."""
252
+ candidates: list[Path] = []
253
+ seen: set[Path] = set()
254
+ allowed_suffixes = SUPPORTED_SUFFIXES if config.enable_markitdown else NATIVE_SUFFIXES
255
+ for pattern in config.include:
256
+ for path in root.glob(pattern):
257
+ if not path.is_file():
258
+ continue
259
+ resolved = path.resolve()
260
+ if resolved in seen:
261
+ continue
262
+ rel = _relative_posix(root, resolved)
263
+ if _is_excluded(rel, config.exclude):
264
+ continue
265
+ if resolved.suffix.lower() not in allowed_suffixes:
266
+ continue
267
+ seen.add(resolved)
268
+ candidates.append(resolved)
269
+
270
+ used_ids: set[str] = set()
271
+ docs: list[DiscoveredDocument] = []
272
+ for path in sorted(candidates, key=lambda p: _relative_posix(root, p)):
273
+ rel = _relative_posix(root, path)
274
+ doc_id = _unique_doc_id(_doc_id_for_relative_path(rel), used_ids)
275
+ used_ids.add(doc_id)
276
+ docs.append(
277
+ DiscoveredDocument(
278
+ id=doc_id,
279
+ source=path,
280
+ relative_source=rel,
281
+ out_dir=document_dir(root, config, doc_id),
282
+ )
283
+ )
284
+ return tuple(docs)
285
+
286
+
287
+ def document_dir(root: Path, config: RepoConfig, doc_id: str) -> Path:
288
+ return cairn_dir(root) / config.documents_dir / doc_id
289
+
290
+
291
+ def load_repo_document_index(
292
+ root: Path,
293
+ *,
294
+ doc_id: str | None = None,
295
+ ) -> DocumentIndex:
296
+ """Load a repo document by id, or the configured primary document."""
297
+ config = load_repo_config(root)
298
+ status = repo_status(root, config=config)
299
+ selected = doc_id or _choose_primary_doc(status)
300
+ if selected is None:
301
+ msg = "no indexed Cairn documents found. Run `cairn sync` first."
302
+ raise IndexNotFoundError(msg, details={"root": str(root)})
303
+ doc = next((item for item in status.documents if item.id == selected), None)
304
+ if doc is None or doc.state == "missing":
305
+ msg = f"repo document is not indexed: {selected!r}"
306
+ raise IndexNotFoundError(msg, details={"doc": selected})
307
+ return DocumentIndex.load(root / doc.doc_dir)
308
+
309
+
310
+ async def sync_repo(
311
+ root: Path,
312
+ *,
313
+ summarizer: Summarizer,
314
+ embedder: Embedder,
315
+ index_config: IndexSettings,
316
+ force: bool = False,
317
+ progress: Callable[[str], None] | None = None,
318
+ ) -> tuple[RepoSyncResult, ...]:
319
+ """Index every configured repo document, reusing per-document no-op checks."""
320
+ config = load_repo_config(root)
321
+ docs = discover_documents(root, config)
322
+ if not docs:
323
+ msg = "no documents matched .cairn/config.toml include patterns"
324
+ raise ConfigError(msg, details={"root": str(root)})
325
+
326
+ results: list[RepoSyncResult] = []
327
+ for number, doc in enumerate(docs, start=1):
328
+ _emit(progress, f"doc {number}/{len(docs)} {doc.id}: {doc.relative_source}")
329
+
330
+ def doc_progress(message: str, doc_id: str = doc.id) -> None:
331
+ _emit(progress, f"{doc_id}: {message}")
332
+
333
+ indexer = Indexer(
334
+ parser=parser_for_path(doc.source),
335
+ summarizer=summarizer,
336
+ embedder=embedder,
337
+ entity_extractor=HeuristicExtractor(),
338
+ xref_extractor=HeuristicXRefExtractor(),
339
+ summary_concurrency=index_config.summary_concurrency,
340
+ embed_batch_size=index_config.embed_batch_size,
341
+ progress=doc_progress,
342
+ )
343
+ try:
344
+ result = await indexer.index_path(
345
+ doc.source,
346
+ out_dir=doc.out_dir,
347
+ doc_id=doc.id,
348
+ force=force,
349
+ )
350
+ results.append(
351
+ RepoSyncResult(
352
+ id=doc.id,
353
+ source=doc.relative_source,
354
+ manifest_path=result.manifest_path,
355
+ rebuilt=result.rebuilt,
356
+ )
357
+ )
358
+ except Exception as exc:
359
+ _emit(progress, f"{doc.id}: failed: {exc}")
360
+ results.append(
361
+ RepoSyncResult(
362
+ id=doc.id,
363
+ source=doc.relative_source,
364
+ rebuilt=False,
365
+ ok=False,
366
+ error=str(exc),
367
+ )
368
+ )
369
+
370
+ write_repo_manifest(root, repo_status(root, config=config))
371
+ return tuple(results)
372
+
373
+
374
+ async def search_repo_documents(
375
+ root: Path,
376
+ *,
377
+ embedder: Embedder,
378
+ query: str,
379
+ k: int = 8,
380
+ include: Iterable[IncludeField] = ("synopsis", "head", "evidence"),
381
+ sections_per_doc: int | None = None,
382
+ ) -> dict[str, Any]:
383
+ """Search across every indexed document in a repository Cairn index."""
384
+ if k < 1 or k > 32:
385
+ msg = f"k must be in [1, 32]; got {k}"
386
+ raise ToolError(msg, details={"k": k})
387
+ if not query.strip():
388
+ msg = "query must not be empty"
389
+ raise ToolError(msg)
390
+
391
+ config = load_repo_config(root)
392
+ effective_sections_per_doc = (
393
+ config.search_sections_per_doc
394
+ if sections_per_doc is None
395
+ else sections_per_doc
396
+ )
397
+ if effective_sections_per_doc < 1 or effective_sections_per_doc > 8:
398
+ msg = f"sections_per_doc must be in [1, 8]; got {sections_per_doc}"
399
+ raise ToolError(msg, details={"sections_per_doc": sections_per_doc})
400
+
401
+ include_set = set(include)
402
+ bad = include_set - {"synopsis", "head", "evidence"}
403
+ if bad:
404
+ msg = f"invalid include values: {sorted(bad)}"
405
+ raise ToolError(msg, details={"invalid": sorted(bad)})
406
+
407
+ vectors = await embedder.embed([query])
408
+ if not vectors:
409
+ msg = "embedder returned no vector for query"
410
+ raise ToolError(msg)
411
+ query_vec = vectors[0]
412
+
413
+ candidates = _repo_search_candidates(root, config)
414
+ payload = await search_repo_index(
415
+ root,
416
+ candidates=candidates,
417
+ query=query,
418
+ query_vec=query_vec,
419
+ k=k,
420
+ include_set=include_set,
421
+ sections_per_doc=effective_sections_per_doc,
422
+ preferred_locales=config.preferred_locales,
423
+ )
424
+ return {
425
+ "tokens_returned": estimate_tokens_of_payload(payload),
426
+ "data": payload,
427
+ }
428
+
429
+
430
+ async def repo_context(
431
+ root: Path,
432
+ *,
433
+ embedder: Embedder,
434
+ query: str,
435
+ k: int = 5,
436
+ sections_per_doc: int | None = None,
437
+ related_k: int = 3,
438
+ level: Literal["gist", "synopsis", "full"] = "synopsis",
439
+ max_section_chars: int = 1600,
440
+ ) -> dict[str, Any]:
441
+ """Build a compact repo-scoped context pack for an agent query."""
442
+ if related_k < 0 or related_k > 12:
443
+ msg = f"related_k must be in [0, 12]; got {related_k}"
444
+ raise ToolError(msg, details={"related_k": related_k})
445
+ if max_section_chars < 200 or max_section_chars > 8000:
446
+ msg = f"max_section_chars must be in [200, 8000]; got {max_section_chars}"
447
+ raise ToolError(msg, details={"max_section_chars": max_section_chars})
448
+
449
+ search = await search_repo_documents(
450
+ root,
451
+ embedder=embedder,
452
+ query=query,
453
+ k=k,
454
+ include=("synopsis", "evidence"),
455
+ sections_per_doc=sections_per_doc,
456
+ )
457
+ hits = list(search["data"]["hits"])
458
+ context_sections: list[dict[str, Any]] = []
459
+ graph_nodes: dict[str, dict[str, Any]] = {}
460
+ graph_edges: list[dict[str, Any]] = []
461
+ seen_edges: set[tuple[str, str, str]] = set()
462
+
463
+ for rank, hit in enumerate(hits, start=1):
464
+ index = load_repo_document_index(root, doc_id=hit["doc"])
465
+ node = index.tree.get(hit["id"])
466
+ if node is None:
467
+ continue
468
+ content = _repo_context_content(
469
+ index,
470
+ section_id=node.id,
471
+ level=level,
472
+ fallback=node.raw_text,
473
+ )[:max_section_chars]
474
+ relationships = _section_relationships(index, node.id, k=related_k)
475
+ context_sections.append(
476
+ {
477
+ "rank": rank,
478
+ "doc": hit["doc"],
479
+ "source": hit["source"],
480
+ "id": node.id,
481
+ "title": node.title,
482
+ "path": list(node.path),
483
+ "anchor": index.anchor(node.id),
484
+ "level": level,
485
+ "content": content,
486
+ "hit": hit,
487
+ "relationships": relationships,
488
+ }
489
+ )
490
+ _add_repo_doc_graph_node(graph_nodes, hit["doc"], source=hit["source"])
491
+ _add_repo_section_graph_node(graph_nodes, hit["doc"], index, node.id)
492
+ _add_repo_graph_edge(
493
+ graph_edges,
494
+ seen_edges,
495
+ source=_repo_doc_node_id(hit["doc"]),
496
+ target=_repo_section_node_id(hit["doc"], node.id),
497
+ kind="contains",
498
+ relation=None,
499
+ confidence=1.0,
500
+ )
501
+ for related in relationships:
502
+ _add_repo_section_graph_node(graph_nodes, hit["doc"], index, related["id"])
503
+ _add_repo_graph_edge(
504
+ graph_edges,
505
+ seen_edges,
506
+ source=_repo_section_node_id(hit["doc"], node.id),
507
+ target=_repo_section_node_id(hit["doc"], related["id"]),
508
+ kind=related["kind"],
509
+ relation=related.get("relation"),
510
+ confidence=float(related.get("confidence", 1.0)),
511
+ )
512
+
513
+ payload: dict[str, Any] = {
514
+ "query": query,
515
+ "hits": hits,
516
+ "context_sections": context_sections,
517
+ "relationship_map": {
518
+ "nodes": list(graph_nodes.values()),
519
+ "edges": graph_edges,
520
+ },
521
+ "stale_documents": search["data"].get("stale_documents", []),
522
+ "skipped_documents": search["data"]["skipped_documents"],
523
+ "codegraph_bridge": {
524
+ "status": "not_invoked",
525
+ "note": (
526
+ "Cairn does not parse source code. Pair this context with the "
527
+ "CodeGraph MCP server for symbol callers, callees, and code impact."
528
+ ),
529
+ },
530
+ }
531
+ return {
532
+ "tokens_returned": estimate_tokens_of_payload(payload),
533
+ "data": payload,
534
+ }
535
+
536
+
537
+ async def repo_graph(
538
+ root: Path,
539
+ *,
540
+ doc: str | None = None,
541
+ max_sections: int = 120,
542
+ max_entities: int = 40,
543
+ include_entities: bool = True,
544
+ include_xrefs: bool = True,
545
+ ) -> dict[str, Any]:
546
+ """Return a repo-level documentation relationship map."""
547
+ if max_sections < 1 or max_sections > 500:
548
+ msg = f"max_sections must be in [1, 500]; got {max_sections}"
549
+ raise ToolError(msg, details={"max_sections": max_sections})
550
+ if max_entities < 0 or max_entities > 200:
551
+ msg = f"max_entities must be in [0, 200]; got {max_entities}"
552
+ raise ToolError(msg, details={"max_entities": max_entities})
553
+
554
+ status = repo_status(root)
555
+ candidates = [
556
+ item
557
+ for item in status.documents
558
+ if item.state in {"indexed", "stale"} and (doc is None or item.id == doc)
559
+ ]
560
+ if doc is not None and not candidates:
561
+ msg = f"repo document is not indexed: {doc!r}"
562
+ raise IndexNotFoundError(msg, details={"doc": doc})
563
+
564
+ graph = _build_repo_graph_payload(
565
+ root,
566
+ candidates,
567
+ max_sections=max_sections,
568
+ max_entities=max_entities if include_entities else 0,
569
+ include_xrefs=include_xrefs,
570
+ )
571
+ payload: dict[str, Any] = {
572
+ "root": str(status.root),
573
+ "doc": doc,
574
+ "nodes": graph["nodes"],
575
+ "edges": graph["edges"],
576
+ "stats": graph["stats"],
577
+ "skipped_documents": graph["skipped_documents"],
578
+ "codegraph_bridge": {
579
+ "status": "external",
580
+ "note": (
581
+ "This graph covers repository documentation only. Do not use Cairn "
582
+ "as a source-code graph; connect CodeGraph for AST symbols and code edges."
583
+ ),
584
+ },
585
+ }
586
+ return {
587
+ "tokens_returned": estimate_tokens_of_payload(payload),
588
+ "data": payload,
589
+ }
590
+
591
+
592
+ async def repo_impact(
593
+ root: Path,
594
+ *,
595
+ doc: str,
596
+ id: str | None = None,
597
+ max_results: int = 24,
598
+ ) -> dict[str, Any]:
599
+ """Estimate documentation surfaces affected by a document or section change."""
600
+ if max_results < 1 or max_results > 100:
601
+ msg = f"max_results must be in [1, 100]; got {max_results}"
602
+ raise ToolError(msg, details={"max_results": max_results})
603
+ status = repo_status(root)
604
+ doc_status = next((item for item in status.documents if item.id == doc), None)
605
+ if doc_status is None or doc_status.state == "missing":
606
+ msg = f"repo document is not indexed: {doc!r}"
607
+ raise IndexNotFoundError(msg, details={"doc": doc})
608
+
609
+ index = DocumentIndex.load(root / doc_status.doc_dir)
610
+ if id is None:
611
+ payload = _repo_document_impact_payload(
612
+ root,
613
+ status=status,
614
+ doc_status=doc_status,
615
+ index=index,
616
+ max_results=max_results,
617
+ )
618
+ else:
619
+ payload = _repo_section_impact_payload(
620
+ root,
621
+ status=status,
622
+ doc_status=doc_status,
623
+ index=index,
624
+ section_id=id,
625
+ max_results=max_results,
626
+ )
627
+ return {
628
+ "tokens_returned": estimate_tokens_of_payload(payload),
629
+ "data": payload,
630
+ }
631
+
632
+
633
+ def _repo_context_content(
634
+ index: DocumentIndex,
635
+ *,
636
+ section_id: str,
637
+ level: Literal["gist", "synopsis", "full"],
638
+ fallback: str,
639
+ ) -> str:
640
+ if level == "full":
641
+ return fallback
642
+ summary = index.summaries.get(section_id)
643
+ if summary is None:
644
+ return fallback
645
+ if level == "gist":
646
+ return summary.gist
647
+ return summary.synopsis
648
+
649
+
650
+ def _section_relationships(
651
+ index: DocumentIndex,
652
+ section_id: str,
653
+ *,
654
+ k: int,
655
+ ) -> list[dict[str, Any]]:
656
+ if k <= 0:
657
+ return []
658
+ node = index.tree.require(section_id)
659
+ relationships: list[dict[str, Any]] = []
660
+ seen: set[tuple[str, str, str | None, str | None]] = set()
661
+
662
+ def add(
663
+ target_id: str,
664
+ *,
665
+ kind: str,
666
+ relation: str | None,
667
+ confidence: float,
668
+ direction: str | None = None,
669
+ ) -> None:
670
+ key = (target_id, kind, relation, direction)
671
+ if key in seen:
672
+ return
673
+ seen.add(key)
674
+ target = index.tree.get(target_id)
675
+ relationships.append(
676
+ {
677
+ "id": target_id,
678
+ "title": target.title if target is not None else target_id,
679
+ "kind": kind,
680
+ "relation": relation,
681
+ "direction": direction,
682
+ "confidence": round(float(confidence), 4),
683
+ "anchor": index.anchor(target_id),
684
+ }
685
+ )
686
+
687
+ if node.parent is not None:
688
+ add(node.parent, kind="parent", relation=None, confidence=1.0)
689
+ for child_id in node.children:
690
+ add(child_id, kind="child", relation=None, confidence=1.0)
691
+ if index.xrefs is not None:
692
+ for ref in index.xrefs.outgoing_from(section_id):
693
+ add(
694
+ ref.dst,
695
+ kind="xref",
696
+ relation=ref.kind,
697
+ confidence=ref.confidence,
698
+ direction="outgoing",
699
+ )
700
+ for ref in index.xrefs.incoming_to(section_id):
701
+ add(
702
+ ref.src,
703
+ kind="xref",
704
+ relation=ref.kind,
705
+ confidence=ref.confidence,
706
+ direction="incoming",
707
+ )
708
+ relationships.sort(
709
+ key=lambda item: (
710
+ -float(item["confidence"]),
711
+ str(item["kind"]),
712
+ str(item["id"]),
713
+ str(item.get("direction") or ""),
714
+ )
715
+ )
716
+ return relationships[:k]
717
+
718
+
719
+ def _build_repo_graph_payload(
720
+ root: Path,
721
+ candidates: Collection[RepoDocumentStatus],
722
+ *,
723
+ max_sections: int,
724
+ max_entities: int,
725
+ include_xrefs: bool,
726
+ ) -> dict[str, Any]:
727
+ nodes: dict[str, dict[str, Any]] = {}
728
+ edges: list[dict[str, Any]] = []
729
+ seen_edges: set[tuple[str, str, str]] = set()
730
+ skipped: list[dict[str, str]] = []
731
+ selected_sections: dict[str, set[str]] = defaultdict(set)
732
+ entity_mentions: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set)
733
+ total_sections = 0
734
+ truncated = False
735
+
736
+ for doc in candidates:
737
+ _add_repo_doc_graph_node(nodes, doc.id, source=doc.source, state=doc.state)
738
+ try:
739
+ index = DocumentIndex.load(root / doc.doc_dir)
740
+ except Exception as exc:
741
+ skipped.append({"doc": doc.id, "reason": str(exc)})
742
+ continue
743
+
744
+ for section in index.tree:
745
+ total_sections += 1
746
+ if _repo_section_count(nodes) >= max_sections:
747
+ truncated = True
748
+ continue
749
+ selected_sections[doc.id].add(section.id)
750
+ _add_repo_section_graph_node(nodes, doc.id, index, section.id)
751
+
752
+ selected = selected_sections[doc.id]
753
+ for node in index.tree:
754
+ if node.id not in selected:
755
+ continue
756
+ source = (
757
+ _repo_section_node_id(doc.id, node.parent)
758
+ if node.parent in selected
759
+ else _repo_doc_node_id(doc.id)
760
+ )
761
+ _add_repo_graph_edge(
762
+ edges,
763
+ seen_edges,
764
+ source=source,
765
+ target=_repo_section_node_id(doc.id, node.id),
766
+ kind="contains",
767
+ relation=None,
768
+ confidence=1.0,
769
+ )
770
+
771
+ if include_xrefs and index.xrefs is not None:
772
+ for ref in index.xrefs:
773
+ if ref.src in selected and ref.dst in selected:
774
+ _add_repo_graph_edge(
775
+ edges,
776
+ seen_edges,
777
+ source=_repo_section_node_id(doc.id, ref.src),
778
+ target=_repo_section_node_id(doc.id, ref.dst),
779
+ kind="xref",
780
+ relation=ref.kind,
781
+ confidence=ref.confidence,
782
+ )
783
+
784
+ if max_entities > 0 and index.entities is not None:
785
+ for entity in index.entities:
786
+ key = (entity.kind, entity.canonical)
787
+ for mention in entity.mentions:
788
+ if mention.section_id in selected:
789
+ entity_mentions[key].add((doc.id, mention.section_id))
790
+
791
+ for (kind, canonical), mentions in sorted(
792
+ entity_mentions.items(),
793
+ key=lambda item: (-len(item[1]), item[0][0], item[0][1].lower()),
794
+ )[:max_entities]:
795
+ entity_id = _repo_entity_node_id(kind, canonical)
796
+ nodes[entity_id] = {
797
+ "id": entity_id,
798
+ "kind": "entity",
799
+ "entity_kind": kind,
800
+ "label": canonical,
801
+ "mentions": len(mentions),
802
+ }
803
+ for doc_id, section_id in sorted(mentions):
804
+ _add_repo_graph_edge(
805
+ edges,
806
+ seen_edges,
807
+ source=_repo_section_node_id(doc_id, section_id),
808
+ target=entity_id,
809
+ kind="mentions",
810
+ relation=kind,
811
+ confidence=1.0,
812
+ )
813
+
814
+ return {
815
+ "nodes": list(nodes.values()),
816
+ "edges": edges,
817
+ "stats": {
818
+ "documents": sum(1 for node in nodes.values() if node["kind"] == "document"),
819
+ "sections": sum(1 for node in nodes.values() if node["kind"] == "section"),
820
+ "entities": sum(1 for node in nodes.values() if node["kind"] == "entity"),
821
+ "edges": len(edges),
822
+ "total_sections": total_sections,
823
+ "truncated": truncated,
824
+ },
825
+ "skipped_documents": skipped,
826
+ }
827
+
828
+
829
+ def _repo_document_impact_payload(
830
+ root: Path,
831
+ *,
832
+ status: RepoStatus,
833
+ doc_status: RepoDocumentStatus,
834
+ index: DocumentIndex,
835
+ max_results: int,
836
+ ) -> dict[str, Any]:
837
+ sections = [
838
+ _impact_section_ref(doc_status.id, index, section.id, kind="contains")
839
+ for section in list(index.tree)[:max_results]
840
+ ]
841
+ related_documents = _related_documents_by_entities(
842
+ root,
843
+ status=status,
844
+ doc_id=doc_status.id,
845
+ max_results=max_results,
846
+ )
847
+ return {
848
+ "scope": "document",
849
+ "doc": doc_status.id,
850
+ "source": doc_status.source,
851
+ "state": doc_status.state,
852
+ "section_count": len(index.tree),
853
+ "derived_artifacts": _repo_derived_artifacts(doc_status.id),
854
+ "affected_surfaces": _repo_affected_surfaces(),
855
+ "sections": sections,
856
+ "related_documents": related_documents,
857
+ "notes": [
858
+ "Changing this source can make the document index stale.",
859
+ (
860
+ "Repo search, repo_context, repo_graph, inspectors, and MCP "
861
+ "drilldown read derived artifacts."
862
+ ),
863
+ ],
864
+ }
865
+
866
+
867
+ def _repo_section_impact_payload(
868
+ root: Path,
869
+ *,
870
+ status: RepoStatus,
871
+ doc_status: RepoDocumentStatus,
872
+ index: DocumentIndex,
873
+ section_id: str,
874
+ max_results: int,
875
+ ) -> dict[str, Any]:
876
+ node = index.tree.require(section_id)
877
+ affected = _section_relationships(index, section_id, k=max_results)
878
+ shared = _shared_entity_section_refs(
879
+ root,
880
+ status=status,
881
+ doc_id=doc_status.id,
882
+ section_id=section_id,
883
+ max_results=max_results,
884
+ )
885
+ merged: list[dict[str, Any]] = []
886
+ seen: set[tuple[str, str, str]] = set()
887
+ for item in affected:
888
+ key = (doc_status.id, item["id"], item["kind"])
889
+ if key in seen:
890
+ continue
891
+ seen.add(key)
892
+ merged.append({"doc": doc_status.id, **item})
893
+ for item in shared:
894
+ key = (item["doc"], item["id"], item["kind"])
895
+ if key in seen:
896
+ continue
897
+ seen.add(key)
898
+ merged.append(item)
899
+ merged = merged[:max_results]
900
+ documents = sorted({item["doc"] for item in merged} | {doc_status.id})
901
+ return {
902
+ "scope": "section",
903
+ "doc": doc_status.id,
904
+ "source": doc_status.source,
905
+ "id": node.id,
906
+ "title": node.title,
907
+ "path": list(node.path),
908
+ "anchor": index.anchor(node.id),
909
+ "derived_artifacts": [
910
+ f".cairn/documents/{doc_status.id}/tree.json",
911
+ f".cairn/documents/{doc_status.id}/summaries.json",
912
+ f".cairn/documents/{doc_status.id}/vectors.lance",
913
+ f".cairn/documents/{doc_status.id}/entities.json",
914
+ f".cairn/documents/{doc_status.id}/refs.json",
915
+ "repo search cache",
916
+ "repo inspectors",
917
+ ],
918
+ "affected_surfaces": _repo_affected_surfaces(),
919
+ "sections": merged,
920
+ "documents": documents,
921
+ "notes": [
922
+ "Impact is documentation-graph impact, not source-code symbol impact.",
923
+ "Use the CodeGraph MCP server for callers, callees, and code symbol impact.",
924
+ ],
925
+ }
926
+
927
+
928
+ def _related_documents_by_entities(
929
+ root: Path,
930
+ *,
931
+ status: RepoStatus,
932
+ doc_id: str,
933
+ max_results: int,
934
+ ) -> list[dict[str, Any]]:
935
+ target_keys: set[tuple[str, str]] = set()
936
+ for item in status.documents:
937
+ if item.id != doc_id or item.state not in {"indexed", "stale"}:
938
+ continue
939
+ try:
940
+ index = DocumentIndex.load(root / item.doc_dir)
941
+ except Exception:
942
+ continue
943
+ if index.entities is not None:
944
+ target_keys.update(
945
+ (entity.kind, entity.canonical) for entity in index.entities
946
+ )
947
+ break
948
+ if not target_keys:
949
+ return []
950
+
951
+ related: Counter[str] = Counter()
952
+ for item in status.documents:
953
+ if item.id == doc_id or item.state not in {"indexed", "stale"}:
954
+ continue
955
+ try:
956
+ index = DocumentIndex.load(root / item.doc_dir)
957
+ except Exception:
958
+ continue
959
+ if index.entities is None:
960
+ continue
961
+ keys = {(entity.kind, entity.canonical) for entity in index.entities}
962
+ related[item.id] += len(target_keys & keys)
963
+ rows = [
964
+ {"doc": doc, "shared_entities": count}
965
+ for doc, count in related.most_common(max_results)
966
+ if count > 0
967
+ ]
968
+ return rows
969
+
970
+
971
+ def _shared_entity_section_refs(
972
+ root: Path,
973
+ *,
974
+ status: RepoStatus,
975
+ doc_id: str,
976
+ section_id: str,
977
+ max_results: int,
978
+ ) -> list[dict[str, Any]]:
979
+ target_entities: set[tuple[str, str]] = set()
980
+ refs: list[dict[str, Any]] = []
981
+ for item in status.documents:
982
+ if item.state not in {"indexed", "stale"}:
983
+ continue
984
+ try:
985
+ index = DocumentIndex.load(root / item.doc_dir)
986
+ except Exception:
987
+ continue
988
+ if index.entities is None:
989
+ continue
990
+ if item.id == doc_id:
991
+ for entity in index.entities:
992
+ if any(mention.section_id == section_id for mention in entity.mentions):
993
+ target_entities.add((entity.kind, entity.canonical))
994
+ break
995
+ if not target_entities:
996
+ return []
997
+ for item in status.documents:
998
+ if item.state not in {"indexed", "stale"}:
999
+ continue
1000
+ try:
1001
+ index = DocumentIndex.load(root / item.doc_dir)
1002
+ except Exception:
1003
+ continue
1004
+ if index.entities is None:
1005
+ continue
1006
+ for entity in index.entities:
1007
+ key = (entity.kind, entity.canonical)
1008
+ if key not in target_entities:
1009
+ continue
1010
+ for mention in entity.mentions:
1011
+ if item.id == doc_id and mention.section_id == section_id:
1012
+ continue
1013
+ if index.tree.get(mention.section_id) is None:
1014
+ continue
1015
+ ref = _impact_section_ref(
1016
+ item.id,
1017
+ index,
1018
+ mention.section_id,
1019
+ kind="shared_entity",
1020
+ relation=f"{entity.kind}:{entity.canonical}",
1021
+ confidence=0.18,
1022
+ )
1023
+ refs.append(ref)
1024
+ if len(refs) >= max_results:
1025
+ return refs
1026
+ return refs
1027
+
1028
+
1029
+ def _repo_derived_artifacts(doc_id: str) -> list[str]:
1030
+ prefix = f".cairn/documents/{doc_id}"
1031
+ return [
1032
+ ".cairn/manifest.json",
1033
+ f"{prefix}/manifest.json",
1034
+ f"{prefix}/tree.json",
1035
+ f"{prefix}/summaries.json",
1036
+ f"{prefix}/vectors.lance",
1037
+ f"{prefix}/entities.json",
1038
+ f"{prefix}/refs.json",
1039
+ ]
1040
+
1041
+
1042
+ def _repo_affected_surfaces() -> list[str]:
1043
+ return [
1044
+ "list_documents",
1045
+ "search_documents",
1046
+ "repo_context",
1047
+ "repo_graph",
1048
+ "repo_impact",
1049
+ "outline/get_section/expand/read_range with doc",
1050
+ "find_mentions/get_related with doc",
1051
+ "generated inspector HTML",
1052
+ ]
1053
+
1054
+
1055
+ def _impact_section_ref(
1056
+ doc_id: str,
1057
+ index: DocumentIndex,
1058
+ section_id: str,
1059
+ *,
1060
+ kind: str,
1061
+ relation: str | None = None,
1062
+ confidence: float = 1.0,
1063
+ ) -> dict[str, Any]:
1064
+ node = index.tree.require(section_id)
1065
+ return {
1066
+ "doc": doc_id,
1067
+ "id": node.id,
1068
+ "title": node.title,
1069
+ "kind": kind,
1070
+ "relation": relation,
1071
+ "confidence": round(float(confidence), 4),
1072
+ "anchor": index.anchor(node.id),
1073
+ "path": list(node.path),
1074
+ }
1075
+
1076
+
1077
+ def _add_repo_doc_graph_node(
1078
+ nodes: dict[str, dict[str, Any]],
1079
+ doc_id: str,
1080
+ *,
1081
+ source: str,
1082
+ state: str | None = None,
1083
+ ) -> None:
1084
+ node_id = _repo_doc_node_id(doc_id)
1085
+ nodes.setdefault(
1086
+ node_id,
1087
+ {
1088
+ "id": node_id,
1089
+ "kind": "document",
1090
+ "doc": doc_id,
1091
+ "label": doc_id,
1092
+ "source": source,
1093
+ **({"state": state} if state is not None else {}),
1094
+ },
1095
+ )
1096
+
1097
+
1098
+ def _add_repo_section_graph_node(
1099
+ nodes: dict[str, dict[str, Any]],
1100
+ doc_id: str,
1101
+ index: DocumentIndex,
1102
+ section_id: str,
1103
+ ) -> None:
1104
+ node = index.tree.get(section_id)
1105
+ if node is None:
1106
+ return
1107
+ node_id = _repo_section_node_id(doc_id, section_id)
1108
+ nodes.setdefault(
1109
+ node_id,
1110
+ {
1111
+ "id": node_id,
1112
+ "kind": "section",
1113
+ "doc": doc_id,
1114
+ "section_id": section_id,
1115
+ "label": node.title,
1116
+ "level": node.level,
1117
+ "path": list(node.path),
1118
+ "anchor": index.anchor(section_id),
1119
+ },
1120
+ )
1121
+
1122
+
1123
+ def _add_repo_graph_edge(
1124
+ edges: list[dict[str, Any]],
1125
+ seen: set[tuple[str, str, str]],
1126
+ *,
1127
+ source: str,
1128
+ target: str,
1129
+ kind: str,
1130
+ relation: str | None,
1131
+ confidence: float,
1132
+ ) -> None:
1133
+ edge_kind = kind if relation is None else f"{kind}:{relation}"
1134
+ key = (source, target, edge_kind)
1135
+ if key in seen:
1136
+ return
1137
+ seen.add(key)
1138
+ edges.append(
1139
+ {
1140
+ "source": source,
1141
+ "target": target,
1142
+ "kind": kind,
1143
+ "relation": relation,
1144
+ "confidence": round(float(confidence), 4),
1145
+ }
1146
+ )
1147
+
1148
+
1149
+ def _repo_doc_node_id(doc_id: str) -> str:
1150
+ return f"doc:{doc_id}"
1151
+
1152
+
1153
+ def _repo_section_node_id(doc_id: str, section_id: str) -> str:
1154
+ return f"section:{doc_id}:{section_id}"
1155
+
1156
+
1157
+ def _repo_entity_node_id(kind: str, canonical: str) -> str:
1158
+ slug = slugify(canonical) or _normalize_search_text(canonical).replace(" ", "-")
1159
+ return f"entity:{kind}:{slug}"
1160
+
1161
+
1162
+ def _repo_section_count(nodes: dict[str, dict[str, Any]]) -> int:
1163
+ return sum(1 for node in nodes.values() if node["kind"] == "section")
1164
+
1165
+
1166
+ def _normalize_search_text(text: str) -> str:
1167
+ normalized = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
1168
+ return " ".join(re.findall(r"[a-z0-9][a-z0-9]*", normalized))
1169
+
1170
+
1171
+ def _repo_search_candidates(
1172
+ root: Path,
1173
+ config: RepoConfig,
1174
+ ) -> tuple[RepoDocumentStatus, ...]:
1175
+ status = repo_status(root, config=config)
1176
+ return tuple(doc for doc in status.documents if doc.state in {"indexed", "stale"})
1177
+
1178
+
1179
+ def _read_repo_manifest_status(root: Path) -> tuple[RepoDocumentStatus, ...] | None:
1180
+ path = repo_manifest_path(root)
1181
+ if not path.exists():
1182
+ return None
1183
+ try:
1184
+ with path.open("r", encoding="utf-8") as fh:
1185
+ payload = json.load(fh)
1186
+ if payload.get("format_version") != REPO_MANIFEST_VERSION:
1187
+ return None
1188
+ return tuple(
1189
+ RepoDocumentStatus.model_validate(item)
1190
+ for item in payload.get("documents", [])
1191
+ )
1192
+ except (OSError, ValueError, TypeError):
1193
+ return None
1194
+
1195
+
1196
+ def repo_status(root: Path, *, config: RepoConfig | None = None) -> RepoStatus:
1197
+ """Compute indexed/stale/missing status for configured repo docs."""
1198
+ cfg = config or load_repo_config(root)
1199
+ docs = discover_documents(root, cfg)
1200
+ previous = {
1201
+ doc.id: doc for doc in (_read_repo_manifest_status(root) or ())
1202
+ }
1203
+ statuses: list[RepoDocumentStatus] = [
1204
+ _document_status(root, doc, previous=previous.get(doc.id)) for doc in docs
1205
+ ]
1206
+ statuses.extend(_orphaned_statuses(root, cfg, {doc.id for doc in docs}))
1207
+ return RepoStatus(
1208
+ root=root,
1209
+ config_path=config_path(root),
1210
+ documents=tuple(statuses),
1211
+ primary_doc=cfg.primary_doc,
1212
+ )
1213
+
1214
+
1215
+ def write_repo_manifest(root: Path, status: RepoStatus) -> Path:
1216
+ """Write a lightweight repo-level manifest for humans and tools."""
1217
+ path = repo_manifest_path(root)
1218
+ path.parent.mkdir(parents=True, exist_ok=True)
1219
+ payload: dict[str, Any] = {
1220
+ "format_version": REPO_MANIFEST_VERSION,
1221
+ "cairn_version": __version__,
1222
+ "generated_at": datetime.now(UTC).isoformat(),
1223
+ "root": str(root),
1224
+ "primary_doc": status.primary_doc,
1225
+ "documents": [doc.model_dump(mode="json") for doc in status.documents],
1226
+ }
1227
+ with path.open("w", encoding="utf-8") as fh:
1228
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
1229
+ fh.write("\n")
1230
+ return path
1231
+
1232
+
1233
+ def _document_status(
1234
+ root: Path,
1235
+ doc: DiscoveredDocument,
1236
+ *,
1237
+ previous: RepoDocumentStatus | None = None,
1238
+ ) -> RepoDocumentStatus:
1239
+ manifest_path = doc.out_dir / "manifest.json"
1240
+ try:
1241
+ source_file_hash = _file_hash(doc.source)
1242
+ except OSError as exc:
1243
+ return RepoDocumentStatus(
1244
+ id=doc.id,
1245
+ source=doc.relative_source,
1246
+ doc_dir=_relative_posix(root, doc.out_dir),
1247
+ state="error",
1248
+ error=str(exc),
1249
+ )
1250
+ source_hash: str | None = None
1251
+ if not manifest_path.exists():
1252
+ try:
1253
+ parsed = parser_for_path(doc.source).parse(doc.source, doc_id=doc.id)
1254
+ source_hash = parsed.source_hash
1255
+ except Exception as exc:
1256
+ return RepoDocumentStatus(
1257
+ id=doc.id,
1258
+ source=doc.relative_source,
1259
+ doc_dir=_relative_posix(root, doc.out_dir),
1260
+ state="error",
1261
+ source_file_hash=source_file_hash,
1262
+ error=str(exc),
1263
+ )
1264
+ return RepoDocumentStatus(
1265
+ id=doc.id,
1266
+ source=doc.relative_source,
1267
+ doc_dir=_relative_posix(root, doc.out_dir),
1268
+ state="missing",
1269
+ source_hash=source_hash,
1270
+ source_file_hash=source_file_hash,
1271
+ )
1272
+
1273
+ try:
1274
+ manifest = read_manifest(doc.out_dir)
1275
+ except Exception as exc:
1276
+ return RepoDocumentStatus(
1277
+ id=doc.id,
1278
+ source=doc.relative_source,
1279
+ doc_dir=_relative_posix(root, doc.out_dir),
1280
+ state="error",
1281
+ source_file_hash=source_file_hash,
1282
+ error=str(exc),
1283
+ )
1284
+
1285
+ previous_indexed_file_hash = (
1286
+ previous.indexed_source_file_hash if previous is not None else None
1287
+ )
1288
+ if (
1289
+ previous is not None
1290
+ and previous.indexed_hash == manifest.source_hash
1291
+ and previous_indexed_file_hash is not None
1292
+ ):
1293
+ state: DocState = (
1294
+ "indexed" if previous_indexed_file_hash == source_file_hash else "stale"
1295
+ )
1296
+ return RepoDocumentStatus(
1297
+ id=doc.id,
1298
+ source=doc.relative_source,
1299
+ doc_dir=_relative_posix(root, doc.out_dir),
1300
+ state=state,
1301
+ section_count=previous.section_count,
1302
+ source_hash=(
1303
+ manifest.source_hash if state == "indexed" else previous.source_hash
1304
+ ),
1305
+ indexed_hash=manifest.source_hash,
1306
+ source_file_hash=source_file_hash,
1307
+ indexed_source_file_hash=previous_indexed_file_hash,
1308
+ indexed_at=manifest.indexed_at,
1309
+ )
1310
+
1311
+ try:
1312
+ parsed = parser_for_path(doc.source).parse(doc.source, doc_id=doc.id)
1313
+ source_hash = parsed.source_hash
1314
+ index = DocumentIndex.load(doc.out_dir)
1315
+ except Exception as exc:
1316
+ return RepoDocumentStatus(
1317
+ id=doc.id,
1318
+ source=doc.relative_source,
1319
+ doc_dir=_relative_posix(root, doc.out_dir),
1320
+ state="error",
1321
+ source_file_hash=source_file_hash,
1322
+ error=str(exc),
1323
+ )
1324
+
1325
+ state = "indexed" if manifest.source_hash == source_hash else "stale"
1326
+
1327
+ return RepoDocumentStatus(
1328
+ id=doc.id,
1329
+ source=doc.relative_source,
1330
+ doc_dir=_relative_posix(root, doc.out_dir),
1331
+ state=state,
1332
+ section_count=len(index.tree),
1333
+ source_hash=source_hash,
1334
+ indexed_hash=manifest.source_hash,
1335
+ source_file_hash=source_file_hash,
1336
+ indexed_source_file_hash=(
1337
+ source_file_hash
1338
+ if state == "indexed"
1339
+ else (
1340
+ previous.indexed_source_file_hash
1341
+ if previous is not None
1342
+ else None
1343
+ )
1344
+ ),
1345
+ indexed_at=manifest.indexed_at,
1346
+ )
1347
+
1348
+
1349
+ def _orphaned_statuses(
1350
+ root: Path,
1351
+ config: RepoConfig,
1352
+ discovered_ids: set[str],
1353
+ ) -> Iterable[RepoDocumentStatus]:
1354
+ docs_root = cairn_dir(root) / config.documents_dir
1355
+ if not docs_root.exists():
1356
+ return ()
1357
+ out: list[RepoDocumentStatus] = []
1358
+ for child in sorted(docs_root.iterdir(), key=lambda p: p.name):
1359
+ if not child.is_dir() or child.name in discovered_ids:
1360
+ continue
1361
+ try:
1362
+ manifest = read_manifest(child)
1363
+ index = DocumentIndex.load(child)
1364
+ manifest_source = Path(manifest.source_path)
1365
+ source_path = (
1366
+ manifest_source
1367
+ if manifest_source.is_absolute()
1368
+ else root / manifest_source
1369
+ )
1370
+ out.append(
1371
+ RepoDocumentStatus(
1372
+ id=child.name,
1373
+ source=manifest.source_path,
1374
+ doc_dir=_relative_posix(root, child),
1375
+ state="orphaned",
1376
+ section_count=len(index.tree),
1377
+ indexed_hash=manifest.source_hash,
1378
+ indexed_source_file_hash=(
1379
+ _file_hash(source_path)
1380
+ if source_path.exists()
1381
+ else None
1382
+ ),
1383
+ indexed_at=manifest.indexed_at,
1384
+ )
1385
+ )
1386
+ except Exception as exc:
1387
+ out.append(
1388
+ RepoDocumentStatus(
1389
+ id=child.name,
1390
+ source="",
1391
+ doc_dir=_relative_posix(root, child),
1392
+ state="error",
1393
+ error=str(exc),
1394
+ )
1395
+ )
1396
+ return tuple(out)
1397
+
1398
+
1399
+ def _choose_primary_doc(status: RepoStatus) -> str | None:
1400
+ indexed = [doc for doc in status.documents if doc.state in {"indexed", "stale"}]
1401
+ if status.primary_doc and any(doc.id == status.primary_doc for doc in indexed):
1402
+ return status.primary_doc
1403
+ if indexed:
1404
+ return indexed[0].id
1405
+ return None
1406
+
1407
+
1408
+ def _render_config(config: RepoConfig) -> str:
1409
+ lines = [
1410
+ "# Cairn repository documentation index.",
1411
+ "# Paths are relative to the repository root.",
1412
+ f"documents_dir = {_toml_string(config.documents_dir)}",
1413
+ f"enable_markitdown = {str(config.enable_markitdown).lower()}",
1414
+ f"search_sections_per_doc = {config.search_sections_per_doc}",
1415
+ "preferred_locales = ["
1416
+ + ", ".join(_toml_string(item) for item in config.preferred_locales)
1417
+ + "]",
1418
+ ]
1419
+ if config.primary_doc is not None:
1420
+ lines.append(f"primary_doc = {_toml_string(config.primary_doc)}")
1421
+ lines.extend(
1422
+ [
1423
+ "",
1424
+ "include = [",
1425
+ *[f" {_toml_string(item)}," for item in config.include],
1426
+ "]",
1427
+ "",
1428
+ "exclude = [",
1429
+ *[f" {_toml_string(item)}," for item in config.exclude],
1430
+ "]",
1431
+ "",
1432
+ ]
1433
+ )
1434
+ return "\n".join(lines)
1435
+
1436
+
1437
+ def _toml_string(value: str) -> str:
1438
+ return json.dumps(value)
1439
+
1440
+
1441
+ def _relative_posix(root: Path, path: Path) -> str:
1442
+ return path.resolve().relative_to(root.resolve()).as_posix()
1443
+
1444
+
1445
+ def _is_excluded(relative_path: str, patterns: tuple[str, ...]) -> bool:
1446
+ rel = Path(relative_path)
1447
+ rel_posix = rel.as_posix()
1448
+ for pattern in patterns:
1449
+ if rel.match(pattern) or fnmatchcase(rel_posix, pattern):
1450
+ return True
1451
+ if _matches_excluded_dir(rel, pattern):
1452
+ return True
1453
+ return False
1454
+
1455
+
1456
+ def _matches_excluded_dir(relative_path: Path, pattern: str) -> bool:
1457
+ """Treat simple ``name/**`` excludes as directory names at any depth."""
1458
+ if not pattern.endswith("/**"):
1459
+ return False
1460
+ dirname = pattern[:-3]
1461
+ if not dirname or "/" in dirname:
1462
+ return False
1463
+ return dirname in relative_path.parts
1464
+
1465
+
1466
+ def _doc_id_for_relative_path(relative_path: str) -> str:
1467
+ stem = Path(relative_path).with_suffix("").as_posix()
1468
+ return slugify(stem.replace("/", "-")) or "document"
1469
+
1470
+
1471
+ def _file_hash(path: Path) -> str:
1472
+ return hashlib.sha256(path.read_bytes()).hexdigest()
1473
+
1474
+
1475
+ def _unique_doc_id(base: str, used: set[str]) -> str:
1476
+ if base not in used:
1477
+ return base
1478
+ suffix = 2
1479
+ while f"{base}-{suffix}" in used:
1480
+ suffix += 1
1481
+ return f"{base}-{suffix}"
1482
+
1483
+
1484
+ def _emit(callback: Callable[[str], None] | None, message: str) -> None:
1485
+ if callback is not None:
1486
+ callback(message)