docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/repo.py
ADDED
|
@@ -0,0 +1,1486 @@
|
|
|
1
|
+
"""Repository-level documentation indexing workflow.
|
|
2
|
+
|
|
3
|
+
This module powers the CodeGraph-like UX for project documents:
|
|
4
|
+
``cairn init -y``, ``cairn sync``, ``cairn status``, and repo-scoped MCP
|
|
5
|
+
serving. It keeps repository state in ``.cairn/`` and stores one normal Cairn
|
|
6
|
+
document index per discovered source file under ``.cairn/documents/<doc_id>/``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import tomllib
|
|
15
|
+
from collections import Counter, defaultdict
|
|
16
|
+
from collections.abc import Callable, Collection, Iterable
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from fnmatch import fnmatchcase
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Final, Literal, Protocol
|
|
21
|
+
|
|
22
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
23
|
+
from slugify import slugify
|
|
24
|
+
|
|
25
|
+
from cairn import __version__
|
|
26
|
+
from cairn.core.errors import ConfigError, IndexNotFoundError, ToolError
|
|
27
|
+
from cairn.embed.base import Embedder
|
|
28
|
+
from cairn.engine.indexer import Indexer
|
|
29
|
+
from cairn.engine.manifest import read_manifest
|
|
30
|
+
from cairn.entity.heuristic import HeuristicExtractor
|
|
31
|
+
from cairn.ingest import parser_for_path, supported_extensions
|
|
32
|
+
from cairn.repo_search import search_repo_index
|
|
33
|
+
from cairn.summarize.base import Summarizer
|
|
34
|
+
from cairn.tools.base import DocumentIndex, estimate_tokens_of_payload
|
|
35
|
+
from cairn.tools.search_semantic import IncludeField
|
|
36
|
+
from cairn.xref.heuristic import HeuristicXRefExtractor
|
|
37
|
+
|
|
38
|
+
CAIRN_DIR: Final = ".cairn"
|
|
39
|
+
CONFIG_FILENAME: Final = "config.toml"
|
|
40
|
+
REPO_MANIFEST_FILENAME: Final = "manifest.json"
|
|
41
|
+
REPO_MANIFEST_VERSION: Final = 1
|
|
42
|
+
|
|
43
|
+
DEFAULT_INCLUDE: Final[tuple[str, ...]] = (
|
|
44
|
+
"*.md",
|
|
45
|
+
"*.markdown",
|
|
46
|
+
"*.mdown",
|
|
47
|
+
"*.mkd",
|
|
48
|
+
"*.pdf",
|
|
49
|
+
"*/README.md",
|
|
50
|
+
"*/README.markdown",
|
|
51
|
+
"docs/**/*.md",
|
|
52
|
+
"docs/**/*.markdown",
|
|
53
|
+
"docs/**/*.mdown",
|
|
54
|
+
"docs/**/*.mkd",
|
|
55
|
+
"docs/**/*.pdf",
|
|
56
|
+
)
|
|
57
|
+
MARKITDOWN_INCLUDE: Final[tuple[str, ...]] = (
|
|
58
|
+
"*.docx",
|
|
59
|
+
"*.pptx",
|
|
60
|
+
"*.xlsx",
|
|
61
|
+
"*.html",
|
|
62
|
+
"*.htm",
|
|
63
|
+
"*.epub",
|
|
64
|
+
"docs/**/*.docx",
|
|
65
|
+
"docs/**/*.pptx",
|
|
66
|
+
"docs/**/*.xlsx",
|
|
67
|
+
"docs/**/*.xls",
|
|
68
|
+
"docs/**/*.html",
|
|
69
|
+
"docs/**/*.htm",
|
|
70
|
+
"docs/**/*.csv",
|
|
71
|
+
"docs/**/*.json",
|
|
72
|
+
"docs/**/*.xml",
|
|
73
|
+
"docs/**/*.epub",
|
|
74
|
+
)
|
|
75
|
+
DEFAULT_EXCLUDE: Final[tuple[str, ...]] = (
|
|
76
|
+
".git/**",
|
|
77
|
+
".cairn/**",
|
|
78
|
+
".codegraph/**",
|
|
79
|
+
".hypothesis/**",
|
|
80
|
+
".mypy_cache/**",
|
|
81
|
+
".pytest_cache/**",
|
|
82
|
+
".ruff_cache/**",
|
|
83
|
+
".venv/**",
|
|
84
|
+
".tox/**",
|
|
85
|
+
".nox/**",
|
|
86
|
+
"venv/**",
|
|
87
|
+
"node_modules/**",
|
|
88
|
+
"dist/**",
|
|
89
|
+
"build/**",
|
|
90
|
+
"site/**",
|
|
91
|
+
"__pycache__/**",
|
|
92
|
+
)
|
|
93
|
+
NATIVE_SUFFIXES: Final = frozenset({".md", ".markdown", ".mdown", ".mkd", ".pdf"})
|
|
94
|
+
SUPPORTED_SUFFIXES: Final = supported_extensions()
|
|
95
|
+
|
|
96
|
+
DocState = Literal["indexed", "stale", "missing", "error", "orphaned"]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class IndexSettings(Protocol):
|
|
100
|
+
"""Indexing knobs needed by repo sync without importing the CLI layer."""
|
|
101
|
+
|
|
102
|
+
summary_concurrency: int
|
|
103
|
+
embed_batch_size: int
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class RepoConfig(BaseModel):
|
|
107
|
+
"""Configuration stored in ``.cairn/config.toml``."""
|
|
108
|
+
|
|
109
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
110
|
+
|
|
111
|
+
include: tuple[str, ...] = DEFAULT_INCLUDE
|
|
112
|
+
exclude: tuple[str, ...] = DEFAULT_EXCLUDE
|
|
113
|
+
documents_dir: str = "documents"
|
|
114
|
+
primary_doc: str | None = None
|
|
115
|
+
enable_markitdown: bool = False
|
|
116
|
+
search_sections_per_doc: int = Field(default=1, ge=1, le=8)
|
|
117
|
+
preferred_locales: tuple[str, ...] = Field(default=())
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class DiscoveredDocument(BaseModel):
|
|
121
|
+
"""One source document discovered from repo config globs."""
|
|
122
|
+
|
|
123
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
124
|
+
|
|
125
|
+
id: str
|
|
126
|
+
source: Path
|
|
127
|
+
relative_source: str
|
|
128
|
+
out_dir: Path
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class RepoDocumentStatus(BaseModel):
|
|
132
|
+
"""Status for one repo document index."""
|
|
133
|
+
|
|
134
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
135
|
+
|
|
136
|
+
id: str
|
|
137
|
+
source: str
|
|
138
|
+
doc_dir: str
|
|
139
|
+
state: DocState
|
|
140
|
+
section_count: int | None = None
|
|
141
|
+
source_hash: str | None = None
|
|
142
|
+
indexed_hash: str | None = None
|
|
143
|
+
source_file_hash: str | None = None
|
|
144
|
+
indexed_source_file_hash: str | None = None
|
|
145
|
+
indexed_at: datetime | None = None
|
|
146
|
+
error: str | None = None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class RepoStatus(BaseModel):
|
|
150
|
+
"""Computed repository documentation index status."""
|
|
151
|
+
|
|
152
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
153
|
+
|
|
154
|
+
root: Path
|
|
155
|
+
config_path: Path
|
|
156
|
+
documents: tuple[RepoDocumentStatus, ...]
|
|
157
|
+
primary_doc: str | None
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def indexed_count(self) -> int:
|
|
161
|
+
return sum(1 for doc in self.documents if doc.state == "indexed")
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def stale_count(self) -> int:
|
|
165
|
+
return sum(1 for doc in self.documents if doc.state == "stale")
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def missing_count(self) -> int:
|
|
169
|
+
return sum(1 for doc in self.documents if doc.state == "missing")
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def error_count(self) -> int:
|
|
173
|
+
return sum(1 for doc in self.documents if doc.state == "error")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class RepoSyncResult(BaseModel):
|
|
177
|
+
"""Outcome for one document during ``cairn sync``."""
|
|
178
|
+
|
|
179
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
180
|
+
|
|
181
|
+
id: str
|
|
182
|
+
source: str
|
|
183
|
+
manifest_path: Path | None = None
|
|
184
|
+
rebuilt: bool
|
|
185
|
+
ok: bool = True
|
|
186
|
+
error: str | None = None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def cairn_dir(root: Path) -> Path:
|
|
190
|
+
return root / CAIRN_DIR
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def config_path(root: Path) -> Path:
|
|
194
|
+
return cairn_dir(root) / CONFIG_FILENAME
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def repo_manifest_path(root: Path) -> Path:
|
|
198
|
+
return cairn_dir(root) / REPO_MANIFEST_FILENAME
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def find_repo_root(start: Path | None = None) -> Path:
|
|
202
|
+
"""Find the nearest ancestor with ``.cairn/config.toml``."""
|
|
203
|
+
current = (start or Path.cwd()).resolve()
|
|
204
|
+
if current.is_file():
|
|
205
|
+
current = current.parent
|
|
206
|
+
for candidate in (current, *current.parents):
|
|
207
|
+
if config_path(candidate).exists():
|
|
208
|
+
return candidate
|
|
209
|
+
msg = "Cairn repo config not found. Run `cairn init -y` first."
|
|
210
|
+
raise ConfigError(msg, details={"start": str(current)})
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def write_default_config(
|
|
214
|
+
root: Path,
|
|
215
|
+
*,
|
|
216
|
+
force: bool = False,
|
|
217
|
+
enable_markitdown: bool = False,
|
|
218
|
+
) -> Path:
|
|
219
|
+
"""Create ``.cairn/config.toml`` with conservative repo-doc defaults."""
|
|
220
|
+
path = config_path(root)
|
|
221
|
+
if path.exists() and not force:
|
|
222
|
+
return path
|
|
223
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
include = DEFAULT_INCLUDE
|
|
225
|
+
if enable_markitdown:
|
|
226
|
+
include = (*DEFAULT_INCLUDE, *MARKITDOWN_INCLUDE)
|
|
227
|
+
cfg = RepoConfig(
|
|
228
|
+
include=include,
|
|
229
|
+
primary_doc="readme",
|
|
230
|
+
enable_markitdown=enable_markitdown,
|
|
231
|
+
)
|
|
232
|
+
path.write_text(_render_config(cfg), encoding="utf-8")
|
|
233
|
+
return path
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def load_repo_config(root: Path) -> RepoConfig:
|
|
237
|
+
path = config_path(root)
|
|
238
|
+
if not path.exists():
|
|
239
|
+
msg = "Cairn repo config not found. Run `cairn init -y` first."
|
|
240
|
+
raise ConfigError(msg, details={"path": str(path)})
|
|
241
|
+
with path.open("rb") as fh:
|
|
242
|
+
payload = tomllib.load(fh)
|
|
243
|
+
try:
|
|
244
|
+
return RepoConfig.model_validate(payload)
|
|
245
|
+
except ValueError as exc:
|
|
246
|
+
msg = f"invalid Cairn repo config: {path}"
|
|
247
|
+
raise ConfigError(msg, details={"path": str(path)}) from exc
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def discover_documents(root: Path, config: RepoConfig) -> tuple[DiscoveredDocument, ...]:
|
|
251
|
+
"""Discover configured source documents in deterministic order."""
|
|
252
|
+
candidates: list[Path] = []
|
|
253
|
+
seen: set[Path] = set()
|
|
254
|
+
allowed_suffixes = SUPPORTED_SUFFIXES if config.enable_markitdown else NATIVE_SUFFIXES
|
|
255
|
+
for pattern in config.include:
|
|
256
|
+
for path in root.glob(pattern):
|
|
257
|
+
if not path.is_file():
|
|
258
|
+
continue
|
|
259
|
+
resolved = path.resolve()
|
|
260
|
+
if resolved in seen:
|
|
261
|
+
continue
|
|
262
|
+
rel = _relative_posix(root, resolved)
|
|
263
|
+
if _is_excluded(rel, config.exclude):
|
|
264
|
+
continue
|
|
265
|
+
if resolved.suffix.lower() not in allowed_suffixes:
|
|
266
|
+
continue
|
|
267
|
+
seen.add(resolved)
|
|
268
|
+
candidates.append(resolved)
|
|
269
|
+
|
|
270
|
+
used_ids: set[str] = set()
|
|
271
|
+
docs: list[DiscoveredDocument] = []
|
|
272
|
+
for path in sorted(candidates, key=lambda p: _relative_posix(root, p)):
|
|
273
|
+
rel = _relative_posix(root, path)
|
|
274
|
+
doc_id = _unique_doc_id(_doc_id_for_relative_path(rel), used_ids)
|
|
275
|
+
used_ids.add(doc_id)
|
|
276
|
+
docs.append(
|
|
277
|
+
DiscoveredDocument(
|
|
278
|
+
id=doc_id,
|
|
279
|
+
source=path,
|
|
280
|
+
relative_source=rel,
|
|
281
|
+
out_dir=document_dir(root, config, doc_id),
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
return tuple(docs)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def document_dir(root: Path, config: RepoConfig, doc_id: str) -> Path:
|
|
288
|
+
return cairn_dir(root) / config.documents_dir / doc_id
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def load_repo_document_index(
|
|
292
|
+
root: Path,
|
|
293
|
+
*,
|
|
294
|
+
doc_id: str | None = None,
|
|
295
|
+
) -> DocumentIndex:
|
|
296
|
+
"""Load a repo document by id, or the configured primary document."""
|
|
297
|
+
config = load_repo_config(root)
|
|
298
|
+
status = repo_status(root, config=config)
|
|
299
|
+
selected = doc_id or _choose_primary_doc(status)
|
|
300
|
+
if selected is None:
|
|
301
|
+
msg = "no indexed Cairn documents found. Run `cairn sync` first."
|
|
302
|
+
raise IndexNotFoundError(msg, details={"root": str(root)})
|
|
303
|
+
doc = next((item for item in status.documents if item.id == selected), None)
|
|
304
|
+
if doc is None or doc.state == "missing":
|
|
305
|
+
msg = f"repo document is not indexed: {selected!r}"
|
|
306
|
+
raise IndexNotFoundError(msg, details={"doc": selected})
|
|
307
|
+
return DocumentIndex.load(root / doc.doc_dir)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
async def sync_repo(
|
|
311
|
+
root: Path,
|
|
312
|
+
*,
|
|
313
|
+
summarizer: Summarizer,
|
|
314
|
+
embedder: Embedder,
|
|
315
|
+
index_config: IndexSettings,
|
|
316
|
+
force: bool = False,
|
|
317
|
+
progress: Callable[[str], None] | None = None,
|
|
318
|
+
) -> tuple[RepoSyncResult, ...]:
|
|
319
|
+
"""Index every configured repo document, reusing per-document no-op checks."""
|
|
320
|
+
config = load_repo_config(root)
|
|
321
|
+
docs = discover_documents(root, config)
|
|
322
|
+
if not docs:
|
|
323
|
+
msg = "no documents matched .cairn/config.toml include patterns"
|
|
324
|
+
raise ConfigError(msg, details={"root": str(root)})
|
|
325
|
+
|
|
326
|
+
results: list[RepoSyncResult] = []
|
|
327
|
+
for number, doc in enumerate(docs, start=1):
|
|
328
|
+
_emit(progress, f"doc {number}/{len(docs)} {doc.id}: {doc.relative_source}")
|
|
329
|
+
|
|
330
|
+
def doc_progress(message: str, doc_id: str = doc.id) -> None:
|
|
331
|
+
_emit(progress, f"{doc_id}: {message}")
|
|
332
|
+
|
|
333
|
+
indexer = Indexer(
|
|
334
|
+
parser=parser_for_path(doc.source),
|
|
335
|
+
summarizer=summarizer,
|
|
336
|
+
embedder=embedder,
|
|
337
|
+
entity_extractor=HeuristicExtractor(),
|
|
338
|
+
xref_extractor=HeuristicXRefExtractor(),
|
|
339
|
+
summary_concurrency=index_config.summary_concurrency,
|
|
340
|
+
embed_batch_size=index_config.embed_batch_size,
|
|
341
|
+
progress=doc_progress,
|
|
342
|
+
)
|
|
343
|
+
try:
|
|
344
|
+
result = await indexer.index_path(
|
|
345
|
+
doc.source,
|
|
346
|
+
out_dir=doc.out_dir,
|
|
347
|
+
doc_id=doc.id,
|
|
348
|
+
force=force,
|
|
349
|
+
)
|
|
350
|
+
results.append(
|
|
351
|
+
RepoSyncResult(
|
|
352
|
+
id=doc.id,
|
|
353
|
+
source=doc.relative_source,
|
|
354
|
+
manifest_path=result.manifest_path,
|
|
355
|
+
rebuilt=result.rebuilt,
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
except Exception as exc:
|
|
359
|
+
_emit(progress, f"{doc.id}: failed: {exc}")
|
|
360
|
+
results.append(
|
|
361
|
+
RepoSyncResult(
|
|
362
|
+
id=doc.id,
|
|
363
|
+
source=doc.relative_source,
|
|
364
|
+
rebuilt=False,
|
|
365
|
+
ok=False,
|
|
366
|
+
error=str(exc),
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
write_repo_manifest(root, repo_status(root, config=config))
|
|
371
|
+
return tuple(results)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
async def search_repo_documents(
|
|
375
|
+
root: Path,
|
|
376
|
+
*,
|
|
377
|
+
embedder: Embedder,
|
|
378
|
+
query: str,
|
|
379
|
+
k: int = 8,
|
|
380
|
+
include: Iterable[IncludeField] = ("synopsis", "head", "evidence"),
|
|
381
|
+
sections_per_doc: int | None = None,
|
|
382
|
+
) -> dict[str, Any]:
|
|
383
|
+
"""Search across every indexed document in a repository Cairn index."""
|
|
384
|
+
if k < 1 or k > 32:
|
|
385
|
+
msg = f"k must be in [1, 32]; got {k}"
|
|
386
|
+
raise ToolError(msg, details={"k": k})
|
|
387
|
+
if not query.strip():
|
|
388
|
+
msg = "query must not be empty"
|
|
389
|
+
raise ToolError(msg)
|
|
390
|
+
|
|
391
|
+
config = load_repo_config(root)
|
|
392
|
+
effective_sections_per_doc = (
|
|
393
|
+
config.search_sections_per_doc
|
|
394
|
+
if sections_per_doc is None
|
|
395
|
+
else sections_per_doc
|
|
396
|
+
)
|
|
397
|
+
if effective_sections_per_doc < 1 or effective_sections_per_doc > 8:
|
|
398
|
+
msg = f"sections_per_doc must be in [1, 8]; got {sections_per_doc}"
|
|
399
|
+
raise ToolError(msg, details={"sections_per_doc": sections_per_doc})
|
|
400
|
+
|
|
401
|
+
include_set = set(include)
|
|
402
|
+
bad = include_set - {"synopsis", "head", "evidence"}
|
|
403
|
+
if bad:
|
|
404
|
+
msg = f"invalid include values: {sorted(bad)}"
|
|
405
|
+
raise ToolError(msg, details={"invalid": sorted(bad)})
|
|
406
|
+
|
|
407
|
+
vectors = await embedder.embed([query])
|
|
408
|
+
if not vectors:
|
|
409
|
+
msg = "embedder returned no vector for query"
|
|
410
|
+
raise ToolError(msg)
|
|
411
|
+
query_vec = vectors[0]
|
|
412
|
+
|
|
413
|
+
candidates = _repo_search_candidates(root, config)
|
|
414
|
+
payload = await search_repo_index(
|
|
415
|
+
root,
|
|
416
|
+
candidates=candidates,
|
|
417
|
+
query=query,
|
|
418
|
+
query_vec=query_vec,
|
|
419
|
+
k=k,
|
|
420
|
+
include_set=include_set,
|
|
421
|
+
sections_per_doc=effective_sections_per_doc,
|
|
422
|
+
preferred_locales=config.preferred_locales,
|
|
423
|
+
)
|
|
424
|
+
return {
|
|
425
|
+
"tokens_returned": estimate_tokens_of_payload(payload),
|
|
426
|
+
"data": payload,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
async def repo_context(
|
|
431
|
+
root: Path,
|
|
432
|
+
*,
|
|
433
|
+
embedder: Embedder,
|
|
434
|
+
query: str,
|
|
435
|
+
k: int = 5,
|
|
436
|
+
sections_per_doc: int | None = None,
|
|
437
|
+
related_k: int = 3,
|
|
438
|
+
level: Literal["gist", "synopsis", "full"] = "synopsis",
|
|
439
|
+
max_section_chars: int = 1600,
|
|
440
|
+
) -> dict[str, Any]:
|
|
441
|
+
"""Build a compact repo-scoped context pack for an agent query."""
|
|
442
|
+
if related_k < 0 or related_k > 12:
|
|
443
|
+
msg = f"related_k must be in [0, 12]; got {related_k}"
|
|
444
|
+
raise ToolError(msg, details={"related_k": related_k})
|
|
445
|
+
if max_section_chars < 200 or max_section_chars > 8000:
|
|
446
|
+
msg = f"max_section_chars must be in [200, 8000]; got {max_section_chars}"
|
|
447
|
+
raise ToolError(msg, details={"max_section_chars": max_section_chars})
|
|
448
|
+
|
|
449
|
+
search = await search_repo_documents(
|
|
450
|
+
root,
|
|
451
|
+
embedder=embedder,
|
|
452
|
+
query=query,
|
|
453
|
+
k=k,
|
|
454
|
+
include=("synopsis", "evidence"),
|
|
455
|
+
sections_per_doc=sections_per_doc,
|
|
456
|
+
)
|
|
457
|
+
hits = list(search["data"]["hits"])
|
|
458
|
+
context_sections: list[dict[str, Any]] = []
|
|
459
|
+
graph_nodes: dict[str, dict[str, Any]] = {}
|
|
460
|
+
graph_edges: list[dict[str, Any]] = []
|
|
461
|
+
seen_edges: set[tuple[str, str, str]] = set()
|
|
462
|
+
|
|
463
|
+
for rank, hit in enumerate(hits, start=1):
|
|
464
|
+
index = load_repo_document_index(root, doc_id=hit["doc"])
|
|
465
|
+
node = index.tree.get(hit["id"])
|
|
466
|
+
if node is None:
|
|
467
|
+
continue
|
|
468
|
+
content = _repo_context_content(
|
|
469
|
+
index,
|
|
470
|
+
section_id=node.id,
|
|
471
|
+
level=level,
|
|
472
|
+
fallback=node.raw_text,
|
|
473
|
+
)[:max_section_chars]
|
|
474
|
+
relationships = _section_relationships(index, node.id, k=related_k)
|
|
475
|
+
context_sections.append(
|
|
476
|
+
{
|
|
477
|
+
"rank": rank,
|
|
478
|
+
"doc": hit["doc"],
|
|
479
|
+
"source": hit["source"],
|
|
480
|
+
"id": node.id,
|
|
481
|
+
"title": node.title,
|
|
482
|
+
"path": list(node.path),
|
|
483
|
+
"anchor": index.anchor(node.id),
|
|
484
|
+
"level": level,
|
|
485
|
+
"content": content,
|
|
486
|
+
"hit": hit,
|
|
487
|
+
"relationships": relationships,
|
|
488
|
+
}
|
|
489
|
+
)
|
|
490
|
+
_add_repo_doc_graph_node(graph_nodes, hit["doc"], source=hit["source"])
|
|
491
|
+
_add_repo_section_graph_node(graph_nodes, hit["doc"], index, node.id)
|
|
492
|
+
_add_repo_graph_edge(
|
|
493
|
+
graph_edges,
|
|
494
|
+
seen_edges,
|
|
495
|
+
source=_repo_doc_node_id(hit["doc"]),
|
|
496
|
+
target=_repo_section_node_id(hit["doc"], node.id),
|
|
497
|
+
kind="contains",
|
|
498
|
+
relation=None,
|
|
499
|
+
confidence=1.0,
|
|
500
|
+
)
|
|
501
|
+
for related in relationships:
|
|
502
|
+
_add_repo_section_graph_node(graph_nodes, hit["doc"], index, related["id"])
|
|
503
|
+
_add_repo_graph_edge(
|
|
504
|
+
graph_edges,
|
|
505
|
+
seen_edges,
|
|
506
|
+
source=_repo_section_node_id(hit["doc"], node.id),
|
|
507
|
+
target=_repo_section_node_id(hit["doc"], related["id"]),
|
|
508
|
+
kind=related["kind"],
|
|
509
|
+
relation=related.get("relation"),
|
|
510
|
+
confidence=float(related.get("confidence", 1.0)),
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
payload: dict[str, Any] = {
|
|
514
|
+
"query": query,
|
|
515
|
+
"hits": hits,
|
|
516
|
+
"context_sections": context_sections,
|
|
517
|
+
"relationship_map": {
|
|
518
|
+
"nodes": list(graph_nodes.values()),
|
|
519
|
+
"edges": graph_edges,
|
|
520
|
+
},
|
|
521
|
+
"stale_documents": search["data"].get("stale_documents", []),
|
|
522
|
+
"skipped_documents": search["data"]["skipped_documents"],
|
|
523
|
+
"codegraph_bridge": {
|
|
524
|
+
"status": "not_invoked",
|
|
525
|
+
"note": (
|
|
526
|
+
"Cairn does not parse source code. Pair this context with the "
|
|
527
|
+
"CodeGraph MCP server for symbol callers, callees, and code impact."
|
|
528
|
+
),
|
|
529
|
+
},
|
|
530
|
+
}
|
|
531
|
+
return {
|
|
532
|
+
"tokens_returned": estimate_tokens_of_payload(payload),
|
|
533
|
+
"data": payload,
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
async def repo_graph(
|
|
538
|
+
root: Path,
|
|
539
|
+
*,
|
|
540
|
+
doc: str | None = None,
|
|
541
|
+
max_sections: int = 120,
|
|
542
|
+
max_entities: int = 40,
|
|
543
|
+
include_entities: bool = True,
|
|
544
|
+
include_xrefs: bool = True,
|
|
545
|
+
) -> dict[str, Any]:
|
|
546
|
+
"""Return a repo-level documentation relationship map."""
|
|
547
|
+
if max_sections < 1 or max_sections > 500:
|
|
548
|
+
msg = f"max_sections must be in [1, 500]; got {max_sections}"
|
|
549
|
+
raise ToolError(msg, details={"max_sections": max_sections})
|
|
550
|
+
if max_entities < 0 or max_entities > 200:
|
|
551
|
+
msg = f"max_entities must be in [0, 200]; got {max_entities}"
|
|
552
|
+
raise ToolError(msg, details={"max_entities": max_entities})
|
|
553
|
+
|
|
554
|
+
status = repo_status(root)
|
|
555
|
+
candidates = [
|
|
556
|
+
item
|
|
557
|
+
for item in status.documents
|
|
558
|
+
if item.state in {"indexed", "stale"} and (doc is None or item.id == doc)
|
|
559
|
+
]
|
|
560
|
+
if doc is not None and not candidates:
|
|
561
|
+
msg = f"repo document is not indexed: {doc!r}"
|
|
562
|
+
raise IndexNotFoundError(msg, details={"doc": doc})
|
|
563
|
+
|
|
564
|
+
graph = _build_repo_graph_payload(
|
|
565
|
+
root,
|
|
566
|
+
candidates,
|
|
567
|
+
max_sections=max_sections,
|
|
568
|
+
max_entities=max_entities if include_entities else 0,
|
|
569
|
+
include_xrefs=include_xrefs,
|
|
570
|
+
)
|
|
571
|
+
payload: dict[str, Any] = {
|
|
572
|
+
"root": str(status.root),
|
|
573
|
+
"doc": doc,
|
|
574
|
+
"nodes": graph["nodes"],
|
|
575
|
+
"edges": graph["edges"],
|
|
576
|
+
"stats": graph["stats"],
|
|
577
|
+
"skipped_documents": graph["skipped_documents"],
|
|
578
|
+
"codegraph_bridge": {
|
|
579
|
+
"status": "external",
|
|
580
|
+
"note": (
|
|
581
|
+
"This graph covers repository documentation only. Do not use Cairn "
|
|
582
|
+
"as a source-code graph; connect CodeGraph for AST symbols and code edges."
|
|
583
|
+
),
|
|
584
|
+
},
|
|
585
|
+
}
|
|
586
|
+
return {
|
|
587
|
+
"tokens_returned": estimate_tokens_of_payload(payload),
|
|
588
|
+
"data": payload,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
async def repo_impact(
|
|
593
|
+
root: Path,
|
|
594
|
+
*,
|
|
595
|
+
doc: str,
|
|
596
|
+
id: str | None = None,
|
|
597
|
+
max_results: int = 24,
|
|
598
|
+
) -> dict[str, Any]:
|
|
599
|
+
"""Estimate documentation surfaces affected by a document or section change."""
|
|
600
|
+
if max_results < 1 or max_results > 100:
|
|
601
|
+
msg = f"max_results must be in [1, 100]; got {max_results}"
|
|
602
|
+
raise ToolError(msg, details={"max_results": max_results})
|
|
603
|
+
status = repo_status(root)
|
|
604
|
+
doc_status = next((item for item in status.documents if item.id == doc), None)
|
|
605
|
+
if doc_status is None or doc_status.state == "missing":
|
|
606
|
+
msg = f"repo document is not indexed: {doc!r}"
|
|
607
|
+
raise IndexNotFoundError(msg, details={"doc": doc})
|
|
608
|
+
|
|
609
|
+
index = DocumentIndex.load(root / doc_status.doc_dir)
|
|
610
|
+
if id is None:
|
|
611
|
+
payload = _repo_document_impact_payload(
|
|
612
|
+
root,
|
|
613
|
+
status=status,
|
|
614
|
+
doc_status=doc_status,
|
|
615
|
+
index=index,
|
|
616
|
+
max_results=max_results,
|
|
617
|
+
)
|
|
618
|
+
else:
|
|
619
|
+
payload = _repo_section_impact_payload(
|
|
620
|
+
root,
|
|
621
|
+
status=status,
|
|
622
|
+
doc_status=doc_status,
|
|
623
|
+
index=index,
|
|
624
|
+
section_id=id,
|
|
625
|
+
max_results=max_results,
|
|
626
|
+
)
|
|
627
|
+
return {
|
|
628
|
+
"tokens_returned": estimate_tokens_of_payload(payload),
|
|
629
|
+
"data": payload,
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _repo_context_content(
|
|
634
|
+
index: DocumentIndex,
|
|
635
|
+
*,
|
|
636
|
+
section_id: str,
|
|
637
|
+
level: Literal["gist", "synopsis", "full"],
|
|
638
|
+
fallback: str,
|
|
639
|
+
) -> str:
|
|
640
|
+
if level == "full":
|
|
641
|
+
return fallback
|
|
642
|
+
summary = index.summaries.get(section_id)
|
|
643
|
+
if summary is None:
|
|
644
|
+
return fallback
|
|
645
|
+
if level == "gist":
|
|
646
|
+
return summary.gist
|
|
647
|
+
return summary.synopsis
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _section_relationships(
|
|
651
|
+
index: DocumentIndex,
|
|
652
|
+
section_id: str,
|
|
653
|
+
*,
|
|
654
|
+
k: int,
|
|
655
|
+
) -> list[dict[str, Any]]:
|
|
656
|
+
if k <= 0:
|
|
657
|
+
return []
|
|
658
|
+
node = index.tree.require(section_id)
|
|
659
|
+
relationships: list[dict[str, Any]] = []
|
|
660
|
+
seen: set[tuple[str, str, str | None, str | None]] = set()
|
|
661
|
+
|
|
662
|
+
def add(
|
|
663
|
+
target_id: str,
|
|
664
|
+
*,
|
|
665
|
+
kind: str,
|
|
666
|
+
relation: str | None,
|
|
667
|
+
confidence: float,
|
|
668
|
+
direction: str | None = None,
|
|
669
|
+
) -> None:
|
|
670
|
+
key = (target_id, kind, relation, direction)
|
|
671
|
+
if key in seen:
|
|
672
|
+
return
|
|
673
|
+
seen.add(key)
|
|
674
|
+
target = index.tree.get(target_id)
|
|
675
|
+
relationships.append(
|
|
676
|
+
{
|
|
677
|
+
"id": target_id,
|
|
678
|
+
"title": target.title if target is not None else target_id,
|
|
679
|
+
"kind": kind,
|
|
680
|
+
"relation": relation,
|
|
681
|
+
"direction": direction,
|
|
682
|
+
"confidence": round(float(confidence), 4),
|
|
683
|
+
"anchor": index.anchor(target_id),
|
|
684
|
+
}
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
if node.parent is not None:
|
|
688
|
+
add(node.parent, kind="parent", relation=None, confidence=1.0)
|
|
689
|
+
for child_id in node.children:
|
|
690
|
+
add(child_id, kind="child", relation=None, confidence=1.0)
|
|
691
|
+
if index.xrefs is not None:
|
|
692
|
+
for ref in index.xrefs.outgoing_from(section_id):
|
|
693
|
+
add(
|
|
694
|
+
ref.dst,
|
|
695
|
+
kind="xref",
|
|
696
|
+
relation=ref.kind,
|
|
697
|
+
confidence=ref.confidence,
|
|
698
|
+
direction="outgoing",
|
|
699
|
+
)
|
|
700
|
+
for ref in index.xrefs.incoming_to(section_id):
|
|
701
|
+
add(
|
|
702
|
+
ref.src,
|
|
703
|
+
kind="xref",
|
|
704
|
+
relation=ref.kind,
|
|
705
|
+
confidence=ref.confidence,
|
|
706
|
+
direction="incoming",
|
|
707
|
+
)
|
|
708
|
+
relationships.sort(
|
|
709
|
+
key=lambda item: (
|
|
710
|
+
-float(item["confidence"]),
|
|
711
|
+
str(item["kind"]),
|
|
712
|
+
str(item["id"]),
|
|
713
|
+
str(item.get("direction") or ""),
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
return relationships[:k]
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def _build_repo_graph_payload(
|
|
720
|
+
root: Path,
|
|
721
|
+
candidates: Collection[RepoDocumentStatus],
|
|
722
|
+
*,
|
|
723
|
+
max_sections: int,
|
|
724
|
+
max_entities: int,
|
|
725
|
+
include_xrefs: bool,
|
|
726
|
+
) -> dict[str, Any]:
|
|
727
|
+
nodes: dict[str, dict[str, Any]] = {}
|
|
728
|
+
edges: list[dict[str, Any]] = []
|
|
729
|
+
seen_edges: set[tuple[str, str, str]] = set()
|
|
730
|
+
skipped: list[dict[str, str]] = []
|
|
731
|
+
selected_sections: dict[str, set[str]] = defaultdict(set)
|
|
732
|
+
entity_mentions: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set)
|
|
733
|
+
total_sections = 0
|
|
734
|
+
truncated = False
|
|
735
|
+
|
|
736
|
+
for doc in candidates:
|
|
737
|
+
_add_repo_doc_graph_node(nodes, doc.id, source=doc.source, state=doc.state)
|
|
738
|
+
try:
|
|
739
|
+
index = DocumentIndex.load(root / doc.doc_dir)
|
|
740
|
+
except Exception as exc:
|
|
741
|
+
skipped.append({"doc": doc.id, "reason": str(exc)})
|
|
742
|
+
continue
|
|
743
|
+
|
|
744
|
+
for section in index.tree:
|
|
745
|
+
total_sections += 1
|
|
746
|
+
if _repo_section_count(nodes) >= max_sections:
|
|
747
|
+
truncated = True
|
|
748
|
+
continue
|
|
749
|
+
selected_sections[doc.id].add(section.id)
|
|
750
|
+
_add_repo_section_graph_node(nodes, doc.id, index, section.id)
|
|
751
|
+
|
|
752
|
+
selected = selected_sections[doc.id]
|
|
753
|
+
for node in index.tree:
|
|
754
|
+
if node.id not in selected:
|
|
755
|
+
continue
|
|
756
|
+
source = (
|
|
757
|
+
_repo_section_node_id(doc.id, node.parent)
|
|
758
|
+
if node.parent in selected
|
|
759
|
+
else _repo_doc_node_id(doc.id)
|
|
760
|
+
)
|
|
761
|
+
_add_repo_graph_edge(
|
|
762
|
+
edges,
|
|
763
|
+
seen_edges,
|
|
764
|
+
source=source,
|
|
765
|
+
target=_repo_section_node_id(doc.id, node.id),
|
|
766
|
+
kind="contains",
|
|
767
|
+
relation=None,
|
|
768
|
+
confidence=1.0,
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
if include_xrefs and index.xrefs is not None:
|
|
772
|
+
for ref in index.xrefs:
|
|
773
|
+
if ref.src in selected and ref.dst in selected:
|
|
774
|
+
_add_repo_graph_edge(
|
|
775
|
+
edges,
|
|
776
|
+
seen_edges,
|
|
777
|
+
source=_repo_section_node_id(doc.id, ref.src),
|
|
778
|
+
target=_repo_section_node_id(doc.id, ref.dst),
|
|
779
|
+
kind="xref",
|
|
780
|
+
relation=ref.kind,
|
|
781
|
+
confidence=ref.confidence,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
if max_entities > 0 and index.entities is not None:
|
|
785
|
+
for entity in index.entities:
|
|
786
|
+
key = (entity.kind, entity.canonical)
|
|
787
|
+
for mention in entity.mentions:
|
|
788
|
+
if mention.section_id in selected:
|
|
789
|
+
entity_mentions[key].add((doc.id, mention.section_id))
|
|
790
|
+
|
|
791
|
+
for (kind, canonical), mentions in sorted(
|
|
792
|
+
entity_mentions.items(),
|
|
793
|
+
key=lambda item: (-len(item[1]), item[0][0], item[0][1].lower()),
|
|
794
|
+
)[:max_entities]:
|
|
795
|
+
entity_id = _repo_entity_node_id(kind, canonical)
|
|
796
|
+
nodes[entity_id] = {
|
|
797
|
+
"id": entity_id,
|
|
798
|
+
"kind": "entity",
|
|
799
|
+
"entity_kind": kind,
|
|
800
|
+
"label": canonical,
|
|
801
|
+
"mentions": len(mentions),
|
|
802
|
+
}
|
|
803
|
+
for doc_id, section_id in sorted(mentions):
|
|
804
|
+
_add_repo_graph_edge(
|
|
805
|
+
edges,
|
|
806
|
+
seen_edges,
|
|
807
|
+
source=_repo_section_node_id(doc_id, section_id),
|
|
808
|
+
target=entity_id,
|
|
809
|
+
kind="mentions",
|
|
810
|
+
relation=kind,
|
|
811
|
+
confidence=1.0,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
return {
|
|
815
|
+
"nodes": list(nodes.values()),
|
|
816
|
+
"edges": edges,
|
|
817
|
+
"stats": {
|
|
818
|
+
"documents": sum(1 for node in nodes.values() if node["kind"] == "document"),
|
|
819
|
+
"sections": sum(1 for node in nodes.values() if node["kind"] == "section"),
|
|
820
|
+
"entities": sum(1 for node in nodes.values() if node["kind"] == "entity"),
|
|
821
|
+
"edges": len(edges),
|
|
822
|
+
"total_sections": total_sections,
|
|
823
|
+
"truncated": truncated,
|
|
824
|
+
},
|
|
825
|
+
"skipped_documents": skipped,
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def _repo_document_impact_payload(
|
|
830
|
+
root: Path,
|
|
831
|
+
*,
|
|
832
|
+
status: RepoStatus,
|
|
833
|
+
doc_status: RepoDocumentStatus,
|
|
834
|
+
index: DocumentIndex,
|
|
835
|
+
max_results: int,
|
|
836
|
+
) -> dict[str, Any]:
|
|
837
|
+
sections = [
|
|
838
|
+
_impact_section_ref(doc_status.id, index, section.id, kind="contains")
|
|
839
|
+
for section in list(index.tree)[:max_results]
|
|
840
|
+
]
|
|
841
|
+
related_documents = _related_documents_by_entities(
|
|
842
|
+
root,
|
|
843
|
+
status=status,
|
|
844
|
+
doc_id=doc_status.id,
|
|
845
|
+
max_results=max_results,
|
|
846
|
+
)
|
|
847
|
+
return {
|
|
848
|
+
"scope": "document",
|
|
849
|
+
"doc": doc_status.id,
|
|
850
|
+
"source": doc_status.source,
|
|
851
|
+
"state": doc_status.state,
|
|
852
|
+
"section_count": len(index.tree),
|
|
853
|
+
"derived_artifacts": _repo_derived_artifacts(doc_status.id),
|
|
854
|
+
"affected_surfaces": _repo_affected_surfaces(),
|
|
855
|
+
"sections": sections,
|
|
856
|
+
"related_documents": related_documents,
|
|
857
|
+
"notes": [
|
|
858
|
+
"Changing this source can make the document index stale.",
|
|
859
|
+
(
|
|
860
|
+
"Repo search, repo_context, repo_graph, inspectors, and MCP "
|
|
861
|
+
"drilldown read derived artifacts."
|
|
862
|
+
),
|
|
863
|
+
],
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
def _repo_section_impact_payload(
|
|
868
|
+
root: Path,
|
|
869
|
+
*,
|
|
870
|
+
status: RepoStatus,
|
|
871
|
+
doc_status: RepoDocumentStatus,
|
|
872
|
+
index: DocumentIndex,
|
|
873
|
+
section_id: str,
|
|
874
|
+
max_results: int,
|
|
875
|
+
) -> dict[str, Any]:
|
|
876
|
+
node = index.tree.require(section_id)
|
|
877
|
+
affected = _section_relationships(index, section_id, k=max_results)
|
|
878
|
+
shared = _shared_entity_section_refs(
|
|
879
|
+
root,
|
|
880
|
+
status=status,
|
|
881
|
+
doc_id=doc_status.id,
|
|
882
|
+
section_id=section_id,
|
|
883
|
+
max_results=max_results,
|
|
884
|
+
)
|
|
885
|
+
merged: list[dict[str, Any]] = []
|
|
886
|
+
seen: set[tuple[str, str, str]] = set()
|
|
887
|
+
for item in affected:
|
|
888
|
+
key = (doc_status.id, item["id"], item["kind"])
|
|
889
|
+
if key in seen:
|
|
890
|
+
continue
|
|
891
|
+
seen.add(key)
|
|
892
|
+
merged.append({"doc": doc_status.id, **item})
|
|
893
|
+
for item in shared:
|
|
894
|
+
key = (item["doc"], item["id"], item["kind"])
|
|
895
|
+
if key in seen:
|
|
896
|
+
continue
|
|
897
|
+
seen.add(key)
|
|
898
|
+
merged.append(item)
|
|
899
|
+
merged = merged[:max_results]
|
|
900
|
+
documents = sorted({item["doc"] for item in merged} | {doc_status.id})
|
|
901
|
+
return {
|
|
902
|
+
"scope": "section",
|
|
903
|
+
"doc": doc_status.id,
|
|
904
|
+
"source": doc_status.source,
|
|
905
|
+
"id": node.id,
|
|
906
|
+
"title": node.title,
|
|
907
|
+
"path": list(node.path),
|
|
908
|
+
"anchor": index.anchor(node.id),
|
|
909
|
+
"derived_artifacts": [
|
|
910
|
+
f".cairn/documents/{doc_status.id}/tree.json",
|
|
911
|
+
f".cairn/documents/{doc_status.id}/summaries.json",
|
|
912
|
+
f".cairn/documents/{doc_status.id}/vectors.lance",
|
|
913
|
+
f".cairn/documents/{doc_status.id}/entities.json",
|
|
914
|
+
f".cairn/documents/{doc_status.id}/refs.json",
|
|
915
|
+
"repo search cache",
|
|
916
|
+
"repo inspectors",
|
|
917
|
+
],
|
|
918
|
+
"affected_surfaces": _repo_affected_surfaces(),
|
|
919
|
+
"sections": merged,
|
|
920
|
+
"documents": documents,
|
|
921
|
+
"notes": [
|
|
922
|
+
"Impact is documentation-graph impact, not source-code symbol impact.",
|
|
923
|
+
"Use the CodeGraph MCP server for callers, callees, and code symbol impact.",
|
|
924
|
+
],
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def _related_documents_by_entities(
|
|
929
|
+
root: Path,
|
|
930
|
+
*,
|
|
931
|
+
status: RepoStatus,
|
|
932
|
+
doc_id: str,
|
|
933
|
+
max_results: int,
|
|
934
|
+
) -> list[dict[str, Any]]:
|
|
935
|
+
target_keys: set[tuple[str, str]] = set()
|
|
936
|
+
for item in status.documents:
|
|
937
|
+
if item.id != doc_id or item.state not in {"indexed", "stale"}:
|
|
938
|
+
continue
|
|
939
|
+
try:
|
|
940
|
+
index = DocumentIndex.load(root / item.doc_dir)
|
|
941
|
+
except Exception:
|
|
942
|
+
continue
|
|
943
|
+
if index.entities is not None:
|
|
944
|
+
target_keys.update(
|
|
945
|
+
(entity.kind, entity.canonical) for entity in index.entities
|
|
946
|
+
)
|
|
947
|
+
break
|
|
948
|
+
if not target_keys:
|
|
949
|
+
return []
|
|
950
|
+
|
|
951
|
+
related: Counter[str] = Counter()
|
|
952
|
+
for item in status.documents:
|
|
953
|
+
if item.id == doc_id or item.state not in {"indexed", "stale"}:
|
|
954
|
+
continue
|
|
955
|
+
try:
|
|
956
|
+
index = DocumentIndex.load(root / item.doc_dir)
|
|
957
|
+
except Exception:
|
|
958
|
+
continue
|
|
959
|
+
if index.entities is None:
|
|
960
|
+
continue
|
|
961
|
+
keys = {(entity.kind, entity.canonical) for entity in index.entities}
|
|
962
|
+
related[item.id] += len(target_keys & keys)
|
|
963
|
+
rows = [
|
|
964
|
+
{"doc": doc, "shared_entities": count}
|
|
965
|
+
for doc, count in related.most_common(max_results)
|
|
966
|
+
if count > 0
|
|
967
|
+
]
|
|
968
|
+
return rows
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
def _shared_entity_section_refs(
|
|
972
|
+
root: Path,
|
|
973
|
+
*,
|
|
974
|
+
status: RepoStatus,
|
|
975
|
+
doc_id: str,
|
|
976
|
+
section_id: str,
|
|
977
|
+
max_results: int,
|
|
978
|
+
) -> list[dict[str, Any]]:
|
|
979
|
+
target_entities: set[tuple[str, str]] = set()
|
|
980
|
+
refs: list[dict[str, Any]] = []
|
|
981
|
+
for item in status.documents:
|
|
982
|
+
if item.state not in {"indexed", "stale"}:
|
|
983
|
+
continue
|
|
984
|
+
try:
|
|
985
|
+
index = DocumentIndex.load(root / item.doc_dir)
|
|
986
|
+
except Exception:
|
|
987
|
+
continue
|
|
988
|
+
if index.entities is None:
|
|
989
|
+
continue
|
|
990
|
+
if item.id == doc_id:
|
|
991
|
+
for entity in index.entities:
|
|
992
|
+
if any(mention.section_id == section_id for mention in entity.mentions):
|
|
993
|
+
target_entities.add((entity.kind, entity.canonical))
|
|
994
|
+
break
|
|
995
|
+
if not target_entities:
|
|
996
|
+
return []
|
|
997
|
+
for item in status.documents:
|
|
998
|
+
if item.state not in {"indexed", "stale"}:
|
|
999
|
+
continue
|
|
1000
|
+
try:
|
|
1001
|
+
index = DocumentIndex.load(root / item.doc_dir)
|
|
1002
|
+
except Exception:
|
|
1003
|
+
continue
|
|
1004
|
+
if index.entities is None:
|
|
1005
|
+
continue
|
|
1006
|
+
for entity in index.entities:
|
|
1007
|
+
key = (entity.kind, entity.canonical)
|
|
1008
|
+
if key not in target_entities:
|
|
1009
|
+
continue
|
|
1010
|
+
for mention in entity.mentions:
|
|
1011
|
+
if item.id == doc_id and mention.section_id == section_id:
|
|
1012
|
+
continue
|
|
1013
|
+
if index.tree.get(mention.section_id) is None:
|
|
1014
|
+
continue
|
|
1015
|
+
ref = _impact_section_ref(
|
|
1016
|
+
item.id,
|
|
1017
|
+
index,
|
|
1018
|
+
mention.section_id,
|
|
1019
|
+
kind="shared_entity",
|
|
1020
|
+
relation=f"{entity.kind}:{entity.canonical}",
|
|
1021
|
+
confidence=0.18,
|
|
1022
|
+
)
|
|
1023
|
+
refs.append(ref)
|
|
1024
|
+
if len(refs) >= max_results:
|
|
1025
|
+
return refs
|
|
1026
|
+
return refs
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def _repo_derived_artifacts(doc_id: str) -> list[str]:
|
|
1030
|
+
prefix = f".cairn/documents/{doc_id}"
|
|
1031
|
+
return [
|
|
1032
|
+
".cairn/manifest.json",
|
|
1033
|
+
f"{prefix}/manifest.json",
|
|
1034
|
+
f"{prefix}/tree.json",
|
|
1035
|
+
f"{prefix}/summaries.json",
|
|
1036
|
+
f"{prefix}/vectors.lance",
|
|
1037
|
+
f"{prefix}/entities.json",
|
|
1038
|
+
f"{prefix}/refs.json",
|
|
1039
|
+
]
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
def _repo_affected_surfaces() -> list[str]:
|
|
1043
|
+
return [
|
|
1044
|
+
"list_documents",
|
|
1045
|
+
"search_documents",
|
|
1046
|
+
"repo_context",
|
|
1047
|
+
"repo_graph",
|
|
1048
|
+
"repo_impact",
|
|
1049
|
+
"outline/get_section/expand/read_range with doc",
|
|
1050
|
+
"find_mentions/get_related with doc",
|
|
1051
|
+
"generated inspector HTML",
|
|
1052
|
+
]
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def _impact_section_ref(
|
|
1056
|
+
doc_id: str,
|
|
1057
|
+
index: DocumentIndex,
|
|
1058
|
+
section_id: str,
|
|
1059
|
+
*,
|
|
1060
|
+
kind: str,
|
|
1061
|
+
relation: str | None = None,
|
|
1062
|
+
confidence: float = 1.0,
|
|
1063
|
+
) -> dict[str, Any]:
|
|
1064
|
+
node = index.tree.require(section_id)
|
|
1065
|
+
return {
|
|
1066
|
+
"doc": doc_id,
|
|
1067
|
+
"id": node.id,
|
|
1068
|
+
"title": node.title,
|
|
1069
|
+
"kind": kind,
|
|
1070
|
+
"relation": relation,
|
|
1071
|
+
"confidence": round(float(confidence), 4),
|
|
1072
|
+
"anchor": index.anchor(node.id),
|
|
1073
|
+
"path": list(node.path),
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
def _add_repo_doc_graph_node(
|
|
1078
|
+
nodes: dict[str, dict[str, Any]],
|
|
1079
|
+
doc_id: str,
|
|
1080
|
+
*,
|
|
1081
|
+
source: str,
|
|
1082
|
+
state: str | None = None,
|
|
1083
|
+
) -> None:
|
|
1084
|
+
node_id = _repo_doc_node_id(doc_id)
|
|
1085
|
+
nodes.setdefault(
|
|
1086
|
+
node_id,
|
|
1087
|
+
{
|
|
1088
|
+
"id": node_id,
|
|
1089
|
+
"kind": "document",
|
|
1090
|
+
"doc": doc_id,
|
|
1091
|
+
"label": doc_id,
|
|
1092
|
+
"source": source,
|
|
1093
|
+
**({"state": state} if state is not None else {}),
|
|
1094
|
+
},
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
def _add_repo_section_graph_node(
|
|
1099
|
+
nodes: dict[str, dict[str, Any]],
|
|
1100
|
+
doc_id: str,
|
|
1101
|
+
index: DocumentIndex,
|
|
1102
|
+
section_id: str,
|
|
1103
|
+
) -> None:
|
|
1104
|
+
node = index.tree.get(section_id)
|
|
1105
|
+
if node is None:
|
|
1106
|
+
return
|
|
1107
|
+
node_id = _repo_section_node_id(doc_id, section_id)
|
|
1108
|
+
nodes.setdefault(
|
|
1109
|
+
node_id,
|
|
1110
|
+
{
|
|
1111
|
+
"id": node_id,
|
|
1112
|
+
"kind": "section",
|
|
1113
|
+
"doc": doc_id,
|
|
1114
|
+
"section_id": section_id,
|
|
1115
|
+
"label": node.title,
|
|
1116
|
+
"level": node.level,
|
|
1117
|
+
"path": list(node.path),
|
|
1118
|
+
"anchor": index.anchor(section_id),
|
|
1119
|
+
},
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _add_repo_graph_edge(
|
|
1124
|
+
edges: list[dict[str, Any]],
|
|
1125
|
+
seen: set[tuple[str, str, str]],
|
|
1126
|
+
*,
|
|
1127
|
+
source: str,
|
|
1128
|
+
target: str,
|
|
1129
|
+
kind: str,
|
|
1130
|
+
relation: str | None,
|
|
1131
|
+
confidence: float,
|
|
1132
|
+
) -> None:
|
|
1133
|
+
edge_kind = kind if relation is None else f"{kind}:{relation}"
|
|
1134
|
+
key = (source, target, edge_kind)
|
|
1135
|
+
if key in seen:
|
|
1136
|
+
return
|
|
1137
|
+
seen.add(key)
|
|
1138
|
+
edges.append(
|
|
1139
|
+
{
|
|
1140
|
+
"source": source,
|
|
1141
|
+
"target": target,
|
|
1142
|
+
"kind": kind,
|
|
1143
|
+
"relation": relation,
|
|
1144
|
+
"confidence": round(float(confidence), 4),
|
|
1145
|
+
}
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
def _repo_doc_node_id(doc_id: str) -> str:
|
|
1150
|
+
return f"doc:{doc_id}"
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def _repo_section_node_id(doc_id: str, section_id: str) -> str:
|
|
1154
|
+
return f"section:{doc_id}:{section_id}"
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
def _repo_entity_node_id(kind: str, canonical: str) -> str:
|
|
1158
|
+
slug = slugify(canonical) or _normalize_search_text(canonical).replace(" ", "-")
|
|
1159
|
+
return f"entity:{kind}:{slug}"
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def _repo_section_count(nodes: dict[str, dict[str, Any]]) -> int:
|
|
1163
|
+
return sum(1 for node in nodes.values() if node["kind"] == "section")
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _normalize_search_text(text: str) -> str:
|
|
1167
|
+
normalized = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
|
|
1168
|
+
return " ".join(re.findall(r"[a-z0-9][a-z0-9]*", normalized))
|
|
1169
|
+
|
|
1170
|
+
|
|
1171
|
+
def _repo_search_candidates(
|
|
1172
|
+
root: Path,
|
|
1173
|
+
config: RepoConfig,
|
|
1174
|
+
) -> tuple[RepoDocumentStatus, ...]:
|
|
1175
|
+
status = repo_status(root, config=config)
|
|
1176
|
+
return tuple(doc for doc in status.documents if doc.state in {"indexed", "stale"})
|
|
1177
|
+
|
|
1178
|
+
|
|
1179
|
+
def _read_repo_manifest_status(root: Path) -> tuple[RepoDocumentStatus, ...] | None:
|
|
1180
|
+
path = repo_manifest_path(root)
|
|
1181
|
+
if not path.exists():
|
|
1182
|
+
return None
|
|
1183
|
+
try:
|
|
1184
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
1185
|
+
payload = json.load(fh)
|
|
1186
|
+
if payload.get("format_version") != REPO_MANIFEST_VERSION:
|
|
1187
|
+
return None
|
|
1188
|
+
return tuple(
|
|
1189
|
+
RepoDocumentStatus.model_validate(item)
|
|
1190
|
+
for item in payload.get("documents", [])
|
|
1191
|
+
)
|
|
1192
|
+
except (OSError, ValueError, TypeError):
|
|
1193
|
+
return None
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
def repo_status(root: Path, *, config: RepoConfig | None = None) -> RepoStatus:
|
|
1197
|
+
"""Compute indexed/stale/missing status for configured repo docs."""
|
|
1198
|
+
cfg = config or load_repo_config(root)
|
|
1199
|
+
docs = discover_documents(root, cfg)
|
|
1200
|
+
previous = {
|
|
1201
|
+
doc.id: doc for doc in (_read_repo_manifest_status(root) or ())
|
|
1202
|
+
}
|
|
1203
|
+
statuses: list[RepoDocumentStatus] = [
|
|
1204
|
+
_document_status(root, doc, previous=previous.get(doc.id)) for doc in docs
|
|
1205
|
+
]
|
|
1206
|
+
statuses.extend(_orphaned_statuses(root, cfg, {doc.id for doc in docs}))
|
|
1207
|
+
return RepoStatus(
|
|
1208
|
+
root=root,
|
|
1209
|
+
config_path=config_path(root),
|
|
1210
|
+
documents=tuple(statuses),
|
|
1211
|
+
primary_doc=cfg.primary_doc,
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def write_repo_manifest(root: Path, status: RepoStatus) -> Path:
|
|
1216
|
+
"""Write a lightweight repo-level manifest for humans and tools."""
|
|
1217
|
+
path = repo_manifest_path(root)
|
|
1218
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
1219
|
+
payload: dict[str, Any] = {
|
|
1220
|
+
"format_version": REPO_MANIFEST_VERSION,
|
|
1221
|
+
"cairn_version": __version__,
|
|
1222
|
+
"generated_at": datetime.now(UTC).isoformat(),
|
|
1223
|
+
"root": str(root),
|
|
1224
|
+
"primary_doc": status.primary_doc,
|
|
1225
|
+
"documents": [doc.model_dump(mode="json") for doc in status.documents],
|
|
1226
|
+
}
|
|
1227
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
1228
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
1229
|
+
fh.write("\n")
|
|
1230
|
+
return path
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
def _document_status(
|
|
1234
|
+
root: Path,
|
|
1235
|
+
doc: DiscoveredDocument,
|
|
1236
|
+
*,
|
|
1237
|
+
previous: RepoDocumentStatus | None = None,
|
|
1238
|
+
) -> RepoDocumentStatus:
|
|
1239
|
+
manifest_path = doc.out_dir / "manifest.json"
|
|
1240
|
+
try:
|
|
1241
|
+
source_file_hash = _file_hash(doc.source)
|
|
1242
|
+
except OSError as exc:
|
|
1243
|
+
return RepoDocumentStatus(
|
|
1244
|
+
id=doc.id,
|
|
1245
|
+
source=doc.relative_source,
|
|
1246
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1247
|
+
state="error",
|
|
1248
|
+
error=str(exc),
|
|
1249
|
+
)
|
|
1250
|
+
source_hash: str | None = None
|
|
1251
|
+
if not manifest_path.exists():
|
|
1252
|
+
try:
|
|
1253
|
+
parsed = parser_for_path(doc.source).parse(doc.source, doc_id=doc.id)
|
|
1254
|
+
source_hash = parsed.source_hash
|
|
1255
|
+
except Exception as exc:
|
|
1256
|
+
return RepoDocumentStatus(
|
|
1257
|
+
id=doc.id,
|
|
1258
|
+
source=doc.relative_source,
|
|
1259
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1260
|
+
state="error",
|
|
1261
|
+
source_file_hash=source_file_hash,
|
|
1262
|
+
error=str(exc),
|
|
1263
|
+
)
|
|
1264
|
+
return RepoDocumentStatus(
|
|
1265
|
+
id=doc.id,
|
|
1266
|
+
source=doc.relative_source,
|
|
1267
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1268
|
+
state="missing",
|
|
1269
|
+
source_hash=source_hash,
|
|
1270
|
+
source_file_hash=source_file_hash,
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
try:
|
|
1274
|
+
manifest = read_manifest(doc.out_dir)
|
|
1275
|
+
except Exception as exc:
|
|
1276
|
+
return RepoDocumentStatus(
|
|
1277
|
+
id=doc.id,
|
|
1278
|
+
source=doc.relative_source,
|
|
1279
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1280
|
+
state="error",
|
|
1281
|
+
source_file_hash=source_file_hash,
|
|
1282
|
+
error=str(exc),
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
previous_indexed_file_hash = (
|
|
1286
|
+
previous.indexed_source_file_hash if previous is not None else None
|
|
1287
|
+
)
|
|
1288
|
+
if (
|
|
1289
|
+
previous is not None
|
|
1290
|
+
and previous.indexed_hash == manifest.source_hash
|
|
1291
|
+
and previous_indexed_file_hash is not None
|
|
1292
|
+
):
|
|
1293
|
+
state: DocState = (
|
|
1294
|
+
"indexed" if previous_indexed_file_hash == source_file_hash else "stale"
|
|
1295
|
+
)
|
|
1296
|
+
return RepoDocumentStatus(
|
|
1297
|
+
id=doc.id,
|
|
1298
|
+
source=doc.relative_source,
|
|
1299
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1300
|
+
state=state,
|
|
1301
|
+
section_count=previous.section_count,
|
|
1302
|
+
source_hash=(
|
|
1303
|
+
manifest.source_hash if state == "indexed" else previous.source_hash
|
|
1304
|
+
),
|
|
1305
|
+
indexed_hash=manifest.source_hash,
|
|
1306
|
+
source_file_hash=source_file_hash,
|
|
1307
|
+
indexed_source_file_hash=previous_indexed_file_hash,
|
|
1308
|
+
indexed_at=manifest.indexed_at,
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
try:
|
|
1312
|
+
parsed = parser_for_path(doc.source).parse(doc.source, doc_id=doc.id)
|
|
1313
|
+
source_hash = parsed.source_hash
|
|
1314
|
+
index = DocumentIndex.load(doc.out_dir)
|
|
1315
|
+
except Exception as exc:
|
|
1316
|
+
return RepoDocumentStatus(
|
|
1317
|
+
id=doc.id,
|
|
1318
|
+
source=doc.relative_source,
|
|
1319
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1320
|
+
state="error",
|
|
1321
|
+
source_file_hash=source_file_hash,
|
|
1322
|
+
error=str(exc),
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
state = "indexed" if manifest.source_hash == source_hash else "stale"
|
|
1326
|
+
|
|
1327
|
+
return RepoDocumentStatus(
|
|
1328
|
+
id=doc.id,
|
|
1329
|
+
source=doc.relative_source,
|
|
1330
|
+
doc_dir=_relative_posix(root, doc.out_dir),
|
|
1331
|
+
state=state,
|
|
1332
|
+
section_count=len(index.tree),
|
|
1333
|
+
source_hash=source_hash,
|
|
1334
|
+
indexed_hash=manifest.source_hash,
|
|
1335
|
+
source_file_hash=source_file_hash,
|
|
1336
|
+
indexed_source_file_hash=(
|
|
1337
|
+
source_file_hash
|
|
1338
|
+
if state == "indexed"
|
|
1339
|
+
else (
|
|
1340
|
+
previous.indexed_source_file_hash
|
|
1341
|
+
if previous is not None
|
|
1342
|
+
else None
|
|
1343
|
+
)
|
|
1344
|
+
),
|
|
1345
|
+
indexed_at=manifest.indexed_at,
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
def _orphaned_statuses(
|
|
1350
|
+
root: Path,
|
|
1351
|
+
config: RepoConfig,
|
|
1352
|
+
discovered_ids: set[str],
|
|
1353
|
+
) -> Iterable[RepoDocumentStatus]:
|
|
1354
|
+
docs_root = cairn_dir(root) / config.documents_dir
|
|
1355
|
+
if not docs_root.exists():
|
|
1356
|
+
return ()
|
|
1357
|
+
out: list[RepoDocumentStatus] = []
|
|
1358
|
+
for child in sorted(docs_root.iterdir(), key=lambda p: p.name):
|
|
1359
|
+
if not child.is_dir() or child.name in discovered_ids:
|
|
1360
|
+
continue
|
|
1361
|
+
try:
|
|
1362
|
+
manifest = read_manifest(child)
|
|
1363
|
+
index = DocumentIndex.load(child)
|
|
1364
|
+
manifest_source = Path(manifest.source_path)
|
|
1365
|
+
source_path = (
|
|
1366
|
+
manifest_source
|
|
1367
|
+
if manifest_source.is_absolute()
|
|
1368
|
+
else root / manifest_source
|
|
1369
|
+
)
|
|
1370
|
+
out.append(
|
|
1371
|
+
RepoDocumentStatus(
|
|
1372
|
+
id=child.name,
|
|
1373
|
+
source=manifest.source_path,
|
|
1374
|
+
doc_dir=_relative_posix(root, child),
|
|
1375
|
+
state="orphaned",
|
|
1376
|
+
section_count=len(index.tree),
|
|
1377
|
+
indexed_hash=manifest.source_hash,
|
|
1378
|
+
indexed_source_file_hash=(
|
|
1379
|
+
_file_hash(source_path)
|
|
1380
|
+
if source_path.exists()
|
|
1381
|
+
else None
|
|
1382
|
+
),
|
|
1383
|
+
indexed_at=manifest.indexed_at,
|
|
1384
|
+
)
|
|
1385
|
+
)
|
|
1386
|
+
except Exception as exc:
|
|
1387
|
+
out.append(
|
|
1388
|
+
RepoDocumentStatus(
|
|
1389
|
+
id=child.name,
|
|
1390
|
+
source="",
|
|
1391
|
+
doc_dir=_relative_posix(root, child),
|
|
1392
|
+
state="error",
|
|
1393
|
+
error=str(exc),
|
|
1394
|
+
)
|
|
1395
|
+
)
|
|
1396
|
+
return tuple(out)
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def _choose_primary_doc(status: RepoStatus) -> str | None:
|
|
1400
|
+
indexed = [doc for doc in status.documents if doc.state in {"indexed", "stale"}]
|
|
1401
|
+
if status.primary_doc and any(doc.id == status.primary_doc for doc in indexed):
|
|
1402
|
+
return status.primary_doc
|
|
1403
|
+
if indexed:
|
|
1404
|
+
return indexed[0].id
|
|
1405
|
+
return None
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def _render_config(config: RepoConfig) -> str:
|
|
1409
|
+
lines = [
|
|
1410
|
+
"# Cairn repository documentation index.",
|
|
1411
|
+
"# Paths are relative to the repository root.",
|
|
1412
|
+
f"documents_dir = {_toml_string(config.documents_dir)}",
|
|
1413
|
+
f"enable_markitdown = {str(config.enable_markitdown).lower()}",
|
|
1414
|
+
f"search_sections_per_doc = {config.search_sections_per_doc}",
|
|
1415
|
+
"preferred_locales = ["
|
|
1416
|
+
+ ", ".join(_toml_string(item) for item in config.preferred_locales)
|
|
1417
|
+
+ "]",
|
|
1418
|
+
]
|
|
1419
|
+
if config.primary_doc is not None:
|
|
1420
|
+
lines.append(f"primary_doc = {_toml_string(config.primary_doc)}")
|
|
1421
|
+
lines.extend(
|
|
1422
|
+
[
|
|
1423
|
+
"",
|
|
1424
|
+
"include = [",
|
|
1425
|
+
*[f" {_toml_string(item)}," for item in config.include],
|
|
1426
|
+
"]",
|
|
1427
|
+
"",
|
|
1428
|
+
"exclude = [",
|
|
1429
|
+
*[f" {_toml_string(item)}," for item in config.exclude],
|
|
1430
|
+
"]",
|
|
1431
|
+
"",
|
|
1432
|
+
]
|
|
1433
|
+
)
|
|
1434
|
+
return "\n".join(lines)
|
|
1435
|
+
|
|
1436
|
+
|
|
1437
|
+
def _toml_string(value: str) -> str:
|
|
1438
|
+
return json.dumps(value)
|
|
1439
|
+
|
|
1440
|
+
|
|
1441
|
+
def _relative_posix(root: Path, path: Path) -> str:
|
|
1442
|
+
return path.resolve().relative_to(root.resolve()).as_posix()
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
def _is_excluded(relative_path: str, patterns: tuple[str, ...]) -> bool:
|
|
1446
|
+
rel = Path(relative_path)
|
|
1447
|
+
rel_posix = rel.as_posix()
|
|
1448
|
+
for pattern in patterns:
|
|
1449
|
+
if rel.match(pattern) or fnmatchcase(rel_posix, pattern):
|
|
1450
|
+
return True
|
|
1451
|
+
if _matches_excluded_dir(rel, pattern):
|
|
1452
|
+
return True
|
|
1453
|
+
return False
|
|
1454
|
+
|
|
1455
|
+
|
|
1456
|
+
def _matches_excluded_dir(relative_path: Path, pattern: str) -> bool:
|
|
1457
|
+
"""Treat simple ``name/**`` excludes as directory names at any depth."""
|
|
1458
|
+
if not pattern.endswith("/**"):
|
|
1459
|
+
return False
|
|
1460
|
+
dirname = pattern[:-3]
|
|
1461
|
+
if not dirname or "/" in dirname:
|
|
1462
|
+
return False
|
|
1463
|
+
return dirname in relative_path.parts
|
|
1464
|
+
|
|
1465
|
+
|
|
1466
|
+
def _doc_id_for_relative_path(relative_path: str) -> str:
|
|
1467
|
+
stem = Path(relative_path).with_suffix("").as_posix()
|
|
1468
|
+
return slugify(stem.replace("/", "-")) or "document"
|
|
1469
|
+
|
|
1470
|
+
|
|
1471
|
+
def _file_hash(path: Path) -> str:
|
|
1472
|
+
return hashlib.sha256(path.read_bytes()).hexdigest()
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
def _unique_doc_id(base: str, used: set[str]) -> str:
|
|
1476
|
+
if base not in used:
|
|
1477
|
+
return base
|
|
1478
|
+
suffix = 2
|
|
1479
|
+
while f"{base}-{suffix}" in used:
|
|
1480
|
+
suffix += 1
|
|
1481
|
+
return f"{base}-{suffix}"
|
|
1482
|
+
|
|
1483
|
+
|
|
1484
|
+
def _emit(callback: Callable[[str], None] | None, message: str) -> None:
|
|
1485
|
+
if callback is not None:
|
|
1486
|
+
callback(message)
|