docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/cli/app.py ADDED
@@ -0,0 +1,776 @@
1
+ """Typer-based command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Annotated, Literal
9
+
10
+ import typer
11
+
12
+ from cairn import __version__
13
+ from cairn.cli.config import load_embed_config, load_index_config, load_llm_config
14
+ from cairn.embed.base import Embedder
15
+ from cairn.engine.indexer import Indexer
16
+ from cairn.entity.heuristic import HeuristicExtractor
17
+ from cairn.ingest import parser_for_path
18
+ from cairn.inspection import write_inspector
19
+ from cairn.providers import make_embedder, make_summarizer
20
+ from cairn.repo import (
21
+ RepoStatus,
22
+ find_repo_root,
23
+ load_repo_document_index,
24
+ repo_status,
25
+ search_repo_documents,
26
+ sync_repo,
27
+ write_default_config,
28
+ )
29
+ from cairn.summarize.base import Summarizer
30
+ from cairn.tools.base import DocumentIndex
31
+ from cairn.tools.find_mentions import find_mentions as find_mentions_tool
32
+ from cairn.tools.get_related import get_related as get_related_tool
33
+ from cairn.tools.outline import outline as outline_tool
34
+ from cairn.tools.search_keyword import Mode
35
+ from cairn.tools.search_keyword import search_keyword as search_keyword_tool
36
+ from cairn.tools.search_semantic import search_semantic as search_semantic_tool
37
+ from cairn.xref.heuristic import HeuristicXRefExtractor
38
+
39
+ app = typer.Typer(
40
+ name="docsgraph",
41
+ help="Local-first documentation graph for AI agents.",
42
+ no_args_is_help=True,
43
+ )
44
+ query_app = typer.Typer(help="Run a single retrieval tool from the command line.")
45
+ mcp_app = typer.Typer(help="Generate MCP client configuration snippets.")
46
+ app.add_typer(query_app, name="query")
47
+ app.add_typer(mcp_app, name="mcp")
48
+
49
+ McpClient = Literal["claude", "cursor", "codex", "goose"]
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Plugin construction
54
+ # ---------------------------------------------------------------------------
55
+
56
+
57
+ def _make_summarizer(use_fake: bool) -> Summarizer:
58
+ return make_summarizer(use_fake)
59
+
60
+
61
+ def _make_embedder(use_fake: bool) -> Embedder:
62
+ return make_embedder(use_fake)
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Commands
67
+ # ---------------------------------------------------------------------------
68
+
69
+
70
+ @app.command()
71
+ def init(
72
+ yes: Annotated[
73
+ bool,
74
+ typer.Option(
75
+ "-y",
76
+ "--yes",
77
+ help="Create .cairn/config.toml without prompting.",
78
+ ),
79
+ ] = False,
80
+ force: Annotated[
81
+ bool,
82
+ typer.Option("--force", help="Overwrite an existing .cairn/config.toml."),
83
+ ] = False,
84
+ markitdown: Annotated[
85
+ bool,
86
+ typer.Option(
87
+ "--markitdown",
88
+ help="Include MarkItDown-backed Office/data/web globs in config.",
89
+ ),
90
+ ] = False,
91
+ ) -> None:
92
+ """Initialize Cairn for repository documentation indexing."""
93
+ root = Path.cwd()
94
+ config_file = root / ".cairn" / "config.toml"
95
+ if config_file.exists() and not force:
96
+ typer.echo(f"already initialized: {config_file}")
97
+ return
98
+ if not yes and not typer.confirm(f"Create {config_file}?"):
99
+ raise typer.Exit(code=1)
100
+ written = write_default_config(root, force=force, enable_markitdown=markitdown)
101
+ typer.echo(f"initialized: {written}")
102
+
103
+
104
+ @app.command()
105
+ def sync(
106
+ fake: Annotated[
107
+ bool,
108
+ typer.Option(
109
+ "--fake",
110
+ help="Use deterministic FakeSummarizer + FakeEmbedder (no network).",
111
+ ),
112
+ ] = False,
113
+ force: Annotated[
114
+ bool,
115
+ typer.Option("--force", help="Rebuild every configured document."),
116
+ ] = False,
117
+ ) -> None:
118
+ """Index every configured repository document under .cairn/documents/."""
119
+ asyncio.run(_run_sync(fake, force))
120
+
121
+
122
+ async def _run_sync(use_fake: bool, force: bool) -> None:
123
+ root = find_repo_root()
124
+ results = await sync_repo(
125
+ root,
126
+ summarizer=_make_summarizer(use_fake),
127
+ embedder=_make_embedder(use_fake),
128
+ index_config=load_index_config(),
129
+ force=force,
130
+ progress=lambda message: typer.echo(message, err=True),
131
+ )
132
+ failed = sum(1 for item in results if not item.ok)
133
+ successful = len(results) - failed
134
+ rebuilt = sum(1 for item in results if item.ok and item.rebuilt)
135
+ skipped = successful - rebuilt
136
+ typer.echo(
137
+ "synced: "
138
+ f"{successful}/{len(results)} documents "
139
+ f"({rebuilt} rebuilt, {skipped} up to date, {failed} failed)"
140
+ )
141
+ if failed:
142
+ raise typer.Exit(code=1)
143
+
144
+
145
+ @app.command()
146
+ def status(
147
+ json_output: Annotated[
148
+ bool,
149
+ typer.Option("--json", help="Print machine-readable JSON."),
150
+ ] = False,
151
+ ) -> None:
152
+ """Show repository documentation index status."""
153
+ root = find_repo_root()
154
+ status_obj = repo_status(root)
155
+ if json_output:
156
+ typer.echo(status_obj.model_dump_json(indent=2))
157
+ return
158
+ typer.echo(_format_repo_status(status_obj))
159
+
160
+
161
+ @app.command()
162
+ def doctor(
163
+ json_output: Annotated[
164
+ bool,
165
+ typer.Option("--json", help="Print machine-readable JSON."),
166
+ ] = False,
167
+ ) -> None:
168
+ """Check repo setup, index freshness, and model configuration."""
169
+ checks = _doctor_checks()
170
+ ok = all(item["ok"] for item in checks)
171
+ payload = {
172
+ "ok": ok,
173
+ "version": __version__,
174
+ "checks": checks,
175
+ }
176
+ if json_output:
177
+ typer.echo(json.dumps(payload, ensure_ascii=False, indent=2))
178
+ else:
179
+ typer.echo(_format_doctor(payload))
180
+ if not ok:
181
+ raise typer.Exit(code=1)
182
+
183
+
184
+ @app.command()
185
+ def version() -> None:
186
+ """Print the Cairn version."""
187
+ typer.echo(__version__)
188
+
189
+
190
+ @app.command()
191
+ def index(
192
+ source: Annotated[
193
+ Path,
194
+ typer.Argument(exists=True, file_okay=True, dir_okay=False, readable=True),
195
+ ],
196
+ doc_id: Annotated[
197
+ str | None,
198
+ typer.Option(help="Override the document id (defaults to filename stem)."),
199
+ ] = None,
200
+ out: Annotated[
201
+ Path | None,
202
+ typer.Option(help="Output directory. Defaults to .cairn/documents/<doc_id>/."),
203
+ ] = None,
204
+ fake: Annotated[
205
+ bool,
206
+ typer.Option(
207
+ "--fake",
208
+ help="Use deterministic FakeSummarizer + FakeEmbedder (no network).",
209
+ ),
210
+ ] = False,
211
+ force: Annotated[
212
+ bool,
213
+ typer.Option(
214
+ "--force",
215
+ help="Rebuild even if the source file is unchanged since last index.",
216
+ ),
217
+ ] = False,
218
+ ) -> None:
219
+ """Index a source document — build Tree + Summaries + Vectors."""
220
+ asyncio.run(_run_index(source, doc_id, out, fake, force))
221
+
222
+
223
+ async def _run_index(
224
+ source: Path,
225
+ doc_id: str | None,
226
+ out: Path | None,
227
+ use_fake: bool,
228
+ force: bool,
229
+ ) -> None:
230
+ parser = parser_for_path(source)
231
+ resolved_doc_id = doc_id or source.stem
232
+ out_dir = out or Path.cwd() / ".cairn" / "documents" / resolved_doc_id
233
+ out_dir.mkdir(parents=True, exist_ok=True)
234
+ index_cfg = load_index_config()
235
+
236
+ indexer = Indexer(
237
+ parser=parser,
238
+ summarizer=_make_summarizer(use_fake),
239
+ embedder=_make_embedder(use_fake),
240
+ entity_extractor=HeuristicExtractor(),
241
+ xref_extractor=HeuristicXRefExtractor(),
242
+ summary_concurrency=index_cfg.summary_concurrency,
243
+ embed_batch_size=index_cfg.embed_batch_size,
244
+ progress=lambda message: typer.echo(message, err=True),
245
+ )
246
+ result = await indexer.index_path(
247
+ source, out_dir=out_dir, doc_id=doc_id, force=force
248
+ )
249
+ if result.rebuilt:
250
+ typer.echo(f"indexed: {result.manifest_path}")
251
+ else:
252
+ typer.echo(f"already up to date: {result.manifest_path}")
253
+
254
+
255
+ @app.command()
256
+ def serve(
257
+ doc_dir: Annotated[
258
+ Path | None,
259
+ typer.Argument(
260
+ file_okay=False,
261
+ dir_okay=True,
262
+ readable=True,
263
+ help=(
264
+ "Built document directory. Omit to serve the current repo's "
265
+ ".cairn documents."
266
+ ),
267
+ ),
268
+ ] = None,
269
+ fake: Annotated[
270
+ bool,
271
+ typer.Option(
272
+ "--fake",
273
+ help="Use FakeEmbedder for query embedding (no network at query time).",
274
+ ),
275
+ ] = False,
276
+ repo: Annotated[
277
+ Path | None,
278
+ typer.Option(
279
+ "--repo",
280
+ file_okay=False,
281
+ dir_okay=True,
282
+ readable=True,
283
+ help="Repository root or child path containing .cairn/config.toml.",
284
+ ),
285
+ ] = None,
286
+ ) -> None:
287
+ """Start the MCP stdio server against a document or repo index."""
288
+ from cairn.mcp.server import serve_repo_stdio, serve_stdio
289
+
290
+ if repo is not None and doc_dir is not None:
291
+ typer.echo("error: pass either a document directory or --repo, not both", err=True)
292
+ raise typer.Exit(code=2)
293
+ if repo is not None:
294
+ asyncio.run(serve_repo_stdio(find_repo_root(repo), embedder=_make_embedder(fake)))
295
+ return
296
+ if doc_dir is None:
297
+ asyncio.run(serve_repo_stdio(find_repo_root(), embedder=_make_embedder(fake)))
298
+ return
299
+ if not doc_dir.exists() or not doc_dir.is_dir():
300
+ typer.echo(f"error: document directory not found: {doc_dir}", err=True)
301
+ raise typer.Exit(code=2)
302
+ asyncio.run(serve_stdio(doc_dir, embedder=_make_embedder(fake)))
303
+
304
+
305
+ @mcp_app.command("config")
306
+ def mcp_config(
307
+ client: Annotated[
308
+ McpClient,
309
+ typer.Option(
310
+ "--client",
311
+ case_sensitive=False,
312
+ help="Client snippet to print: claude, cursor, codex, or goose.",
313
+ ),
314
+ ] = "claude",
315
+ repo: Annotated[
316
+ Path | None,
317
+ typer.Option(
318
+ "--repo",
319
+ file_okay=False,
320
+ dir_okay=True,
321
+ readable=True,
322
+ help="Repository root. Defaults to the nearest .cairn/config.toml.",
323
+ ),
324
+ ] = None,
325
+ command: Annotated[
326
+ str,
327
+ typer.Option(
328
+ "--command",
329
+ help="Executable path clients should run.",
330
+ ),
331
+ ] = "docsgraph",
332
+ fake: Annotated[
333
+ bool,
334
+ typer.Option(
335
+ "--fake",
336
+ help="Include --fake for deterministic local smoke tests.",
337
+ ),
338
+ ] = False,
339
+ ) -> None:
340
+ """Print a copy-pasteable MCP stdio configuration snippet."""
341
+ root = find_repo_root(repo)
342
+ args = ["serve", "--repo", str(root)]
343
+ if fake:
344
+ args.append("--fake")
345
+ typer.echo(_format_mcp_config(client, command=command, args=args))
346
+
347
+
348
+ @app.command()
349
+ def outline(
350
+ doc_dir: Annotated[
351
+ Path,
352
+ typer.Argument(exists=True, file_okay=False, dir_okay=True, readable=True),
353
+ ],
354
+ depth: Annotated[int, typer.Option(min=1, max=6)] = 2,
355
+ focus: Annotated[
356
+ str | None, typer.Option(help="Restrict to a subtree.")
357
+ ] = None,
358
+ ) -> None:
359
+ """Print the document outline as JSON."""
360
+ asyncio.run(_run_outline(doc_dir, depth, focus))
361
+
362
+
363
+ async def _run_outline(doc_dir: Path, depth: int, focus: str | None) -> None:
364
+ idx = DocumentIndex.load(doc_dir)
365
+ resp = await outline_tool(idx, depth=depth, focus=focus, include=("gist",))
366
+ typer.echo(json.dumps(resp.data, ensure_ascii=False, indent=2))
367
+
368
+
369
+ @query_app.command("semantic")
370
+ def query_semantic(
371
+ doc_dir: Annotated[
372
+ Path,
373
+ typer.Argument(exists=True, file_okay=False, dir_okay=True, readable=True),
374
+ ],
375
+ query: Annotated[str, typer.Argument(help="Query string.")],
376
+ k: Annotated[int, typer.Option(min=1, max=32)] = 8,
377
+ fake: Annotated[bool, typer.Option("--fake")] = False,
378
+ ) -> None:
379
+ """Run a semantic search and print results as JSON."""
380
+ asyncio.run(_run_search_semantic(doc_dir, query, k, fake))
381
+
382
+
383
+ async def _run_search_semantic(
384
+ doc_dir: Path, query: str, k: int, use_fake: bool
385
+ ) -> None:
386
+ idx = DocumentIndex.load(doc_dir)
387
+ embedder = _make_embedder(use_fake)
388
+ resp = await search_semantic_tool(idx, embedder=embedder, query=query, k=k)
389
+ typer.echo(json.dumps(resp.data, ensure_ascii=False, indent=2))
390
+
391
+
392
+ @query_app.command("repo")
393
+ def query_repo(
394
+ query: Annotated[
395
+ str,
396
+ typer.Argument(help="Conceptual query across indexed repo docs."),
397
+ ],
398
+ k: Annotated[int, typer.Option(min=1, max=32)] = 8,
399
+ sections_per_doc: Annotated[
400
+ int | None,
401
+ typer.Option(help="Maximum section hits per document (1-8)."),
402
+ ] = None,
403
+ repo: Annotated[
404
+ Path | None,
405
+ typer.Option(
406
+ "--repo",
407
+ file_okay=False,
408
+ dir_okay=True,
409
+ readable=True,
410
+ help="Repository root or child path containing .cairn/config.toml.",
411
+ ),
412
+ ] = None,
413
+ fake: Annotated[bool, typer.Option("--fake")] = False,
414
+ ) -> None:
415
+ """Run repo-scoped hybrid search and print results as JSON."""
416
+ asyncio.run(_run_search_repo(query, k, sections_per_doc, repo, fake))
417
+
418
+
419
+ async def _run_search_repo(
420
+ query: str,
421
+ k: int,
422
+ sections_per_doc: int | None,
423
+ repo: Path | None,
424
+ use_fake: bool,
425
+ ) -> None:
426
+ root = find_repo_root(repo)
427
+ resp = await search_repo_documents(
428
+ root,
429
+ embedder=_make_embedder(use_fake),
430
+ query=query,
431
+ k=k,
432
+ sections_per_doc=sections_per_doc,
433
+ )
434
+ typer.echo(json.dumps(resp["data"], ensure_ascii=False, indent=2))
435
+
436
+
437
+ @query_app.command("keyword")
438
+ def query_keyword(
439
+ doc_dir: Annotated[
440
+ Path,
441
+ typer.Argument(exists=True, file_okay=False, dir_okay=True, readable=True),
442
+ ],
443
+ terms: Annotated[list[str], typer.Argument(help="One or more search terms.")],
444
+ k: Annotated[int, typer.Option(min=1, max=32)] = 12,
445
+ mode: Annotated[str, typer.Option(help="any | all")] = "any",
446
+ ) -> None:
447
+ """Run a keyword search and print results as JSON."""
448
+ asyncio.run(_run_search_keyword(doc_dir, terms, k, mode))
449
+
450
+
451
+ async def _run_search_keyword(
452
+ doc_dir: Path, terms: list[str], k: int, mode: str
453
+ ) -> None:
454
+ if mode not in ("any", "all"):
455
+ typer.echo(f"error: mode must be 'any' or 'all'; got {mode!r}", err=True)
456
+ raise typer.Exit(code=2)
457
+ cast_mode: Mode = mode # type: ignore[assignment]
458
+ idx = DocumentIndex.load(doc_dir)
459
+ resp = await search_keyword_tool(idx, terms=terms, k=k, mode=cast_mode)
460
+ typer.echo(json.dumps(resp.data, ensure_ascii=False, indent=2))
461
+
462
+
463
+ @query_app.command("mentions")
464
+ def query_mentions(
465
+ doc_dir: Annotated[
466
+ Path,
467
+ typer.Argument(exists=True, file_okay=False, dir_okay=True, readable=True),
468
+ ],
469
+ entity: Annotated[str, typer.Argument(help="Entity name (canonical or surface form).")],
470
+ scope: Annotated[
471
+ str | None,
472
+ typer.Option(help="Restrict to a section-id prefix."),
473
+ ] = None,
474
+ ) -> None:
475
+ """Locate every section that mentions an entity."""
476
+ asyncio.run(_run_find_mentions(doc_dir, entity, scope))
477
+
478
+
479
+ async def _run_find_mentions(
480
+ doc_dir: Path, entity: str, scope: str | None
481
+ ) -> None:
482
+ idx = DocumentIndex.load(doc_dir)
483
+ resp = await find_mentions_tool(idx, entity=entity, scope=scope)
484
+ typer.echo(json.dumps(resp.data, ensure_ascii=False, indent=2))
485
+
486
+
487
+ @query_app.command("related")
488
+ def query_related(
489
+ doc_dir: Annotated[
490
+ Path,
491
+ typer.Argument(exists=True, file_okay=False, dir_okay=True, readable=True),
492
+ ],
493
+ section_id: Annotated[str, typer.Argument(help="Section id to find neighbors of.")],
494
+ kinds: Annotated[
495
+ str,
496
+ typer.Option(help="Comma-separated channels: xref,sibling,parent,child"),
497
+ ] = "xref",
498
+ k: Annotated[int, typer.Option(min=1, max=32)] = 8,
499
+ ) -> None:
500
+ """Return neighbors of a section across the xref graph and tree."""
501
+ parsed = tuple(s.strip() for s in kinds.split(",") if s.strip())
502
+ asyncio.run(_run_get_related(doc_dir, section_id, parsed, k))
503
+
504
+
505
+ async def _run_get_related(
506
+ doc_dir: Path, section_id: str, kinds: tuple[str, ...], k: int
507
+ ) -> None:
508
+ idx = DocumentIndex.load(doc_dir)
509
+ resp = await get_related_tool(idx, id=section_id, kinds=kinds, k=k) # type: ignore[arg-type]
510
+ typer.echo(json.dumps(resp.data, ensure_ascii=False, indent=2))
511
+
512
+
513
+ @app.command()
514
+ def inspect(
515
+ doc_dir: Annotated[
516
+ Path | None,
517
+ typer.Argument(
518
+ file_okay=False,
519
+ dir_okay=True,
520
+ readable=True,
521
+ help=(
522
+ "Built document directory. Omit to inspect the current repo's "
523
+ "primary Cairn document."
524
+ ),
525
+ ),
526
+ ] = None,
527
+ out: Annotated[
528
+ Path | None,
529
+ typer.Option(
530
+ help=(
531
+ "HTML output path. Defaults to <doc_dir>/inspector.html for "
532
+ "single-doc mode or .cairn/inspector.html for repo mode."
533
+ )
534
+ ),
535
+ ] = None,
536
+ doc: Annotated[
537
+ str | None,
538
+ typer.Option(help="Repo document id to inspect when doc_dir is omitted."),
539
+ ] = None,
540
+ ) -> None:
541
+ """Generate a standalone HTML inspector for a document index."""
542
+ if doc_dir is None:
543
+ root = find_repo_root()
544
+ idx = load_repo_document_index(root, doc_id=doc)
545
+ out_path = out or root / ".cairn" / "inspector.html"
546
+ else:
547
+ if not doc_dir.exists() or not doc_dir.is_dir():
548
+ typer.echo(f"error: document directory not found: {doc_dir}", err=True)
549
+ raise typer.Exit(code=2)
550
+ idx = DocumentIndex.load(doc_dir)
551
+ out_path = out or doc_dir / "inspector.html"
552
+ written = write_inspector(idx, out=out_path)
553
+ typer.echo(f"inspector: {written}")
554
+
555
+
556
+ @app.command()
557
+ def bench(
558
+ suite: Annotated[
559
+ Path,
560
+ typer.Argument(exists=True, file_okay=True, dir_okay=False, readable=True),
561
+ ],
562
+ k: Annotated[int, typer.Option(min=1, max=32)] = 8,
563
+ out: Annotated[
564
+ Path | None,
565
+ typer.Option(help="Where to write the JSON report."),
566
+ ] = None,
567
+ fake: Annotated[
568
+ bool,
569
+ typer.Option("--fake", help="Use FakeSummarizer + FakeEmbedder (deterministic, offline)."),
570
+ ] = False,
571
+ judge: Annotated[
572
+ bool,
573
+ typer.Option(
574
+ "--judge",
575
+ help="Run LLM-as-judge for QA accuracy (uses CAIRN_LLM_* settings).",
576
+ ),
577
+ ] = False,
578
+ ) -> None:
579
+ """Run a benchmark suite comparing Cairn against a naive vector-RAG baseline."""
580
+ asyncio.run(_run_bench(suite, k, out, fake, judge))
581
+
582
+
583
+ async def _run_bench(
584
+ suite_path: Path,
585
+ k: int,
586
+ out: Path | None,
587
+ use_fake: bool,
588
+ use_judge: bool,
589
+ ) -> None:
590
+ from cairn.bench.dataset import load_suite
591
+ from cairn.bench.judge import LLMJudge
592
+ from cairn.bench.report import format_markdown_report, write_json_report
593
+ from cairn.bench.runner import BenchOptions, BenchRunner
594
+
595
+ suite = load_suite(suite_path)
596
+
597
+ judge_client: LLMJudge | None = None
598
+ if use_judge:
599
+ cfg = load_llm_config()
600
+ judge_client = LLMJudge(
601
+ base_url=cfg.base_url,
602
+ model=cfg.model,
603
+ api_key=cfg.api_key,
604
+ )
605
+
606
+ index_cfg = load_index_config()
607
+ runner = BenchRunner(
608
+ summarizer=_make_summarizer(use_fake),
609
+ embedder=_make_embedder(use_fake),
610
+ judge=judge_client,
611
+ options=BenchOptions(
612
+ k=k,
613
+ summary_concurrency=index_cfg.summary_concurrency,
614
+ embed_batch_size=index_cfg.embed_batch_size,
615
+ ),
616
+ progress=lambda message: typer.echo(message, err=True),
617
+ )
618
+
619
+ import tempfile
620
+
621
+ with tempfile.TemporaryDirectory(prefix="cairn-bench-") as work_str:
622
+ work_dir = Path(work_str)
623
+ summary = await runner.run(suite, work_dir=work_dir)
624
+
625
+ typer.echo(format_markdown_report(summary))
626
+
627
+ out_path = out or Path("/tmp/cairn-bench") / f"{suite_path.stem}.json"
628
+ write_json_report(summary, out_path)
629
+ typer.echo(f"\njson report written → {out_path}", err=True)
630
+
631
+
632
+ def _format_repo_status(status_obj: RepoStatus) -> str:
633
+ lines = [
634
+ f"Cairn repo: {status_obj.root}",
635
+ f"config: {status_obj.config_path}",
636
+ (
637
+ "documents: "
638
+ f"{status_obj.indexed_count} indexed, "
639
+ f"{status_obj.stale_count} stale, "
640
+ f"{status_obj.missing_count} missing, "
641
+ f"{status_obj.error_count} errors"
642
+ ),
643
+ "",
644
+ "| state | doc | sections | source |",
645
+ "|---|---|---:|---|",
646
+ ]
647
+ for doc in status_obj.documents:
648
+ sections = "" if doc.section_count is None else str(doc.section_count)
649
+ lines.append(f"| {doc.state} | `{doc.id}` | {sections} | {doc.source} |")
650
+ return "\n".join(lines)
651
+
652
+
653
+ def _doctor_checks() -> list[dict[str, object]]:
654
+ checks: list[dict[str, object]] = []
655
+ try:
656
+ root = find_repo_root()
657
+ except Exception as exc:
658
+ return [
659
+ {
660
+ "name": "repo_config",
661
+ "ok": False,
662
+ "message": f"{exc}. Run `docsgraph init -y` first.",
663
+ }
664
+ ]
665
+
666
+ checks.append(
667
+ {
668
+ "name": "repo_config",
669
+ "ok": True,
670
+ "message": f"found {root / '.cairn' / 'config.toml'}",
671
+ }
672
+ )
673
+ try:
674
+ status_obj = repo_status(root)
675
+ unhealthy = (
676
+ status_obj.missing_count
677
+ + status_obj.stale_count
678
+ + status_obj.error_count
679
+ )
680
+ checks.append(
681
+ {
682
+ "name": "repo_index",
683
+ "ok": unhealthy == 0 and status_obj.indexed_count > 0,
684
+ "message": (
685
+ f"{status_obj.indexed_count} indexed, "
686
+ f"{status_obj.stale_count} stale, "
687
+ f"{status_obj.missing_count} missing, "
688
+ f"{status_obj.error_count} errors"
689
+ ),
690
+ }
691
+ )
692
+ if status_obj.primary_doc:
693
+ primary_ok = any(
694
+ doc.id == status_obj.primary_doc and doc.state == "indexed"
695
+ for doc in status_obj.documents
696
+ )
697
+ checks.append(
698
+ {
699
+ "name": "primary_doc",
700
+ "ok": primary_ok,
701
+ "message": status_obj.primary_doc,
702
+ }
703
+ )
704
+ except Exception as exc:
705
+ checks.append(
706
+ {
707
+ "name": "repo_index",
708
+ "ok": False,
709
+ "message": f"{exc}. Run `docsgraph sync --fake` to build locally.",
710
+ }
711
+ )
712
+
713
+ llm = load_llm_config()
714
+ embed = load_embed_config()
715
+ checks.append(
716
+ {
717
+ "name": "summarizer",
718
+ "ok": bool(llm.model and llm.base_url),
719
+ "message": f"{llm.model} at {llm.base_url}",
720
+ }
721
+ )
722
+ checks.append(
723
+ {
724
+ "name": "embedder",
725
+ "ok": bool(embed.model and embed.base_url and embed.dim > 0),
726
+ "message": f"{embed.provider}:{embed.model} dim={embed.dim}",
727
+ }
728
+ )
729
+ return checks
730
+
731
+
732
+ def _format_doctor(payload: dict[str, object]) -> str:
733
+ checks = payload["checks"]
734
+ assert isinstance(checks, list)
735
+ lines = [f"Cairn doctor: {'ok' if payload['ok'] else 'needs attention'}"]
736
+ for item in checks:
737
+ assert isinstance(item, dict)
738
+ marker = "ok" if item["ok"] else "!!"
739
+ lines.append(f"[{marker}] {item['name']}: {item['message']}")
740
+ if not payload["ok"]:
741
+ lines.append("")
742
+ lines.append(
743
+ "Next steps: run `docsgraph init -y`, then `docsgraph sync --fake` "
744
+ "(or use the compatible `cairn` alias)."
745
+ )
746
+ return "\n".join(lines)
747
+
748
+
749
+ def _format_mcp_config(client: McpClient, *, command: str, args: list[str]) -> str:
750
+ server = {"command": command, "args": args}
751
+ if client in {"claude", "cursor"}:
752
+ return json.dumps({"mcpServers": {"cairn": server}}, indent=2)
753
+ if client == "codex":
754
+ quoted_args = ", ".join(json.dumps(arg) for arg in args)
755
+ return "\n".join(
756
+ [
757
+ "[mcp_servers.cairn]",
758
+ f"command = {json.dumps(command)}",
759
+ f"args = [{quoted_args}]",
760
+ ]
761
+ )
762
+ yaml_args = "\n".join(f" - {json.dumps(arg)}" for arg in args)
763
+ return "\n".join(
764
+ [
765
+ "extensions:",
766
+ " cairn:",
767
+ " type: stdio",
768
+ f" command: {json.dumps(command)}",
769
+ " args:",
770
+ yaml_args,
771
+ ]
772
+ )
773
+
774
+
775
+ if __name__ == "__main__":
776
+ app()