docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,688 @@
1
+ Metadata-Version: 2.4
2
+ Name: docsgraph
3
+ Version: 0.1.0a2
4
+ Summary: Local-first documentation graph for AI agents. CodeGraph for docs, exposed through MCP.
5
+ Project-URL: Homepage, https://github.com/jokeuncle/cairn
6
+ Project-URL: Documentation, https://github.com/jokeuncle/cairn/tree/main/docs
7
+ Project-URL: Repository, https://github.com/jokeuncle/cairn
8
+ Project-URL: Issues, https://github.com/jokeuncle/cairn/issues
9
+ Project-URL: Changelog, https://github.com/jokeuncle/cairn/blob/main/CHANGELOG.md
10
+ Author: The Cairn Authors
11
+ License: Apache License
12
+ Version 2.0, January 2004
13
+ http://www.apache.org/licenses/
14
+
15
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
16
+
17
+ 1. Definitions.
18
+
19
+ "License" shall mean the terms and conditions for use, reproduction,
20
+ and distribution as defined by Sections 1 through 9 of this document.
21
+
22
+ "Licensor" shall mean the copyright owner or entity authorized by
23
+ the copyright owner that is granting the License.
24
+
25
+ "Legal Entity" shall mean the union of the acting entity and all
26
+ other entities that control, are controlled by, or are under common
27
+ control with that entity. For the purposes of this definition,
28
+ "control" means (i) the power, direct or indirect, to cause the
29
+ direction or management of such entity, whether by contract or
30
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
31
+ outstanding shares, or (iii) beneficial ownership of such entity.
32
+
33
+ "You" (or "Your") shall mean an individual or Legal Entity
34
+ exercising permissions granted by this License.
35
+
36
+ "Source" form shall mean the preferred form for making modifications,
37
+ including but not limited to software source code, documentation
38
+ source, and configuration files.
39
+
40
+ "Object" form shall mean any form resulting from mechanical
41
+ transformation or translation of a Source form, including but
42
+ not limited to compiled object code, generated documentation,
43
+ and conversions to other media types.
44
+
45
+ "Work" shall mean the work of authorship, whether in Source or
46
+ Object form, made available under the License, as indicated by a
47
+ copyright notice that is included in or attached to the work
48
+ (an example is provided in the Appendix below).
49
+
50
+ "Derivative Works" shall mean any work, whether in Source or Object
51
+ form, that is based on (or derived from) the Work and for which the
52
+ editorial revisions, annotations, elaborations, or other modifications
53
+ represent, as a whole, an original work of authorship. For the purposes
54
+ of this License, Derivative Works shall not include works that remain
55
+ separable from, or merely link (or bind by name) to the interfaces of,
56
+ the Work and Derivative Works thereof.
57
+
58
+ "Contribution" shall mean any work of authorship, including
59
+ the original version of the Work and any modifications or additions
60
+ to that Work or Derivative Works thereof, that is intentionally
61
+ submitted to Licensor for inclusion in the Work by the copyright owner
62
+ or by an individual or Legal Entity authorized to submit on behalf of
63
+ the copyright owner. For the purposes of this definition, "submitted"
64
+ means any form of electronic, verbal, or written communication sent
65
+ to the Licensor or its representatives, including but not limited to
66
+ communication on electronic mailing lists, source code control systems,
67
+ and issue tracking systems that are managed by, or on behalf of, the
68
+ Licensor for the purpose of discussing and improving the Work, but
69
+ excluding communication that is conspicuously marked or otherwise
70
+ designated in writing by the copyright owner as "Not a Contribution."
71
+
72
+ "Contributor" shall mean Licensor and any individual or Legal Entity
73
+ on behalf of whom a Contribution has been received by Licensor and
74
+ subsequently incorporated within the Work.
75
+
76
+ 2. Grant of Copyright License. Subject to the terms and conditions of
77
+ this License, each Contributor hereby grants to You a perpetual,
78
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
79
+ copyright license to reproduce, prepare Derivative Works of,
80
+ publicly display, publicly perform, sublicense, and distribute the
81
+ Work and such Derivative Works in Source or Object form.
82
+
83
+ 3. Grant of Patent License. Subject to the terms and conditions of
84
+ this License, each Contributor hereby grants to You a perpetual,
85
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
86
+ (except as stated in this section) patent license to make, have made,
87
+ use, offer to sell, sell, import, and otherwise transfer the Work,
88
+ where such license applies only to those patent claims licensable
89
+ by such Contributor that are necessarily infringed by their
90
+ Contribution(s) alone or by combination of their Contribution(s)
91
+ with the Work to which such Contribution(s) was submitted. If You
92
+ institute patent litigation against any entity (including a
93
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
94
+ or a Contribution incorporated within the Work constitutes direct
95
+ or contributory patent infringement, then any patent licenses
96
+ granted to You under this License for that Work shall terminate
97
+ as of the date such litigation is filed.
98
+
99
+ 4. Redistribution. You may reproduce and distribute copies of the
100
+ Work or Derivative Works thereof in any medium, with or without
101
+ modifications, and in Source or Object form, provided that You
102
+ meet the following conditions:
103
+
104
+ (a) You must give any other recipients of the Work or
105
+ Derivative Works a copy of this License; and
106
+
107
+ (b) You must cause any modified files to carry prominent notices
108
+ stating that You changed the files; and
109
+
110
+ (c) You must retain, in the Source form of any Derivative Works
111
+ that You distribute, all copyright, patent, trademark, and
112
+ attribution notices from the Source form of the Work,
113
+ excluding those notices that do not pertain to any part of
114
+ the Derivative Works; and
115
+
116
+ (d) If the Work includes a "NOTICE" text file as part of its
117
+ distribution, then any Derivative Works that You distribute must
118
+ include a readable copy of the attribution notices contained
119
+ within such NOTICE file, excluding those notices that do not
120
+ pertain to any part of the Derivative Works, in at least one
121
+ of the following places: within a NOTICE text file distributed
122
+ as part of the Derivative Works; within the Source form or
123
+ documentation, if provided along with the Derivative Works; or,
124
+ within a display generated by the Derivative Works, if and
125
+ wherever such third-party notices normally appear. The contents
126
+ of the NOTICE file are for informational purposes only and
127
+ do not modify the License. You may add Your own attribution
128
+ notices within Derivative Works that You distribute, alongside
129
+ or as an addendum to the NOTICE text from the Work, provided
130
+ that such additional attribution notices cannot be construed
131
+ as modifying the License.
132
+
133
+ You may add Your own copyright statement to Your modifications and
134
+ may provide additional or different license terms and conditions
135
+ for use, reproduction, or distribution of Your modifications, or
136
+ for any such Derivative Works as a whole, provided Your use,
137
+ reproduction, and distribution of the Work otherwise complies with
138
+ the conditions stated in this License.
139
+
140
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
141
+ any Contribution intentionally submitted for inclusion in the Work
142
+ by You to the Licensor shall be under the terms and conditions of
143
+ this License, without any additional terms or conditions.
144
+ Notwithstanding the above, nothing herein shall supersede or modify
145
+ the terms of any separate license agreement you may have executed
146
+ with Licensor regarding such Contributions.
147
+
148
+ 6. Trademarks. This License does not grant permission to use the trade
149
+ names, trademarks, service marks, or product names of the Licensor,
150
+ except as required for describing the origin of the Work and
151
+ reproducing the content of the NOTICE file.
152
+
153
+ 7. Disclaimer of Warranty. Unless required by applicable law or
154
+ agreed to in writing, Licensor provides the Work (and each
155
+ Contributor provides its Contributions) on an "AS IS" BASIS,
156
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
157
+ implied, including, without limitation, any warranties or conditions
158
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
159
+ PARTICULAR PURPOSE. You are solely responsible for determining the
160
+ appropriateness of using or redistributing the Work and assume any
161
+ risks associated with Your exercise of permissions under this License.
162
+
163
+ 8. Limitation of Liability. In no event and under no legal theory,
164
+ whether in tort (including negligence), contract, or otherwise,
165
+ unless required by applicable law (such as deliberate and grossly
166
+ negligent acts) or agreed to in writing, shall any Contributor be
167
+ liable to You for damages, including any direct, indirect, special,
168
+ incidental, or consequential damages of any character arising as a
169
+ result of this License or out of the use or inability to use the
170
+ Work (including but not limited to damages for loss of goodwill,
171
+ work stoppage, computer failure or malfunction, or any and all
172
+ other commercial damages or losses), even if such Contributor
173
+ has been advised of the possibility of such damages.
174
+
175
+ 9. Accepting Warranty or Support. While redistributing the Work or
176
+ Derivative Works thereof, You may choose to offer, and charge a
177
+ fee for, acceptance of support, warranty, indemnity, or other
178
+ liability obligations and/or rights consistent with this License.
179
+ However, in accepting such obligations, You may act only on Your
180
+ own behalf and on Your sole responsibility, not on behalf of any
181
+ other Contributor, and only if You agree to indemnify, defend, and
182
+ hold each Contributor harmless for any liability incurred by, or
183
+ claims asserted against, such Contributor by reason of your
184
+ accepting any such warranty or support.
185
+
186
+ END OF TERMS AND CONDITIONS
187
+
188
+ APPENDIX: How to apply the Apache License to your work.
189
+
190
+ To apply the Apache License to your work, attach the following
191
+ boilerplate notice, with the fields enclosed by brackets "[]"
192
+ replaced with your own identifying information. (Don't include
193
+ the brackets!) The text should be enclosed in the appropriate
194
+ comment syntax for the file format. We also recommend that a
195
+ file or class name and description of purpose be included on the
196
+ same "printed page" as the copyright notice for easier
197
+ identification within third-party archives.
198
+
199
+ Copyright 2026 The Cairn Authors
200
+
201
+ Licensed under the Apache License, Version 2.0 (the "License");
202
+ you may not use this file except in compliance with the License.
203
+ You may obtain a copy of the License at
204
+
205
+ http://www.apache.org/licenses/LICENSE-2.0
206
+
207
+ Unless required by applicable law or agreed to in writing, software
208
+ distributed under the License is distributed on an "AS IS" BASIS,
209
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
210
+ See the License for the specific language governing permissions and
211
+ limitations under the License.
212
+ License-File: LICENSE
213
+ Keywords: agents,ai-agents,documentation,documents,hierarchical,lancedb,markdown,mcp,model-context-protocol,pdf,rag,repository,retrieval,vector-search
214
+ Classifier: Development Status :: 3 - Alpha
215
+ Classifier: Intended Audience :: Developers
216
+ Classifier: Intended Audience :: Information Technology
217
+ Classifier: Intended Audience :: Science/Research
218
+ Classifier: License :: OSI Approved :: Apache Software License
219
+ Classifier: Operating System :: OS Independent
220
+ Classifier: Programming Language :: Python :: 3
221
+ Classifier: Programming Language :: Python :: 3.11
222
+ Classifier: Programming Language :: Python :: 3.12
223
+ Classifier: Programming Language :: Python :: 3.13
224
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
225
+ Classifier: Topic :: Software Development :: Libraries
226
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
227
+ Classifier: Topic :: Text Processing :: Indexing
228
+ Classifier: Typing :: Typed
229
+ Requires-Python: >=3.11
230
+ Requires-Dist: httpx[socks]>=0.27
231
+ Requires-Dist: lancedb>=0.13
232
+ Requires-Dist: markdown-it-py>=3.0
233
+ Requires-Dist: mcp>=1.2
234
+ Requires-Dist: mdit-py-plugins>=0.4
235
+ Requires-Dist: numpy>=1.26
236
+ Requires-Dist: pyarrow>=15.0
237
+ Requires-Dist: pydantic>=2.7
238
+ Requires-Dist: pymupdf>=1.24
239
+ Requires-Dist: python-slugify>=8.0
240
+ Requires-Dist: structlog>=24.1
241
+ Requires-Dist: typer>=0.12
242
+ Provides-Extra: dev
243
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
244
+ Requires-Dist: mypy>=1.10; extra == 'dev'
245
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
246
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
247
+ Requires-Dist: pytest>=8; extra == 'dev'
248
+ Requires-Dist: respx>=0.21; extra == 'dev'
249
+ Requires-Dist: ruff>=0.5; extra == 'dev'
250
+ Requires-Dist: types-python-slugify; extra == 'dev'
251
+ Provides-Extra: markitdown
252
+ Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
253
+ Description-Content-Type: text/markdown
254
+
255
+ # Cairn
256
+
257
+ > **The DocsGraph for AI agents. CodeGraph helps agents navigate code; Cairn
258
+ > helps them navigate docs. Install it as `docsgraph`; keep the `cairn` name
259
+ > for the product and compatibility alias.**
260
+
261
+ [![CI](https://github.com/jokeuncle/cairn/actions/workflows/ci.yml/badge.svg)](https://github.com/jokeuncle/cairn/actions/workflows/ci.yml)
262
+ [![License](https://img.shields.io/badge/license-Apache_2.0-blue.svg)](LICENSE)
263
+ [![Version](https://img.shields.io/badge/version-0.1.0a2-blue.svg)](CHANGELOG.md)
264
+ [![Python](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/)
265
+ [![MCP](https://img.shields.io/badge/MCP-native-7c3aed.svg)](https://modelcontextprotocol.io/)
266
+
267
+ ![Cairn demo: repository documentation graph and MCP tools](docs/assets/cairn-demo.svg)
268
+
269
+ Cairn is a **local-first, MCP-native DocsGraph** for software
270
+ repositories and large structured documents. It turns README files, specs,
271
+ ADRs, docs folders, PDFs, and optional MarkItDown-converted Office/data/web
272
+ files into a navigable map: document catalog, hierarchical sections,
273
+ multi-granularity summaries, entity mentions, cross-reference edges, and a
274
+ semantic vector overlay.
275
+
276
+ Instead of dumping whole docs into context or relying on anonymous chunks, an
277
+ agent can ask Cairn to `list_documents`, `search_documents`, inspect an
278
+ `outline`, and drill into exact sections with stable `cairn://` anchors. The
279
+ same engine also works for standalone handbooks, papers, and PDFs.
280
+
281
+ The result: better retrieval accuracy, lower token spend, and a practical MCP
282
+ tool layer between your project documentation and every AI coding agent you
283
+ use. Local-first. Vendor-neutral. Designed for open-source repos.
284
+
285
+ > 🚀 **Alpha — `0.1.0a2`.** Markdown + PDF ingest, all eight MCP tools,
286
+ > the full structure-aware index (tree + summaries + entities + xrefs +
287
+ > vectors), repo-level `init/sync/status`, repo-scoped MCP with
288
+ > `list_documents`, `search_documents`, `repo_context`, `repo_graph`, and
289
+ > `repo_impact`, failure-isolated sync, static graph inspector, Doubao
290
+ > multimodal embeddings, and a benchmark harness with headline numbers. See
291
+ > [`CHANGELOG.md`](CHANGELOG.md) for what's in this
292
+ > release and [`ROADMAP.md`](ROADMAP.md) for what's next.
293
+
294
+ ---
295
+
296
+ ## Why Cairn?
297
+
298
+ | Today | With Cairn |
299
+ |---|---|
300
+ | AI coding agents guess from README snippets or grep. | Agent gets a repo-level documentation map with stable section anchors. |
301
+ | Dump the whole document into context. Burns tokens, dilutes attention. | Agent fetches only what it needs, at the granularity it needs. |
302
+ | Naive RAG splits structure into context-free chunks. | The document's own structure is the index. |
303
+ | Cross-references and entities are lost in chunking. | They are first-class objects. |
304
+ | Locked into one vendor's embeddings / vector DB. | Pluggable everything. Local-first defaults. |
305
+ | Different tool stacks for Claude / Cursor / Cline / Goose. | One MCP server. Any compliant agent works. |
306
+
307
+ For the in-depth motivation, see [`PRODUCT.md`](PRODUCT.md).
308
+ For the technical design, see [`ARCHITECTURE.md`](ARCHITECTURE.md).
309
+ For the public documentation quality contract Cairn optimizes for, see
310
+ [`docs/golden-docs-standard.md`](docs/golden-docs-standard.md).
311
+
312
+ ---
313
+
314
+ ## How It Works (90 seconds)
315
+
316
+ 1. **Discover.** `docsgraph init -y` writes `.cairn/config.toml`; `docsgraph sync`
317
+ discovers README, Markdown docs, ADRs, specs, and PDFs from conservative
318
+ repo globs.
319
+ 2. **Index.** Each document becomes a normal Cairn index: structural tree (T),
320
+ multi-level summaries (S), entity index (E), cross-reference graph (X), and
321
+ vector overlay (V). A bad source file is isolated instead of breaking the
322
+ whole repo sync.
323
+ 3. **Serve.** `docsgraph serve` exposes repo-scoped MCP tools:
324
+ `list_documents`, `search_documents`, plus `outline`, `get_section`,
325
+ `expand`, `search_semantic`, `search_keyword`, `find_mentions`,
326
+ `get_related`, and `read_range` routed by optional `doc`.
327
+ 4. **Navigate.** Your agent searches across the repo, picks a document, drills
328
+ into promising sections, and only fetches full text when justified. Every
329
+ result carries stable anchors for verification.
330
+
331
+ A visual explainer comparing Cairn's approach to RAPTOR, BookRAG, and A-RAG
332
+ lives at [`docs/canvas.html`](docs/canvas.html). Open it in any browser.
333
+
334
+ ---
335
+
336
+ ## Quickstart
337
+
338
+ The fastest way to see Cairn work is to index this repo's own documentation.
339
+ **Zero API keys, zero model downloads** — the `--fake` flag uses deterministic
340
+ in-process plugins so the whole thing runs offline.
341
+
342
+ The PyPI distribution is `docsgraph`; the primary CLI command is `docsgraph`.
343
+ The older `cairn` command is installed as a compatibility alias:
344
+
345
+ ```bash
346
+ pip install docsgraph
347
+ ```
348
+
349
+ Or run it without installing:
350
+
351
+ ```bash
352
+ uvx docsgraph --help
353
+ ```
354
+
355
+ ### Repository Workflow
356
+
357
+ Inside any repository:
358
+
359
+ ```bash
360
+ docsgraph init -y
361
+ docsgraph sync --fake
362
+ docsgraph status
363
+ docsgraph query repo "where are docs indexed?" --fake
364
+ docsgraph doctor
365
+ docsgraph mcp config --client claude --fake
366
+ docsgraph serve --fake
367
+ ```
368
+
369
+ `docsgraph doctor` checks repo config, index freshness, primary-doc routing,
370
+ and model settings. `docsgraph mcp config` prints copy-pasteable stdio snippets for
371
+ Claude, Cursor, Codex, and Goose:
372
+
373
+ ```bash
374
+ docsgraph mcp config --client claude
375
+ docsgraph mcp config --client cursor
376
+ docsgraph mcp config --client codex
377
+ docsgraph mcp config --client goose
378
+ ```
379
+
380
+ For local development from source:
381
+
382
+ ```bash
383
+ git clone https://github.com/jokeuncle/cairn.git
384
+ cd cairn
385
+
386
+ python3.11 -m venv .venv
387
+ .venv/bin/pip install -e ".[dev]"
388
+
389
+ # 1. Create .cairn/config.toml with conservative documentation globs.
390
+ .venv/bin/docsgraph init -y
391
+
392
+ # 2. Index README, Markdown docs, and PDFs.
393
+ .venv/bin/docsgraph sync --fake
394
+
395
+ # 3. Inspect freshness and indexed document ids.
396
+ .venv/bin/docsgraph status
397
+
398
+ # 4. Search across all indexed repository docs.
399
+ .venv/bin/docsgraph query repo "where are docs indexed?" --fake
400
+
401
+ # 5. Start the repo-scoped MCP stdio server for Claude Code / Cursor / Cline / Goose.
402
+ .venv/bin/docsgraph serve --fake
403
+ ```
404
+
405
+ Repo mode writes a shareable config plus ignored runtime data:
406
+
407
+ ```text
408
+ .cairn/
409
+ config.toml # commit this if you want a stable repo docs policy
410
+ manifest.json # generated
411
+ documents/ # generated per-document Cairn indexes
412
+ readme/
413
+ architecture/
414
+ docs-specs-mcp-tools/
415
+ ```
416
+
417
+ Repo-scoped MCP adds:
418
+
419
+ | Tool | Use it for |
420
+ |---|---|
421
+ | `list_documents` | See every indexed doc, its source path, freshness, and section count. |
422
+ | `search_documents` | Search across all indexed docs and get globally ranked, explainable section hits with `doc` ids, skipped docs, and stale-doc warnings. |
423
+ | `repo_context` | Get a ready-to-read context pack: ranked hits, selected section text, hit explanations, and a relationship map. |
424
+ | `repo_graph` | Inspect the repo documentation graph: document, section, entity, contains, xref, and mention edges. Cross-document links are exposed through shared entity nodes. |
425
+ | `repo_impact` | Estimate documentation surfaces affected by a document or section change. |
426
+ | normal Cairn tools + `doc` | Drill into a chosen document with `outline`, `get_section`, `search_semantic`, `get_related`, etc. |
427
+
428
+ Repo behavior is intentionally configurable in `.cairn/config.toml`:
429
+
430
+ | Setting | Default | Impact |
431
+ |---|---|---|
432
+ | `include` | README, top-level Markdown/PDF, `docs/**` Markdown/PDF, one-level nested README | Expands or narrows what Cairn treats as repository documentation. Broader globs improve coverage but can index noisy generated files. |
433
+ | `exclude` | `.git`, `.cairn`, `.codegraph`, caches, virtualenvs, build output, `node_modules` | Keeps generated or tool-owned docs out of search. Simple `name/**` directory excludes match at any depth, so `frontend/node_modules/...` and `apps/web/dist/...` are skipped. Add project-specific generated doc folders here. |
434
+ | `enable_markitdown` | `false` | Enables non-Markdown/PDF conversion when the `markitdown` extra is installed. Useful for DOCX/PPTX/XLSX/HTML-heavy repos, slower and less deterministic than native Markdown/PDF parsing. |
435
+ | `primary_doc` | `readme` | Chooses the default document for normal tools when `doc` is omitted in repo mode. |
436
+ | `search_sections_per_doc` | `1` | Default diversity for `search_documents`. `1` helps agents find the right doc first; raise it when a repo has a few long docs and you want deeper hits from each doc by default. |
437
+ | `preferred_locales` | `[]` | Optional locale preference for repo search, for example `["en"]` or `["zh"]`. When omitted, English queries prefer English or locale-neutral docs without hiding other languages. |
438
+
439
+ MarkItDown integration is local-file only and optional. Cairn uses it as a
440
+ conversion layer, then feeds the generated Markdown into the same canonical
441
+ Markdown parser. This expands coverage to formats such as DOCX, PPTX, XLSX,
442
+ HTML, CSV, JSON, XML, and EPUB without making the base install heavy:
443
+
444
+ ```bash
445
+ .venv/bin/pip install -e ".[markitdown]"
446
+ .venv/bin/docsgraph init -y --force --markitdown
447
+ .venv/bin/docsgraph sync --fake
448
+ ```
449
+
450
+ Generate a standalone graph inspector for the primary repo doc:
451
+
452
+ ```bash
453
+ docsgraph inspect --out /tmp/cairn-repo-inspector.html
454
+ ```
455
+
456
+ ### Single Document Workflow
457
+
458
+ Cairn still works as a focused index for one large document:
459
+
460
+ ```bash
461
+ # Index Cairn's own architecture document.
462
+ .venv/bin/docsgraph index ARCHITECTURE.md --out /tmp/cairn-arch --fake
463
+
464
+ # Get the map — gists only, never full text.
465
+ .venv/bin/docsgraph outline /tmp/cairn-arch --depth 2
466
+
467
+ # Keyword search: every section that mentions "LanceDB".
468
+ .venv/bin/docsgraph query keyword /tmp/cairn-arch LanceDB
469
+
470
+ # Multi-term keyword search with mode=all.
471
+ .venv/bin/docsgraph query keyword /tmp/cairn-arch progressive disclosure --mode all
472
+
473
+ # Generate a standalone graph inspector for the built index.
474
+ .venv/bin/docsgraph inspect /tmp/cairn-arch --out /tmp/cairn-arch/inspector.html
475
+
476
+ # Start a single-document MCP stdio server.
477
+ .venv/bin/docsgraph serve /tmp/cairn-arch --fake
478
+ ```
479
+
480
+ A walkthrough with full output and an MCP-client config snippet is in
481
+ [`examples/hero-demo.md`](examples/hero-demo.md).
482
+
483
+ ### Benchmarks
484
+
485
+ Cairn ships with `cairn-bench`, a small framework that compares Cairn against
486
+ a naive 512-word-chunk vector-RAG baseline (both backed by LanceDB and the
487
+ same embedder, so the comparison is apples-to-apples).
488
+
489
+ Running the starter suite (10 hand-curated questions over Cairn's own
490
+ `ARCHITECTURE.md`) with deterministic in-process plugins:
491
+
492
+ ```bash
493
+ docsgraph bench benchmarks/architecture.toml --fake
494
+ ```
495
+
496
+ | metric | naive vector RAG | Cairn |
497
+ |---|---:|---:|
498
+ | mean recall@8 | 25% | 25% |
499
+ | mean tokens returned | 3,670 | **1,388 (37.8% of naive)** |
500
+
501
+ Caveat — these numbers come from the deterministic `FakeEmbedder` (a
502
+ bag-of-words hash with no semantic understanding). Recall ties because
503
+ neither system has semantics; **the 2.6× token efficiency win is independent
504
+ of the embedder**: it comes from progressive disclosure and section-aware
505
+ retrieval, not from vector quality. Cairn now returns a short `evidence`
506
+ snippet with every semantic hit by default, which raises the token count but
507
+ makes ranking errors easier to inspect. Reproduce these numbers in under a
508
+ second on any machine — and re-run with Ollama (`nomic-embed-text`) or
509
+ Doubao for the real-semantics version. See
510
+ [`benchmarks/README.md`](benchmarks/README.md) for caveats and how to author
511
+ your own suites.
512
+
513
+ Repo-level smoke tests are also public and reproducible:
514
+
515
+ ```bash
516
+ python scripts/eval_repos.py --repo all --refresh --strict
517
+ python scripts/smoke_many_repos.py --limit 37 --strict
518
+ ```
519
+
520
+ The labeled eval set covers `astral-sh/uv`, `pydantic/pydantic-ai`,
521
+ `modelcontextprotocol/python-sdk`, and `fastapi/full-stack-fastapi-template`.
522
+ The broad smoke matrix currently spans 37 public repositories across Python,
523
+ JavaScript/TypeScript, Rust, and Go ecosystems. It is not an accuracy
524
+ leaderboard; it verifies clone/discovery/sync/search/drilldown robustness and
525
+ latency across different documentation shapes.
526
+
527
+ Latest fake-plugin runs on this machine:
528
+
529
+ | suite | result |
530
+ |---|---|
531
+ | `pydantic-ai` labeled eval | 178/178 docs indexed, 8/8 top1, 8/8 top5, 8/8 drilldown |
532
+ | `uv` labeled eval | 89/89 docs indexed, 15/16 top1, 16/16 top3/top5, 16/16 drilldown |
533
+ | `mcp-python-sdk` labeled eval | 17/17 docs indexed, 4/4 top1, 4/4 drilldown |
534
+ | `fastapi-template` labeled eval | 7/7 docs indexed, 4/4 top1, 4/4 drilldown |
535
+ | 37-repo smoke matrix | 2931 docs indexed, 0 sync failures, 185/185 searches with hits, 185/185 drilldowns |
536
+
537
+ `search_documents` uses a general hybrid ranker: dense vector similarity,
538
+ BM25-style sparse evidence, structure-aware field support, weighted query-term
539
+ coverage, path/title identity prior, and local graph-neighborhood propagation.
540
+ Repo search builds a process-local cache and scores dense vectors in batches so
541
+ large documentation sets stay warm-query friendly. On large section sets it
542
+ uses a two-stage path: dense seeds, cheap lexical/path seeds, and graph
543
+ neighbors form a wide shortlist, then the full BM25/graph/explanation ranker
544
+ scores only that candidate set. Cold cache construction loads per-document
545
+ indexes concurrently while preserving per-document failure isolation.
546
+ Search responses expose `ranker.mode`, `total_sections`, and `scored_sections`
547
+ so the performance path is visible to clients and benchmarks.
548
+ Each hit includes a score breakdown and short explanation so agents and humans
549
+ can see whether dense, lexical, sparse, or graph evidence dominated the result.
550
+ Changelog, release-note, and migration-history documents are intent-gated: they
551
+ stay first-class results for release/version/change queries, but broad topic
552
+ queries prefer guides, API docs, and README-style docs when comparable evidence
553
+ exists.
554
+ Search candidates are freshness-aware: repo status records a file-level
555
+ fingerprint, and query responses expose `stale_documents` when source files have
556
+ changed since the last sync.
557
+ `repo_context` composes search, section content, and local relationships into
558
+ one agent-ready payload; `repo_graph` and `repo_impact` expose the documentation
559
+ graph without reimplementing source-code analysis. Pair Cairn with CodeGraph
560
+ when you need AST symbols, callers/callees, or code impact.
561
+ The ranker does not special-case repository names, document ids, or benchmark
562
+ answers.
563
+
564
+ ### Real LLM + real embeddings
565
+
566
+ The `--fake` plugins are great for offline reproducibility but they have no
567
+ semantic understanding. For production indexing, point Cairn at any
568
+ OpenAI-compatible endpoint. The defaults target a **local Ollama** so you
569
+ keep the local-first promise without paying for API tokens:
570
+
571
+ ```bash
572
+ ollama serve
573
+ ollama pull llama3.2:3b
574
+ ollama pull nomic-embed-text
575
+
576
+ .venv/bin/docsgraph index ARCHITECTURE.md --out /tmp/cairn-arch # no --fake
577
+ ```
578
+
579
+ OpenAI, vLLM, Together, Anyscale, …all of them work the same way; override
580
+ `CAIRN_LLM_*` and `CAIRN_EMBED_*` environment variables.
581
+
582
+ For Doubao's vision embedding model, use the dedicated provider because the
583
+ model is served through Volcengine's `/embeddings/multimodal` endpoint:
584
+
585
+ ```bash
586
+ export CAIRN_LLM_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
587
+ export CAIRN_LLM_MODEL=doubao-seed-2-0-code-preview-260215
588
+ export CAIRN_LLM_API_KEY=...
589
+
590
+ export CAIRN_EMBED_PROVIDER=doubao-vision
591
+ export CAIRN_EMBED_MODEL=doubao-embedding-vision-251215
592
+ export CAIRN_EMBED_API_KEY=...
593
+
594
+ docsgraph index ARCHITECTURE.md --out /tmp/cairn-arch
595
+ ```
596
+
597
+ To run the public-repo eval with the real provider configured by your
598
+ environment instead of the deterministic fake plugins:
599
+
600
+ ```bash
601
+ python scripts/eval_repos.py --repo pydantic-ai \
602
+ --provider env \
603
+ --workdir /tmp/cairn-repo-eval-real \
604
+ --refresh
605
+ ```
606
+
607
+ The eval report includes provider mode, model names, and vector dimension, but
608
+ never prints API keys. Cairn also invalidates old indexes when the summarizer,
609
+ embedder, vector dimension, entity extractor, or xref extractor changes, so
610
+ switching from `--fake` to Doubao rebuilds the affected documents instead of
611
+ quietly reusing incompatible vectors.
612
+
613
+ Useful operational knobs when running against hosted APIs:
614
+
615
+ | variable | default | purpose |
616
+ |---|---:|---|
617
+ | `CAIRN_LLM_TIMEOUT` | `60` | per-request summary timeout in seconds |
618
+ | `CAIRN_LLM_MAX_RETRIES` | `2` | retries for 429/5xx and transport errors |
619
+ | `CAIRN_EMBED_TIMEOUT` | `60` | per-request embedding timeout in seconds |
620
+ | `CAIRN_EMBED_MAX_RETRIES` | `2` | retries for embedding 429/5xx and transport errors |
621
+ | `CAIRN_SUMMARY_CONCURRENCY` | `4` | concurrent summary calls during indexing and benchmarks |
622
+ | `CAIRN_EMBED_BATCH_SIZE` | `32` | sections/chunks per embedding batch |
623
+
624
+ ---
625
+
626
+ ## Inspiration and Lineage
627
+
628
+ Cairn synthesizes two strands of recent research and ships them as a real,
629
+ agent-ready tool:
630
+
631
+ - **[BookRAG](https://arxiv.org/abs/2512.03413)** (Dec 2025): structure-aware
632
+ index combining a hierarchical tree with an entity graph, queried via an
633
+ Information-Foraging-Theory-inspired agent. Cairn implements this vision in
634
+ production-grade form.
635
+ - **[A-RAG](https://arxiv.org/abs/2602.03442)** (Feb 2026): clean agent loop
636
+ with hierarchical retrieval tools (keyword/semantic/chunk). Cairn borrows the
637
+ agent-tool philosophy and replaces A-RAG's chunk-based index with a
638
+ structure-first one.
639
+ - **[RAPTOR](https://arxiv.org/abs/2401.18059)** (ICLR 2024): the seminal
640
+ recursive-summarization tree. Cairn's summary layer takes inspiration from it
641
+ while anchoring summaries to the document's own structure instead of
642
+ clustered chunks.
643
+
644
+ We are deeply grateful to these authors; see ADRs for the specific design
645
+ choices we adopted, modified, or declined.
646
+
647
+ ---
648
+
649
+ ## Status & Roadmap
650
+
651
+ | Phase | Status | What |
652
+ |---|---|---|
653
+ | 0 — Foundation | ☑ | Authoritative docs in place (PRODUCT, ARCHITECTURE, CLAUDE, ROADMAP, ADR-0001) |
654
+ | 1 — v0.1 walking skeleton | ☑ | Markdown ingest, Tree + Summaries + Vectors indexes, 5 MCP tools, stdio server, CLI, hero demo |
655
+ | 2 — v0.2 structure-aware retrieval | ☑ | Entities, cross-references, PDF ingest, digest summaries, incremental rebuild, static inspector, `cairn-bench` |
656
+ | 3 — v0.3 repo docs graph | ◐ | Repo `init/sync/status`, repo-scoped MCP, `list_documents`, `search_documents`, `repo_context`, `repo_graph`, `repo_impact`, shareable `.cairn/config.toml`; hosted inspector and telemetry still next |
657
+ | 4 — v0.4 polish for production | ☐ | DOCX/RTF/EPUB, VSCode extension, security review |
658
+ | v1.0 GA | ☐ | All `PRODUCT.md` §7 success criteria met |
659
+
660
+ Full plan: [`ROADMAP.md`](ROADMAP.md). Current test suite: **436 passing**,
661
+ mypy strict clean, ruff clean.
662
+
663
+ Maintainer release gate: [`docs/release-checklist.md`](docs/release-checklist.md).
664
+
665
+ ---
666
+
667
+ ## Contributing
668
+
669
+ Cairn is opinionated. Before opening a PR, please read:
670
+
671
+ 1. [`PRODUCT.md`](PRODUCT.md) — especially the non-goals.
672
+ 2. [`ARCHITECTURE.md`](ARCHITECTURE.md) — the end-state design we're building toward.
673
+ 3. [`CONTRIBUTING.md`](CONTRIBUTING.md) — workflow and PR expectations.
674
+ 4. [`docs/decisions/`](docs/decisions/) — existing ADRs.
675
+
676
+ If you're an AI agent helping a contributor, you'll find your session anchor in
677
+ [`CLAUDE.md`](CLAUDE.md).
678
+
679
+ ---
680
+
681
+ ## License
682
+
683
+ Apache 2.0. See [`LICENSE`](LICENSE).
684
+
685
+ ---
686
+
687
+ *A cairn is a small stack of stones marking a trail through difficult terrain.
688
+ This project is one for AI agents lost in large documents.*