docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/mcp/server.py ADDED
@@ -0,0 +1,363 @@
1
+ """MCP stdio server — wraps the 5 retrieval tools.
2
+
3
+ The dispatch function is exposed separately so unit tests can exercise it
4
+ without spawning a stdio transport.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from mcp.server import Server
13
+ from mcp.server.stdio import stdio_server
14
+ from mcp.types import Tool
15
+
16
+ from cairn.core.errors import CairnError, ToolError
17
+ from cairn.embed.base import Embedder
18
+ from cairn.mcp.schemas import CAIRN_TOOLS, REPO_TOOLS
19
+ from cairn.repo import (
20
+ load_repo_document_index,
21
+ repo_context,
22
+ repo_graph,
23
+ repo_impact,
24
+ repo_status,
25
+ search_repo_documents,
26
+ )
27
+ from cairn.tools.base import DocumentIndex
28
+ from cairn.tools.find_mentions import find_mentions as find_mentions_tool
29
+ from cairn.tools.get_related import get_related as get_related_tool
30
+ from cairn.tools.get_section import expand as expand_tool
31
+ from cairn.tools.get_section import get_section as get_section_tool
32
+ from cairn.tools.outline import outline as outline_tool
33
+ from cairn.tools.read_range import read_range as read_range_tool
34
+ from cairn.tools.search_keyword import search_keyword as search_keyword_tool
35
+ from cairn.tools.search_semantic import search_semantic as search_semantic_tool
36
+
37
+ SERVER_NAME = "cairn"
38
+
39
+
40
+ def _new_trace(
41
+ tool: str,
42
+ arguments: dict[str, Any],
43
+ *,
44
+ mode: str,
45
+ repo_root: Path | None = None,
46
+ doc: str | None = None,
47
+ ) -> dict[str, Any]:
48
+ trace: dict[str, Any] = {
49
+ "server": SERVER_NAME,
50
+ "tool": tool,
51
+ "mode": mode,
52
+ "status": "ok",
53
+ "arguments": dict(arguments),
54
+ "steps": [],
55
+ }
56
+ if repo_root is not None:
57
+ trace["repo_root"] = str(repo_root)
58
+ if doc is not None:
59
+ trace["doc"] = doc
60
+ return trace
61
+
62
+
63
+ def _trace_step(trace: dict[str, Any], name: str, **details: Any) -> None:
64
+ step: dict[str, Any] = {"name": name, "status": details.pop("status", "ok")}
65
+ step.update(details)
66
+ trace["steps"].append(step)
67
+
68
+
69
+ def _success_envelope(
70
+ *,
71
+ data: dict[str, Any],
72
+ tokens_returned: int,
73
+ trace: dict[str, Any],
74
+ ) -> dict[str, Any]:
75
+ trace["status"] = "ok"
76
+ _trace_step(trace, "return_result", tokens_returned=tokens_returned)
77
+ return {
78
+ "ok": True,
79
+ "tokens_returned": tokens_returned,
80
+ "data": data,
81
+ "trace": trace,
82
+ }
83
+
84
+
85
+ def _error_envelope(
86
+ *,
87
+ error: dict[str, Any],
88
+ trace: dict[str, Any],
89
+ ) -> dict[str, Any]:
90
+ trace["status"] = "error"
91
+ _trace_step(
92
+ trace,
93
+ "return_error",
94
+ status="error",
95
+ code=error.get("code"),
96
+ message=error.get("message"),
97
+ )
98
+ return {"ok": False, "error": error, "trace": trace}
99
+
100
+
101
+ async def dispatch_tool(
102
+ name: str,
103
+ arguments: dict[str, Any] | None,
104
+ index: DocumentIndex,
105
+ embedder: Embedder,
106
+ *,
107
+ trace: dict[str, Any] | None = None,
108
+ mode: str = "document",
109
+ ) -> dict[str, Any]:
110
+ """Run a named tool and return the MCP envelope dict.
111
+
112
+ Never raises: all CairnError subclasses are caught and converted to the
113
+ ``{"ok": False, "error": {...}}`` shape documented in mcp-tools.md §0.
114
+ """
115
+ args: dict[str, Any] = dict(arguments or {})
116
+ trace = trace or _new_trace(name, args, mode=mode)
117
+ try:
118
+ _trace_step(trace, "select_tool", tool=name)
119
+ if name == "outline":
120
+ resp = await outline_tool(index, **args)
121
+ elif name == "get_section":
122
+ resp = await get_section_tool(index, **args)
123
+ elif name == "expand":
124
+ resp = await expand_tool(index, **args)
125
+ elif name == "search_semantic":
126
+ resp = await search_semantic_tool(index, embedder=embedder, **args)
127
+ elif name == "search_keyword":
128
+ resp = await search_keyword_tool(index, **args)
129
+ elif name == "find_mentions":
130
+ resp = await find_mentions_tool(index, **args)
131
+ elif name == "get_related":
132
+ resp = await get_related_tool(index, **args)
133
+ elif name == "read_range":
134
+ resp = await read_range_tool(index, **args)
135
+ else:
136
+ msg = f"unknown tool: {name!r}"
137
+ _trace_step(trace, "select_tool", status="error", tool=name)
138
+ raise ToolError(msg, details={"name": name})
139
+ _trace_step(
140
+ trace,
141
+ "execute_tool",
142
+ tool=name,
143
+ tokens_returned=resp.tokens_returned,
144
+ )
145
+ return _success_envelope(
146
+ tokens_returned=resp.tokens_returned,
147
+ data=resp.data,
148
+ trace=trace,
149
+ )
150
+ except CairnError as exc:
151
+ return _error_envelope(error=exc.to_envelope(), trace=trace)
152
+ except TypeError as exc:
153
+ # Pydantic / Python TypeError from wrong kwarg names → 400-like error.
154
+ msg = f"invalid arguments for tool {name!r}: {exc}"
155
+ return _error_envelope(
156
+ trace=trace,
157
+ error={
158
+ "code": "INVALID_INPUT",
159
+ "message": msg,
160
+ "details": {"arguments": args},
161
+ },
162
+ )
163
+
164
+
165
+ def build_server(index: DocumentIndex, embedder: Embedder) -> Server:
166
+ """Construct an MCP Server with handlers bound to this index + embedder."""
167
+ server: Server = Server(SERVER_NAME)
168
+
169
+ # The MCP SDK's decorator helpers are not fully typed; the ignores keep
170
+ # mypy strict happy without polluting the rest of the file.
171
+ @server.list_tools() # type: ignore[no-untyped-call, untyped-decorator]
172
+ async def _list_tools() -> list[Tool]:
173
+ return CAIRN_TOOLS
174
+
175
+ @server.call_tool() # type: ignore[untyped-decorator]
176
+ async def _call_tool(
177
+ name: str, arguments: dict[str, Any] | None
178
+ ) -> dict[str, Any]:
179
+ return await dispatch_tool(name, arguments, index, embedder)
180
+
181
+ return server
182
+
183
+
184
+ async def dispatch_repo_tool(
185
+ name: str,
186
+ arguments: dict[str, Any] | None,
187
+ repo_root: Path,
188
+ embedder: Embedder,
189
+ ) -> dict[str, Any]:
190
+ """Run a tool against one document from a repo-scoped Cairn index."""
191
+ args: dict[str, Any] = dict(arguments or {})
192
+ trace = _new_trace(name, args, mode="repo", repo_root=repo_root)
193
+ try:
194
+ _trace_step(trace, "select_tool", tool=name)
195
+ if name == "list_documents":
196
+ state = args.get("state")
197
+ status = repo_status(repo_root)
198
+ docs = [
199
+ doc.model_dump(mode="json")
200
+ for doc in status.documents
201
+ if state is None or doc.state == state
202
+ ]
203
+ data = {
204
+ "root": str(status.root),
205
+ "primary_doc": status.primary_doc,
206
+ "documents": docs,
207
+ }
208
+ _trace_step(
209
+ trace,
210
+ "load_repo_status",
211
+ documents=len(status.documents),
212
+ indexed=status.indexed_count,
213
+ stale=status.stale_count,
214
+ missing=status.missing_count,
215
+ errors=status.error_count,
216
+ )
217
+ _trace_step(trace, "filter_documents", state=state, returned=len(docs))
218
+ return _success_envelope(
219
+ tokens_returned=0,
220
+ data=data,
221
+ trace=trace,
222
+ )
223
+ if name == "search_documents":
224
+ result = await search_repo_documents(repo_root, embedder=embedder, **args)
225
+ data = result["data"]
226
+ _trace_step(
227
+ trace,
228
+ "search_documents",
229
+ query=args.get("query"),
230
+ hits=len(data.get("hits", [])),
231
+ searched_documents=data.get("searched_documents"),
232
+ ranker_mode=data.get("ranker", {}).get("mode"),
233
+ scored_sections=data.get("ranker", {}).get("scored_sections"),
234
+ )
235
+ return _success_envelope(
236
+ tokens_returned=result["tokens_returned"],
237
+ data=data,
238
+ trace=trace,
239
+ )
240
+ if name == "repo_context":
241
+ result = await repo_context(repo_root, embedder=embedder, **args)
242
+ data = result["data"]
243
+ relationship_map = data.get("relationship_map", {})
244
+ _trace_step(
245
+ trace,
246
+ "search_documents",
247
+ query=args.get("query"),
248
+ hits=len(data.get("hits", [])),
249
+ )
250
+ _trace_step(
251
+ trace,
252
+ "load_context_sections",
253
+ sections=len(data.get("context_sections", [])),
254
+ level=args.get("level", "synopsis"),
255
+ )
256
+ _trace_step(
257
+ trace,
258
+ "build_relationship_map",
259
+ nodes=len(relationship_map.get("nodes", [])),
260
+ edges=len(relationship_map.get("edges", [])),
261
+ )
262
+ return _success_envelope(
263
+ tokens_returned=result["tokens_returned"],
264
+ data=data,
265
+ trace=trace,
266
+ )
267
+ if name == "repo_graph":
268
+ result = await repo_graph(repo_root, **args)
269
+ data = result["data"]
270
+ _trace_step(
271
+ trace,
272
+ "build_repo_graph",
273
+ nodes=len(data.get("nodes", [])),
274
+ edges=len(data.get("edges", [])),
275
+ doc=args.get("doc"),
276
+ )
277
+ return _success_envelope(
278
+ tokens_returned=result["tokens_returned"],
279
+ data=data,
280
+ trace=trace,
281
+ )
282
+ if name == "repo_impact":
283
+ result = await repo_impact(repo_root, **args)
284
+ data = result["data"]
285
+ _trace_step(
286
+ trace,
287
+ "estimate_repo_impact",
288
+ scope=data.get("scope"),
289
+ affected_surfaces=len(data.get("affected_surfaces", [])),
290
+ related_sections=len(data.get("related_sections", [])),
291
+ )
292
+ return _success_envelope(
293
+ tokens_returned=result["tokens_returned"],
294
+ data=data,
295
+ trace=trace,
296
+ )
297
+
298
+ doc_id = args.pop("doc", None)
299
+ if doc_id is not None and not isinstance(doc_id, str):
300
+ msg = "`doc` must be a string when provided"
301
+ _trace_step(trace, "select_document", status="error", doc=doc_id)
302
+ raise ToolError(msg, details={"doc": doc_id})
303
+ trace["mode"] = "repo_document"
304
+ if doc_id is not None:
305
+ trace["doc"] = doc_id
306
+ index = load_repo_document_index(repo_root, doc_id=doc_id)
307
+ _trace_step(trace, "load_document_index", doc=doc_id)
308
+ return await dispatch_tool(
309
+ name,
310
+ args,
311
+ index,
312
+ embedder,
313
+ trace=trace,
314
+ mode="repo_document",
315
+ )
316
+ except CairnError as exc:
317
+ return _error_envelope(error=exc.to_envelope(), trace=trace)
318
+ except TypeError as exc:
319
+ msg = f"invalid arguments for repo tool {name!r}: {exc}"
320
+ return _error_envelope(
321
+ trace=trace,
322
+ error={
323
+ "code": "INVALID_INPUT",
324
+ "message": msg,
325
+ "details": {"arguments": args},
326
+ },
327
+ )
328
+
329
+
330
+ def build_repo_server(repo_root: Path, embedder: Embedder) -> Server:
331
+ """Construct an MCP Server for a repo-scoped Cairn documentation index."""
332
+ server: Server = Server(SERVER_NAME)
333
+
334
+ @server.list_tools() # type: ignore[no-untyped-call, untyped-decorator]
335
+ async def _list_tools() -> list[Tool]:
336
+ return REPO_TOOLS
337
+
338
+ @server.call_tool() # type: ignore[untyped-decorator]
339
+ async def _call_tool(
340
+ name: str, arguments: dict[str, Any] | None
341
+ ) -> dict[str, Any]:
342
+ return await dispatch_repo_tool(name, arguments, repo_root, embedder)
343
+
344
+ return server
345
+
346
+
347
+ async def serve_stdio(doc_dir: Path, *, embedder: Embedder) -> None:
348
+ """Load a built index and serve MCP over stdio until the peer disconnects."""
349
+ index = DocumentIndex.load(doc_dir)
350
+ server = build_server(index, embedder)
351
+ async with stdio_server() as (read_stream, write_stream):
352
+ await server.run(
353
+ read_stream, write_stream, server.create_initialization_options()
354
+ )
355
+
356
+
357
+ async def serve_repo_stdio(repo_root: Path, *, embedder: Embedder) -> None:
358
+ """Serve a repo-scoped Cairn document index over MCP stdio."""
359
+ server = build_repo_server(repo_root, embedder)
360
+ async with stdio_server() as (read_stream, write_stream):
361
+ await server.run(
362
+ read_stream, write_stream, server.create_initialization_options()
363
+ )
cairn/providers.py ADDED
@@ -0,0 +1,50 @@
1
+ """Runtime provider factories shared by CLI commands and evaluation scripts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from cairn.cli.config import load_embed_config, load_llm_config
6
+ from cairn.embed.base import Embedder
7
+ from cairn.embed.doubao import DoubaoVisionEmbedder
8
+ from cairn.embed.fake import FakeEmbedder
9
+ from cairn.embed.openai_compatible import OpenAICompatibleEmbedder
10
+ from cairn.summarize.base import Summarizer
11
+ from cairn.summarize.fake import FakeSummarizer
12
+ from cairn.summarize.openai_compatible import OpenAICompatibleSummarizer
13
+
14
+
15
+ def make_summarizer(use_fake: bool) -> Summarizer:
16
+ """Build the configured summarizer without exposing any secret values."""
17
+ if use_fake:
18
+ return FakeSummarizer()
19
+ cfg = load_llm_config()
20
+ return OpenAICompatibleSummarizer(
21
+ base_url=cfg.base_url,
22
+ model=cfg.model,
23
+ api_key=cfg.api_key,
24
+ timeout=cfg.timeout,
25
+ max_retries=cfg.max_retries,
26
+ )
27
+
28
+
29
+ def make_embedder(use_fake: bool) -> Embedder:
30
+ """Build the configured embedder without exposing any secret values."""
31
+ if use_fake:
32
+ return FakeEmbedder(dim=64)
33
+ cfg = load_embed_config()
34
+ if cfg.provider == "doubao-vision":
35
+ return DoubaoVisionEmbedder(
36
+ base_url=cfg.base_url,
37
+ model=cfg.model,
38
+ dim=cfg.dim,
39
+ api_key=cfg.api_key,
40
+ timeout=cfg.timeout,
41
+ max_retries=cfg.max_retries,
42
+ )
43
+ return OpenAICompatibleEmbedder(
44
+ base_url=cfg.base_url,
45
+ model=cfg.model,
46
+ dim=cfg.dim,
47
+ api_key=cfg.api_key,
48
+ timeout=cfg.timeout,
49
+ max_retries=cfg.max_retries,
50
+ )
cairn/py.typed ADDED
File without changes