docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,140 @@
1
+ """``get_related`` retrieval tool.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §7.
4
+
5
+ Returns neighbors of a section across two channels:
6
+
7
+ - the tree (``sibling`` / ``parent`` / ``child``)
8
+ - the cross-reference graph (``xref``)
9
+
10
+ Tree neighbors are returned with confidence ``1.0`` and ``relation: null``.
11
+ XRef neighbors carry the extractor's confidence and the edge's ``kind`` as
12
+ the ``relation`` field (``link``, ``textual``, or ``entity``).
13
+
14
+ Results are sorted by confidence descending, then by destination id, and
15
+ truncated to ``k``.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from collections.abc import Sequence
21
+ from typing import Any, Literal
22
+
23
+ from cairn.core.errors import IndexNotFoundError, ToolError
24
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
25
+
26
+ Kind = Literal["xref", "sibling", "parent", "child"]
27
+
28
+ _VALID_KINDS: frozenset[str] = frozenset({"xref", "sibling", "parent", "child"})
29
+
30
+
31
+ async def get_related(
32
+ index: DocumentIndex,
33
+ *,
34
+ id: str,
35
+ kinds: Sequence[Kind] = ("xref",),
36
+ k: int = 8,
37
+ ) -> ToolResponse:
38
+ """Return up to ``k`` neighbors of section ``id`` across requested channels."""
39
+ if k < 1 or k > 32:
40
+ msg = f"k must be in [1, 32]; got {k}"
41
+ raise ToolError(msg, details={"k": k})
42
+ if not kinds:
43
+ msg = "kinds must contain at least one entry"
44
+ raise ToolError(msg)
45
+ bad = [x for x in kinds if x not in _VALID_KINDS]
46
+ if bad:
47
+ msg = f"invalid kinds: {bad}"
48
+ raise ToolError(msg, details={"invalid": bad})
49
+
50
+ node = index.tree.get(id)
51
+ if node is None:
52
+ msg = f"section not found: {id!r}"
53
+ raise IndexNotFoundError(msg, details={"section_id": id})
54
+
55
+ kind_set = set(kinds)
56
+ neighbors: list[dict[str, Any]] = []
57
+
58
+ if "xref" in kind_set and index.xrefs is not None:
59
+ for xref in index.xrefs.outgoing_from(id):
60
+ neighbors.append(
61
+ _neighbor(
62
+ index,
63
+ section_id=xref.dst,
64
+ kind="xref",
65
+ relation=xref.kind,
66
+ confidence=xref.confidence,
67
+ )
68
+ )
69
+
70
+ if "child" in kind_set:
71
+ for child in index.tree.children_of(id):
72
+ neighbors.append(
73
+ _neighbor(
74
+ index,
75
+ section_id=child.id,
76
+ kind="child",
77
+ relation=None,
78
+ confidence=1.0,
79
+ )
80
+ )
81
+
82
+ if "parent" in kind_set and node.parent is not None:
83
+ neighbors.append(
84
+ _neighbor(
85
+ index,
86
+ section_id=node.parent,
87
+ kind="parent",
88
+ relation=None,
89
+ confidence=1.0,
90
+ )
91
+ )
92
+
93
+ if "sibling" in kind_set and node.parent is not None:
94
+ for sibling in index.tree.children_of(node.parent):
95
+ if sibling.id == id:
96
+ continue
97
+ neighbors.append(
98
+ _neighbor(
99
+ index,
100
+ section_id=sibling.id,
101
+ kind="sibling",
102
+ relation=None,
103
+ confidence=1.0,
104
+ )
105
+ )
106
+
107
+ neighbors.sort(key=lambda n: (-float(n["confidence"]), n["id"]))
108
+ neighbors = neighbors[:k]
109
+
110
+ payload: dict[str, Any] = {
111
+ "id": id,
112
+ "neighbors": neighbors,
113
+ }
114
+ return ToolResponse(
115
+ data=payload,
116
+ tokens_returned=estimate_tokens_of_payload(payload),
117
+ )
118
+
119
+
120
+ def _neighbor(
121
+ index: DocumentIndex,
122
+ *,
123
+ section_id: str,
124
+ kind: str,
125
+ relation: str | None,
126
+ confidence: float,
127
+ ) -> dict[str, Any]:
128
+ node = index.tree.get(section_id)
129
+ payload: dict[str, Any] = {
130
+ "id": section_id,
131
+ "title": node.title if node is not None else section_id,
132
+ "kind": kind,
133
+ "relation": relation,
134
+ "confidence": confidence,
135
+ "anchor": index.anchor(section_id),
136
+ }
137
+ summary = index.summaries.get(section_id)
138
+ if summary is not None and summary.gist:
139
+ payload["gist"] = summary.gist
140
+ return payload
@@ -0,0 +1,130 @@
1
+ """``get_section`` and ``expand`` retrieval tools.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §2 and §3.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, Literal
9
+
10
+ from cairn.core.errors import IndexNotFoundError, ToolError
11
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
12
+
13
+ Level = Literal["gist", "synopsis", "digest", "full"]
14
+
15
+ _LEVEL_ORDER: dict[str, int] = {"gist": 0, "synopsis": 1, "digest": 2, "full": 3}
16
+
17
+
18
+ async def get_section(
19
+ index: DocumentIndex,
20
+ *,
21
+ id: str,
22
+ level: Level = "synopsis",
23
+ include_children: bool = False,
24
+ ) -> ToolResponse:
25
+ """Fetch one section at the chosen summary level.
26
+
27
+ `include_children` is reserved for v0.2 — passing ``True`` raises
28
+ :class:`cairn.core.errors.ToolError`.
29
+ """
30
+ if include_children:
31
+ msg = "include_children is reserved for v0.2; pass False in v0.1"
32
+ raise ToolError(msg, details={"feature": "include_children"})
33
+
34
+ node = index.tree.get(id)
35
+ if node is None:
36
+ msg = f"section not found: {id!r}"
37
+ raise IndexNotFoundError(msg, details={"section_id": id})
38
+
39
+ content = _content_at_level(index, id, level, node.raw_text)
40
+
41
+ next_levels = _levels_deeper_than(level, index, id)
42
+
43
+ payload: dict[str, Any] = {
44
+ "doc": index.doc_id,
45
+ "id": node.id,
46
+ "title": node.title,
47
+ "level": level,
48
+ "content": content,
49
+ "anchor": index.anchor(node.id),
50
+ "path": list(node.path),
51
+ "has_children": bool(node.children),
52
+ "next_levels_available": next_levels,
53
+ }
54
+ return ToolResponse(
55
+ data=payload,
56
+ tokens_returned=estimate_tokens(content),
57
+ )
58
+
59
+
60
+ async def expand(
61
+ index: DocumentIndex,
62
+ *,
63
+ id: str,
64
+ to: Literal["synopsis", "digest", "full"],
65
+ ) -> ToolResponse:
66
+ """Move from a shallower summary to a deeper one. Convenience over ``get_section``.
67
+
68
+ Behaves exactly like ``get_section(id, level=to)`` and exists to make
69
+ the progressive-disclosure idiom explicit in agent prompts.
70
+ """
71
+ return await get_section(index, id=id, level=to)
72
+
73
+
74
+ def _content_at_level(
75
+ index: DocumentIndex,
76
+ section_id: str,
77
+ level: str,
78
+ raw_text: str,
79
+ ) -> str:
80
+ if level == "full":
81
+ return raw_text
82
+
83
+ summary = index.summaries.get(section_id)
84
+ if summary is None:
85
+ msg = (
86
+ f"section {section_id!r} has no summary set; "
87
+ "the Summaries index may not have been built"
88
+ )
89
+ raise IndexNotFoundError(msg, details={"section_id": section_id})
90
+
91
+ if level == "gist":
92
+ return summary.gist
93
+ if level == "synopsis":
94
+ return summary.synopsis
95
+ if level == "digest":
96
+ if summary.digest is None:
97
+ msg = (
98
+ f"digest not available for {section_id!r}; "
99
+ "v0.1 generates only gist + synopsis"
100
+ )
101
+ raise IndexNotFoundError(
102
+ msg,
103
+ details={"section_id": section_id, "level": level},
104
+ )
105
+ return summary.digest
106
+
107
+ msg = f"unknown level: {level!r}"
108
+ raise ToolError(msg, details={"level": level})
109
+
110
+
111
+ def _levels_deeper_than(
112
+ current: str,
113
+ index: DocumentIndex,
114
+ section_id: str,
115
+ ) -> list[str]:
116
+ current_rank = _LEVEL_ORDER[current]
117
+ deeper = [
118
+ name for name, rank in _LEVEL_ORDER.items() if rank > current_rank
119
+ ]
120
+ # Filter to what is actually available.
121
+ summary = index.summaries.get(section_id)
122
+ available: list[str] = []
123
+ for level in deeper:
124
+ if level == "full":
125
+ available.append(level)
126
+ elif summary is not None and level == "digest" and summary.digest is None:
127
+ continue
128
+ else:
129
+ available.append(level)
130
+ return available
cairn/tools/outline.py ADDED
@@ -0,0 +1,75 @@
1
+ """``outline`` retrieval tool — the cheapest, called first.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §1.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from collections.abc import Container, Sequence
9
+ from typing import Any, Literal
10
+
11
+ from cairn.core.errors import ToolError
12
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
13
+
14
+ IncludeLevel = Literal["gist", "synopsis"]
15
+
16
+ _VALID_INCLUDE: frozenset[str] = frozenset({"gist", "synopsis"})
17
+
18
+
19
+ async def outline(
20
+ index: DocumentIndex,
21
+ *,
22
+ depth: int = 2,
23
+ focus: str | None = None,
24
+ include: Sequence[IncludeLevel] = ("gist",),
25
+ ) -> ToolResponse:
26
+ """Return a truncated outline tree of the document.
27
+
28
+ See docs/specs/mcp-tools.md §1 for full semantics.
29
+ """
30
+ if depth < 1 or depth > 6:
31
+ msg = f"depth must be in [1, 6]; got {depth}"
32
+ raise ToolError(msg, details={"depth": depth})
33
+
34
+ if not include:
35
+ msg = "include must contain at least one summary level"
36
+ raise ToolError(msg)
37
+
38
+ bad = [x for x in include if x not in _VALID_INCLUDE]
39
+ if bad:
40
+ msg = f"invalid include values: {bad}"
41
+ raise ToolError(msg, details={"invalid": bad})
42
+
43
+ forest = index.tree.outline(depth=depth, focus=focus)
44
+ include_set = set(include)
45
+ _attach_summaries(forest, index, include_set)
46
+
47
+ payload: dict[str, Any] = {
48
+ "doc": index.doc_id,
49
+ "depth": depth,
50
+ "focus": focus,
51
+ "tree": forest,
52
+ }
53
+ return ToolResponse(
54
+ data=payload,
55
+ tokens_returned=estimate_tokens_of_payload(payload),
56
+ )
57
+
58
+
59
+ def _attach_summaries(
60
+ nodes: list[dict[str, Any]],
61
+ index: DocumentIndex,
62
+ include: Container[str],
63
+ ) -> None:
64
+ """Mutate the outline forest to add gist/synopsis where requested."""
65
+ for node in nodes:
66
+ sid = node["id"]
67
+ summary = index.summaries.get(sid)
68
+ if summary is not None:
69
+ if "gist" in include and summary.gist:
70
+ node["gist"] = summary.gist
71
+ if "synopsis" in include and summary.synopsis:
72
+ node["synopsis"] = summary.synopsis
73
+ children = node.get("children", [])
74
+ if children:
75
+ _attach_summaries(children, index, include)
@@ -0,0 +1,94 @@
1
+ """``read_range`` retrieval tool.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §8.
4
+
5
+ Reads continuous text across consecutive sections in document order. The
6
+ agent gives ``start_id`` and ``end_id``; the tool concatenates each section
7
+ as ``"## {title}\\n\\n{raw_text}"`` separated by blank lines, truncating at
8
+ the ``max_tokens`` budget. When truncated, ``next_id`` points at the first
9
+ section that wasn't included so the agent can continue.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Any
15
+
16
+ from cairn.core.errors import IndexNotFoundError, ToolError
17
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
18
+
19
+
20
+ async def read_range(
21
+ index: DocumentIndex,
22
+ *,
23
+ start_id: str,
24
+ end_id: str,
25
+ max_tokens: int = 4000,
26
+ ) -> ToolResponse:
27
+ """Read continuous content from ``start_id`` through ``end_id``."""
28
+ if max_tokens < 1:
29
+ msg = f"max_tokens must be >= 1; got {max_tokens}"
30
+ raise ToolError(msg, details={"max_tokens": max_tokens})
31
+
32
+ sections = list(index.tree)
33
+ ids = [s.id for s in sections]
34
+
35
+ try:
36
+ start_idx = ids.index(start_id)
37
+ except ValueError as exc:
38
+ msg = f"start_id not found: {start_id!r}"
39
+ raise IndexNotFoundError(msg, details={"section_id": start_id}) from exc
40
+
41
+ try:
42
+ end_idx = ids.index(end_id)
43
+ except ValueError as exc:
44
+ msg = f"end_id not found: {end_id!r}"
45
+ raise IndexNotFoundError(msg, details={"section_id": end_id}) from exc
46
+
47
+ if start_idx > end_idx:
48
+ msg = (
49
+ f"start_id {start_id!r} must come before end_id {end_id!r} "
50
+ "in document order"
51
+ )
52
+ raise ToolError(
53
+ msg,
54
+ details={"start_id": start_id, "end_id": end_id},
55
+ )
56
+
57
+ parts: list[str] = []
58
+ tokens_so_far = 0
59
+ next_id: str | None = None
60
+
61
+ for section in sections[start_idx : end_idx + 1]:
62
+ rendered = _render_section(section.title, section.raw_text)
63
+ part_tokens = estimate_tokens(rendered)
64
+ # Allow the first section even if it alone exceeds the budget — the
65
+ # agent asked for it, and returning nothing is worse than returning
66
+ # a single oversized chunk.
67
+ if parts and tokens_so_far + part_tokens > max_tokens:
68
+ next_id = section.id
69
+ break
70
+ parts.append(rendered)
71
+ tokens_so_far += part_tokens
72
+
73
+ content = "\n\n".join(parts)
74
+
75
+ payload: dict[str, Any] = {
76
+ "doc": index.doc_id,
77
+ "start_id": start_id,
78
+ "end_id": end_id,
79
+ "content": content,
80
+ "anchor_start": index.anchor(start_id),
81
+ "anchor_end": index.anchor(end_id),
82
+ "truncated": next_id is not None,
83
+ "next_id": next_id,
84
+ }
85
+ return ToolResponse(
86
+ data=payload,
87
+ tokens_returned=tokens_so_far,
88
+ )
89
+
90
+
91
+ def _render_section(title: str, body: str) -> str:
92
+ if body.strip():
93
+ return f"## {title}\n\n{body}"
94
+ return f"## {title}"
@@ -0,0 +1,94 @@
1
+ """``search_keyword`` retrieval tool.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §5.
4
+
5
+ v0.1 uses a linear scan over loaded sections. For documents up to a few
6
+ thousand sections this comfortably stays under the spec's latency target.
7
+ A proper inverted index is a v0.2+ optimization.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Literal
13
+
14
+ from cairn.core.errors import ToolError
15
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
16
+
17
+ Mode = Literal["any", "all"]
18
+
19
+ _HEAD_CHARS: int = 200
20
+
21
+
22
+ async def search_keyword(
23
+ index: DocumentIndex,
24
+ *,
25
+ terms: list[str],
26
+ scope: str | None = None,
27
+ k: int = 12,
28
+ mode: Mode = "any",
29
+ ) -> ToolResponse:
30
+ """Exact (case-insensitive) lexical search across the document."""
31
+ if not 1 <= len(terms) <= 8:
32
+ msg = f"terms must contain 1-8 entries; got {len(terms)}"
33
+ raise ToolError(msg, details={"count": len(terms)})
34
+ if any(not t.strip() for t in terms):
35
+ msg = "terms must be non-empty strings"
36
+ raise ToolError(msg)
37
+ if k < 1 or k > 32:
38
+ msg = f"k must be in [1, 32]; got {k}"
39
+ raise ToolError(msg, details={"k": k})
40
+ if mode not in ("any", "all"):
41
+ msg = f"mode must be 'any' or 'all'; got {mode!r}"
42
+ raise ToolError(msg, details={"mode": mode})
43
+
44
+ lc_terms = [t.lower() for t in terms]
45
+
46
+ scored: list[tuple[int, dict[str, Any]]] = []
47
+ for node in index.tree:
48
+ if scope is not None and not _matches_scope(node.id, scope):
49
+ continue
50
+
51
+ text_lc = (node.title + "\n" + node.raw_text).lower()
52
+ matches: list[dict[str, Any]] = []
53
+ total_score = 0
54
+ for orig, lc in zip(terms, lc_terms, strict=True):
55
+ count = text_lc.count(lc)
56
+ if count > 0:
57
+ matches.append({"term": orig, "count": count})
58
+ total_score += count * len(orig)
59
+
60
+ if not matches:
61
+ continue
62
+ if mode == "all" and len(matches) != len(terms):
63
+ continue
64
+
65
+ scored.append(
66
+ (
67
+ total_score,
68
+ {
69
+ "id": node.id,
70
+ "title": node.title,
71
+ "score": total_score,
72
+ "anchor": index.anchor(node.id),
73
+ "matches": matches,
74
+ "head": node.raw_text[:_HEAD_CHARS],
75
+ },
76
+ )
77
+ )
78
+
79
+ scored.sort(key=lambda x: (-x[0], x[1]["id"]))
80
+ top = [d for _, d in scored[:k]]
81
+
82
+ payload: dict[str, Any] = {
83
+ "terms": terms,
84
+ "mode": mode,
85
+ "hits": top,
86
+ }
87
+ return ToolResponse(
88
+ data=payload,
89
+ tokens_returned=estimate_tokens_of_payload(payload),
90
+ )
91
+
92
+
93
+ def _matches_scope(section_id: str, scope: str) -> bool:
94
+ return section_id == scope or section_id.startswith(scope + "/")