docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,181 @@
1
+ """``search_semantic`` retrieval tool.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §4.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from collections.abc import Sequence
10
+ from typing import Any, Literal
11
+
12
+ from cairn.core.errors import ToolError
13
+ from cairn.embed.base import Embedder
14
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
15
+
16
+ IncludeField = Literal["synopsis", "head", "evidence"]
17
+
18
+ _HEAD_CHARS: int = 200
19
+ _EVIDENCE_CHARS: int = 360
20
+ _MAX_EVIDENCE_TERMS: int = 40
21
+ _VALID_INCLUDE: frozenset[str] = frozenset({"synopsis", "head", "evidence"})
22
+
23
+
24
+ async def search_semantic(
25
+ index: DocumentIndex,
26
+ *,
27
+ embedder: Embedder,
28
+ query: str,
29
+ scope: str | None = None,
30
+ k: int = 8,
31
+ include: Sequence[IncludeField] = ("synopsis", "head", "evidence"),
32
+ ) -> ToolResponse:
33
+ """Dense vector search across the document.
34
+
35
+ The MCP server passes an :class:`Embedder` instance; tools receive it as
36
+ a typed dependency so they can be unit-tested with a fake.
37
+ """
38
+ if k < 1 or k > 32:
39
+ msg = f"k must be in [1, 32]; got {k}"
40
+ raise ToolError(msg, details={"k": k})
41
+ if not query.strip():
42
+ msg = "query must not be empty"
43
+ raise ToolError(msg)
44
+ bad = [x for x in include if x not in _VALID_INCLUDE]
45
+ if bad:
46
+ msg = f"invalid include values: {bad}"
47
+ raise ToolError(msg, details={"invalid": bad})
48
+
49
+ vectors = await embedder.embed([query])
50
+ if not vectors:
51
+ msg = "embedder returned no vector for query"
52
+ raise ToolError(msg)
53
+ query_vec = vectors[0]
54
+ if len(query_vec) != index.vectors.dim:
55
+ msg = (
56
+ f"query embedding dim {len(query_vec)} != "
57
+ f"index dim {index.vectors.dim}"
58
+ )
59
+ raise ToolError(msg)
60
+
61
+ hits = await index.vectors.search(query_vec, k=k, scope_prefix=scope)
62
+
63
+ include_set = set(include)
64
+ results: list[dict[str, Any]] = []
65
+ for hit in hits:
66
+ node = index.tree.get(hit.id)
67
+ if node is None:
68
+ # Stale vector index entry: skip rather than fail the whole call.
69
+ continue
70
+ summary = index.summaries.get(hit.id)
71
+ result: dict[str, Any] = {
72
+ "id": hit.id,
73
+ "title": node.title,
74
+ "score": hit.score,
75
+ "anchor": index.anchor(hit.id),
76
+ }
77
+ if "synopsis" in include_set and summary is not None and summary.synopsis:
78
+ result["synopsis"] = summary.synopsis
79
+ if "head" in include_set:
80
+ result["head"] = node.raw_text[:_HEAD_CHARS]
81
+ if "evidence" in include_set:
82
+ result["evidence"] = _evidence_snippet(node.raw_text, query)
83
+ results.append(result)
84
+
85
+ payload: dict[str, Any] = {
86
+ "query": query,
87
+ "scope": scope,
88
+ "hits": results,
89
+ "cursor": None,
90
+ }
91
+ return ToolResponse(
92
+ data=payload,
93
+ tokens_returned=estimate_tokens_of_payload(payload),
94
+ )
95
+
96
+
97
+ def _evidence_snippet(text: str, query: str) -> dict[str, Any]:
98
+ """Return the strongest lexical evidence window for a semantic hit.
99
+
100
+ Semantic rank is vector-based, but a short lexical window gives humans and
101
+ agents a cheap explanation of what in the section may have caused the hit.
102
+ When there is no term overlap, fall back to the start of the section.
103
+ """
104
+ clean_text = text.strip()
105
+ if not clean_text:
106
+ return {"text": "", "matched_terms": [], "span": {"start": 0, "end": 0}}
107
+
108
+ terms = _query_terms(query)
109
+ best_start = 0
110
+ best_score = 0
111
+ lowered = clean_text.lower()
112
+ if terms:
113
+ candidate_starts: set[int] = {0}
114
+ for term in terms:
115
+ for match in re.finditer(re.escape(term), lowered):
116
+ candidate_starts.add(max(0, match.start() - _EVIDENCE_CHARS // 3))
117
+ for start in candidate_starts:
118
+ end = min(len(clean_text), start + _EVIDENCE_CHARS)
119
+ window = lowered[start:end]
120
+ score = sum(window.count(term) for term in terms)
121
+ if score > best_score:
122
+ best_score = score
123
+ best_start = start
124
+
125
+ start = best_start
126
+ end = min(len(clean_text), start + _EVIDENCE_CHARS)
127
+ snippet = clean_text[start:end]
128
+ if start > 0:
129
+ snippet = "..." + snippet.lstrip()
130
+ if end < len(clean_text):
131
+ snippet = snippet.rstrip() + "..."
132
+
133
+ matched = [term for term in terms if term in lowered[start:end]]
134
+ return {
135
+ "text": snippet,
136
+ "matched_terms": matched,
137
+ "span": {"start": start, "end": end},
138
+ }
139
+
140
+
141
+ def _query_terms(query: str) -> list[str]:
142
+ query = query.lower()
143
+ stop = {
144
+ "a",
145
+ "an",
146
+ "and",
147
+ "are",
148
+ "for",
149
+ "how",
150
+ "is",
151
+ "of",
152
+ "or",
153
+ "the",
154
+ "to",
155
+ "what",
156
+ "where",
157
+ }
158
+ seen: set[str] = set()
159
+ out: list[str] = []
160
+
161
+ def add(term: str) -> None:
162
+ if term in seen or len(out) >= _MAX_EVIDENCE_TERMS:
163
+ return
164
+ seen.add(term)
165
+ out.append(term)
166
+
167
+ words = re.findall(r"[A-Za-z0-9_][A-Za-z0-9_-]*", query)
168
+ for word in words:
169
+ if len(word) < 3 or word in stop or word in seen:
170
+ continue
171
+ add(word)
172
+
173
+ for seq in re.findall(r"[\u3400-\u9fff]+", query):
174
+ if len(seq) >= 2:
175
+ add(seq)
176
+ # CJK queries often have no whitespace; bounded n-grams give the
177
+ # evidence window useful overlap without changing vector ranking.
178
+ for size in range(min(6, len(seq)), 1, -1):
179
+ for start in range(0, len(seq) - size + 1):
180
+ add(seq[start : start + size])
181
+ return out
cairn/xref/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """Cross-reference extraction — directed edges between sections.
2
+
3
+ Three kinds of edges (per ARCHITECTURE.md §2.4):
4
+
5
+ - ``link``: explicit Markdown anchor links (``[text](#anchor)``)
6
+ - ``textual``: numeric section references (``"§ 2.5"``, ``"Section 3"``)
7
+ - ``entity``: sections that share a high-signal defined entity
8
+
9
+ The :class:`XRefExtractor` Protocol is the seam. The default
10
+ :class:`HeuristicXRefExtractor` produces all three kinds without any model
11
+ dependency, and accepts an optional Entities reader for entity-mediated
12
+ edges. LLM-verified textual references are a v0.2.3+ refinement.
13
+ """
14
+
15
+ from cairn.xref.base import ExtractionEdge, XRefExtractor
16
+ from cairn.xref.fake import FakeXRefExtractor
17
+ from cairn.xref.heuristic import HeuristicXRefExtractor
18
+
19
+ __all__ = [
20
+ "ExtractionEdge",
21
+ "FakeXRefExtractor",
22
+ "HeuristicXRefExtractor",
23
+ "XRefExtractor",
24
+ ]
cairn/xref/base.py ADDED
@@ -0,0 +1,50 @@
1
+ """XRefExtractor Protocol + intermediate ExtractionEdge type."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Awaitable, Iterable
6
+ from typing import Protocol, runtime_checkable
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from cairn.core.types import Document, Span, XRefKind
11
+ from cairn.index.entities import Entities
12
+
13
+
14
+ class ExtractionEdge(BaseModel):
15
+ """One observed cross-reference, before deduplication.
16
+
17
+ Spans use the same convention as :class:`cairn.entity.base.ExtractionHit`:
18
+ offsets within the *source section's* ``raw_text``. Self-loops (``src ==
19
+ dst``) are dropped by the :class:`cairn.index.xrefs.XRefBuilder`; do not
20
+ emit them.
21
+ """
22
+
23
+ model_config = ConfigDict(frozen=True, extra="forbid")
24
+
25
+ src: str
26
+ dst: str
27
+ kind: XRefKind
28
+ confidence: float = Field(ge=0.0, le=1.0)
29
+ span: Span
30
+
31
+
32
+ @runtime_checkable
33
+ class XRefExtractor(Protocol):
34
+ """Pluggable cross-reference extractor.
35
+
36
+ The default :class:`cairn.xref.heuristic.HeuristicXRefExtractor` accepts
37
+ an optional ``Entities`` reader as a constructor argument; the Protocol
38
+ itself only mandates the ``extract`` shape.
39
+ """
40
+
41
+ name: str
42
+
43
+ def extract(
44
+ self,
45
+ document: Document,
46
+ *,
47
+ entities: Entities | None = None,
48
+ ) -> Awaitable[Iterable[ExtractionEdge]]:
49
+ """Return an iterable of cross-reference edges across ``document``."""
50
+ ...
cairn/xref/fake.py ADDED
@@ -0,0 +1,40 @@
1
+ """Deterministic cross-reference extractor for tests.
2
+
3
+ Emits one edge between each consecutive pair of sections (in document
4
+ order). Kind is always ``link``. Useful for tests that exercise builder /
5
+ reader / tool behavior without coupling to extraction heuristics.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable
11
+
12
+ from cairn.core.types import Document, Span
13
+ from cairn.index.entities import Entities
14
+ from cairn.xref.base import ExtractionEdge
15
+
16
+
17
+ class FakeXRefExtractor:
18
+ """Linear edge between consecutive sections."""
19
+
20
+ name = "fake:linear"
21
+
22
+ async def extract(
23
+ self,
24
+ document: Document,
25
+ *,
26
+ entities: Entities | None = None,
27
+ ) -> Iterable[ExtractionEdge]:
28
+ edges: list[ExtractionEdge] = []
29
+ sections = document.sections
30
+ for i in range(len(sections) - 1):
31
+ edges.append(
32
+ ExtractionEdge(
33
+ src=sections[i].id,
34
+ dst=sections[i + 1].id,
35
+ kind="link",
36
+ confidence=1.0,
37
+ span=Span(start=0, end=0),
38
+ )
39
+ )
40
+ return edges
@@ -0,0 +1,217 @@
1
+ """HeuristicXRefExtractor — regex + entity-graph derivation, no model needed.
2
+
3
+ Combines three sources into one extractor (per ARCHITECTURE.md §2.4):
4
+
5
+ - **link**: explicit anchor links. ``[text](#anchor)`` resolves to a section
6
+ whose ``id`` ends in ``anchor``. Confidence 0.95 when unique, 0.75 when
7
+ multiple candidates exist.
8
+ - **textual**: numeric references like ``§ 2.5`` or ``Section 3.1``. Mapped
9
+ to a section whose ``title`` starts with the same numeric prefix.
10
+ Confidence 0.7.
11
+ - **entity**: pairs of sections that share a high-signal *defined* entity.
12
+ Confidence scales with the number of shared entities, capped at 0.8.
13
+ Computed only when an :class:`~cairn.index.entities.Entities` reader is
14
+ supplied.
15
+
16
+ Self-loops are filtered. Duplicate ``(src, dst, kind)`` triples are
17
+ deduplicated by the :class:`~cairn.index.xrefs.XRefBuilder`; this layer just
18
+ emits ``ExtractionEdge`` records.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import re
24
+ from collections.abc import Iterable, Iterator
25
+ from typing import Final
26
+
27
+ from cairn.core.types import Document, SectionNode, Span
28
+ from cairn.index.entities import Entities
29
+ from cairn.xref.base import ExtractionEdge
30
+
31
+ # Markdown anchor link: [text](#anchor) — anchor uses kebab/slug form.
32
+ _ANCHOR_LINK = re.compile(r"\[[^\]]+\]\(#([^)\s]+)\)")
33
+
34
+ # Section reference: "§ 2.5", "Section 3.1", "Chapter 4.2". Captures the
35
+ # numeric prefix only.
36
+ _SECTION_REF = re.compile(
37
+ r"(?:§\s*|(?:Section|Chapter|§)\s+)(\d+(?:\.\d+)*)\b",
38
+ re.IGNORECASE,
39
+ )
40
+
41
+
42
+ # Confidence scores per ARCHITECTURE.md §2.4.
43
+ _LINK_CONF_UNIQUE: Final = 0.95
44
+ _LINK_CONF_AMBIGUOUS: Final = 0.75
45
+ _TEXTUAL_CONF: Final = 0.7
46
+ _ENTITY_CONF_BASE: Final = 0.3
47
+ _ENTITY_CONF_STEP: Final = 0.2
48
+ _ENTITY_CONF_CAP: Final = 0.8
49
+
50
+
51
+ class HeuristicXRefExtractor:
52
+ """Regex + entity-graph cross-reference extractor."""
53
+
54
+ name = "heuristic:xref-v1"
55
+
56
+ async def extract(
57
+ self,
58
+ document: Document,
59
+ *,
60
+ entities: Entities | None = None,
61
+ ) -> Iterable[ExtractionEdge]:
62
+ sections = document.sections
63
+ if not sections:
64
+ return []
65
+
66
+ # Pre-build lookups used by link + textual extractors.
67
+ anchor_to_ids = _build_anchor_index(sections)
68
+ prefix_to_id = _build_prefix_index(sections)
69
+
70
+ edges: list[ExtractionEdge] = []
71
+ for section in sections:
72
+ edges.extend(_scan_links(section, anchor_to_ids))
73
+ edges.extend(_scan_textual(section, prefix_to_id))
74
+
75
+ if entities is not None:
76
+ edges.extend(_entity_mediated(sections, entities))
77
+
78
+ return edges
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Link extraction
83
+ # ---------------------------------------------------------------------------
84
+
85
+
86
+ def _build_anchor_index(sections: tuple[SectionNode, ...]) -> dict[str, list[str]]:
87
+ """Map heading-anchor → list of section_ids whose last slug equals it."""
88
+ index: dict[str, list[str]] = {}
89
+ for section in sections:
90
+ last = section.id.rsplit("/", 1)[-1]
91
+ index.setdefault(last, []).append(section.id)
92
+ return index
93
+
94
+
95
+ def _scan_links(
96
+ section: SectionNode,
97
+ anchor_to_ids: dict[str, list[str]],
98
+ ) -> Iterator[ExtractionEdge]:
99
+ for match in _ANCHOR_LINK.finditer(section.raw_text):
100
+ anchor = match.group(1).lower()
101
+ candidates = anchor_to_ids.get(anchor, ())
102
+ if not candidates:
103
+ continue
104
+ unique = len(candidates) == 1
105
+ conf = _LINK_CONF_UNIQUE if unique else _LINK_CONF_AMBIGUOUS
106
+ for dst in candidates:
107
+ if dst == section.id:
108
+ continue
109
+ yield ExtractionEdge(
110
+ src=section.id,
111
+ dst=dst,
112
+ kind="link",
113
+ confidence=conf,
114
+ span=Span(start=match.start(1), end=match.end(1)),
115
+ )
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # Textual extraction
120
+ # ---------------------------------------------------------------------------
121
+
122
+
123
+ def _build_prefix_index(sections: tuple[SectionNode, ...]) -> dict[str, str]:
124
+ """Map numeric title prefix (``"2.5"``) → section_id.
125
+
126
+ Only sections whose title starts with a digit-sequence are indexed.
127
+ First-seen wins on collision.
128
+ """
129
+ index: dict[str, str] = {}
130
+ for section in sections:
131
+ prefix = _numeric_prefix(section.title)
132
+ if prefix and prefix not in index:
133
+ index[prefix] = section.id
134
+ return index
135
+
136
+
137
+ _TITLE_PREFIX = re.compile(r"^\s*(\d+(?:\.\d+)*)\b")
138
+
139
+
140
+ def _numeric_prefix(title: str) -> str | None:
141
+ m = _TITLE_PREFIX.match(title)
142
+ return m.group(1) if m else None
143
+
144
+
145
+ def _scan_textual(
146
+ section: SectionNode,
147
+ prefix_to_id: dict[str, str],
148
+ ) -> Iterator[ExtractionEdge]:
149
+ for match in _SECTION_REF.finditer(section.raw_text):
150
+ number = match.group(1)
151
+ dst = prefix_to_id.get(number)
152
+ if dst is None or dst == section.id:
153
+ continue
154
+ yield ExtractionEdge(
155
+ src=section.id,
156
+ dst=dst,
157
+ kind="textual",
158
+ confidence=_TEXTUAL_CONF,
159
+ span=Span(start=match.start(1), end=match.end(1)),
160
+ )
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Entity-mediated extraction
165
+ # ---------------------------------------------------------------------------
166
+
167
+
168
+ def _entity_mediated(
169
+ sections: tuple[SectionNode, ...],
170
+ entities: Entities,
171
+ ) -> Iterator[ExtractionEdge]:
172
+ """Emit edges between sections that share defined entities.
173
+
174
+ For each defined entity that appears in 2+ sections, every ordered
175
+ section pair contributes one (or one extra) shared count. Confidence
176
+ rises with the count: 1 → 0.5, 2 → 0.7, 3+ → 0.8 (cap).
177
+ """
178
+ # shared_counts[(src, dst)] = number of distinct defined entities in common
179
+ shared: dict[tuple[str, str], int] = {}
180
+ sample_span: dict[tuple[str, str], Span] = {}
181
+
182
+ for ent in entities.by_kind("defined"):
183
+ mention_sections = {m.section_id for m in ent.mentions}
184
+ if len(mention_sections) < 2:
185
+ continue
186
+ ordered = sorted(mention_sections)
187
+ for i, src in enumerate(ordered):
188
+ for dst in ordered[i + 1 :]:
189
+ shared[(src, dst)] = shared.get((src, dst), 0) + 1
190
+ shared[(dst, src)] = shared.get((dst, src), 0) + 1
191
+ # Pick *some* span — the first mention in the src section.
192
+ if (src, dst) not in sample_span:
193
+ for m in ent.mentions:
194
+ if m.section_id == src:
195
+ sample_span[(src, dst)] = m.span
196
+ break
197
+ if (dst, src) not in sample_span:
198
+ for m in ent.mentions:
199
+ if m.section_id == dst:
200
+ sample_span[(dst, src)] = m.span
201
+ break
202
+
203
+ for (src, dst), count in shared.items():
204
+ if src == dst:
205
+ continue
206
+ conf = min(
207
+ _ENTITY_CONF_CAP,
208
+ _ENTITY_CONF_BASE + _ENTITY_CONF_STEP * count,
209
+ )
210
+ span = sample_span.get((src, dst), Span(start=0, end=0))
211
+ yield ExtractionEdge(
212
+ src=src,
213
+ dst=dst,
214
+ kind="entity",
215
+ confidence=conf,
216
+ span=span,
217
+ )