docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,18 @@
1
+ """Summarization layer — pluggable LLM-backed summarizers + cache.
2
+
3
+ Used by the index layer (`cairn.index.summaries.SummaryBuilder`) at indexing
4
+ time. Never invoked at query time. See ARCHITECTURE.md §2.2.
5
+ """
6
+
7
+ from cairn.summarize.base import Summarizer, SummaryLevel
8
+ from cairn.summarize.cache import SummaryCache
9
+ from cairn.summarize.fake import FakeSummarizer
10
+ from cairn.summarize.openai_compatible import OpenAICompatibleSummarizer
11
+
12
+ __all__ = [
13
+ "FakeSummarizer",
14
+ "OpenAICompatibleSummarizer",
15
+ "Summarizer",
16
+ "SummaryCache",
17
+ "SummaryLevel",
18
+ ]
@@ -0,0 +1,56 @@
1
+ """Summarizer protocol and level enum.
2
+
3
+ A `Summarizer` produces a single summary string for a section at a given
4
+ granularity level. Pre-computed during indexing; never invoked at query time.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from enum import StrEnum
10
+ from typing import Protocol, runtime_checkable
11
+
12
+
13
+ class SummaryLevel(StrEnum):
14
+ """The three granularity levels Cairn supports.
15
+
16
+ - ``GIST``: ≤ 20 words. The "scent" in IFT terms; used by ``outline``.
17
+ - ``SYNOPSIS``: ≤ 80 words. Used by ``get_section`` (default) and search hits.
18
+ - ``DIGEST``: ≤ 300 words. Used by ``expand`` and ``get_section(level="digest")``.
19
+ """
20
+
21
+ GIST = "gist"
22
+ SYNOPSIS = "synopsis"
23
+ DIGEST = "digest"
24
+
25
+
26
+ @runtime_checkable
27
+ class Summarizer(Protocol):
28
+ """A pluggable summarizer.
29
+
30
+ Implementations should be deterministic for ``(title, body, level)`` when
31
+ possible — use ``temperature=0`` and fixed prompts. The ``name`` attribute
32
+ must encode both the implementation family and the model identifier so
33
+ cache keys correctly invalidate when either changes.
34
+
35
+ Examples of valid ``name`` values::
36
+
37
+ "fake:words"
38
+ "openai-compat:gpt-4o-mini"
39
+ "openai-compat:llama3.2:3b"
40
+ """
41
+
42
+ name: str
43
+
44
+ async def summarize(
45
+ self,
46
+ *,
47
+ title: str,
48
+ body: str,
49
+ level: SummaryLevel,
50
+ ) -> str:
51
+ """Produce a summary of ``body`` (titled ``title``) at ``level``.
52
+
53
+ Implementations must enforce the level's word budget on the output
54
+ (see ``cairn.summarize.prompts.WORD_BUDGETS``).
55
+ """
56
+ ...
@@ -0,0 +1,66 @@
1
+ """File-system cache for summarizer outputs.
2
+
3
+ Keyed by ``sha256(model || level || section_hash)``. Each entry is a single
4
+ UTF-8 text file under ``<root>/<first2hex>/<remaining>.txt``. Writes are
5
+ atomic: temp-file + rename. Concurrent writers may race; the winner's
6
+ content is kept (acceptable because identical inputs should yield identical
7
+ outputs from a deterministic summarizer).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import os
14
+ from pathlib import Path
15
+
16
+
17
+ class SummaryCache:
18
+ """Local file-system cache for ``Summarizer`` outputs."""
19
+
20
+ def __init__(self, root: Path) -> None:
21
+ self.root = root
22
+
23
+ # -- key / path helpers -------------------------------------------------
24
+
25
+ @staticmethod
26
+ def key(*, model: str, level: str, section_hash: str) -> str:
27
+ """Compute the cache key for one (model, level, section) tuple."""
28
+ h = hashlib.sha256()
29
+ h.update(model.encode("utf-8"))
30
+ h.update(b"\x00")
31
+ h.update(level.encode("utf-8"))
32
+ h.update(b"\x00")
33
+ h.update(section_hash.encode("utf-8"))
34
+ return h.hexdigest()
35
+
36
+ def _path_for(self, key: str) -> Path:
37
+ return self.root / key[:2] / f"{key[2:]}.txt"
38
+
39
+ # -- public API ---------------------------------------------------------
40
+
41
+ def get(self, key: str) -> str | None:
42
+ """Return the cached summary or ``None`` if absent."""
43
+ path = self._path_for(key)
44
+ if not path.exists():
45
+ return None
46
+ return path.read_text(encoding="utf-8")
47
+
48
+ def put(self, key: str, value: str) -> None:
49
+ """Write a cache entry atomically."""
50
+ path = self._path_for(key)
51
+ path.parent.mkdir(parents=True, exist_ok=True)
52
+ tmp = path.with_suffix(path.suffix + ".tmp")
53
+ tmp.write_text(value, encoding="utf-8")
54
+ os.replace(tmp, path)
55
+
56
+ def clear(self) -> None:
57
+ """Remove the entire cache directory. Safe if it doesn't exist."""
58
+ if not self.root.exists():
59
+ return
60
+ for path in sorted(self.root.rglob("*"), reverse=True):
61
+ if path.is_file():
62
+ path.unlink()
63
+ elif path.is_dir():
64
+ path.rmdir()
65
+ if self.root.exists() and self.root.is_dir():
66
+ self.root.rmdir()
@@ -0,0 +1,43 @@
1
+ """Deterministic, network-free summarizer for tests and dry runs.
2
+
3
+ Not for production use. Output is a word-truncated prefix of the body, which
4
+ preserves enough structure for downstream sanity checks while requiring no
5
+ LLM and no network.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import ClassVar
12
+
13
+ from cairn.summarize.base import SummaryLevel
14
+
15
+ _WORD = re.compile(r"\S+")
16
+
17
+
18
+ class FakeSummarizer:
19
+ """Word-truncation summarizer. Deterministic; no network."""
20
+
21
+ name = "fake:words"
22
+
23
+ _BUDGETS: ClassVar[dict[SummaryLevel, int]] = {
24
+ SummaryLevel.GIST: 15,
25
+ SummaryLevel.SYNOPSIS: 60,
26
+ SummaryLevel.DIGEST: 200,
27
+ }
28
+
29
+ async def summarize(
30
+ self,
31
+ *,
32
+ title: str,
33
+ body: str,
34
+ level: SummaryLevel,
35
+ ) -> str:
36
+ budget = self._BUDGETS[level]
37
+ words = _WORD.findall(body)
38
+ if not words:
39
+ return f"{title.strip() or 'Section'}."
40
+ truncated = " ".join(words[:budget])
41
+ if len(words) > budget:
42
+ truncated += "…"
43
+ return truncated
@@ -0,0 +1,148 @@
1
+ """OpenAI-compatible HTTP summarizer.
2
+
3
+ Works with any endpoint that implements the OpenAI ``/v1/chat/completions``
4
+ contract: OpenAI itself, Ollama (``http://localhost:11434/v1``), vLLM,
5
+ Together, Anyscale, etc.
6
+
7
+ This is the **default** summarizer for production indexing in v0.1. It must
8
+ remain usable without proprietary credentials (point it at a local Ollama
9
+ instance) — per CLAUDE.md P4 "local-first must always work".
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ from cairn.core.errors import IndexBuildError
20
+ from cairn.summarize.base import SummaryLevel
21
+ from cairn.summarize.prompts import SYSTEM_PROMPT, enforce_word_budget, user_prompt
22
+
23
+
24
+ class OpenAICompatibleSummarizer:
25
+ """OpenAI-compatible chat-completions client."""
26
+
27
+ def __init__(
28
+ self,
29
+ *,
30
+ base_url: str = "http://localhost:11434/v1",
31
+ model: str = "llama3.2:3b",
32
+ api_key: str | None = None,
33
+ timeout: float = 60.0,
34
+ temperature: float = 0.0,
35
+ max_retries: int = 2,
36
+ retry_base_delay: float = 0.5,
37
+ ) -> None:
38
+ if max_retries < 0:
39
+ msg = f"max_retries must be >= 0; got {max_retries}"
40
+ raise ValueError(msg)
41
+ if retry_base_delay < 0:
42
+ msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
43
+ raise ValueError(msg)
44
+ self.base_url = base_url.rstrip("/")
45
+ self.model = model
46
+ self.api_key = api_key
47
+ self.timeout = timeout
48
+ self.temperature = temperature
49
+ self.max_retries = max_retries
50
+ self.retry_base_delay = retry_base_delay
51
+ self.name = f"openai-compat:{model}"
52
+
53
+ async def summarize(
54
+ self,
55
+ *,
56
+ title: str,
57
+ body: str,
58
+ level: SummaryLevel,
59
+ ) -> str:
60
+ headers = {"Content-Type": "application/json"}
61
+ if self.api_key:
62
+ headers["Authorization"] = f"Bearer {self.api_key}"
63
+
64
+ payload: dict[str, Any] = {
65
+ "model": self.model,
66
+ "temperature": self.temperature,
67
+ "messages": [
68
+ {"role": "system", "content": SYSTEM_PROMPT},
69
+ {"role": "user", "content": user_prompt(title, body, level)},
70
+ ],
71
+ }
72
+
73
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
74
+ response = await self._post_with_retries(client, payload, headers)
75
+ data = response.json()
76
+
77
+ try:
78
+ text = str(data["choices"][0]["message"]["content"]).strip()
79
+ except (KeyError, IndexError, TypeError) as exc:
80
+ msg = "summarizer response did not match OpenAI chat-completions shape"
81
+ raise IndexBuildError(msg, details={"response": data}) from exc
82
+
83
+ return enforce_word_budget(text, level)
84
+
85
+ async def _post_with_retries(
86
+ self,
87
+ client: httpx.AsyncClient,
88
+ payload: dict[str, Any],
89
+ headers: dict[str, str],
90
+ ) -> httpx.Response:
91
+ last_exc: httpx.HTTPError | None = None
92
+ for attempt in range(self.max_retries + 1):
93
+ try:
94
+ response = await client.post(
95
+ f"{self.base_url}/chat/completions",
96
+ json=payload,
97
+ headers=headers,
98
+ )
99
+ except httpx.HTTPError as exc:
100
+ last_exc = exc
101
+ if attempt < self.max_retries:
102
+ await self._sleep_before_retry(attempt)
103
+ continue
104
+ msg = f"summarizer request failed: {exc}"
105
+ raise IndexBuildError(
106
+ msg,
107
+ details={
108
+ "model": self.model,
109
+ "base_url": self.base_url,
110
+ "error_type": type(exc).__name__,
111
+ "attempts": attempt + 1,
112
+ },
113
+ ) from exc
114
+
115
+ if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
116
+ await self._sleep_before_retry(attempt)
117
+ continue
118
+ if response.status_code >= 400:
119
+ msg = (
120
+ f"summarizer endpoint returned HTTP {response.status_code}: "
121
+ f"{response.text[:200]}"
122
+ )
123
+ raise IndexBuildError(
124
+ msg,
125
+ details={
126
+ "status": response.status_code,
127
+ "model": self.model,
128
+ "base_url": self.base_url,
129
+ "attempts": attempt + 1,
130
+ },
131
+ )
132
+ return response
133
+
134
+ # Unreachable, but keeps strict type-checkers honest if the loop changes.
135
+ msg = "summarizer request failed without a response"
136
+ raise IndexBuildError(
137
+ msg,
138
+ details={
139
+ "model": self.model,
140
+ "base_url": self.base_url,
141
+ "error_type": type(last_exc).__name__ if last_exc else None,
142
+ },
143
+ )
144
+
145
+ async def _sleep_before_retry(self, attempt: int) -> None:
146
+ if self.retry_base_delay == 0:
147
+ return
148
+ await asyncio.sleep(self.retry_base_delay * (2**attempt))
@@ -0,0 +1,73 @@
1
+ """Prompt templates and word budgets for summarization.
2
+
3
+ The prompts are deliberately terse. We trust word budgets more than
4
+ elaborately worded instructions; the budget is enforced at the call site
5
+ regardless of model compliance.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import Final
12
+
13
+ from cairn.summarize.base import SummaryLevel
14
+
15
+ WORD_BUDGETS: Final[dict[SummaryLevel, int]] = {
16
+ SummaryLevel.GIST: 20,
17
+ SummaryLevel.SYNOPSIS: 80,
18
+ SummaryLevel.DIGEST: 300,
19
+ }
20
+
21
+ SYSTEM_PROMPT: Final = (
22
+ "You write structural document summaries for a hierarchical retrieval "
23
+ "system.\n"
24
+ "- Be precise and factual. Do not interpret, extrapolate, or add opinions.\n"
25
+ "- Do not begin with 'This section…', 'The author…', or similar preamble.\n"
26
+ "- Output ONLY the summary text. No headers, labels, or quotation marks.\n"
27
+ "- Stay strictly within the word budget.\n"
28
+ )
29
+
30
+
31
+ def user_prompt(title: str, body: str, level: SummaryLevel) -> str:
32
+ """Build the user-role prompt for one summary request."""
33
+ budget = WORD_BUDGETS[level]
34
+ if level is SummaryLevel.GIST:
35
+ instruction = (
36
+ f"Summarize the section below in a single sentence of at most "
37
+ f"{budget} words. Capture the single most important fact or claim."
38
+ )
39
+ elif level is SummaryLevel.SYNOPSIS:
40
+ instruction = (
41
+ f"Summarize the section below in one paragraph of at most "
42
+ f"{budget} words. Cover the main idea, key specifics, and what "
43
+ "the reader will learn."
44
+ )
45
+ else: # DIGEST
46
+ instruction = (
47
+ f"Summarize the section below in 2 to 3 short paragraphs, totaling "
48
+ f"at most {budget} words. Preserve structural ordering and any "
49
+ "concrete facts (names, numbers, code identifiers)."
50
+ )
51
+
52
+ body_excerpt = body.strip() or "(empty section body)"
53
+ return (
54
+ f"{instruction}\n\n"
55
+ f"SECTION TITLE: {title}\n\n"
56
+ f"SECTION BODY:\n{body_excerpt}"
57
+ )
58
+
59
+
60
+ _WORD = re.compile(r"\S+")
61
+
62
+
63
+ def enforce_word_budget(text: str, level: SummaryLevel) -> str:
64
+ """Soft-truncate ``text`` to the level's word budget at a word boundary.
65
+
66
+ Appends a horizontal ellipsis (``…``) when truncation occurred. Returns
67
+ the original text untouched when already within budget.
68
+ """
69
+ budget = WORD_BUDGETS[level]
70
+ words = _WORD.findall(text)
71
+ if len(words) <= budget:
72
+ return text
73
+ return " ".join(words[:budget]) + "…"
@@ -0,0 +1,31 @@
1
+ """Retrieval tools — the public API consumed by the MCP server.
2
+
3
+ Each tool corresponds 1:1 to an MCP tool documented in
4
+ ``docs/specs/mcp-tools.md``. Tools accept a :class:`DocumentIndex` plus typed
5
+ arguments and return a :class:`ToolResponse`. They do not speak MCP
6
+ themselves; the ``cairn.mcp`` layer translates :class:`ToolResponse` and
7
+ :class:`cairn.core.errors.CairnError` into the MCP wire envelope.
8
+ """
9
+
10
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
11
+ from cairn.tools.find_mentions import find_mentions
12
+ from cairn.tools.get_related import get_related
13
+ from cairn.tools.get_section import expand, get_section
14
+ from cairn.tools.outline import outline
15
+ from cairn.tools.read_range import read_range
16
+ from cairn.tools.search_keyword import search_keyword
17
+ from cairn.tools.search_semantic import search_semantic
18
+
19
+ __all__ = [
20
+ "DocumentIndex",
21
+ "ToolResponse",
22
+ "estimate_tokens",
23
+ "expand",
24
+ "find_mentions",
25
+ "get_related",
26
+ "get_section",
27
+ "outline",
28
+ "read_range",
29
+ "search_keyword",
30
+ "search_semantic",
31
+ ]
cairn/tools/base.py ADDED
@@ -0,0 +1,126 @@
1
+ """Shared types and helpers for retrieval tools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from cairn.core.errors import IndexBuildError, IndexNotFoundError
11
+ from cairn.index.entities import Entities
12
+ from cairn.index.summaries import Summaries
13
+ from cairn.index.tree import Tree
14
+ from cairn.index.vectors import Vectors
15
+ from cairn.index.xrefs import XRefs
16
+
17
+
18
+ class DocumentIndex:
19
+ """All sub-indexes loaded for a single document.
20
+
21
+ Tree, Summaries, and Vectors are required. Entities (v0.2.0+) and XRefs
22
+ (v0.2.2+) are optional; v0.1/early v0.2 indexes don't have them and
23
+ tools that need them must check ``index.entities`` / ``index.xrefs``
24
+ against ``None``.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ *,
30
+ tree: Tree,
31
+ summaries: Summaries,
32
+ vectors: Vectors,
33
+ entities: Entities | None = None,
34
+ xrefs: XRefs | None = None,
35
+ ) -> None:
36
+ doc_ids = {
37
+ "tree": tree.doc_id,
38
+ "summaries": summaries.doc_id,
39
+ "vectors": vectors.doc_id,
40
+ }
41
+ if entities is not None:
42
+ doc_ids["entities"] = entities.doc_id
43
+ if xrefs is not None:
44
+ doc_ids["xrefs"] = xrefs.doc_id
45
+ if len(set(doc_ids.values())) > 1:
46
+ msg = "sub-index doc_id mismatch: " + ", ".join(
47
+ f"{k}={v!r}" for k, v in doc_ids.items()
48
+ )
49
+ raise IndexBuildError(msg, details=doc_ids)
50
+
51
+ self.tree = tree
52
+ self.summaries = summaries
53
+ self.vectors = vectors
54
+ self.entities = entities
55
+ self.xrefs = xrefs
56
+ self.doc_id = tree.doc_id
57
+
58
+ @classmethod
59
+ def load(cls, doc_dir: Path) -> DocumentIndex:
60
+ """Load all sub-indexes from a single document directory.
61
+
62
+ Entities and XRefs are optional: older indexes don't have them, and
63
+ we degrade gracefully rather than refuse to load.
64
+ """
65
+ entities: Entities | None
66
+ try:
67
+ entities = Entities.load(doc_dir)
68
+ except IndexNotFoundError:
69
+ entities = None
70
+ xrefs: XRefs | None
71
+ try:
72
+ xrefs = XRefs.load(doc_dir)
73
+ except IndexNotFoundError:
74
+ xrefs = None
75
+ return cls(
76
+ tree=Tree.load(doc_dir),
77
+ summaries=Summaries.load(doc_dir),
78
+ vectors=Vectors.load(doc_dir),
79
+ entities=entities,
80
+ xrefs=xrefs,
81
+ )
82
+
83
+ def anchor(self, section_id: str) -> str:
84
+ """Build the canonical ``cairn://`` anchor for a section."""
85
+ return f"cairn://{self.doc_id}/{section_id}"
86
+
87
+
88
+ class ToolResponse(BaseModel):
89
+ """Successful result of a tool invocation.
90
+
91
+ Errors are signaled by raising :class:`cairn.core.errors.CairnError` from
92
+ the tool function; the MCP server wraps them in the structured envelope
93
+ documented in ``docs/specs/mcp-tools.md`` §0.
94
+ """
95
+
96
+ model_config = ConfigDict(frozen=True, extra="forbid")
97
+
98
+ data: dict[str, Any]
99
+ tokens_returned: int = Field(ge=0)
100
+
101
+
102
+ def estimate_tokens(text: str) -> int:
103
+ """Estimate the token cost of a text payload.
104
+
105
+ Approximation: 1.3 tokens per whitespace-separated word, which tracks
106
+ common English tokenizers within ~10%. Good enough for budget reporting;
107
+ not a substitute for a real tokenizer.
108
+ """
109
+ if not text:
110
+ return 0
111
+ return max(1, int(len(text.split()) * 1.3))
112
+
113
+
114
+ def estimate_tokens_of_payload(payload: Any) -> int:
115
+ """Estimate token cost of every string anywhere in ``payload``."""
116
+ return estimate_tokens(_flatten_text(payload))
117
+
118
+
119
+ def _flatten_text(obj: Any) -> str:
120
+ if isinstance(obj, str):
121
+ return obj
122
+ if isinstance(obj, dict):
123
+ return " ".join(_flatten_text(v) for v in obj.values())
124
+ if isinstance(obj, list | tuple):
125
+ return " ".join(_flatten_text(item) for item in obj)
126
+ return ""
@@ -0,0 +1,93 @@
1
+ """``find_mentions`` retrieval tool.
2
+
3
+ Spec: ``docs/specs/mcp-tools.md`` §6.
4
+
5
+ Returns every section where a named entity occurs, with stable anchors back
6
+ into the source. When the entity is unknown to the index, returns a
7
+ successful envelope with an empty ``mentions`` array — "no mentions" is a
8
+ valid answer, not an error condition.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Sequence
14
+ from typing import Any, Literal
15
+
16
+ from cairn.core.errors import IndexNotFoundError, ToolError
17
+ from cairn.core.types import EntityKind
18
+ from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
19
+
20
+ Kind = Literal["term", "code", "proper", "defined"]
21
+
22
+
23
+ async def find_mentions(
24
+ index: DocumentIndex,
25
+ *,
26
+ entity: str,
27
+ scope: str | None = None,
28
+ kinds: Sequence[Kind] | None = None,
29
+ ) -> ToolResponse:
30
+ """Locate every section that mentions ``entity``.
31
+
32
+ The lookup matches by canonical form first, then by registered surface
33
+ forms. When ``kinds`` is supplied, only entities of those kinds are
34
+ considered.
35
+ """
36
+ if not entity.strip():
37
+ msg = "entity must be a non-empty string"
38
+ raise ToolError(msg)
39
+
40
+ if index.entities is None:
41
+ msg = (
42
+ "entities sub-index not built for this document; "
43
+ "re-index with v0.2 to enable find_mentions"
44
+ )
45
+ raise IndexNotFoundError(msg, details={"missing": "entities"})
46
+
47
+ kinds_tuple: tuple[EntityKind, ...] | None = None
48
+ if kinds is not None:
49
+ kinds_tuple = tuple(kinds)
50
+
51
+ ent = index.entities.lookup(entity, kinds=kinds_tuple)
52
+ if ent is None:
53
+ return ToolResponse(
54
+ data={
55
+ "entity": entity,
56
+ "canonical": None,
57
+ "kind": None,
58
+ "mentions": [],
59
+ },
60
+ tokens_returned=0,
61
+ )
62
+
63
+ mentions: list[dict[str, Any]] = []
64
+ for m in ent.mentions:
65
+ if scope is not None and not _matches_scope(m.section_id, scope):
66
+ continue
67
+ node = index.tree.get(m.section_id)
68
+ if node is None:
69
+ # Stale extractor output — skip rather than fail the whole call.
70
+ continue
71
+ mentions.append(
72
+ {
73
+ "section_id": m.section_id,
74
+ "title": node.title,
75
+ "anchor": index.anchor(m.section_id),
76
+ "span": [m.span.start, m.span.end],
77
+ }
78
+ )
79
+
80
+ payload: dict[str, Any] = {
81
+ "entity": entity,
82
+ "canonical": ent.canonical,
83
+ "kind": ent.kind,
84
+ "mentions": mentions,
85
+ }
86
+ return ToolResponse(
87
+ data=payload,
88
+ tokens_returned=estimate_tokens_of_payload(payload),
89
+ )
90
+
91
+
92
+ def _matches_scope(section_id: str, scope: str) -> bool:
93
+ return section_id == scope or section_id.startswith(scope + "/")