docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""``get_related`` retrieval tool.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §7.
|
|
4
|
+
|
|
5
|
+
Returns neighbors of a section across two channels:
|
|
6
|
+
|
|
7
|
+
- the tree (``sibling`` / ``parent`` / ``child``)
|
|
8
|
+
- the cross-reference graph (``xref``)
|
|
9
|
+
|
|
10
|
+
Tree neighbors are returned with confidence ``1.0`` and ``relation: null``.
|
|
11
|
+
XRef neighbors carry the extractor's confidence and the edge's ``kind`` as
|
|
12
|
+
the ``relation`` field (``link``, ``textual``, or ``entity``).
|
|
13
|
+
|
|
14
|
+
Results are sorted by confidence descending, then by destination id, and
|
|
15
|
+
truncated to ``k``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
from typing import Any, Literal
|
|
22
|
+
|
|
23
|
+
from cairn.core.errors import IndexNotFoundError, ToolError
|
|
24
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
|
|
25
|
+
|
|
26
|
+
Kind = Literal["xref", "sibling", "parent", "child"]
|
|
27
|
+
|
|
28
|
+
_VALID_KINDS: frozenset[str] = frozenset({"xref", "sibling", "parent", "child"})
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def get_related(
|
|
32
|
+
index: DocumentIndex,
|
|
33
|
+
*,
|
|
34
|
+
id: str,
|
|
35
|
+
kinds: Sequence[Kind] = ("xref",),
|
|
36
|
+
k: int = 8,
|
|
37
|
+
) -> ToolResponse:
|
|
38
|
+
"""Return up to ``k`` neighbors of section ``id`` across requested channels."""
|
|
39
|
+
if k < 1 or k > 32:
|
|
40
|
+
msg = f"k must be in [1, 32]; got {k}"
|
|
41
|
+
raise ToolError(msg, details={"k": k})
|
|
42
|
+
if not kinds:
|
|
43
|
+
msg = "kinds must contain at least one entry"
|
|
44
|
+
raise ToolError(msg)
|
|
45
|
+
bad = [x for x in kinds if x not in _VALID_KINDS]
|
|
46
|
+
if bad:
|
|
47
|
+
msg = f"invalid kinds: {bad}"
|
|
48
|
+
raise ToolError(msg, details={"invalid": bad})
|
|
49
|
+
|
|
50
|
+
node = index.tree.get(id)
|
|
51
|
+
if node is None:
|
|
52
|
+
msg = f"section not found: {id!r}"
|
|
53
|
+
raise IndexNotFoundError(msg, details={"section_id": id})
|
|
54
|
+
|
|
55
|
+
kind_set = set(kinds)
|
|
56
|
+
neighbors: list[dict[str, Any]] = []
|
|
57
|
+
|
|
58
|
+
if "xref" in kind_set and index.xrefs is not None:
|
|
59
|
+
for xref in index.xrefs.outgoing_from(id):
|
|
60
|
+
neighbors.append(
|
|
61
|
+
_neighbor(
|
|
62
|
+
index,
|
|
63
|
+
section_id=xref.dst,
|
|
64
|
+
kind="xref",
|
|
65
|
+
relation=xref.kind,
|
|
66
|
+
confidence=xref.confidence,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if "child" in kind_set:
|
|
71
|
+
for child in index.tree.children_of(id):
|
|
72
|
+
neighbors.append(
|
|
73
|
+
_neighbor(
|
|
74
|
+
index,
|
|
75
|
+
section_id=child.id,
|
|
76
|
+
kind="child",
|
|
77
|
+
relation=None,
|
|
78
|
+
confidence=1.0,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if "parent" in kind_set and node.parent is not None:
|
|
83
|
+
neighbors.append(
|
|
84
|
+
_neighbor(
|
|
85
|
+
index,
|
|
86
|
+
section_id=node.parent,
|
|
87
|
+
kind="parent",
|
|
88
|
+
relation=None,
|
|
89
|
+
confidence=1.0,
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if "sibling" in kind_set and node.parent is not None:
|
|
94
|
+
for sibling in index.tree.children_of(node.parent):
|
|
95
|
+
if sibling.id == id:
|
|
96
|
+
continue
|
|
97
|
+
neighbors.append(
|
|
98
|
+
_neighbor(
|
|
99
|
+
index,
|
|
100
|
+
section_id=sibling.id,
|
|
101
|
+
kind="sibling",
|
|
102
|
+
relation=None,
|
|
103
|
+
confidence=1.0,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
neighbors.sort(key=lambda n: (-float(n["confidence"]), n["id"]))
|
|
108
|
+
neighbors = neighbors[:k]
|
|
109
|
+
|
|
110
|
+
payload: dict[str, Any] = {
|
|
111
|
+
"id": id,
|
|
112
|
+
"neighbors": neighbors,
|
|
113
|
+
}
|
|
114
|
+
return ToolResponse(
|
|
115
|
+
data=payload,
|
|
116
|
+
tokens_returned=estimate_tokens_of_payload(payload),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _neighbor(
|
|
121
|
+
index: DocumentIndex,
|
|
122
|
+
*,
|
|
123
|
+
section_id: str,
|
|
124
|
+
kind: str,
|
|
125
|
+
relation: str | None,
|
|
126
|
+
confidence: float,
|
|
127
|
+
) -> dict[str, Any]:
|
|
128
|
+
node = index.tree.get(section_id)
|
|
129
|
+
payload: dict[str, Any] = {
|
|
130
|
+
"id": section_id,
|
|
131
|
+
"title": node.title if node is not None else section_id,
|
|
132
|
+
"kind": kind,
|
|
133
|
+
"relation": relation,
|
|
134
|
+
"confidence": confidence,
|
|
135
|
+
"anchor": index.anchor(section_id),
|
|
136
|
+
}
|
|
137
|
+
summary = index.summaries.get(section_id)
|
|
138
|
+
if summary is not None and summary.gist:
|
|
139
|
+
payload["gist"] = summary.gist
|
|
140
|
+
return payload
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""``get_section`` and ``expand`` retrieval tools.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §2 and §3.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
from cairn.core.errors import IndexNotFoundError, ToolError
|
|
11
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
|
|
12
|
+
|
|
13
|
+
Level = Literal["gist", "synopsis", "digest", "full"]
|
|
14
|
+
|
|
15
|
+
_LEVEL_ORDER: dict[str, int] = {"gist": 0, "synopsis": 1, "digest": 2, "full": 3}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def get_section(
|
|
19
|
+
index: DocumentIndex,
|
|
20
|
+
*,
|
|
21
|
+
id: str,
|
|
22
|
+
level: Level = "synopsis",
|
|
23
|
+
include_children: bool = False,
|
|
24
|
+
) -> ToolResponse:
|
|
25
|
+
"""Fetch one section at the chosen summary level.
|
|
26
|
+
|
|
27
|
+
`include_children` is reserved for v0.2 — passing ``True`` raises
|
|
28
|
+
:class:`cairn.core.errors.ToolError`.
|
|
29
|
+
"""
|
|
30
|
+
if include_children:
|
|
31
|
+
msg = "include_children is reserved for v0.2; pass False in v0.1"
|
|
32
|
+
raise ToolError(msg, details={"feature": "include_children"})
|
|
33
|
+
|
|
34
|
+
node = index.tree.get(id)
|
|
35
|
+
if node is None:
|
|
36
|
+
msg = f"section not found: {id!r}"
|
|
37
|
+
raise IndexNotFoundError(msg, details={"section_id": id})
|
|
38
|
+
|
|
39
|
+
content = _content_at_level(index, id, level, node.raw_text)
|
|
40
|
+
|
|
41
|
+
next_levels = _levels_deeper_than(level, index, id)
|
|
42
|
+
|
|
43
|
+
payload: dict[str, Any] = {
|
|
44
|
+
"doc": index.doc_id,
|
|
45
|
+
"id": node.id,
|
|
46
|
+
"title": node.title,
|
|
47
|
+
"level": level,
|
|
48
|
+
"content": content,
|
|
49
|
+
"anchor": index.anchor(node.id),
|
|
50
|
+
"path": list(node.path),
|
|
51
|
+
"has_children": bool(node.children),
|
|
52
|
+
"next_levels_available": next_levels,
|
|
53
|
+
}
|
|
54
|
+
return ToolResponse(
|
|
55
|
+
data=payload,
|
|
56
|
+
tokens_returned=estimate_tokens(content),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def expand(
|
|
61
|
+
index: DocumentIndex,
|
|
62
|
+
*,
|
|
63
|
+
id: str,
|
|
64
|
+
to: Literal["synopsis", "digest", "full"],
|
|
65
|
+
) -> ToolResponse:
|
|
66
|
+
"""Move from a shallower summary to a deeper one. Convenience over ``get_section``.
|
|
67
|
+
|
|
68
|
+
Behaves exactly like ``get_section(id, level=to)`` and exists to make
|
|
69
|
+
the progressive-disclosure idiom explicit in agent prompts.
|
|
70
|
+
"""
|
|
71
|
+
return await get_section(index, id=id, level=to)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _content_at_level(
|
|
75
|
+
index: DocumentIndex,
|
|
76
|
+
section_id: str,
|
|
77
|
+
level: str,
|
|
78
|
+
raw_text: str,
|
|
79
|
+
) -> str:
|
|
80
|
+
if level == "full":
|
|
81
|
+
return raw_text
|
|
82
|
+
|
|
83
|
+
summary = index.summaries.get(section_id)
|
|
84
|
+
if summary is None:
|
|
85
|
+
msg = (
|
|
86
|
+
f"section {section_id!r} has no summary set; "
|
|
87
|
+
"the Summaries index may not have been built"
|
|
88
|
+
)
|
|
89
|
+
raise IndexNotFoundError(msg, details={"section_id": section_id})
|
|
90
|
+
|
|
91
|
+
if level == "gist":
|
|
92
|
+
return summary.gist
|
|
93
|
+
if level == "synopsis":
|
|
94
|
+
return summary.synopsis
|
|
95
|
+
if level == "digest":
|
|
96
|
+
if summary.digest is None:
|
|
97
|
+
msg = (
|
|
98
|
+
f"digest not available for {section_id!r}; "
|
|
99
|
+
"v0.1 generates only gist + synopsis"
|
|
100
|
+
)
|
|
101
|
+
raise IndexNotFoundError(
|
|
102
|
+
msg,
|
|
103
|
+
details={"section_id": section_id, "level": level},
|
|
104
|
+
)
|
|
105
|
+
return summary.digest
|
|
106
|
+
|
|
107
|
+
msg = f"unknown level: {level!r}"
|
|
108
|
+
raise ToolError(msg, details={"level": level})
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _levels_deeper_than(
|
|
112
|
+
current: str,
|
|
113
|
+
index: DocumentIndex,
|
|
114
|
+
section_id: str,
|
|
115
|
+
) -> list[str]:
|
|
116
|
+
current_rank = _LEVEL_ORDER[current]
|
|
117
|
+
deeper = [
|
|
118
|
+
name for name, rank in _LEVEL_ORDER.items() if rank > current_rank
|
|
119
|
+
]
|
|
120
|
+
# Filter to what is actually available.
|
|
121
|
+
summary = index.summaries.get(section_id)
|
|
122
|
+
available: list[str] = []
|
|
123
|
+
for level in deeper:
|
|
124
|
+
if level == "full":
|
|
125
|
+
available.append(level)
|
|
126
|
+
elif summary is not None and level == "digest" and summary.digest is None:
|
|
127
|
+
continue
|
|
128
|
+
else:
|
|
129
|
+
available.append(level)
|
|
130
|
+
return available
|
cairn/tools/outline.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""``outline`` retrieval tool — the cheapest, called first.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §1.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from collections.abc import Container, Sequence
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
from cairn.core.errors import ToolError
|
|
12
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
|
|
13
|
+
|
|
14
|
+
IncludeLevel = Literal["gist", "synopsis"]
|
|
15
|
+
|
|
16
|
+
_VALID_INCLUDE: frozenset[str] = frozenset({"gist", "synopsis"})
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def outline(
|
|
20
|
+
index: DocumentIndex,
|
|
21
|
+
*,
|
|
22
|
+
depth: int = 2,
|
|
23
|
+
focus: str | None = None,
|
|
24
|
+
include: Sequence[IncludeLevel] = ("gist",),
|
|
25
|
+
) -> ToolResponse:
|
|
26
|
+
"""Return a truncated outline tree of the document.
|
|
27
|
+
|
|
28
|
+
See docs/specs/mcp-tools.md §1 for full semantics.
|
|
29
|
+
"""
|
|
30
|
+
if depth < 1 or depth > 6:
|
|
31
|
+
msg = f"depth must be in [1, 6]; got {depth}"
|
|
32
|
+
raise ToolError(msg, details={"depth": depth})
|
|
33
|
+
|
|
34
|
+
if not include:
|
|
35
|
+
msg = "include must contain at least one summary level"
|
|
36
|
+
raise ToolError(msg)
|
|
37
|
+
|
|
38
|
+
bad = [x for x in include if x not in _VALID_INCLUDE]
|
|
39
|
+
if bad:
|
|
40
|
+
msg = f"invalid include values: {bad}"
|
|
41
|
+
raise ToolError(msg, details={"invalid": bad})
|
|
42
|
+
|
|
43
|
+
forest = index.tree.outline(depth=depth, focus=focus)
|
|
44
|
+
include_set = set(include)
|
|
45
|
+
_attach_summaries(forest, index, include_set)
|
|
46
|
+
|
|
47
|
+
payload: dict[str, Any] = {
|
|
48
|
+
"doc": index.doc_id,
|
|
49
|
+
"depth": depth,
|
|
50
|
+
"focus": focus,
|
|
51
|
+
"tree": forest,
|
|
52
|
+
}
|
|
53
|
+
return ToolResponse(
|
|
54
|
+
data=payload,
|
|
55
|
+
tokens_returned=estimate_tokens_of_payload(payload),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _attach_summaries(
|
|
60
|
+
nodes: list[dict[str, Any]],
|
|
61
|
+
index: DocumentIndex,
|
|
62
|
+
include: Container[str],
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Mutate the outline forest to add gist/synopsis where requested."""
|
|
65
|
+
for node in nodes:
|
|
66
|
+
sid = node["id"]
|
|
67
|
+
summary = index.summaries.get(sid)
|
|
68
|
+
if summary is not None:
|
|
69
|
+
if "gist" in include and summary.gist:
|
|
70
|
+
node["gist"] = summary.gist
|
|
71
|
+
if "synopsis" in include and summary.synopsis:
|
|
72
|
+
node["synopsis"] = summary.synopsis
|
|
73
|
+
children = node.get("children", [])
|
|
74
|
+
if children:
|
|
75
|
+
_attach_summaries(children, index, include)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""``read_range`` retrieval tool.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §8.
|
|
4
|
+
|
|
5
|
+
Reads continuous text across consecutive sections in document order. The
|
|
6
|
+
agent gives ``start_id`` and ``end_id``; the tool concatenates each section
|
|
7
|
+
as ``"## {title}\\n\\n{raw_text}"`` separated by blank lines, truncating at
|
|
8
|
+
the ``max_tokens`` budget. When truncated, ``next_id`` points at the first
|
|
9
|
+
section that wasn't included so the agent can continue.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from cairn.core.errors import IndexNotFoundError, ToolError
|
|
17
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def read_range(
|
|
21
|
+
index: DocumentIndex,
|
|
22
|
+
*,
|
|
23
|
+
start_id: str,
|
|
24
|
+
end_id: str,
|
|
25
|
+
max_tokens: int = 4000,
|
|
26
|
+
) -> ToolResponse:
|
|
27
|
+
"""Read continuous content from ``start_id`` through ``end_id``."""
|
|
28
|
+
if max_tokens < 1:
|
|
29
|
+
msg = f"max_tokens must be >= 1; got {max_tokens}"
|
|
30
|
+
raise ToolError(msg, details={"max_tokens": max_tokens})
|
|
31
|
+
|
|
32
|
+
sections = list(index.tree)
|
|
33
|
+
ids = [s.id for s in sections]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
start_idx = ids.index(start_id)
|
|
37
|
+
except ValueError as exc:
|
|
38
|
+
msg = f"start_id not found: {start_id!r}"
|
|
39
|
+
raise IndexNotFoundError(msg, details={"section_id": start_id}) from exc
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
end_idx = ids.index(end_id)
|
|
43
|
+
except ValueError as exc:
|
|
44
|
+
msg = f"end_id not found: {end_id!r}"
|
|
45
|
+
raise IndexNotFoundError(msg, details={"section_id": end_id}) from exc
|
|
46
|
+
|
|
47
|
+
if start_idx > end_idx:
|
|
48
|
+
msg = (
|
|
49
|
+
f"start_id {start_id!r} must come before end_id {end_id!r} "
|
|
50
|
+
"in document order"
|
|
51
|
+
)
|
|
52
|
+
raise ToolError(
|
|
53
|
+
msg,
|
|
54
|
+
details={"start_id": start_id, "end_id": end_id},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parts: list[str] = []
|
|
58
|
+
tokens_so_far = 0
|
|
59
|
+
next_id: str | None = None
|
|
60
|
+
|
|
61
|
+
for section in sections[start_idx : end_idx + 1]:
|
|
62
|
+
rendered = _render_section(section.title, section.raw_text)
|
|
63
|
+
part_tokens = estimate_tokens(rendered)
|
|
64
|
+
# Allow the first section even if it alone exceeds the budget — the
|
|
65
|
+
# agent asked for it, and returning nothing is worse than returning
|
|
66
|
+
# a single oversized chunk.
|
|
67
|
+
if parts and tokens_so_far + part_tokens > max_tokens:
|
|
68
|
+
next_id = section.id
|
|
69
|
+
break
|
|
70
|
+
parts.append(rendered)
|
|
71
|
+
tokens_so_far += part_tokens
|
|
72
|
+
|
|
73
|
+
content = "\n\n".join(parts)
|
|
74
|
+
|
|
75
|
+
payload: dict[str, Any] = {
|
|
76
|
+
"doc": index.doc_id,
|
|
77
|
+
"start_id": start_id,
|
|
78
|
+
"end_id": end_id,
|
|
79
|
+
"content": content,
|
|
80
|
+
"anchor_start": index.anchor(start_id),
|
|
81
|
+
"anchor_end": index.anchor(end_id),
|
|
82
|
+
"truncated": next_id is not None,
|
|
83
|
+
"next_id": next_id,
|
|
84
|
+
}
|
|
85
|
+
return ToolResponse(
|
|
86
|
+
data=payload,
|
|
87
|
+
tokens_returned=tokens_so_far,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _render_section(title: str, body: str) -> str:
|
|
92
|
+
if body.strip():
|
|
93
|
+
return f"## {title}\n\n{body}"
|
|
94
|
+
return f"## {title}"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""``search_keyword`` retrieval tool.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §5.
|
|
4
|
+
|
|
5
|
+
v0.1 uses a linear scan over loaded sections. For documents up to a few
|
|
6
|
+
thousand sections this comfortably stays under the spec's latency target.
|
|
7
|
+
A proper inverted index is a v0.2+ optimization.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Literal
|
|
13
|
+
|
|
14
|
+
from cairn.core.errors import ToolError
|
|
15
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
|
|
16
|
+
|
|
17
|
+
Mode = Literal["any", "all"]
|
|
18
|
+
|
|
19
|
+
_HEAD_CHARS: int = 200
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def search_keyword(
|
|
23
|
+
index: DocumentIndex,
|
|
24
|
+
*,
|
|
25
|
+
terms: list[str],
|
|
26
|
+
scope: str | None = None,
|
|
27
|
+
k: int = 12,
|
|
28
|
+
mode: Mode = "any",
|
|
29
|
+
) -> ToolResponse:
|
|
30
|
+
"""Exact (case-insensitive) lexical search across the document."""
|
|
31
|
+
if not 1 <= len(terms) <= 8:
|
|
32
|
+
msg = f"terms must contain 1-8 entries; got {len(terms)}"
|
|
33
|
+
raise ToolError(msg, details={"count": len(terms)})
|
|
34
|
+
if any(not t.strip() for t in terms):
|
|
35
|
+
msg = "terms must be non-empty strings"
|
|
36
|
+
raise ToolError(msg)
|
|
37
|
+
if k < 1 or k > 32:
|
|
38
|
+
msg = f"k must be in [1, 32]; got {k}"
|
|
39
|
+
raise ToolError(msg, details={"k": k})
|
|
40
|
+
if mode not in ("any", "all"):
|
|
41
|
+
msg = f"mode must be 'any' or 'all'; got {mode!r}"
|
|
42
|
+
raise ToolError(msg, details={"mode": mode})
|
|
43
|
+
|
|
44
|
+
lc_terms = [t.lower() for t in terms]
|
|
45
|
+
|
|
46
|
+
scored: list[tuple[int, dict[str, Any]]] = []
|
|
47
|
+
for node in index.tree:
|
|
48
|
+
if scope is not None and not _matches_scope(node.id, scope):
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
text_lc = (node.title + "\n" + node.raw_text).lower()
|
|
52
|
+
matches: list[dict[str, Any]] = []
|
|
53
|
+
total_score = 0
|
|
54
|
+
for orig, lc in zip(terms, lc_terms, strict=True):
|
|
55
|
+
count = text_lc.count(lc)
|
|
56
|
+
if count > 0:
|
|
57
|
+
matches.append({"term": orig, "count": count})
|
|
58
|
+
total_score += count * len(orig)
|
|
59
|
+
|
|
60
|
+
if not matches:
|
|
61
|
+
continue
|
|
62
|
+
if mode == "all" and len(matches) != len(terms):
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
scored.append(
|
|
66
|
+
(
|
|
67
|
+
total_score,
|
|
68
|
+
{
|
|
69
|
+
"id": node.id,
|
|
70
|
+
"title": node.title,
|
|
71
|
+
"score": total_score,
|
|
72
|
+
"anchor": index.anchor(node.id),
|
|
73
|
+
"matches": matches,
|
|
74
|
+
"head": node.raw_text[:_HEAD_CHARS],
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
scored.sort(key=lambda x: (-x[0], x[1]["id"]))
|
|
80
|
+
top = [d for _, d in scored[:k]]
|
|
81
|
+
|
|
82
|
+
payload: dict[str, Any] = {
|
|
83
|
+
"terms": terms,
|
|
84
|
+
"mode": mode,
|
|
85
|
+
"hits": top,
|
|
86
|
+
}
|
|
87
|
+
return ToolResponse(
|
|
88
|
+
data=payload,
|
|
89
|
+
tokens_returned=estimate_tokens_of_payload(payload),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _matches_scope(section_id: str, scope: str) -> bool:
|
|
94
|
+
return section_id == scope or section_id.startswith(scope + "/")
|