docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/index/tree.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Tree sub-index — persistence and queries for the structural backbone.
|
|
2
|
+
|
|
3
|
+
`TreeBuilder` writes a deterministic ``tree.json`` from a parsed
|
|
4
|
+
:class:`Document`. `Tree` loads and queries it.
|
|
5
|
+
|
|
6
|
+
The tree is the primary navigation structure (ARCHITECTURE.md §2.1). All other
|
|
7
|
+
sub-indexes key into the ``section_id`` namespace it defines.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from collections.abc import Iterator
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Final
|
|
17
|
+
|
|
18
|
+
from cairn.core.errors import IndexBuildError, IndexNotFoundError
|
|
19
|
+
from cairn.core.types import Document, SectionNode, Span
|
|
20
|
+
|
|
21
|
+
TREE_FILENAME: Final = "tree.json"
|
|
22
|
+
TREE_FORMAT_VERSION: Final = 1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TreeBuilder:
|
|
26
|
+
"""Writes the structural tree of a Document to ``tree.json``."""
|
|
27
|
+
|
|
28
|
+
def build(self, document: Document, *, out_dir: Path) -> Path:
|
|
29
|
+
"""Serialize ``document.sections`` into ``out_dir/tree.json``.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
document: The parsed document. Its `sections` must form a valid
|
|
33
|
+
forest (every non-root section's `parent` exists; every
|
|
34
|
+
referenced `child` exists).
|
|
35
|
+
out_dir: Directory to write into. Created if it does not exist.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The path to the written ``tree.json``.
|
|
39
|
+
"""
|
|
40
|
+
self._validate_tree(document)
|
|
41
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
path = out_dir / TREE_FILENAME
|
|
43
|
+
|
|
44
|
+
payload: dict[str, Any] = {
|
|
45
|
+
"format_version": TREE_FORMAT_VERSION,
|
|
46
|
+
"doc_id": document.id,
|
|
47
|
+
"source_path": str(document.source_path),
|
|
48
|
+
"source_hash": document.source_hash,
|
|
49
|
+
"indexed_at": document.indexed_at.isoformat(),
|
|
50
|
+
"cairn_version": document.cairn_version,
|
|
51
|
+
"sections": [_section_to_dict(s) for s in document.sections],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
55
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2, sort_keys=False)
|
|
56
|
+
fh.write("\n")
|
|
57
|
+
return path
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _validate_tree(document: Document) -> None:
|
|
61
|
+
seen_ids: set[str] = set()
|
|
62
|
+
for section in document.sections:
|
|
63
|
+
if section.id in seen_ids:
|
|
64
|
+
msg = f"duplicate section id in document: {section.id!r}"
|
|
65
|
+
raise IndexBuildError(msg, details={"section_id": section.id})
|
|
66
|
+
seen_ids.add(section.id)
|
|
67
|
+
|
|
68
|
+
for section in document.sections:
|
|
69
|
+
if section.parent is not None and section.parent not in seen_ids:
|
|
70
|
+
msg = (
|
|
71
|
+
f"section {section.id!r} references unknown parent "
|
|
72
|
+
f"{section.parent!r}"
|
|
73
|
+
)
|
|
74
|
+
raise IndexBuildError(
|
|
75
|
+
msg,
|
|
76
|
+
details={"section_id": section.id, "parent": section.parent},
|
|
77
|
+
)
|
|
78
|
+
for child in section.children:
|
|
79
|
+
if child not in seen_ids:
|
|
80
|
+
msg = (
|
|
81
|
+
f"section {section.id!r} references unknown child "
|
|
82
|
+
f"{child!r}"
|
|
83
|
+
)
|
|
84
|
+
raise IndexBuildError(
|
|
85
|
+
msg,
|
|
86
|
+
details={"section_id": section.id, "child": child},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Tree:
|
|
91
|
+
"""Loaded tree index. Read-only queries against the structural backbone."""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
sections: tuple[SectionNode, ...],
|
|
96
|
+
*,
|
|
97
|
+
doc_id: str,
|
|
98
|
+
source_hash: str,
|
|
99
|
+
indexed_at: datetime,
|
|
100
|
+
) -> None:
|
|
101
|
+
self._sections = sections
|
|
102
|
+
self._by_id: dict[str, SectionNode] = {s.id: s for s in sections}
|
|
103
|
+
self._roots: tuple[SectionNode, ...] = tuple(
|
|
104
|
+
s for s in sections if s.parent is None
|
|
105
|
+
)
|
|
106
|
+
self.doc_id = doc_id
|
|
107
|
+
self.source_hash = source_hash
|
|
108
|
+
self.indexed_at = indexed_at
|
|
109
|
+
|
|
110
|
+
# -- construction --------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def load(cls, doc_dir: Path) -> Tree:
|
|
114
|
+
"""Load ``tree.json`` from a document directory."""
|
|
115
|
+
path = doc_dir / TREE_FILENAME
|
|
116
|
+
if not path.exists():
|
|
117
|
+
msg = f"tree.json not found in {doc_dir}"
|
|
118
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
119
|
+
|
|
120
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
121
|
+
payload = json.load(fh)
|
|
122
|
+
|
|
123
|
+
format_version = payload.get("format_version")
|
|
124
|
+
if format_version != TREE_FORMAT_VERSION:
|
|
125
|
+
msg = (
|
|
126
|
+
f"unsupported tree format version: {format_version!r} "
|
|
127
|
+
f"(expected {TREE_FORMAT_VERSION})"
|
|
128
|
+
)
|
|
129
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
130
|
+
|
|
131
|
+
sections = tuple(_section_from_dict(d) for d in payload["sections"])
|
|
132
|
+
return cls(
|
|
133
|
+
sections,
|
|
134
|
+
doc_id=payload["doc_id"],
|
|
135
|
+
source_hash=payload["source_hash"],
|
|
136
|
+
indexed_at=datetime.fromisoformat(payload["indexed_at"]),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# -- queries -------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
def get(self, section_id: str) -> SectionNode | None:
|
|
142
|
+
"""Look up a section by id. Returns ``None`` if absent."""
|
|
143
|
+
return self._by_id.get(section_id)
|
|
144
|
+
|
|
145
|
+
def require(self, section_id: str) -> SectionNode:
|
|
146
|
+
"""Look up a section by id, raising :class:`IndexNotFoundError`."""
|
|
147
|
+
node = self.get(section_id)
|
|
148
|
+
if node is None:
|
|
149
|
+
msg = f"section not found: {section_id!r}"
|
|
150
|
+
raise IndexNotFoundError(msg, details={"section_id": section_id})
|
|
151
|
+
return node
|
|
152
|
+
|
|
153
|
+
def __contains__(self, section_id: object) -> bool:
|
|
154
|
+
return isinstance(section_id, str) and section_id in self._by_id
|
|
155
|
+
|
|
156
|
+
def __len__(self) -> int:
|
|
157
|
+
return len(self._sections)
|
|
158
|
+
|
|
159
|
+
def __iter__(self) -> Iterator[SectionNode]:
|
|
160
|
+
"""Yield every section in document order."""
|
|
161
|
+
return iter(self._sections)
|
|
162
|
+
|
|
163
|
+
def roots(self) -> tuple[SectionNode, ...]:
|
|
164
|
+
"""Top-level sections (those with `parent is None`)."""
|
|
165
|
+
return self._roots
|
|
166
|
+
|
|
167
|
+
def children_of(self, section_id: str) -> tuple[SectionNode, ...]:
|
|
168
|
+
"""Direct children of a section, in document order."""
|
|
169
|
+
node = self.require(section_id)
|
|
170
|
+
return tuple(self._by_id[cid] for cid in node.children)
|
|
171
|
+
|
|
172
|
+
def descendants_of(self, section_id: str) -> Iterator[SectionNode]:
|
|
173
|
+
"""Depth-first traversal of a section's descendants (excluding self)."""
|
|
174
|
+
node = self.require(section_id)
|
|
175
|
+
stack: list[str] = list(reversed(node.children))
|
|
176
|
+
while stack:
|
|
177
|
+
current_id = stack.pop()
|
|
178
|
+
current = self._by_id[current_id]
|
|
179
|
+
yield current
|
|
180
|
+
stack.extend(reversed(current.children))
|
|
181
|
+
|
|
182
|
+
def ancestors_of(self, section_id: str) -> Iterator[SectionNode]:
|
|
183
|
+
"""Walk parents from the section up to the root (excluding self)."""
|
|
184
|
+
node = self.require(section_id)
|
|
185
|
+
current = node.parent
|
|
186
|
+
while current is not None:
|
|
187
|
+
parent_node = self._by_id[current]
|
|
188
|
+
yield parent_node
|
|
189
|
+
current = parent_node.parent
|
|
190
|
+
|
|
191
|
+
def outline(
|
|
192
|
+
self,
|
|
193
|
+
*,
|
|
194
|
+
depth: int = 2,
|
|
195
|
+
focus: str | None = None,
|
|
196
|
+
) -> list[dict[str, Any]]:
|
|
197
|
+
"""Return a nested outline suitable for the ``outline`` MCP tool.
|
|
198
|
+
|
|
199
|
+
Each node has: ``id``, ``title``, ``level``, ``children`` (recursively),
|
|
200
|
+
plus ``truncated: True`` when descendants exist beyond ``depth``.
|
|
201
|
+
Summaries are **not** attached here — that is the MCP tool's job after
|
|
202
|
+
joining with the Summaries sub-index.
|
|
203
|
+
"""
|
|
204
|
+
if depth < 1 or depth > 6:
|
|
205
|
+
msg = f"depth must be in [1, 6]; got {depth}"
|
|
206
|
+
raise IndexNotFoundError(msg)
|
|
207
|
+
|
|
208
|
+
if focus is None:
|
|
209
|
+
roots = self._roots
|
|
210
|
+
base_level = 0
|
|
211
|
+
else:
|
|
212
|
+
focused = self.require(focus)
|
|
213
|
+
roots = (focused,)
|
|
214
|
+
base_level = focused.level - 1
|
|
215
|
+
|
|
216
|
+
return [self._outline_node(s, depth, base_level) for s in roots]
|
|
217
|
+
|
|
218
|
+
def _outline_node(
|
|
219
|
+
self,
|
|
220
|
+
node: SectionNode,
|
|
221
|
+
depth: int,
|
|
222
|
+
base_level: int,
|
|
223
|
+
) -> dict[str, Any]:
|
|
224
|
+
remaining = depth - (node.level - base_level)
|
|
225
|
+
children_payload: list[dict[str, Any]] = []
|
|
226
|
+
truncated = False
|
|
227
|
+
if remaining > 0 and node.children:
|
|
228
|
+
for child_id in node.children:
|
|
229
|
+
child = self._by_id[child_id]
|
|
230
|
+
children_payload.append(self._outline_node(child, depth, base_level))
|
|
231
|
+
elif node.children:
|
|
232
|
+
truncated = True
|
|
233
|
+
|
|
234
|
+
payload: dict[str, Any] = {
|
|
235
|
+
"id": node.id,
|
|
236
|
+
"title": node.title,
|
|
237
|
+
"level": node.level,
|
|
238
|
+
"children": children_payload,
|
|
239
|
+
}
|
|
240
|
+
if truncated:
|
|
241
|
+
payload["truncated"] = True
|
|
242
|
+
return payload
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# ---------------------------------------------------------------------------
|
|
246
|
+
# (de)serialization
|
|
247
|
+
# ---------------------------------------------------------------------------
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _section_to_dict(s: SectionNode) -> dict[str, Any]:
|
|
251
|
+
return {
|
|
252
|
+
"id": s.id,
|
|
253
|
+
"title": s.title,
|
|
254
|
+
"level": s.level,
|
|
255
|
+
"parent": s.parent,
|
|
256
|
+
"children": list(s.children),
|
|
257
|
+
"span": {"start": s.span.start, "end": s.span.end},
|
|
258
|
+
"path": list(s.path),
|
|
259
|
+
"raw_text": s.raw_text,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _section_from_dict(d: dict[str, Any]) -> SectionNode:
|
|
264
|
+
span = d["span"]
|
|
265
|
+
return SectionNode(
|
|
266
|
+
id=d["id"],
|
|
267
|
+
title=d["title"],
|
|
268
|
+
level=d["level"],
|
|
269
|
+
parent=d["parent"],
|
|
270
|
+
children=tuple(d["children"]),
|
|
271
|
+
span=Span(start=span["start"], end=span["end"]),
|
|
272
|
+
path=tuple(d["path"]),
|
|
273
|
+
raw_text=d["raw_text"],
|
|
274
|
+
)
|
cairn/index/vectors.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Vectors sub-index — dense embeddings over LanceDB.
|
|
2
|
+
|
|
3
|
+
Storage layout::
|
|
4
|
+
|
|
5
|
+
<doc_dir>/
|
|
6
|
+
├── vectors.lance/ # LanceDB connect root
|
|
7
|
+
│ └── data.lance/ # table holding (id, vector)
|
|
8
|
+
└── vectors_manifest.json # embedder name, dim, build metadata
|
|
9
|
+
|
|
10
|
+
LanceDB is the v0.1 default per ARCHITECTURE.md §7. We use the sync API and
|
|
11
|
+
wrap blocking calls in ``asyncio.to_thread`` to satisfy our async-by-default
|
|
12
|
+
public surface without adopting LanceDB's still-evolving native async API.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
import shutil
|
|
22
|
+
from datetime import UTC, datetime
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any, Final
|
|
25
|
+
|
|
26
|
+
import lancedb
|
|
27
|
+
import pyarrow as pa
|
|
28
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
29
|
+
|
|
30
|
+
from cairn.core.errors import IndexBuildError, IndexNotFoundError
|
|
31
|
+
from cairn.core.types import Document, SectionNode
|
|
32
|
+
from cairn.embed.base import Embedder
|
|
33
|
+
|
|
34
|
+
VECTORS_DB_DIRNAME: Final = "vectors.lance"
|
|
35
|
+
VECTORS_TABLE_NAME: Final = "data"
|
|
36
|
+
VECTORS_MANIFEST_FILENAME: Final = "vectors_manifest.json"
|
|
37
|
+
VECTORS_FORMAT_VERSION: Final = 1
|
|
38
|
+
|
|
39
|
+
_SCOPE_PATTERN = re.compile(r"^[a-z0-9][a-z0-9_/-]*$")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class VectorHit(BaseModel):
|
|
43
|
+
"""One result row from a vector search."""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
46
|
+
|
|
47
|
+
id: str
|
|
48
|
+
score: float = Field(ge=0.0, le=1.0)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class VectorEntry(BaseModel):
|
|
52
|
+
"""One persisted section vector."""
|
|
53
|
+
|
|
54
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
55
|
+
|
|
56
|
+
id: str
|
|
57
|
+
vector: list[float]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def embedding_text(node: SectionNode) -> str:
|
|
61
|
+
"""Compose the text we embed for a section.
|
|
62
|
+
|
|
63
|
+
Includes the title so heading information enters the embedding, and falls
|
|
64
|
+
back to title alone for sections with empty bodies.
|
|
65
|
+
"""
|
|
66
|
+
body = node.raw_text.strip()
|
|
67
|
+
if not body:
|
|
68
|
+
return node.title
|
|
69
|
+
return f"{node.title}\n\n{body}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def l2_normalize(vec: list[float]) -> list[float]:
|
|
73
|
+
"""Return the L2-normalized copy of ``vec``. Zero vectors are returned unchanged."""
|
|
74
|
+
norm = math.sqrt(sum(x * x for x in vec))
|
|
75
|
+
if norm == 0.0:
|
|
76
|
+
return list(vec)
|
|
77
|
+
return [x / norm for x in vec]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class VectorBuilder:
|
|
81
|
+
"""Embed and persist section-level vectors for a Document."""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
embedder: Embedder,
|
|
86
|
+
*,
|
|
87
|
+
batch_size: int = 32,
|
|
88
|
+
) -> None:
|
|
89
|
+
if batch_size < 1:
|
|
90
|
+
msg = f"batch_size must be >= 1; got {batch_size}"
|
|
91
|
+
raise IndexBuildError(msg)
|
|
92
|
+
self.embedder = embedder
|
|
93
|
+
self.batch_size = batch_size
|
|
94
|
+
|
|
95
|
+
async def build(self, document: Document, *, out_dir: Path) -> Path:
|
|
96
|
+
"""Embed every section and write ``vectors.lance/`` + manifest.
|
|
97
|
+
|
|
98
|
+
Returns the path to the manifest file.
|
|
99
|
+
"""
|
|
100
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
db_dir = out_dir / VECTORS_DB_DIRNAME
|
|
102
|
+
manifest_path = out_dir / VECTORS_MANIFEST_FILENAME
|
|
103
|
+
|
|
104
|
+
ids = [s.id for s in document.sections]
|
|
105
|
+
texts = [embedding_text(s) for s in document.sections]
|
|
106
|
+
|
|
107
|
+
vectors: list[list[float]] = []
|
|
108
|
+
for i in range(0, len(texts), self.batch_size):
|
|
109
|
+
batch = texts[i : i + self.batch_size]
|
|
110
|
+
raw = await self.embedder.embed(batch)
|
|
111
|
+
if len(raw) != len(batch):
|
|
112
|
+
msg = (
|
|
113
|
+
f"embedder returned {len(raw)} vectors for batch of "
|
|
114
|
+
f"{len(batch)}"
|
|
115
|
+
)
|
|
116
|
+
raise IndexBuildError(msg)
|
|
117
|
+
for vec in raw:
|
|
118
|
+
if len(vec) != self.embedder.dim:
|
|
119
|
+
msg = (
|
|
120
|
+
f"embedder returned dim={len(vec)} but expected "
|
|
121
|
+
f"dim={self.embedder.dim}"
|
|
122
|
+
)
|
|
123
|
+
raise IndexBuildError(msg)
|
|
124
|
+
vectors.append(l2_normalize(vec))
|
|
125
|
+
|
|
126
|
+
await asyncio.to_thread(self._write_table, db_dir, ids, vectors)
|
|
127
|
+
|
|
128
|
+
now = datetime.now(UTC)
|
|
129
|
+
manifest = {
|
|
130
|
+
"format_version": VECTORS_FORMAT_VERSION,
|
|
131
|
+
"doc_id": document.id,
|
|
132
|
+
"embedder": self.embedder.name,
|
|
133
|
+
"dim": self.embedder.dim,
|
|
134
|
+
"section_count": len(ids),
|
|
135
|
+
"generated_at": now.isoformat(),
|
|
136
|
+
}
|
|
137
|
+
with manifest_path.open("w", encoding="utf-8") as fh:
|
|
138
|
+
json.dump(manifest, fh, ensure_ascii=False, indent=2)
|
|
139
|
+
fh.write("\n")
|
|
140
|
+
return manifest_path
|
|
141
|
+
|
|
142
|
+
def _write_table(
|
|
143
|
+
self,
|
|
144
|
+
db_dir: Path,
|
|
145
|
+
ids: list[str],
|
|
146
|
+
vectors: list[list[float]],
|
|
147
|
+
) -> None:
|
|
148
|
+
# Full rebuild: clear any previous table data for a clean schema state.
|
|
149
|
+
if db_dir.exists():
|
|
150
|
+
shutil.rmtree(db_dir)
|
|
151
|
+
|
|
152
|
+
db = lancedb.connect(str(db_dir))
|
|
153
|
+
schema = pa.schema(
|
|
154
|
+
[
|
|
155
|
+
pa.field("id", pa.string()),
|
|
156
|
+
pa.field("vector", pa.list_(pa.float32(), self.embedder.dim)),
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
table = db.create_table(VECTORS_TABLE_NAME, schema=schema)
|
|
160
|
+
if ids:
|
|
161
|
+
records = [
|
|
162
|
+
{"id": sid, "vector": vec} for sid, vec in zip(ids, vectors, strict=True)
|
|
163
|
+
]
|
|
164
|
+
table.add(records)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class Vectors:
|
|
168
|
+
"""Loaded vectors index. Cosine-similarity search via LanceDB."""
|
|
169
|
+
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
table: Any,
|
|
173
|
+
*,
|
|
174
|
+
doc_id: str,
|
|
175
|
+
embedder: str,
|
|
176
|
+
dim: int,
|
|
177
|
+
) -> None:
|
|
178
|
+
self._table = table
|
|
179
|
+
self.doc_id = doc_id
|
|
180
|
+
self.embedder = embedder
|
|
181
|
+
self.dim = dim
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def load(cls, doc_dir: Path) -> Vectors:
|
|
185
|
+
"""Load vectors index from a document directory."""
|
|
186
|
+
manifest_path = doc_dir / VECTORS_MANIFEST_FILENAME
|
|
187
|
+
db_dir = doc_dir / VECTORS_DB_DIRNAME
|
|
188
|
+
if not manifest_path.exists():
|
|
189
|
+
msg = f"vectors manifest not found in {doc_dir}"
|
|
190
|
+
raise IndexNotFoundError(msg, details={"path": str(manifest_path)})
|
|
191
|
+
if not db_dir.exists():
|
|
192
|
+
msg = f"vectors.lance directory not found in {doc_dir}"
|
|
193
|
+
raise IndexNotFoundError(msg, details={"path": str(db_dir)})
|
|
194
|
+
|
|
195
|
+
with manifest_path.open("r", encoding="utf-8") as fh:
|
|
196
|
+
manifest = json.load(fh)
|
|
197
|
+
|
|
198
|
+
version = manifest.get("format_version")
|
|
199
|
+
if version != VECTORS_FORMAT_VERSION:
|
|
200
|
+
msg = (
|
|
201
|
+
f"unsupported vectors format version: {version!r} "
|
|
202
|
+
f"(expected {VECTORS_FORMAT_VERSION})"
|
|
203
|
+
)
|
|
204
|
+
raise IndexNotFoundError(msg, details={"path": str(manifest_path)})
|
|
205
|
+
|
|
206
|
+
db = lancedb.connect(str(db_dir))
|
|
207
|
+
table = db.open_table(VECTORS_TABLE_NAME)
|
|
208
|
+
return cls(
|
|
209
|
+
table,
|
|
210
|
+
doc_id=manifest["doc_id"],
|
|
211
|
+
embedder=manifest["embedder"],
|
|
212
|
+
dim=manifest["dim"],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
async def search(
|
|
216
|
+
self,
|
|
217
|
+
query: list[float],
|
|
218
|
+
*,
|
|
219
|
+
k: int = 8,
|
|
220
|
+
scope_prefix: str | None = None,
|
|
221
|
+
) -> list[VectorHit]:
|
|
222
|
+
"""Return up to ``k`` nearest sections by cosine similarity.
|
|
223
|
+
|
|
224
|
+
When ``scope_prefix`` is given, results are restricted to sections
|
|
225
|
+
whose id equals the prefix or begins with ``f"{prefix}/"``.
|
|
226
|
+
"""
|
|
227
|
+
if k < 1:
|
|
228
|
+
msg = f"k must be >= 1; got {k}"
|
|
229
|
+
raise IndexBuildError(msg)
|
|
230
|
+
if len(query) != self.dim:
|
|
231
|
+
msg = f"query dim {len(query)} != index dim {self.dim}"
|
|
232
|
+
raise IndexBuildError(msg)
|
|
233
|
+
if scope_prefix is not None and not _SCOPE_PATTERN.match(scope_prefix):
|
|
234
|
+
msg = (
|
|
235
|
+
f"invalid scope_prefix {scope_prefix!r}; only lowercase "
|
|
236
|
+
"alphanumeric, '-', '_', '/' allowed"
|
|
237
|
+
)
|
|
238
|
+
raise IndexBuildError(msg)
|
|
239
|
+
|
|
240
|
+
normalized = l2_normalize(query)
|
|
241
|
+
return await asyncio.to_thread(
|
|
242
|
+
self._sync_search, normalized, k, scope_prefix
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def _sync_search(
|
|
246
|
+
self,
|
|
247
|
+
vec: list[float],
|
|
248
|
+
k: int,
|
|
249
|
+
scope_prefix: str | None,
|
|
250
|
+
) -> list[VectorHit]:
|
|
251
|
+
q = self._table.search(vec).distance_type("cosine")
|
|
252
|
+
if scope_prefix is not None:
|
|
253
|
+
predicate = (
|
|
254
|
+
f"id = '{scope_prefix}' OR id LIKE '{scope_prefix}/%'"
|
|
255
|
+
)
|
|
256
|
+
q = q.where(predicate, prefilter=True)
|
|
257
|
+
rows = q.limit(k).to_list()
|
|
258
|
+
|
|
259
|
+
hits: list[VectorHit] = []
|
|
260
|
+
for row in rows:
|
|
261
|
+
distance = float(row["_distance"])
|
|
262
|
+
score = max(0.0, min(1.0, 1.0 - distance))
|
|
263
|
+
hits.append(VectorHit(id=str(row["id"]), score=score))
|
|
264
|
+
return hits
|
|
265
|
+
|
|
266
|
+
async def count(self) -> int:
|
|
267
|
+
"""Total number of indexed sections."""
|
|
268
|
+
return await asyncio.to_thread(self._table.count_rows)
|
|
269
|
+
|
|
270
|
+
async def entries(self) -> list[VectorEntry]:
|
|
271
|
+
"""Return every stored vector.
|
|
272
|
+
|
|
273
|
+
Repo-scoped search uses this to build a process-local flat index once,
|
|
274
|
+
then answers repeated MCP queries without reopening every per-document
|
|
275
|
+
LanceDB table.
|
|
276
|
+
"""
|
|
277
|
+
return await asyncio.to_thread(self._sync_entries)
|
|
278
|
+
|
|
279
|
+
def _sync_entries(self) -> list[VectorEntry]:
|
|
280
|
+
rows = self._table.to_arrow().to_pylist()
|
|
281
|
+
return [
|
|
282
|
+
VectorEntry(
|
|
283
|
+
id=str(row["id"]),
|
|
284
|
+
vector=[float(value) for value in row["vector"]],
|
|
285
|
+
)
|
|
286
|
+
for row in rows
|
|
287
|
+
]
|