docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""OpenAI-compatible HTTP embedder.
|
|
2
|
+
|
|
3
|
+
Works with any endpoint that implements the OpenAI ``/v1/embeddings``
|
|
4
|
+
contract: OpenAI itself, Ollama (``http://localhost:11434/v1``), vLLM,
|
|
5
|
+
Together, Anyscale, etc.
|
|
6
|
+
|
|
7
|
+
Default configuration points at a local Ollama instance running the
|
|
8
|
+
``nomic-embed-text`` model (768 dims) — chosen for the same reason as the
|
|
9
|
+
summarizer default: zero API keys, mature stack, runs on a laptop.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
from cairn.core.errors import IndexBuildError
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class OpenAICompatibleEmbedder:
|
|
23
|
+
"""OpenAI-compatible embeddings client."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
base_url: str = "http://localhost:11434/v1",
|
|
29
|
+
model: str = "nomic-embed-text",
|
|
30
|
+
dim: int = 768,
|
|
31
|
+
api_key: str | None = None,
|
|
32
|
+
timeout: float = 60.0,
|
|
33
|
+
max_retries: int = 2,
|
|
34
|
+
retry_base_delay: float = 0.5,
|
|
35
|
+
) -> None:
|
|
36
|
+
if dim < 1:
|
|
37
|
+
msg = f"dim must be >= 1; got {dim}"
|
|
38
|
+
raise ValueError(msg)
|
|
39
|
+
if max_retries < 0:
|
|
40
|
+
msg = f"max_retries must be >= 0; got {max_retries}"
|
|
41
|
+
raise ValueError(msg)
|
|
42
|
+
if retry_base_delay < 0:
|
|
43
|
+
msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
|
|
44
|
+
raise ValueError(msg)
|
|
45
|
+
self.base_url = base_url.rstrip("/")
|
|
46
|
+
self.model = model
|
|
47
|
+
self.dim = dim
|
|
48
|
+
self.api_key = api_key
|
|
49
|
+
self.timeout = timeout
|
|
50
|
+
self.max_retries = max_retries
|
|
51
|
+
self.retry_base_delay = retry_base_delay
|
|
52
|
+
self.name = f"openai-compat:{model}"
|
|
53
|
+
|
|
54
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
55
|
+
if not texts:
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
headers = {"Content-Type": "application/json"}
|
|
59
|
+
if self.api_key:
|
|
60
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
61
|
+
|
|
62
|
+
payload: dict[str, Any] = {
|
|
63
|
+
"model": self.model,
|
|
64
|
+
"input": texts,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
68
|
+
response = await self._post_with_retries(client, payload, headers)
|
|
69
|
+
data = response.json()
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
vectors = [list(item["embedding"]) for item in data["data"]]
|
|
73
|
+
except (KeyError, TypeError, IndexError) as exc:
|
|
74
|
+
msg = "embedder response did not match OpenAI embeddings shape"
|
|
75
|
+
raise IndexBuildError(msg, details={"response": data}) from exc
|
|
76
|
+
|
|
77
|
+
if len(vectors) != len(texts):
|
|
78
|
+
msg = (
|
|
79
|
+
f"embedder returned {len(vectors)} vectors for "
|
|
80
|
+
f"{len(texts)} inputs"
|
|
81
|
+
)
|
|
82
|
+
raise IndexBuildError(msg)
|
|
83
|
+
for i, vec in enumerate(vectors):
|
|
84
|
+
if len(vec) != self.dim:
|
|
85
|
+
msg = (
|
|
86
|
+
f"embedder returned dim={len(vec)} but client expects "
|
|
87
|
+
f"dim={self.dim} (model {self.model!r}, index {i})"
|
|
88
|
+
)
|
|
89
|
+
raise IndexBuildError(msg)
|
|
90
|
+
|
|
91
|
+
return vectors
|
|
92
|
+
|
|
93
|
+
async def _post_with_retries(
|
|
94
|
+
self,
|
|
95
|
+
client: httpx.AsyncClient,
|
|
96
|
+
payload: dict[str, Any],
|
|
97
|
+
headers: dict[str, str],
|
|
98
|
+
) -> httpx.Response:
|
|
99
|
+
last_exc: httpx.HTTPError | None = None
|
|
100
|
+
for attempt in range(self.max_retries + 1):
|
|
101
|
+
try:
|
|
102
|
+
response = await client.post(
|
|
103
|
+
f"{self.base_url}/embeddings",
|
|
104
|
+
json=payload,
|
|
105
|
+
headers=headers,
|
|
106
|
+
)
|
|
107
|
+
except httpx.HTTPError as exc:
|
|
108
|
+
last_exc = exc
|
|
109
|
+
if attempt < self.max_retries:
|
|
110
|
+
await self._sleep_before_retry(attempt)
|
|
111
|
+
continue
|
|
112
|
+
msg = f"embedder request failed: {exc}"
|
|
113
|
+
raise IndexBuildError(
|
|
114
|
+
msg,
|
|
115
|
+
details={
|
|
116
|
+
"model": self.model,
|
|
117
|
+
"base_url": self.base_url,
|
|
118
|
+
"error_type": type(exc).__name__,
|
|
119
|
+
"attempts": attempt + 1,
|
|
120
|
+
},
|
|
121
|
+
) from exc
|
|
122
|
+
|
|
123
|
+
if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
|
|
124
|
+
await self._sleep_before_retry(attempt)
|
|
125
|
+
continue
|
|
126
|
+
if response.status_code >= 400:
|
|
127
|
+
msg = (
|
|
128
|
+
f"embedder endpoint returned HTTP {response.status_code}: "
|
|
129
|
+
f"{response.text[:200]}"
|
|
130
|
+
)
|
|
131
|
+
raise IndexBuildError(
|
|
132
|
+
msg,
|
|
133
|
+
details={
|
|
134
|
+
"status": response.status_code,
|
|
135
|
+
"model": self.model,
|
|
136
|
+
"base_url": self.base_url,
|
|
137
|
+
"attempts": attempt + 1,
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
return response
|
|
141
|
+
|
|
142
|
+
msg = "embedder request failed without a response"
|
|
143
|
+
raise IndexBuildError(
|
|
144
|
+
msg,
|
|
145
|
+
details={
|
|
146
|
+
"model": self.model,
|
|
147
|
+
"base_url": self.base_url,
|
|
148
|
+
"error_type": type(last_exc).__name__ if last_exc else None,
|
|
149
|
+
},
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
async def _sleep_before_retry(self, attempt: int) -> None:
|
|
153
|
+
if self.retry_base_delay == 0:
|
|
154
|
+
return
|
|
155
|
+
await asyncio.sleep(self.retry_base_delay * (2**attempt))
|
cairn/engine/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Engine layer — orchestrates the three sub-index builders + top-level manifest."""
|
|
2
|
+
|
|
3
|
+
from cairn.engine.indexer import Indexer, IndexResult
|
|
4
|
+
from cairn.engine.manifest import (
|
|
5
|
+
MANIFEST_FILENAME,
|
|
6
|
+
MANIFEST_FORMAT_VERSION,
|
|
7
|
+
Manifest,
|
|
8
|
+
read_manifest,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"MANIFEST_FILENAME",
|
|
13
|
+
"MANIFEST_FORMAT_VERSION",
|
|
14
|
+
"IndexResult",
|
|
15
|
+
"Indexer",
|
|
16
|
+
"Manifest",
|
|
17
|
+
"read_manifest",
|
|
18
|
+
]
|
cairn/engine/indexer.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""Indexer — single entry point that builds all three v0.1 sub-indexes.
|
|
2
|
+
|
|
3
|
+
Parses a source document, runs ``TreeBuilder`` synchronously, then
|
|
4
|
+
``SummaryBuilder`` and ``VectorBuilder`` asynchronously, and finally writes
|
|
5
|
+
the top-level ``manifest.json`` that ties the artifacts together.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable, Sequence
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from cairn import __version__
|
|
16
|
+
from cairn.core.errors import IndexNotFoundError
|
|
17
|
+
from cairn.core.types import Document
|
|
18
|
+
from cairn.embed.base import Embedder
|
|
19
|
+
from cairn.engine.manifest import (
|
|
20
|
+
MANIFEST_FILENAME,
|
|
21
|
+
MANIFEST_FORMAT_VERSION,
|
|
22
|
+
Manifest,
|
|
23
|
+
SubIndexEntry,
|
|
24
|
+
read_manifest,
|
|
25
|
+
write_manifest,
|
|
26
|
+
)
|
|
27
|
+
from cairn.entity.base import EntityExtractor
|
|
28
|
+
from cairn.index.entities import (
|
|
29
|
+
ENTITIES_FILENAME,
|
|
30
|
+
ENTITIES_FORMAT_VERSION,
|
|
31
|
+
Entities,
|
|
32
|
+
EntityBuilder,
|
|
33
|
+
)
|
|
34
|
+
from cairn.index.summaries import (
|
|
35
|
+
SUMMARIES_FILENAME,
|
|
36
|
+
SUMMARIES_FORMAT_VERSION,
|
|
37
|
+
SummaryBuilder,
|
|
38
|
+
)
|
|
39
|
+
from cairn.index.tree import TREE_FILENAME, TreeBuilder
|
|
40
|
+
from cairn.index.vectors import (
|
|
41
|
+
VECTORS_FORMAT_VERSION,
|
|
42
|
+
VECTORS_MANIFEST_FILENAME,
|
|
43
|
+
VectorBuilder,
|
|
44
|
+
)
|
|
45
|
+
from cairn.index.xrefs import (
|
|
46
|
+
XREFS_FILENAME,
|
|
47
|
+
XREFS_FORMAT_VERSION,
|
|
48
|
+
XRefBuilder,
|
|
49
|
+
)
|
|
50
|
+
from cairn.ingest.base import Parser
|
|
51
|
+
from cairn.summarize.base import Summarizer, SummaryLevel
|
|
52
|
+
from cairn.summarize.cache import SummaryCache
|
|
53
|
+
from cairn.xref.base import XRefExtractor
|
|
54
|
+
|
|
55
|
+
_TREE_BUILDER_VERSION = 1
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class IndexResult:
|
|
60
|
+
"""Outcome of an :meth:`Indexer.index_path` call.
|
|
61
|
+
|
|
62
|
+
``rebuilt`` is ``False`` when the source's hash matched the previous
|
|
63
|
+
build's manifest, all producer fingerprints still match, and the existing
|
|
64
|
+
index was kept as-is (a no-op).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
manifest_path: Path
|
|
68
|
+
rebuilt: bool
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Indexer:
|
|
72
|
+
"""Orchestrates the sub-index builders for one document.
|
|
73
|
+
|
|
74
|
+
Tree + Summaries + Vectors are always built. The Entities sub-index is
|
|
75
|
+
built when ``entity_extractor`` is supplied (default since v0.2).
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
parser: Parser,
|
|
82
|
+
summarizer: Summarizer,
|
|
83
|
+
embedder: Embedder,
|
|
84
|
+
entity_extractor: EntityExtractor | None = None,
|
|
85
|
+
xref_extractor: XRefExtractor | None = None,
|
|
86
|
+
summary_cache: SummaryCache | None = None,
|
|
87
|
+
summary_concurrency: int = 4,
|
|
88
|
+
embed_batch_size: int = 32,
|
|
89
|
+
progress: Callable[[str], None] | None = None,
|
|
90
|
+
) -> None:
|
|
91
|
+
self.parser = parser
|
|
92
|
+
self.summarizer = summarizer
|
|
93
|
+
self.embedder = embedder
|
|
94
|
+
self.entity_extractor = entity_extractor
|
|
95
|
+
self.xref_extractor = xref_extractor
|
|
96
|
+
self.summary_cache = summary_cache
|
|
97
|
+
self.summary_concurrency = summary_concurrency
|
|
98
|
+
self.embed_batch_size = embed_batch_size
|
|
99
|
+
self.progress = progress
|
|
100
|
+
|
|
101
|
+
async def index_path(
|
|
102
|
+
self,
|
|
103
|
+
source: Path,
|
|
104
|
+
*,
|
|
105
|
+
out_dir: Path,
|
|
106
|
+
doc_id: str | None = None,
|
|
107
|
+
summary_levels: Sequence[SummaryLevel] = (
|
|
108
|
+
SummaryLevel.GIST,
|
|
109
|
+
SummaryLevel.SYNOPSIS,
|
|
110
|
+
SummaryLevel.DIGEST,
|
|
111
|
+
),
|
|
112
|
+
force: bool = False,
|
|
113
|
+
) -> IndexResult:
|
|
114
|
+
"""Parse a source file and build all sub-indexes.
|
|
115
|
+
|
|
116
|
+
When ``force`` is ``False`` (default), the indexer first checks
|
|
117
|
+
whether ``out_dir`` already contains a manifest whose source_hash and
|
|
118
|
+
producer fingerprints match the requested build. If so, the existing
|
|
119
|
+
index is left untouched and ``IndexResult.rebuilt`` is ``False``. Pass
|
|
120
|
+
``force=True`` to always rebuild.
|
|
121
|
+
"""
|
|
122
|
+
document = self.parser.parse(source, doc_id=doc_id)
|
|
123
|
+
|
|
124
|
+
if not force and _existing_matches(
|
|
125
|
+
out_dir,
|
|
126
|
+
document.source_hash,
|
|
127
|
+
summarizer=self.summarizer,
|
|
128
|
+
embedder=self.embedder,
|
|
129
|
+
summary_levels=summary_levels,
|
|
130
|
+
entity_extractor=self.entity_extractor,
|
|
131
|
+
xref_extractor=self.xref_extractor,
|
|
132
|
+
):
|
|
133
|
+
return IndexResult(
|
|
134
|
+
manifest_path=out_dir / MANIFEST_FILENAME,
|
|
135
|
+
rebuilt=False,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
manifest_path = await self.index_document(
|
|
139
|
+
document,
|
|
140
|
+
out_dir=out_dir,
|
|
141
|
+
summary_levels=summary_levels,
|
|
142
|
+
)
|
|
143
|
+
return IndexResult(manifest_path=manifest_path, rebuilt=True)
|
|
144
|
+
|
|
145
|
+
async def index_document(
|
|
146
|
+
self,
|
|
147
|
+
document: Document,
|
|
148
|
+
*,
|
|
149
|
+
out_dir: Path,
|
|
150
|
+
summary_levels: Sequence[SummaryLevel] = (
|
|
151
|
+
SummaryLevel.GIST,
|
|
152
|
+
SummaryLevel.SYNOPSIS,
|
|
153
|
+
SummaryLevel.DIGEST,
|
|
154
|
+
),
|
|
155
|
+
) -> Path:
|
|
156
|
+
"""Run every configured builder against an already-parsed Document."""
|
|
157
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
|
|
159
|
+
self._emit("tree: writing")
|
|
160
|
+
TreeBuilder().build(document, out_dir=out_dir)
|
|
161
|
+
self._emit("tree: done")
|
|
162
|
+
self._emit("summaries: starting")
|
|
163
|
+
await SummaryBuilder(
|
|
164
|
+
self.summarizer,
|
|
165
|
+
cache=self.summary_cache,
|
|
166
|
+
concurrency=self.summary_concurrency,
|
|
167
|
+
progress=lambda done, total: self._emit(f"summaries: {done}/{total}"),
|
|
168
|
+
).build(document, out_dir=out_dir, levels=summary_levels)
|
|
169
|
+
self._emit("summaries: done")
|
|
170
|
+
self._emit("vectors: starting")
|
|
171
|
+
await VectorBuilder(
|
|
172
|
+
self.embedder, batch_size=self.embed_batch_size
|
|
173
|
+
).build(document, out_dir=out_dir)
|
|
174
|
+
self._emit("vectors: done")
|
|
175
|
+
|
|
176
|
+
subindexes: dict[str, SubIndexEntry] = {
|
|
177
|
+
"tree": SubIndexEntry(
|
|
178
|
+
path=TREE_FILENAME,
|
|
179
|
+
builder_version=_TREE_BUILDER_VERSION,
|
|
180
|
+
),
|
|
181
|
+
"summaries": SubIndexEntry(
|
|
182
|
+
path=SUMMARIES_FILENAME,
|
|
183
|
+
builder_version=SUMMARIES_FORMAT_VERSION,
|
|
184
|
+
model=self.summarizer.name,
|
|
185
|
+
levels=[lvl.value for lvl in summary_levels],
|
|
186
|
+
),
|
|
187
|
+
"vectors": SubIndexEntry(
|
|
188
|
+
path=VECTORS_MANIFEST_FILENAME,
|
|
189
|
+
builder_version=VECTORS_FORMAT_VERSION,
|
|
190
|
+
embedder=self.embedder.name,
|
|
191
|
+
dim=self.embedder.dim,
|
|
192
|
+
),
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
entities_reader: Entities | None = None
|
|
196
|
+
if self.entity_extractor is not None:
|
|
197
|
+
self._emit("entities: starting")
|
|
198
|
+
await EntityBuilder(self.entity_extractor).build(
|
|
199
|
+
document, out_dir=out_dir
|
|
200
|
+
)
|
|
201
|
+
self._emit("entities: done")
|
|
202
|
+
subindexes["entities"] = SubIndexEntry(
|
|
203
|
+
path=ENTITIES_FILENAME,
|
|
204
|
+
builder_version=ENTITIES_FORMAT_VERSION,
|
|
205
|
+
extractor=self.entity_extractor.name,
|
|
206
|
+
)
|
|
207
|
+
# Reload from disk so the xref extractor can use the canonical
|
|
208
|
+
# form of the just-built Entities sub-index.
|
|
209
|
+
entities_reader = Entities.load(out_dir)
|
|
210
|
+
|
|
211
|
+
if self.xref_extractor is not None:
|
|
212
|
+
self._emit("xrefs: starting")
|
|
213
|
+
await XRefBuilder(self.xref_extractor).build(
|
|
214
|
+
document, out_dir=out_dir, entities=entities_reader
|
|
215
|
+
)
|
|
216
|
+
self._emit("xrefs: done")
|
|
217
|
+
subindexes["xrefs"] = SubIndexEntry(
|
|
218
|
+
path=XREFS_FILENAME,
|
|
219
|
+
builder_version=XREFS_FORMAT_VERSION,
|
|
220
|
+
extractor=self.xref_extractor.name,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
manifest = Manifest(
|
|
224
|
+
format_version=MANIFEST_FORMAT_VERSION,
|
|
225
|
+
doc_id=document.id,
|
|
226
|
+
cairn_version=__version__,
|
|
227
|
+
source_path=str(document.source_path),
|
|
228
|
+
source_hash=document.source_hash,
|
|
229
|
+
indexed_at=datetime.now(UTC),
|
|
230
|
+
subindexes=subindexes,
|
|
231
|
+
)
|
|
232
|
+
self._emit("manifest: writing")
|
|
233
|
+
path = write_manifest(out_dir, manifest)
|
|
234
|
+
self._emit("manifest: done")
|
|
235
|
+
return path
|
|
236
|
+
|
|
237
|
+
def _emit(self, message: str) -> None:
|
|
238
|
+
if self.progress is not None:
|
|
239
|
+
self.progress(message)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _existing_matches(
|
|
243
|
+
out_dir: Path,
|
|
244
|
+
source_hash: str,
|
|
245
|
+
*,
|
|
246
|
+
summarizer: Summarizer,
|
|
247
|
+
embedder: Embedder,
|
|
248
|
+
summary_levels: Sequence[SummaryLevel],
|
|
249
|
+
entity_extractor: EntityExtractor | None,
|
|
250
|
+
xref_extractor: XRefExtractor | None,
|
|
251
|
+
) -> bool:
|
|
252
|
+
"""Return ``True`` when the existing index matches source and producers."""
|
|
253
|
+
if not (out_dir / MANIFEST_FILENAME).exists():
|
|
254
|
+
return False
|
|
255
|
+
try:
|
|
256
|
+
existing = read_manifest(out_dir)
|
|
257
|
+
except IndexNotFoundError:
|
|
258
|
+
return False
|
|
259
|
+
if existing.source_hash != source_hash:
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
tree = existing.subindexes.get("tree")
|
|
263
|
+
summaries = existing.subindexes.get("summaries")
|
|
264
|
+
vectors = existing.subindexes.get("vectors")
|
|
265
|
+
if tree is None or tree.builder_version != _TREE_BUILDER_VERSION:
|
|
266
|
+
return False
|
|
267
|
+
if (
|
|
268
|
+
summaries is None
|
|
269
|
+
or summaries.builder_version != SUMMARIES_FORMAT_VERSION
|
|
270
|
+
or summaries.model != summarizer.name
|
|
271
|
+
or summaries.levels != [lvl.value for lvl in summary_levels]
|
|
272
|
+
):
|
|
273
|
+
return False
|
|
274
|
+
if (
|
|
275
|
+
vectors is None
|
|
276
|
+
or vectors.builder_version != VECTORS_FORMAT_VERSION
|
|
277
|
+
or vectors.embedder != embedder.name
|
|
278
|
+
or vectors.dim != embedder.dim
|
|
279
|
+
):
|
|
280
|
+
return False
|
|
281
|
+
|
|
282
|
+
if entity_extractor is not None:
|
|
283
|
+
entities = existing.subindexes.get("entities")
|
|
284
|
+
if (
|
|
285
|
+
entities is None
|
|
286
|
+
or entities.builder_version != ENTITIES_FORMAT_VERSION
|
|
287
|
+
or entities.extractor != entity_extractor.name
|
|
288
|
+
):
|
|
289
|
+
return False
|
|
290
|
+
if xref_extractor is not None:
|
|
291
|
+
xrefs = existing.subindexes.get("xrefs")
|
|
292
|
+
if (
|
|
293
|
+
xrefs is None
|
|
294
|
+
or xrefs.builder_version != XREFS_FORMAT_VERSION
|
|
295
|
+
or xrefs.extractor != xref_extractor.name
|
|
296
|
+
):
|
|
297
|
+
return False
|
|
298
|
+
return True
|
cairn/engine/manifest.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Top-level document manifest.
|
|
2
|
+
|
|
3
|
+
Per ARCHITECTURE.md §5, a document directory holds one ``manifest.json`` that
|
|
4
|
+
records source provenance, sub-index file pointers, builder versions, and
|
|
5
|
+
the model identifiers that produced each artifact. The manifest is the
|
|
6
|
+
contract: any file it references must exist; orphans are reapable.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Final
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
|
|
18
|
+
from cairn.core.errors import IndexNotFoundError
|
|
19
|
+
|
|
20
|
+
MANIFEST_FILENAME: Final = "manifest.json"
|
|
21
|
+
MANIFEST_FORMAT_VERSION: Final = 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SubIndexEntry(BaseModel):
|
|
25
|
+
"""One sub-index pointer in the top-level manifest."""
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
28
|
+
|
|
29
|
+
path: str = Field(description="path relative to the document directory")
|
|
30
|
+
builder_version: int = Field(ge=1)
|
|
31
|
+
# Optional fields that describe what produced this artifact.
|
|
32
|
+
model: str | None = None
|
|
33
|
+
embedder: str | None = None
|
|
34
|
+
extractor: str | None = None
|
|
35
|
+
dim: int | None = None
|
|
36
|
+
levels: list[str] | None = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Manifest(BaseModel):
|
|
40
|
+
"""Top-level document manifest — the contract for everything else."""
|
|
41
|
+
|
|
42
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
43
|
+
|
|
44
|
+
format_version: int
|
|
45
|
+
doc_id: str
|
|
46
|
+
cairn_version: str
|
|
47
|
+
source_path: str
|
|
48
|
+
source_hash: str
|
|
49
|
+
indexed_at: datetime
|
|
50
|
+
subindexes: dict[str, SubIndexEntry]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def write_manifest(out_dir: Path, manifest: Manifest) -> Path:
|
|
54
|
+
"""Write ``manifest.json`` into ``out_dir`` deterministically."""
|
|
55
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
path = out_dir / MANIFEST_FILENAME
|
|
57
|
+
|
|
58
|
+
payload: dict[str, Any] = manifest.model_dump(mode="json")
|
|
59
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
60
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
61
|
+
fh.write("\n")
|
|
62
|
+
return path
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def read_manifest(doc_dir: Path) -> Manifest:
|
|
66
|
+
"""Load and validate ``manifest.json`` from ``doc_dir``."""
|
|
67
|
+
path = doc_dir / MANIFEST_FILENAME
|
|
68
|
+
if not path.exists():
|
|
69
|
+
msg = f"manifest.json not found in {doc_dir}"
|
|
70
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
71
|
+
|
|
72
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
73
|
+
payload = json.load(fh)
|
|
74
|
+
|
|
75
|
+
version = payload.get("format_version")
|
|
76
|
+
if version != MANIFEST_FORMAT_VERSION:
|
|
77
|
+
msg = (
|
|
78
|
+
f"unsupported manifest format version: {version!r} "
|
|
79
|
+
f"(expected {MANIFEST_FORMAT_VERSION})"
|
|
80
|
+
)
|
|
81
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
82
|
+
|
|
83
|
+
return Manifest.model_validate(payload)
|
cairn/entity/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Entity extraction — pluggable extractors that mine entities from a Document.
|
|
2
|
+
|
|
3
|
+
Used by ``cairn.index.entities.EntityBuilder`` at indexing time. The
|
|
4
|
+
heuristic extractor is the v0.2.0 default; an LLM-backed extractor for
|
|
5
|
+
``term`` and ``proper`` kinds is planned for v0.2.1.
|
|
6
|
+
|
|
7
|
+
Per ARCHITECTURE.md §2.3, entities come in four kinds — ``term``, ``code``,
|
|
8
|
+
``proper``, ``defined``. The heuristic extractor covers ``code`` and
|
|
9
|
+
``defined`` without any model dependency.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from cairn.entity.base import EntityExtractor, ExtractionHit
|
|
13
|
+
from cairn.entity.fake import FakeEntityExtractor
|
|
14
|
+
from cairn.entity.heuristic import HeuristicExtractor
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"EntityExtractor",
|
|
18
|
+
"ExtractionHit",
|
|
19
|
+
"FakeEntityExtractor",
|
|
20
|
+
"HeuristicExtractor",
|
|
21
|
+
]
|
cairn/entity/base.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""EntityExtractor protocol + intermediate extraction hit type.
|
|
2
|
+
|
|
3
|
+
Extractors emit a sequence of :class:`ExtractionHit` — one per *occurrence*.
|
|
4
|
+
The :class:`cairn.index.entities.EntityBuilder` deduplicates hits by
|
|
5
|
+
``(canonical, kind)`` into :class:`cairn.core.types.Entity` records.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Awaitable, Iterable
|
|
11
|
+
from typing import Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict
|
|
14
|
+
|
|
15
|
+
from cairn.core.types import Document, EntityKind, Span
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ExtractionHit(BaseModel):
|
|
19
|
+
"""One observed occurrence of a candidate entity.
|
|
20
|
+
|
|
21
|
+
Spans are offsets *within the section's ``raw_text``*, not into the
|
|
22
|
+
source document. The Entities sub-index stores spans in the same
|
|
23
|
+
coordinate space, so consumers do not need to know about section
|
|
24
|
+
territory boundaries to interpret them.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
28
|
+
|
|
29
|
+
section_id: str
|
|
30
|
+
canonical: str
|
|
31
|
+
surface_form: str
|
|
32
|
+
kind: EntityKind
|
|
33
|
+
span: Span
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@runtime_checkable
|
|
37
|
+
class EntityExtractor(Protocol):
|
|
38
|
+
"""A pluggable extractor.
|
|
39
|
+
|
|
40
|
+
Implementations may be sync (heuristic, regex-based) or async (LLM-backed).
|
|
41
|
+
The protocol uses an async signature; sync implementations return an
|
|
42
|
+
already-resolved awaitable.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
|
|
47
|
+
def extract(
|
|
48
|
+
self,
|
|
49
|
+
document: Document,
|
|
50
|
+
) -> Awaitable[Iterable[ExtractionHit]]:
|
|
51
|
+
"""Return an iterable of extraction hits across ``document``."""
|
|
52
|
+
...
|
cairn/entity/fake.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Deterministic entity extractor for tests.
|
|
2
|
+
|
|
3
|
+
Returns a fixed catalogue of hits regardless of input. Used by tests that
|
|
4
|
+
care about the downstream builder/index/tool behavior, not the extraction
|
|
5
|
+
heuristics themselves.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
|
|
12
|
+
from cairn.core.types import Document, Span
|
|
13
|
+
from cairn.entity.base import ExtractionHit
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FakeEntityExtractor:
|
|
17
|
+
"""Returns one hit per section, kind=defined, canonical=<section_id>."""
|
|
18
|
+
|
|
19
|
+
name = "fake:per-section"
|
|
20
|
+
|
|
21
|
+
async def extract(self, document: Document) -> Iterable[ExtractionHit]:
|
|
22
|
+
hits: list[ExtractionHit] = []
|
|
23
|
+
for section in document.sections:
|
|
24
|
+
canonical = section.id.split("/")[-1].replace("-", " ")
|
|
25
|
+
hits.append(
|
|
26
|
+
ExtractionHit(
|
|
27
|
+
section_id=section.id,
|
|
28
|
+
canonical=canonical,
|
|
29
|
+
surface_form=canonical,
|
|
30
|
+
kind="defined",
|
|
31
|
+
span=Span(start=0, end=min(len(canonical), len(section.raw_text))),
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
return hits
|