docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,155 @@
1
+ """OpenAI-compatible HTTP embedder.
2
+
3
+ Works with any endpoint that implements the OpenAI ``/v1/embeddings``
4
+ contract: OpenAI itself, Ollama (``http://localhost:11434/v1``), vLLM,
5
+ Together, Anyscale, etc.
6
+
7
+ Default configuration points at a local Ollama instance running the
8
+ ``nomic-embed-text`` model (768 dims) — chosen for the same reason as the
9
+ summarizer default: zero API keys, mature stack, runs on a laptop.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ from cairn.core.errors import IndexBuildError
20
+
21
+
22
+ class OpenAICompatibleEmbedder:
23
+ """OpenAI-compatible embeddings client."""
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ base_url: str = "http://localhost:11434/v1",
29
+ model: str = "nomic-embed-text",
30
+ dim: int = 768,
31
+ api_key: str | None = None,
32
+ timeout: float = 60.0,
33
+ max_retries: int = 2,
34
+ retry_base_delay: float = 0.5,
35
+ ) -> None:
36
+ if dim < 1:
37
+ msg = f"dim must be >= 1; got {dim}"
38
+ raise ValueError(msg)
39
+ if max_retries < 0:
40
+ msg = f"max_retries must be >= 0; got {max_retries}"
41
+ raise ValueError(msg)
42
+ if retry_base_delay < 0:
43
+ msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
44
+ raise ValueError(msg)
45
+ self.base_url = base_url.rstrip("/")
46
+ self.model = model
47
+ self.dim = dim
48
+ self.api_key = api_key
49
+ self.timeout = timeout
50
+ self.max_retries = max_retries
51
+ self.retry_base_delay = retry_base_delay
52
+ self.name = f"openai-compat:{model}"
53
+
54
+ async def embed(self, texts: list[str]) -> list[list[float]]:
55
+ if not texts:
56
+ return []
57
+
58
+ headers = {"Content-Type": "application/json"}
59
+ if self.api_key:
60
+ headers["Authorization"] = f"Bearer {self.api_key}"
61
+
62
+ payload: dict[str, Any] = {
63
+ "model": self.model,
64
+ "input": texts,
65
+ }
66
+
67
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
68
+ response = await self._post_with_retries(client, payload, headers)
69
+ data = response.json()
70
+
71
+ try:
72
+ vectors = [list(item["embedding"]) for item in data["data"]]
73
+ except (KeyError, TypeError, IndexError) as exc:
74
+ msg = "embedder response did not match OpenAI embeddings shape"
75
+ raise IndexBuildError(msg, details={"response": data}) from exc
76
+
77
+ if len(vectors) != len(texts):
78
+ msg = (
79
+ f"embedder returned {len(vectors)} vectors for "
80
+ f"{len(texts)} inputs"
81
+ )
82
+ raise IndexBuildError(msg)
83
+ for i, vec in enumerate(vectors):
84
+ if len(vec) != self.dim:
85
+ msg = (
86
+ f"embedder returned dim={len(vec)} but client expects "
87
+ f"dim={self.dim} (model {self.model!r}, index {i})"
88
+ )
89
+ raise IndexBuildError(msg)
90
+
91
+ return vectors
92
+
93
+ async def _post_with_retries(
94
+ self,
95
+ client: httpx.AsyncClient,
96
+ payload: dict[str, Any],
97
+ headers: dict[str, str],
98
+ ) -> httpx.Response:
99
+ last_exc: httpx.HTTPError | None = None
100
+ for attempt in range(self.max_retries + 1):
101
+ try:
102
+ response = await client.post(
103
+ f"{self.base_url}/embeddings",
104
+ json=payload,
105
+ headers=headers,
106
+ )
107
+ except httpx.HTTPError as exc:
108
+ last_exc = exc
109
+ if attempt < self.max_retries:
110
+ await self._sleep_before_retry(attempt)
111
+ continue
112
+ msg = f"embedder request failed: {exc}"
113
+ raise IndexBuildError(
114
+ msg,
115
+ details={
116
+ "model": self.model,
117
+ "base_url": self.base_url,
118
+ "error_type": type(exc).__name__,
119
+ "attempts": attempt + 1,
120
+ },
121
+ ) from exc
122
+
123
+ if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
124
+ await self._sleep_before_retry(attempt)
125
+ continue
126
+ if response.status_code >= 400:
127
+ msg = (
128
+ f"embedder endpoint returned HTTP {response.status_code}: "
129
+ f"{response.text[:200]}"
130
+ )
131
+ raise IndexBuildError(
132
+ msg,
133
+ details={
134
+ "status": response.status_code,
135
+ "model": self.model,
136
+ "base_url": self.base_url,
137
+ "attempts": attempt + 1,
138
+ },
139
+ )
140
+ return response
141
+
142
+ msg = "embedder request failed without a response"
143
+ raise IndexBuildError(
144
+ msg,
145
+ details={
146
+ "model": self.model,
147
+ "base_url": self.base_url,
148
+ "error_type": type(last_exc).__name__ if last_exc else None,
149
+ },
150
+ )
151
+
152
+ async def _sleep_before_retry(self, attempt: int) -> None:
153
+ if self.retry_base_delay == 0:
154
+ return
155
+ await asyncio.sleep(self.retry_base_delay * (2**attempt))
@@ -0,0 +1,18 @@
1
+ """Engine layer — orchestrates the three sub-index builders + top-level manifest."""
2
+
3
+ from cairn.engine.indexer import Indexer, IndexResult
4
+ from cairn.engine.manifest import (
5
+ MANIFEST_FILENAME,
6
+ MANIFEST_FORMAT_VERSION,
7
+ Manifest,
8
+ read_manifest,
9
+ )
10
+
11
+ __all__ = [
12
+ "MANIFEST_FILENAME",
13
+ "MANIFEST_FORMAT_VERSION",
14
+ "IndexResult",
15
+ "Indexer",
16
+ "Manifest",
17
+ "read_manifest",
18
+ ]
@@ -0,0 +1,298 @@
1
+ """Indexer — single entry point that builds all three v0.1 sub-indexes.
2
+
3
+ Parses a source document, runs ``TreeBuilder`` synchronously, then
4
+ ``SummaryBuilder`` and ``VectorBuilder`` asynchronously, and finally writes
5
+ the top-level ``manifest.json`` that ties the artifacts together.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Callable, Sequence
11
+ from dataclasses import dataclass
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+
15
+ from cairn import __version__
16
+ from cairn.core.errors import IndexNotFoundError
17
+ from cairn.core.types import Document
18
+ from cairn.embed.base import Embedder
19
+ from cairn.engine.manifest import (
20
+ MANIFEST_FILENAME,
21
+ MANIFEST_FORMAT_VERSION,
22
+ Manifest,
23
+ SubIndexEntry,
24
+ read_manifest,
25
+ write_manifest,
26
+ )
27
+ from cairn.entity.base import EntityExtractor
28
+ from cairn.index.entities import (
29
+ ENTITIES_FILENAME,
30
+ ENTITIES_FORMAT_VERSION,
31
+ Entities,
32
+ EntityBuilder,
33
+ )
34
+ from cairn.index.summaries import (
35
+ SUMMARIES_FILENAME,
36
+ SUMMARIES_FORMAT_VERSION,
37
+ SummaryBuilder,
38
+ )
39
+ from cairn.index.tree import TREE_FILENAME, TreeBuilder
40
+ from cairn.index.vectors import (
41
+ VECTORS_FORMAT_VERSION,
42
+ VECTORS_MANIFEST_FILENAME,
43
+ VectorBuilder,
44
+ )
45
+ from cairn.index.xrefs import (
46
+ XREFS_FILENAME,
47
+ XREFS_FORMAT_VERSION,
48
+ XRefBuilder,
49
+ )
50
+ from cairn.ingest.base import Parser
51
+ from cairn.summarize.base import Summarizer, SummaryLevel
52
+ from cairn.summarize.cache import SummaryCache
53
+ from cairn.xref.base import XRefExtractor
54
+
55
+ _TREE_BUILDER_VERSION = 1
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class IndexResult:
60
+ """Outcome of an :meth:`Indexer.index_path` call.
61
+
62
+ ``rebuilt`` is ``False`` when the source's hash matched the previous
63
+ build's manifest, all producer fingerprints still match, and the existing
64
+ index was kept as-is (a no-op).
65
+ """
66
+
67
+ manifest_path: Path
68
+ rebuilt: bool
69
+
70
+
71
+ class Indexer:
72
+ """Orchestrates the sub-index builders for one document.
73
+
74
+ Tree + Summaries + Vectors are always built. The Entities sub-index is
75
+ built when ``entity_extractor`` is supplied (default since v0.2).
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ *,
81
+ parser: Parser,
82
+ summarizer: Summarizer,
83
+ embedder: Embedder,
84
+ entity_extractor: EntityExtractor | None = None,
85
+ xref_extractor: XRefExtractor | None = None,
86
+ summary_cache: SummaryCache | None = None,
87
+ summary_concurrency: int = 4,
88
+ embed_batch_size: int = 32,
89
+ progress: Callable[[str], None] | None = None,
90
+ ) -> None:
91
+ self.parser = parser
92
+ self.summarizer = summarizer
93
+ self.embedder = embedder
94
+ self.entity_extractor = entity_extractor
95
+ self.xref_extractor = xref_extractor
96
+ self.summary_cache = summary_cache
97
+ self.summary_concurrency = summary_concurrency
98
+ self.embed_batch_size = embed_batch_size
99
+ self.progress = progress
100
+
101
+ async def index_path(
102
+ self,
103
+ source: Path,
104
+ *,
105
+ out_dir: Path,
106
+ doc_id: str | None = None,
107
+ summary_levels: Sequence[SummaryLevel] = (
108
+ SummaryLevel.GIST,
109
+ SummaryLevel.SYNOPSIS,
110
+ SummaryLevel.DIGEST,
111
+ ),
112
+ force: bool = False,
113
+ ) -> IndexResult:
114
+ """Parse a source file and build all sub-indexes.
115
+
116
+ When ``force`` is ``False`` (default), the indexer first checks
117
+ whether ``out_dir`` already contains a manifest whose source_hash and
118
+ producer fingerprints match the requested build. If so, the existing
119
+ index is left untouched and ``IndexResult.rebuilt`` is ``False``. Pass
120
+ ``force=True`` to always rebuild.
121
+ """
122
+ document = self.parser.parse(source, doc_id=doc_id)
123
+
124
+ if not force and _existing_matches(
125
+ out_dir,
126
+ document.source_hash,
127
+ summarizer=self.summarizer,
128
+ embedder=self.embedder,
129
+ summary_levels=summary_levels,
130
+ entity_extractor=self.entity_extractor,
131
+ xref_extractor=self.xref_extractor,
132
+ ):
133
+ return IndexResult(
134
+ manifest_path=out_dir / MANIFEST_FILENAME,
135
+ rebuilt=False,
136
+ )
137
+
138
+ manifest_path = await self.index_document(
139
+ document,
140
+ out_dir=out_dir,
141
+ summary_levels=summary_levels,
142
+ )
143
+ return IndexResult(manifest_path=manifest_path, rebuilt=True)
144
+
145
+ async def index_document(
146
+ self,
147
+ document: Document,
148
+ *,
149
+ out_dir: Path,
150
+ summary_levels: Sequence[SummaryLevel] = (
151
+ SummaryLevel.GIST,
152
+ SummaryLevel.SYNOPSIS,
153
+ SummaryLevel.DIGEST,
154
+ ),
155
+ ) -> Path:
156
+ """Run every configured builder against an already-parsed Document."""
157
+ out_dir.mkdir(parents=True, exist_ok=True)
158
+
159
+ self._emit("tree: writing")
160
+ TreeBuilder().build(document, out_dir=out_dir)
161
+ self._emit("tree: done")
162
+ self._emit("summaries: starting")
163
+ await SummaryBuilder(
164
+ self.summarizer,
165
+ cache=self.summary_cache,
166
+ concurrency=self.summary_concurrency,
167
+ progress=lambda done, total: self._emit(f"summaries: {done}/{total}"),
168
+ ).build(document, out_dir=out_dir, levels=summary_levels)
169
+ self._emit("summaries: done")
170
+ self._emit("vectors: starting")
171
+ await VectorBuilder(
172
+ self.embedder, batch_size=self.embed_batch_size
173
+ ).build(document, out_dir=out_dir)
174
+ self._emit("vectors: done")
175
+
176
+ subindexes: dict[str, SubIndexEntry] = {
177
+ "tree": SubIndexEntry(
178
+ path=TREE_FILENAME,
179
+ builder_version=_TREE_BUILDER_VERSION,
180
+ ),
181
+ "summaries": SubIndexEntry(
182
+ path=SUMMARIES_FILENAME,
183
+ builder_version=SUMMARIES_FORMAT_VERSION,
184
+ model=self.summarizer.name,
185
+ levels=[lvl.value for lvl in summary_levels],
186
+ ),
187
+ "vectors": SubIndexEntry(
188
+ path=VECTORS_MANIFEST_FILENAME,
189
+ builder_version=VECTORS_FORMAT_VERSION,
190
+ embedder=self.embedder.name,
191
+ dim=self.embedder.dim,
192
+ ),
193
+ }
194
+
195
+ entities_reader: Entities | None = None
196
+ if self.entity_extractor is not None:
197
+ self._emit("entities: starting")
198
+ await EntityBuilder(self.entity_extractor).build(
199
+ document, out_dir=out_dir
200
+ )
201
+ self._emit("entities: done")
202
+ subindexes["entities"] = SubIndexEntry(
203
+ path=ENTITIES_FILENAME,
204
+ builder_version=ENTITIES_FORMAT_VERSION,
205
+ extractor=self.entity_extractor.name,
206
+ )
207
+ # Reload from disk so the xref extractor can use the canonical
208
+ # form of the just-built Entities sub-index.
209
+ entities_reader = Entities.load(out_dir)
210
+
211
+ if self.xref_extractor is not None:
212
+ self._emit("xrefs: starting")
213
+ await XRefBuilder(self.xref_extractor).build(
214
+ document, out_dir=out_dir, entities=entities_reader
215
+ )
216
+ self._emit("xrefs: done")
217
+ subindexes["xrefs"] = SubIndexEntry(
218
+ path=XREFS_FILENAME,
219
+ builder_version=XREFS_FORMAT_VERSION,
220
+ extractor=self.xref_extractor.name,
221
+ )
222
+
223
+ manifest = Manifest(
224
+ format_version=MANIFEST_FORMAT_VERSION,
225
+ doc_id=document.id,
226
+ cairn_version=__version__,
227
+ source_path=str(document.source_path),
228
+ source_hash=document.source_hash,
229
+ indexed_at=datetime.now(UTC),
230
+ subindexes=subindexes,
231
+ )
232
+ self._emit("manifest: writing")
233
+ path = write_manifest(out_dir, manifest)
234
+ self._emit("manifest: done")
235
+ return path
236
+
237
+ def _emit(self, message: str) -> None:
238
+ if self.progress is not None:
239
+ self.progress(message)
240
+
241
+
242
+ def _existing_matches(
243
+ out_dir: Path,
244
+ source_hash: str,
245
+ *,
246
+ summarizer: Summarizer,
247
+ embedder: Embedder,
248
+ summary_levels: Sequence[SummaryLevel],
249
+ entity_extractor: EntityExtractor | None,
250
+ xref_extractor: XRefExtractor | None,
251
+ ) -> bool:
252
+ """Return ``True`` when the existing index matches source and producers."""
253
+ if not (out_dir / MANIFEST_FILENAME).exists():
254
+ return False
255
+ try:
256
+ existing = read_manifest(out_dir)
257
+ except IndexNotFoundError:
258
+ return False
259
+ if existing.source_hash != source_hash:
260
+ return False
261
+
262
+ tree = existing.subindexes.get("tree")
263
+ summaries = existing.subindexes.get("summaries")
264
+ vectors = existing.subindexes.get("vectors")
265
+ if tree is None or tree.builder_version != _TREE_BUILDER_VERSION:
266
+ return False
267
+ if (
268
+ summaries is None
269
+ or summaries.builder_version != SUMMARIES_FORMAT_VERSION
270
+ or summaries.model != summarizer.name
271
+ or summaries.levels != [lvl.value for lvl in summary_levels]
272
+ ):
273
+ return False
274
+ if (
275
+ vectors is None
276
+ or vectors.builder_version != VECTORS_FORMAT_VERSION
277
+ or vectors.embedder != embedder.name
278
+ or vectors.dim != embedder.dim
279
+ ):
280
+ return False
281
+
282
+ if entity_extractor is not None:
283
+ entities = existing.subindexes.get("entities")
284
+ if (
285
+ entities is None
286
+ or entities.builder_version != ENTITIES_FORMAT_VERSION
287
+ or entities.extractor != entity_extractor.name
288
+ ):
289
+ return False
290
+ if xref_extractor is not None:
291
+ xrefs = existing.subindexes.get("xrefs")
292
+ if (
293
+ xrefs is None
294
+ or xrefs.builder_version != XREFS_FORMAT_VERSION
295
+ or xrefs.extractor != xref_extractor.name
296
+ ):
297
+ return False
298
+ return True
@@ -0,0 +1,83 @@
1
+ """Top-level document manifest.
2
+
3
+ Per ARCHITECTURE.md §5, a document directory holds one ``manifest.json`` that
4
+ records source provenance, sub-index file pointers, builder versions, and
5
+ the model identifiers that produced each artifact. The manifest is the
6
+ contract: any file it references must exist; orphans are reapable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Any, Final
15
+
16
+ from pydantic import BaseModel, ConfigDict, Field
17
+
18
+ from cairn.core.errors import IndexNotFoundError
19
+
20
+ MANIFEST_FILENAME: Final = "manifest.json"
21
+ MANIFEST_FORMAT_VERSION: Final = 1
22
+
23
+
24
+ class SubIndexEntry(BaseModel):
25
+ """One sub-index pointer in the top-level manifest."""
26
+
27
+ model_config = ConfigDict(frozen=True, extra="forbid")
28
+
29
+ path: str = Field(description="path relative to the document directory")
30
+ builder_version: int = Field(ge=1)
31
+ # Optional fields that describe what produced this artifact.
32
+ model: str | None = None
33
+ embedder: str | None = None
34
+ extractor: str | None = None
35
+ dim: int | None = None
36
+ levels: list[str] | None = None
37
+
38
+
39
+ class Manifest(BaseModel):
40
+ """Top-level document manifest — the contract for everything else."""
41
+
42
+ model_config = ConfigDict(frozen=True, extra="forbid")
43
+
44
+ format_version: int
45
+ doc_id: str
46
+ cairn_version: str
47
+ source_path: str
48
+ source_hash: str
49
+ indexed_at: datetime
50
+ subindexes: dict[str, SubIndexEntry]
51
+
52
+
53
+ def write_manifest(out_dir: Path, manifest: Manifest) -> Path:
54
+ """Write ``manifest.json`` into ``out_dir`` deterministically."""
55
+ out_dir.mkdir(parents=True, exist_ok=True)
56
+ path = out_dir / MANIFEST_FILENAME
57
+
58
+ payload: dict[str, Any] = manifest.model_dump(mode="json")
59
+ with path.open("w", encoding="utf-8") as fh:
60
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
61
+ fh.write("\n")
62
+ return path
63
+
64
+
65
+ def read_manifest(doc_dir: Path) -> Manifest:
66
+ """Load and validate ``manifest.json`` from ``doc_dir``."""
67
+ path = doc_dir / MANIFEST_FILENAME
68
+ if not path.exists():
69
+ msg = f"manifest.json not found in {doc_dir}"
70
+ raise IndexNotFoundError(msg, details={"path": str(path)})
71
+
72
+ with path.open("r", encoding="utf-8") as fh:
73
+ payload = json.load(fh)
74
+
75
+ version = payload.get("format_version")
76
+ if version != MANIFEST_FORMAT_VERSION:
77
+ msg = (
78
+ f"unsupported manifest format version: {version!r} "
79
+ f"(expected {MANIFEST_FORMAT_VERSION})"
80
+ )
81
+ raise IndexNotFoundError(msg, details={"path": str(path)})
82
+
83
+ return Manifest.model_validate(payload)
@@ -0,0 +1,21 @@
1
+ """Entity extraction — pluggable extractors that mine entities from a Document.
2
+
3
+ Used by ``cairn.index.entities.EntityBuilder`` at indexing time. The
4
+ heuristic extractor is the v0.2.0 default; an LLM-backed extractor for
5
+ ``term`` and ``proper`` kinds is planned for v0.2.1.
6
+
7
+ Per ARCHITECTURE.md §2.3, entities come in four kinds — ``term``, ``code``,
8
+ ``proper``, ``defined``. The heuristic extractor covers ``code`` and
9
+ ``defined`` without any model dependency.
10
+ """
11
+
12
+ from cairn.entity.base import EntityExtractor, ExtractionHit
13
+ from cairn.entity.fake import FakeEntityExtractor
14
+ from cairn.entity.heuristic import HeuristicExtractor
15
+
16
+ __all__ = [
17
+ "EntityExtractor",
18
+ "ExtractionHit",
19
+ "FakeEntityExtractor",
20
+ "HeuristicExtractor",
21
+ ]
cairn/entity/base.py ADDED
@@ -0,0 +1,52 @@
1
+ """EntityExtractor protocol + intermediate extraction hit type.
2
+
3
+ Extractors emit a sequence of :class:`ExtractionHit` — one per *occurrence*.
4
+ The :class:`cairn.index.entities.EntityBuilder` deduplicates hits by
5
+ ``(canonical, kind)`` into :class:`cairn.core.types.Entity` records.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Awaitable, Iterable
11
+ from typing import Protocol, runtime_checkable
12
+
13
+ from pydantic import BaseModel, ConfigDict
14
+
15
+ from cairn.core.types import Document, EntityKind, Span
16
+
17
+
18
+ class ExtractionHit(BaseModel):
19
+ """One observed occurrence of a candidate entity.
20
+
21
+ Spans are offsets *within the section's ``raw_text``*, not into the
22
+ source document. The Entities sub-index stores spans in the same
23
+ coordinate space, so consumers do not need to know about section
24
+ territory boundaries to interpret them.
25
+ """
26
+
27
+ model_config = ConfigDict(frozen=True, extra="forbid")
28
+
29
+ section_id: str
30
+ canonical: str
31
+ surface_form: str
32
+ kind: EntityKind
33
+ span: Span
34
+
35
+
36
+ @runtime_checkable
37
+ class EntityExtractor(Protocol):
38
+ """A pluggable extractor.
39
+
40
+ Implementations may be sync (heuristic, regex-based) or async (LLM-backed).
41
+ The protocol uses an async signature; sync implementations return an
42
+ already-resolved awaitable.
43
+ """
44
+
45
+ name: str
46
+
47
+ def extract(
48
+ self,
49
+ document: Document,
50
+ ) -> Awaitable[Iterable[ExtractionHit]]:
51
+ """Return an iterable of extraction hits across ``document``."""
52
+ ...
cairn/entity/fake.py ADDED
@@ -0,0 +1,34 @@
1
+ """Deterministic entity extractor for tests.
2
+
3
+ Returns a fixed catalogue of hits regardless of input. Used by tests that
4
+ care about the downstream builder/index/tool behavior, not the extraction
5
+ heuristics themselves.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable
11
+
12
+ from cairn.core.types import Document, Span
13
+ from cairn.entity.base import ExtractionHit
14
+
15
+
16
+ class FakeEntityExtractor:
17
+ """Returns one hit per section, kind=defined, canonical=<section_id>."""
18
+
19
+ name = "fake:per-section"
20
+
21
+ async def extract(self, document: Document) -> Iterable[ExtractionHit]:
22
+ hits: list[ExtractionHit] = []
23
+ for section in document.sections:
24
+ canonical = section.id.split("/")[-1].replace("-", " ")
25
+ hits.append(
26
+ ExtractionHit(
27
+ section_id=section.id,
28
+ canonical=canonical,
29
+ surface_form=canonical,
30
+ kind="defined",
31
+ span=Span(start=0, end=min(len(canonical), len(section.raw_text))),
32
+ )
33
+ )
34
+ return hits