docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""HeuristicExtractor — regex-based, no model needed.
|
|
2
|
+
|
|
3
|
+
Covers two of the four ``EntityKind`` values:
|
|
4
|
+
|
|
5
|
+
- **code**: identifiers inside fenced code blocks and inline ``` `code` ```
|
|
6
|
+
spans. Filters out common language keywords (Python, JS) and identifiers
|
|
7
|
+
shorter than three characters.
|
|
8
|
+
- **defined**: text inside ``**bold**`` markdown markers that looks like a
|
|
9
|
+
term (no sentence-level punctuation, ≤ 80 chars).
|
|
10
|
+
|
|
11
|
+
Span coordinates are offsets within the *section's* ``raw_text``. The
|
|
12
|
+
:class:`cairn.index.entities.EntityBuilder` preserves this convention end
|
|
13
|
+
to end.
|
|
14
|
+
|
|
15
|
+
LLM-based extraction for ``term`` and ``proper`` kinds is the v0.2.1
|
|
16
|
+
follow-up; this v0.2.0 extractor is fully offline.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from collections.abc import Iterable, Iterator
|
|
23
|
+
from typing import Final
|
|
24
|
+
|
|
25
|
+
from cairn.core.types import Document, Span
|
|
26
|
+
from cairn.entity.base import ExtractionHit
|
|
27
|
+
|
|
28
|
+
_FENCED_CODE = re.compile(r"```[^\n]*\n(.*?)\n```", re.DOTALL)
|
|
29
|
+
_INLINE_CODE = re.compile(r"`([^`\n]+)`")
|
|
30
|
+
_BOLD = re.compile(r"\*\*([^*\n]+)\*\*")
|
|
31
|
+
_IDENTIFIER = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
|
|
32
|
+
|
|
33
|
+
# Common stopwords that show up in code blocks but carry no entity meaning.
|
|
34
|
+
# Conservative: only drops obvious language keywords + built-in names.
|
|
35
|
+
_CODE_STOPWORDS: Final[frozenset[str]] = frozenset(
|
|
36
|
+
{
|
|
37
|
+
# Python keywords / builtins
|
|
38
|
+
"and", "as", "assert", "async", "await", "break", "class", "continue",
|
|
39
|
+
"def", "del", "elif", "else", "except", "finally", "for", "from",
|
|
40
|
+
"global", "if", "import", "in", "is", "lambda", "nonlocal", "not",
|
|
41
|
+
"or", "pass", "raise", "return", "try", "while", "with", "yield",
|
|
42
|
+
"True", "False", "None", "self", "cls",
|
|
43
|
+
"int", "str", "float", "bool", "list", "dict", "tuple", "set",
|
|
44
|
+
"type", "any", "all", "len", "range", "print",
|
|
45
|
+
# JS/TS keywords
|
|
46
|
+
"const", "let", "var", "function", "this", "new", "throw", "void",
|
|
47
|
+
"typeof", "instanceof", "switch", "case", "default", "extends",
|
|
48
|
+
# Common short words that masquerade as identifiers
|
|
49
|
+
"the", "that", "into",
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Maximum length of a "defined" entity in characters.
|
|
54
|
+
_DEFINED_MAX_LEN: Final = 80
|
|
55
|
+
|
|
56
|
+
# Tokens that disqualify a bold span from being a defined entity.
|
|
57
|
+
_SENTENCE_MARKERS: Final = frozenset({".", ",", ";", ":", "!", "?"})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HeuristicExtractor:
|
|
61
|
+
"""Regex-based entity extractor."""
|
|
62
|
+
|
|
63
|
+
name = "heuristic:regex-v1"
|
|
64
|
+
|
|
65
|
+
async def extract(self, document: Document) -> Iterable[ExtractionHit]:
|
|
66
|
+
hits: list[ExtractionHit] = []
|
|
67
|
+
for section in document.sections:
|
|
68
|
+
for hit in _scan_section(section.id, section.raw_text):
|
|
69
|
+
hits.append(hit)
|
|
70
|
+
return hits
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _scan_section(section_id: str, text: str) -> Iterator[ExtractionHit]:
|
|
74
|
+
yield from _scan_code(section_id, text)
|
|
75
|
+
yield from _scan_defined(section_id, text)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _scan_code(section_id: str, text: str) -> Iterator[ExtractionHit]:
|
|
79
|
+
# Fenced code blocks.
|
|
80
|
+
for fence in _FENCED_CODE.finditer(text):
|
|
81
|
+
body = fence.group(1)
|
|
82
|
+
body_offset = fence.start(1)
|
|
83
|
+
for ident in _IDENTIFIER.finditer(body):
|
|
84
|
+
name = ident.group()
|
|
85
|
+
if name in _CODE_STOPWORDS:
|
|
86
|
+
continue
|
|
87
|
+
yield ExtractionHit(
|
|
88
|
+
section_id=section_id,
|
|
89
|
+
canonical=name,
|
|
90
|
+
surface_form=name,
|
|
91
|
+
kind="code",
|
|
92
|
+
span=Span(
|
|
93
|
+
start=body_offset + ident.start(),
|
|
94
|
+
end=body_offset + ident.end(),
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Inline code spans. Skip those that fall inside a fenced block by
|
|
99
|
+
# checking offsets against fence ranges.
|
|
100
|
+
fence_ranges = [
|
|
101
|
+
(m.start(), m.end()) for m in _FENCED_CODE.finditer(text)
|
|
102
|
+
]
|
|
103
|
+
for inline in _INLINE_CODE.finditer(text):
|
|
104
|
+
if _inside_any(inline.start(), fence_ranges):
|
|
105
|
+
continue
|
|
106
|
+
inner = inline.group(1)
|
|
107
|
+
inner_offset = inline.start(1)
|
|
108
|
+
# Only emit if the entire inline body looks like an identifier list.
|
|
109
|
+
# Skip prose-y `things like this`.
|
|
110
|
+
if " " in inner.strip() and len(inner.split()) > 1:
|
|
111
|
+
# Multi-word inline code → typically not a single identifier.
|
|
112
|
+
# Still scan for identifiers but treat them as separate hits.
|
|
113
|
+
pass
|
|
114
|
+
for ident in _IDENTIFIER.finditer(inner):
|
|
115
|
+
name = ident.group()
|
|
116
|
+
if name in _CODE_STOPWORDS:
|
|
117
|
+
continue
|
|
118
|
+
yield ExtractionHit(
|
|
119
|
+
section_id=section_id,
|
|
120
|
+
canonical=name,
|
|
121
|
+
surface_form=name,
|
|
122
|
+
kind="code",
|
|
123
|
+
span=Span(
|
|
124
|
+
start=inner_offset + ident.start(),
|
|
125
|
+
end=inner_offset + ident.end(),
|
|
126
|
+
),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _scan_defined(section_id: str, text: str) -> Iterator[ExtractionHit]:
|
|
131
|
+
for bold in _BOLD.finditer(text):
|
|
132
|
+
inner_raw = bold.group(1)
|
|
133
|
+
inner = inner_raw.strip()
|
|
134
|
+
if not inner or len(inner) > _DEFINED_MAX_LEN:
|
|
135
|
+
continue
|
|
136
|
+
if any(marker in inner for marker in _SENTENCE_MARKERS):
|
|
137
|
+
continue
|
|
138
|
+
yield ExtractionHit(
|
|
139
|
+
section_id=section_id,
|
|
140
|
+
canonical=inner,
|
|
141
|
+
surface_form=inner,
|
|
142
|
+
kind="defined",
|
|
143
|
+
span=Span(start=bold.start(1), end=bold.end(1)),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _inside_any(pos: int, ranges: list[tuple[int, int]]) -> bool:
|
|
148
|
+
return any(start <= pos < end for start, end in ranges)
|
cairn/index/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Index layer — the five sub-indexes (Tree, Summaries, Entities, XRefs, Vectors)."""
|
|
2
|
+
|
|
3
|
+
from cairn.index.entities import ENTITIES_FILENAME, Entities, EntityBuilder
|
|
4
|
+
from cairn.index.summaries import (
|
|
5
|
+
SUMMARIES_FILENAME,
|
|
6
|
+
Summaries,
|
|
7
|
+
SummaryBuilder,
|
|
8
|
+
section_hash,
|
|
9
|
+
)
|
|
10
|
+
from cairn.index.tree import TREE_FILENAME, Tree, TreeBuilder
|
|
11
|
+
from cairn.index.vectors import (
|
|
12
|
+
VECTORS_DB_DIRNAME,
|
|
13
|
+
VECTORS_MANIFEST_FILENAME,
|
|
14
|
+
VectorBuilder,
|
|
15
|
+
VectorHit,
|
|
16
|
+
Vectors,
|
|
17
|
+
)
|
|
18
|
+
from cairn.index.xrefs import XREFS_FILENAME, XRefBuilder, XRefs
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"ENTITIES_FILENAME",
|
|
22
|
+
"SUMMARIES_FILENAME",
|
|
23
|
+
"TREE_FILENAME",
|
|
24
|
+
"VECTORS_DB_DIRNAME",
|
|
25
|
+
"VECTORS_MANIFEST_FILENAME",
|
|
26
|
+
"XREFS_FILENAME",
|
|
27
|
+
"Entities",
|
|
28
|
+
"EntityBuilder",
|
|
29
|
+
"Summaries",
|
|
30
|
+
"SummaryBuilder",
|
|
31
|
+
"Tree",
|
|
32
|
+
"TreeBuilder",
|
|
33
|
+
"VectorBuilder",
|
|
34
|
+
"VectorHit",
|
|
35
|
+
"Vectors",
|
|
36
|
+
"XRefBuilder",
|
|
37
|
+
"XRefs",
|
|
38
|
+
"section_hash",
|
|
39
|
+
]
|
cairn/index/entities.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Entities sub-index — extractor outputs, deduplicated and persisted.
|
|
2
|
+
|
|
3
|
+
The :class:`EntityBuilder` runs an :class:`cairn.entity.base.EntityExtractor`
|
|
4
|
+
over a Document, aggregates ``ExtractionHit`` records by ``(canonical, kind)``
|
|
5
|
+
into :class:`cairn.core.types.Entity`, and writes ``entities.json``.
|
|
6
|
+
|
|
7
|
+
The :class:`Entities` reader exposes lookup by canonical / surface form /
|
|
8
|
+
section, with optional ``kind`` filtering — exactly what the
|
|
9
|
+
``find_mentions`` retrieval tool needs.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from collections.abc import Iterable, Iterator
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Final
|
|
20
|
+
|
|
21
|
+
from cairn.core.errors import IndexBuildError, IndexNotFoundError
|
|
22
|
+
from cairn.core.types import Document, Entity, EntityKind, Mention, Span
|
|
23
|
+
from cairn.entity.base import EntityExtractor, ExtractionHit
|
|
24
|
+
|
|
25
|
+
ENTITIES_FILENAME: Final = "entities.json"
|
|
26
|
+
ENTITIES_FORMAT_VERSION: Final = 1
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EntityBuilder:
|
|
30
|
+
"""Run an extractor, aggregate hits, persist ``entities.json``."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, extractor: EntityExtractor) -> None:
|
|
33
|
+
self.extractor = extractor
|
|
34
|
+
|
|
35
|
+
async def build(self, document: Document, *, out_dir: Path) -> Path:
|
|
36
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
path = out_dir / ENTITIES_FILENAME
|
|
38
|
+
|
|
39
|
+
hits = await self.extractor.extract(document)
|
|
40
|
+
entities = _aggregate(hits)
|
|
41
|
+
now = datetime.now(UTC)
|
|
42
|
+
|
|
43
|
+
payload: dict[str, Any] = {
|
|
44
|
+
"format_version": ENTITIES_FORMAT_VERSION,
|
|
45
|
+
"doc_id": document.id,
|
|
46
|
+
"extractor": self.extractor.name,
|
|
47
|
+
"generated_at": now.isoformat(),
|
|
48
|
+
"entities": [_entity_to_dict(e) for e in entities],
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
52
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
53
|
+
fh.write("\n")
|
|
54
|
+
return path
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Entities:
|
|
58
|
+
"""Loaded entities sub-index. Read-only queries."""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
entities: tuple[Entity, ...],
|
|
63
|
+
*,
|
|
64
|
+
doc_id: str,
|
|
65
|
+
extractor: str,
|
|
66
|
+
) -> None:
|
|
67
|
+
self._all = entities
|
|
68
|
+
self.doc_id = doc_id
|
|
69
|
+
self.extractor = extractor
|
|
70
|
+
|
|
71
|
+
# Indexes
|
|
72
|
+
self._by_canonical: dict[tuple[str, str], Entity] = {
|
|
73
|
+
(e.canonical, e.kind): e for e in entities
|
|
74
|
+
}
|
|
75
|
+
self._by_surface: dict[str, list[Entity]] = defaultdict(list)
|
|
76
|
+
self._by_section: dict[str, list[Entity]] = defaultdict(list)
|
|
77
|
+
for ent in entities:
|
|
78
|
+
for sf in ent.surface_forms:
|
|
79
|
+
self._by_surface[sf].append(ent)
|
|
80
|
+
for mention in ent.mentions:
|
|
81
|
+
self._by_section[mention.section_id].append(ent)
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def load(cls, doc_dir: Path) -> Entities:
|
|
85
|
+
path = doc_dir / ENTITIES_FILENAME
|
|
86
|
+
if not path.exists():
|
|
87
|
+
msg = f"entities.json not found in {doc_dir}"
|
|
88
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
89
|
+
|
|
90
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
91
|
+
payload = json.load(fh)
|
|
92
|
+
|
|
93
|
+
version = payload.get("format_version")
|
|
94
|
+
if version != ENTITIES_FORMAT_VERSION:
|
|
95
|
+
msg = (
|
|
96
|
+
f"unsupported entities format version: {version!r} "
|
|
97
|
+
f"(expected {ENTITIES_FORMAT_VERSION})"
|
|
98
|
+
)
|
|
99
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
100
|
+
|
|
101
|
+
entities = tuple(_entity_from_dict(d) for d in payload["entities"])
|
|
102
|
+
return cls(
|
|
103
|
+
entities,
|
|
104
|
+
doc_id=payload["doc_id"],
|
|
105
|
+
extractor=payload["extractor"],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# -- queries -----------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def __len__(self) -> int:
|
|
111
|
+
return len(self._all)
|
|
112
|
+
|
|
113
|
+
def __iter__(self) -> Iterator[Entity]:
|
|
114
|
+
return iter(self._all)
|
|
115
|
+
|
|
116
|
+
def lookup(
|
|
117
|
+
self,
|
|
118
|
+
name: str,
|
|
119
|
+
*,
|
|
120
|
+
kinds: tuple[EntityKind, ...] | None = None,
|
|
121
|
+
) -> Entity | None:
|
|
122
|
+
"""Return the first matching entity by canonical or any surface form.
|
|
123
|
+
|
|
124
|
+
Precedence: canonical match before surface-form match. When ``kinds``
|
|
125
|
+
is supplied, only entities of those kinds are considered.
|
|
126
|
+
"""
|
|
127
|
+
for ent in self._candidates(name):
|
|
128
|
+
if kinds is None or ent.kind in kinds:
|
|
129
|
+
return ent
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
def lookup_all(
|
|
133
|
+
self,
|
|
134
|
+
name: str,
|
|
135
|
+
*,
|
|
136
|
+
kinds: tuple[EntityKind, ...] | None = None,
|
|
137
|
+
) -> list[Entity]:
|
|
138
|
+
"""Return every matching entity (across kinds) for ``name``."""
|
|
139
|
+
seen: set[tuple[str, str]] = set()
|
|
140
|
+
out: list[Entity] = []
|
|
141
|
+
for ent in self._candidates(name):
|
|
142
|
+
key = (ent.canonical, ent.kind)
|
|
143
|
+
if key in seen:
|
|
144
|
+
continue
|
|
145
|
+
if kinds is None or ent.kind in kinds:
|
|
146
|
+
out.append(ent)
|
|
147
|
+
seen.add(key)
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
def by_section(self, section_id: str) -> list[Entity]:
|
|
151
|
+
return list(self._by_section.get(section_id, ()))
|
|
152
|
+
|
|
153
|
+
def by_kind(self, kind: EntityKind) -> list[Entity]:
|
|
154
|
+
return [e for e in self._all if e.kind == kind]
|
|
155
|
+
|
|
156
|
+
def _candidates(self, name: str) -> Iterator[Entity]:
|
|
157
|
+
# Canonical hits first, across all kinds, in extractor order.
|
|
158
|
+
for ent in self._all:
|
|
159
|
+
if ent.canonical == name:
|
|
160
|
+
yield ent
|
|
161
|
+
# Then surface-form matches.
|
|
162
|
+
for ent in self._by_surface.get(name, ()):
|
|
163
|
+
if ent.canonical != name:
|
|
164
|
+
yield ent
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Aggregation
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _aggregate(hits: Iterable[ExtractionHit]) -> list[Entity]:
|
|
173
|
+
"""Fold ``ExtractionHit`` stream into a flat list of :class:`Entity`."""
|
|
174
|
+
by_key: dict[tuple[str, str], _Acc] = {}
|
|
175
|
+
insertion_order: list[tuple[str, str]] = []
|
|
176
|
+
|
|
177
|
+
for hit in hits:
|
|
178
|
+
if not hit.canonical:
|
|
179
|
+
msg = "extractor emitted an empty canonical"
|
|
180
|
+
raise IndexBuildError(msg)
|
|
181
|
+
key = (hit.canonical, hit.kind)
|
|
182
|
+
acc = by_key.get(key)
|
|
183
|
+
if acc is None:
|
|
184
|
+
acc = _Acc(canonical=hit.canonical, kind=hit.kind)
|
|
185
|
+
by_key[key] = acc
|
|
186
|
+
insertion_order.append(key)
|
|
187
|
+
acc.surface_forms[hit.surface_form] = None
|
|
188
|
+
acc.mentions.append(Mention(section_id=hit.section_id, span=hit.span))
|
|
189
|
+
|
|
190
|
+
return [by_key[key].freeze() for key in insertion_order]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class _Acc:
|
|
194
|
+
__slots__ = ("canonical", "kind", "mentions", "surface_forms")
|
|
195
|
+
|
|
196
|
+
def __init__(self, *, canonical: str, kind: EntityKind) -> None:
|
|
197
|
+
self.canonical = canonical
|
|
198
|
+
self.kind = kind
|
|
199
|
+
# dict[str, None] is the cheapest insertion-ordered set in Python.
|
|
200
|
+
self.surface_forms: dict[str, None] = {}
|
|
201
|
+
self.mentions: list[Mention] = []
|
|
202
|
+
|
|
203
|
+
def freeze(self) -> Entity:
|
|
204
|
+
return Entity(
|
|
205
|
+
canonical=self.canonical,
|
|
206
|
+
surface_forms=tuple(self.surface_forms.keys()),
|
|
207
|
+
kind=self.kind,
|
|
208
|
+
mentions=tuple(self.mentions),
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Serialization
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _entity_to_dict(e: Entity) -> dict[str, Any]:
|
|
218
|
+
return {
|
|
219
|
+
"canonical": e.canonical,
|
|
220
|
+
"surface_forms": list(e.surface_forms),
|
|
221
|
+
"kind": e.kind,
|
|
222
|
+
"mentions": [
|
|
223
|
+
{
|
|
224
|
+
"section_id": m.section_id,
|
|
225
|
+
"span": {"start": m.span.start, "end": m.span.end},
|
|
226
|
+
}
|
|
227
|
+
for m in e.mentions
|
|
228
|
+
],
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _entity_from_dict(d: dict[str, Any]) -> Entity:
|
|
233
|
+
return Entity(
|
|
234
|
+
canonical=d["canonical"],
|
|
235
|
+
surface_forms=tuple(d["surface_forms"]),
|
|
236
|
+
kind=d["kind"],
|
|
237
|
+
mentions=tuple(
|
|
238
|
+
Mention(
|
|
239
|
+
section_id=m["section_id"],
|
|
240
|
+
span=Span(start=m["span"]["start"], end=m["span"]["end"]),
|
|
241
|
+
)
|
|
242
|
+
for m in d["mentions"]
|
|
243
|
+
),
|
|
244
|
+
)
|
cairn/index/summaries.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Summaries sub-index — per-section multi-granularity summaries.
|
|
2
|
+
|
|
3
|
+
``SummaryBuilder`` runs at indexing time and writes ``summaries.json``;
|
|
4
|
+
``Summaries`` loads that file and serves queries. Both honor the
|
|
5
|
+
:class:`cairn.core.types.SummarySet` contract.
|
|
6
|
+
|
|
7
|
+
The cache layer (``cairn.summarize.cache.SummaryCache``) is keyed by
|
|
8
|
+
``(summarizer.name, level, section_hash)``. Switching to a different
|
|
9
|
+
summarizer transparently invalidates prior cache entries; editing a section
|
|
10
|
+
invalidates only that section's entries.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
from collections.abc import Awaitable, Callable, Iterator, Sequence
|
|
19
|
+
from datetime import UTC, datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Final
|
|
22
|
+
|
|
23
|
+
from cairn.core.errors import IndexBuildError, IndexNotFoundError
|
|
24
|
+
from cairn.core.types import Document, SectionNode, SummarySet
|
|
25
|
+
from cairn.summarize.base import Summarizer, SummaryLevel
|
|
26
|
+
from cairn.summarize.cache import SummaryCache
|
|
27
|
+
|
|
28
|
+
SUMMARIES_FILENAME: Final = "summaries.json"
|
|
29
|
+
SUMMARIES_FORMAT_VERSION: Final = 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def section_hash(node: SectionNode) -> str:
|
|
33
|
+
"""Cache-invalidating fingerprint of a section's content.
|
|
34
|
+
|
|
35
|
+
Uses ``sha256(title || NUL || raw_text)``. Changes in either invalidate
|
|
36
|
+
cached summaries for that section.
|
|
37
|
+
"""
|
|
38
|
+
h = hashlib.sha256()
|
|
39
|
+
h.update(node.title.encode("utf-8"))
|
|
40
|
+
h.update(b"\x00")
|
|
41
|
+
h.update(node.raw_text.encode("utf-8"))
|
|
42
|
+
return h.hexdigest()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SummaryBuilder:
|
|
46
|
+
"""Asynchronously generate and persist summaries for a Document."""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
summarizer: Summarizer,
|
|
51
|
+
*,
|
|
52
|
+
cache: SummaryCache | None = None,
|
|
53
|
+
concurrency: int = 4,
|
|
54
|
+
progress: Callable[[int, int], None] | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
if concurrency < 1:
|
|
57
|
+
msg = f"concurrency must be ≥1; got {concurrency}"
|
|
58
|
+
raise IndexBuildError(msg)
|
|
59
|
+
self.summarizer = summarizer
|
|
60
|
+
self.cache = cache
|
|
61
|
+
self.concurrency = concurrency
|
|
62
|
+
self.progress = progress
|
|
63
|
+
|
|
64
|
+
async def build(
|
|
65
|
+
self,
|
|
66
|
+
document: Document,
|
|
67
|
+
*,
|
|
68
|
+
out_dir: Path,
|
|
69
|
+
levels: Sequence[SummaryLevel] = (
|
|
70
|
+
SummaryLevel.GIST,
|
|
71
|
+
SummaryLevel.SYNOPSIS,
|
|
72
|
+
SummaryLevel.DIGEST,
|
|
73
|
+
),
|
|
74
|
+
) -> Path:
|
|
75
|
+
"""Summarize every section in ``document`` and write ``summaries.json``.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
document: The parsed document.
|
|
79
|
+
out_dir: Directory to write into. Created if absent.
|
|
80
|
+
levels: Which granularity levels to generate. Order is preserved
|
|
81
|
+
and duplicates are dropped. Defaults to all three levels
|
|
82
|
+
(gist, synopsis, digest) since v0.2.4.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Path to the written ``summaries.json``.
|
|
86
|
+
"""
|
|
87
|
+
ordered_levels = _dedupe_preserve_order(levels)
|
|
88
|
+
if not ordered_levels:
|
|
89
|
+
msg = "at least one SummaryLevel is required"
|
|
90
|
+
raise IndexBuildError(msg)
|
|
91
|
+
|
|
92
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
path = out_dir / SUMMARIES_FILENAME
|
|
94
|
+
now = datetime.now(UTC)
|
|
95
|
+
semaphore = asyncio.Semaphore(self.concurrency)
|
|
96
|
+
progress_lock = asyncio.Lock()
|
|
97
|
+
completed = 0
|
|
98
|
+
total = len(document.sections) * len(ordered_levels)
|
|
99
|
+
|
|
100
|
+
async def mark_progress() -> None:
|
|
101
|
+
nonlocal completed
|
|
102
|
+
async with progress_lock:
|
|
103
|
+
completed += 1
|
|
104
|
+
if self.progress is None:
|
|
105
|
+
return
|
|
106
|
+
step = max(5, total // 20)
|
|
107
|
+
if completed != 1 and completed != total and completed % step != 0:
|
|
108
|
+
return
|
|
109
|
+
self.progress(completed, total)
|
|
110
|
+
|
|
111
|
+
async def for_section(node: SectionNode) -> dict[str, Any]:
|
|
112
|
+
return await self._summarize_section(
|
|
113
|
+
node,
|
|
114
|
+
ordered_levels,
|
|
115
|
+
semaphore,
|
|
116
|
+
now,
|
|
117
|
+
mark_progress=mark_progress,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
records = await asyncio.gather(
|
|
121
|
+
*(for_section(s) for s in document.sections)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
payload: dict[str, Any] = {
|
|
125
|
+
"format_version": SUMMARIES_FORMAT_VERSION,
|
|
126
|
+
"doc_id": document.id,
|
|
127
|
+
"model": self.summarizer.name,
|
|
128
|
+
"levels": [lvl.value for lvl in ordered_levels],
|
|
129
|
+
"generated_at": now.isoformat(),
|
|
130
|
+
"summaries": list(records),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
134
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
135
|
+
fh.write("\n")
|
|
136
|
+
return path
|
|
137
|
+
|
|
138
|
+
async def _summarize_section(
|
|
139
|
+
self,
|
|
140
|
+
node: SectionNode,
|
|
141
|
+
levels: Sequence[SummaryLevel],
|
|
142
|
+
semaphore: asyncio.Semaphore,
|
|
143
|
+
now: datetime,
|
|
144
|
+
mark_progress: Callable[[], Awaitable[None]],
|
|
145
|
+
) -> dict[str, Any]:
|
|
146
|
+
sh = section_hash(node)
|
|
147
|
+
results: dict[str, str] = {}
|
|
148
|
+
|
|
149
|
+
for level in levels:
|
|
150
|
+
cache_key: str | None = None
|
|
151
|
+
cached: str | None = None
|
|
152
|
+
if self.cache is not None:
|
|
153
|
+
cache_key = SummaryCache.key(
|
|
154
|
+
model=self.summarizer.name,
|
|
155
|
+
level=level.value,
|
|
156
|
+
section_hash=sh,
|
|
157
|
+
)
|
|
158
|
+
cached = self.cache.get(cache_key)
|
|
159
|
+
|
|
160
|
+
if cached is not None:
|
|
161
|
+
results[level.value] = cached
|
|
162
|
+
await mark_progress()
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
async with semaphore:
|
|
166
|
+
text = await self.summarizer.summarize(
|
|
167
|
+
title=node.title,
|
|
168
|
+
body=node.raw_text,
|
|
169
|
+
level=level,
|
|
170
|
+
)
|
|
171
|
+
results[level.value] = text
|
|
172
|
+
await mark_progress()
|
|
173
|
+
if self.cache is not None and cache_key is not None:
|
|
174
|
+
self.cache.put(cache_key, text)
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"section_id": node.id,
|
|
178
|
+
"section_hash": sh,
|
|
179
|
+
"gist": results.get(SummaryLevel.GIST.value, ""),
|
|
180
|
+
"synopsis": results.get(SummaryLevel.SYNOPSIS.value, ""),
|
|
181
|
+
"digest": results.get(SummaryLevel.DIGEST.value),
|
|
182
|
+
"model": self.summarizer.name,
|
|
183
|
+
"generated_at": now.isoformat(),
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class Summaries:
|
|
188
|
+
"""Loaded summaries index. Read-only by section id."""
|
|
189
|
+
|
|
190
|
+
def __init__(
|
|
191
|
+
self,
|
|
192
|
+
sets: tuple[SummarySet, ...],
|
|
193
|
+
*,
|
|
194
|
+
doc_id: str,
|
|
195
|
+
model: str,
|
|
196
|
+
) -> None:
|
|
197
|
+
self._by_id: dict[str, SummarySet] = {s.section_id: s for s in sets}
|
|
198
|
+
self._all = sets
|
|
199
|
+
self.doc_id = doc_id
|
|
200
|
+
self.model = model
|
|
201
|
+
|
|
202
|
+
@classmethod
|
|
203
|
+
def load(cls, doc_dir: Path) -> Summaries:
|
|
204
|
+
"""Load ``summaries.json`` from a document directory."""
|
|
205
|
+
path = doc_dir / SUMMARIES_FILENAME
|
|
206
|
+
if not path.exists():
|
|
207
|
+
msg = f"summaries.json not found in {doc_dir}"
|
|
208
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
209
|
+
|
|
210
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
211
|
+
payload = json.load(fh)
|
|
212
|
+
|
|
213
|
+
version = payload.get("format_version")
|
|
214
|
+
if version != SUMMARIES_FORMAT_VERSION:
|
|
215
|
+
msg = (
|
|
216
|
+
f"unsupported summaries format version: {version!r} "
|
|
217
|
+
f"(expected {SUMMARIES_FORMAT_VERSION})"
|
|
218
|
+
)
|
|
219
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
220
|
+
|
|
221
|
+
sets = tuple(
|
|
222
|
+
SummarySet(
|
|
223
|
+
section_id=record["section_id"],
|
|
224
|
+
gist=record["gist"],
|
|
225
|
+
synopsis=record["synopsis"],
|
|
226
|
+
digest=record.get("digest"),
|
|
227
|
+
model=record["model"],
|
|
228
|
+
section_hash=record["section_hash"],
|
|
229
|
+
generated_at=datetime.fromisoformat(record["generated_at"]),
|
|
230
|
+
)
|
|
231
|
+
for record in payload["summaries"]
|
|
232
|
+
)
|
|
233
|
+
return cls(sets, doc_id=payload["doc_id"], model=payload["model"])
|
|
234
|
+
|
|
235
|
+
# -- queries -----------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
def get(self, section_id: str) -> SummarySet | None:
|
|
238
|
+
return self._by_id.get(section_id)
|
|
239
|
+
|
|
240
|
+
def require(self, section_id: str) -> SummarySet:
|
|
241
|
+
s = self.get(section_id)
|
|
242
|
+
if s is None:
|
|
243
|
+
msg = f"summary not found for section: {section_id!r}"
|
|
244
|
+
raise IndexNotFoundError(msg, details={"section_id": section_id})
|
|
245
|
+
return s
|
|
246
|
+
|
|
247
|
+
def __contains__(self, section_id: object) -> bool:
|
|
248
|
+
return isinstance(section_id, str) and section_id in self._by_id
|
|
249
|
+
|
|
250
|
+
def __len__(self) -> int:
|
|
251
|
+
return len(self._all)
|
|
252
|
+
|
|
253
|
+
def __iter__(self) -> Iterator[SummarySet]:
|
|
254
|
+
return iter(self._all)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
# helpers
|
|
259
|
+
# ---------------------------------------------------------------------------
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _dedupe_preserve_order(levels: Sequence[SummaryLevel]) -> list[SummaryLevel]:
|
|
263
|
+
seen: set[SummaryLevel] = set()
|
|
264
|
+
out: list[SummaryLevel] = []
|
|
265
|
+
for lvl in levels:
|
|
266
|
+
if lvl not in seen:
|
|
267
|
+
seen.add(lvl)
|
|
268
|
+
out.append(lvl)
|
|
269
|
+
return out
|