docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,148 @@
1
+ """HeuristicExtractor — regex-based, no model needed.
2
+
3
+ Covers two of the four ``EntityKind`` values:
4
+
5
+ - **code**: identifiers inside fenced code blocks and inline ``` `code` ```
6
+ spans. Filters out common language keywords (Python, JS) and identifiers
7
+ shorter than three characters.
8
+ - **defined**: text inside ``**bold**`` markdown markers that looks like a
9
+ term (no sentence-level punctuation, ≤ 80 chars).
10
+
11
+ Span coordinates are offsets within the *section's* ``raw_text``. The
12
+ :class:`cairn.index.entities.EntityBuilder` preserves this convention end
13
+ to end.
14
+
15
+ LLM-based extraction for ``term`` and ``proper`` kinds is the v0.2.1
16
+ follow-up; this v0.2.0 extractor is fully offline.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from collections.abc import Iterable, Iterator
23
+ from typing import Final
24
+
25
+ from cairn.core.types import Document, Span
26
+ from cairn.entity.base import ExtractionHit
27
+
28
+ _FENCED_CODE = re.compile(r"```[^\n]*\n(.*?)\n```", re.DOTALL)
29
+ _INLINE_CODE = re.compile(r"`([^`\n]+)`")
30
+ _BOLD = re.compile(r"\*\*([^*\n]+)\*\*")
31
+ _IDENTIFIER = re.compile(r"[A-Za-z_][A-Za-z0-9_]{2,}")
32
+
33
+ # Common stopwords that show up in code blocks but carry no entity meaning.
34
+ # Conservative: only drops obvious language keywords + built-in names.
35
+ _CODE_STOPWORDS: Final[frozenset[str]] = frozenset(
36
+ {
37
+ # Python keywords / builtins
38
+ "and", "as", "assert", "async", "await", "break", "class", "continue",
39
+ "def", "del", "elif", "else", "except", "finally", "for", "from",
40
+ "global", "if", "import", "in", "is", "lambda", "nonlocal", "not",
41
+ "or", "pass", "raise", "return", "try", "while", "with", "yield",
42
+ "True", "False", "None", "self", "cls",
43
+ "int", "str", "float", "bool", "list", "dict", "tuple", "set",
44
+ "type", "any", "all", "len", "range", "print",
45
+ # JS/TS keywords
46
+ "const", "let", "var", "function", "this", "new", "throw", "void",
47
+ "typeof", "instanceof", "switch", "case", "default", "extends",
48
+ # Common short words that masquerade as identifiers
49
+ "the", "that", "into",
50
+ }
51
+ )
52
+
53
+ # Maximum length of a "defined" entity in characters.
54
+ _DEFINED_MAX_LEN: Final = 80
55
+
56
+ # Tokens that disqualify a bold span from being a defined entity.
57
+ _SENTENCE_MARKERS: Final = frozenset({".", ",", ";", ":", "!", "?"})
58
+
59
+
60
+ class HeuristicExtractor:
61
+ """Regex-based entity extractor."""
62
+
63
+ name = "heuristic:regex-v1"
64
+
65
+ async def extract(self, document: Document) -> Iterable[ExtractionHit]:
66
+ hits: list[ExtractionHit] = []
67
+ for section in document.sections:
68
+ for hit in _scan_section(section.id, section.raw_text):
69
+ hits.append(hit)
70
+ return hits
71
+
72
+
73
+ def _scan_section(section_id: str, text: str) -> Iterator[ExtractionHit]:
74
+ yield from _scan_code(section_id, text)
75
+ yield from _scan_defined(section_id, text)
76
+
77
+
78
+ def _scan_code(section_id: str, text: str) -> Iterator[ExtractionHit]:
79
+ # Fenced code blocks.
80
+ for fence in _FENCED_CODE.finditer(text):
81
+ body = fence.group(1)
82
+ body_offset = fence.start(1)
83
+ for ident in _IDENTIFIER.finditer(body):
84
+ name = ident.group()
85
+ if name in _CODE_STOPWORDS:
86
+ continue
87
+ yield ExtractionHit(
88
+ section_id=section_id,
89
+ canonical=name,
90
+ surface_form=name,
91
+ kind="code",
92
+ span=Span(
93
+ start=body_offset + ident.start(),
94
+ end=body_offset + ident.end(),
95
+ ),
96
+ )
97
+
98
+ # Inline code spans. Skip those that fall inside a fenced block by
99
+ # checking offsets against fence ranges.
100
+ fence_ranges = [
101
+ (m.start(), m.end()) for m in _FENCED_CODE.finditer(text)
102
+ ]
103
+ for inline in _INLINE_CODE.finditer(text):
104
+ if _inside_any(inline.start(), fence_ranges):
105
+ continue
106
+ inner = inline.group(1)
107
+ inner_offset = inline.start(1)
108
+ # Only emit if the entire inline body looks like an identifier list.
109
+ # Skip prose-y `things like this`.
110
+ if " " in inner.strip() and len(inner.split()) > 1:
111
+ # Multi-word inline code → typically not a single identifier.
112
+ # Still scan for identifiers but treat them as separate hits.
113
+ pass
114
+ for ident in _IDENTIFIER.finditer(inner):
115
+ name = ident.group()
116
+ if name in _CODE_STOPWORDS:
117
+ continue
118
+ yield ExtractionHit(
119
+ section_id=section_id,
120
+ canonical=name,
121
+ surface_form=name,
122
+ kind="code",
123
+ span=Span(
124
+ start=inner_offset + ident.start(),
125
+ end=inner_offset + ident.end(),
126
+ ),
127
+ )
128
+
129
+
130
+ def _scan_defined(section_id: str, text: str) -> Iterator[ExtractionHit]:
131
+ for bold in _BOLD.finditer(text):
132
+ inner_raw = bold.group(1)
133
+ inner = inner_raw.strip()
134
+ if not inner or len(inner) > _DEFINED_MAX_LEN:
135
+ continue
136
+ if any(marker in inner for marker in _SENTENCE_MARKERS):
137
+ continue
138
+ yield ExtractionHit(
139
+ section_id=section_id,
140
+ canonical=inner,
141
+ surface_form=inner,
142
+ kind="defined",
143
+ span=Span(start=bold.start(1), end=bold.end(1)),
144
+ )
145
+
146
+
147
+ def _inside_any(pos: int, ranges: list[tuple[int, int]]) -> bool:
148
+ return any(start <= pos < end for start, end in ranges)
@@ -0,0 +1,39 @@
1
+ """Index layer — the five sub-indexes (Tree, Summaries, Entities, XRefs, Vectors)."""
2
+
3
+ from cairn.index.entities import ENTITIES_FILENAME, Entities, EntityBuilder
4
+ from cairn.index.summaries import (
5
+ SUMMARIES_FILENAME,
6
+ Summaries,
7
+ SummaryBuilder,
8
+ section_hash,
9
+ )
10
+ from cairn.index.tree import TREE_FILENAME, Tree, TreeBuilder
11
+ from cairn.index.vectors import (
12
+ VECTORS_DB_DIRNAME,
13
+ VECTORS_MANIFEST_FILENAME,
14
+ VectorBuilder,
15
+ VectorHit,
16
+ Vectors,
17
+ )
18
+ from cairn.index.xrefs import XREFS_FILENAME, XRefBuilder, XRefs
19
+
20
+ __all__ = [
21
+ "ENTITIES_FILENAME",
22
+ "SUMMARIES_FILENAME",
23
+ "TREE_FILENAME",
24
+ "VECTORS_DB_DIRNAME",
25
+ "VECTORS_MANIFEST_FILENAME",
26
+ "XREFS_FILENAME",
27
+ "Entities",
28
+ "EntityBuilder",
29
+ "Summaries",
30
+ "SummaryBuilder",
31
+ "Tree",
32
+ "TreeBuilder",
33
+ "VectorBuilder",
34
+ "VectorHit",
35
+ "Vectors",
36
+ "XRefBuilder",
37
+ "XRefs",
38
+ "section_hash",
39
+ ]
@@ -0,0 +1,244 @@
1
+ """Entities sub-index — extractor outputs, deduplicated and persisted.
2
+
3
+ The :class:`EntityBuilder` runs an :class:`cairn.entity.base.EntityExtractor`
4
+ over a Document, aggregates ``ExtractionHit`` records by ``(canonical, kind)``
5
+ into :class:`cairn.core.types.Entity`, and writes ``entities.json``.
6
+
7
+ The :class:`Entities` reader exposes lookup by canonical / surface form /
8
+ section, with optional ``kind`` filtering — exactly what the
9
+ ``find_mentions`` retrieval tool needs.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from collections import defaultdict
16
+ from collections.abc import Iterable, Iterator
17
+ from datetime import UTC, datetime
18
+ from pathlib import Path
19
+ from typing import Any, Final
20
+
21
+ from cairn.core.errors import IndexBuildError, IndexNotFoundError
22
+ from cairn.core.types import Document, Entity, EntityKind, Mention, Span
23
+ from cairn.entity.base import EntityExtractor, ExtractionHit
24
+
25
+ ENTITIES_FILENAME: Final = "entities.json"
26
+ ENTITIES_FORMAT_VERSION: Final = 1
27
+
28
+
29
+ class EntityBuilder:
30
+ """Run an extractor, aggregate hits, persist ``entities.json``."""
31
+
32
+ def __init__(self, extractor: EntityExtractor) -> None:
33
+ self.extractor = extractor
34
+
35
+ async def build(self, document: Document, *, out_dir: Path) -> Path:
36
+ out_dir.mkdir(parents=True, exist_ok=True)
37
+ path = out_dir / ENTITIES_FILENAME
38
+
39
+ hits = await self.extractor.extract(document)
40
+ entities = _aggregate(hits)
41
+ now = datetime.now(UTC)
42
+
43
+ payload: dict[str, Any] = {
44
+ "format_version": ENTITIES_FORMAT_VERSION,
45
+ "doc_id": document.id,
46
+ "extractor": self.extractor.name,
47
+ "generated_at": now.isoformat(),
48
+ "entities": [_entity_to_dict(e) for e in entities],
49
+ }
50
+
51
+ with path.open("w", encoding="utf-8") as fh:
52
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
53
+ fh.write("\n")
54
+ return path
55
+
56
+
57
+ class Entities:
58
+ """Loaded entities sub-index. Read-only queries."""
59
+
60
+ def __init__(
61
+ self,
62
+ entities: tuple[Entity, ...],
63
+ *,
64
+ doc_id: str,
65
+ extractor: str,
66
+ ) -> None:
67
+ self._all = entities
68
+ self.doc_id = doc_id
69
+ self.extractor = extractor
70
+
71
+ # Indexes
72
+ self._by_canonical: dict[tuple[str, str], Entity] = {
73
+ (e.canonical, e.kind): e for e in entities
74
+ }
75
+ self._by_surface: dict[str, list[Entity]] = defaultdict(list)
76
+ self._by_section: dict[str, list[Entity]] = defaultdict(list)
77
+ for ent in entities:
78
+ for sf in ent.surface_forms:
79
+ self._by_surface[sf].append(ent)
80
+ for mention in ent.mentions:
81
+ self._by_section[mention.section_id].append(ent)
82
+
83
+ @classmethod
84
+ def load(cls, doc_dir: Path) -> Entities:
85
+ path = doc_dir / ENTITIES_FILENAME
86
+ if not path.exists():
87
+ msg = f"entities.json not found in {doc_dir}"
88
+ raise IndexNotFoundError(msg, details={"path": str(path)})
89
+
90
+ with path.open("r", encoding="utf-8") as fh:
91
+ payload = json.load(fh)
92
+
93
+ version = payload.get("format_version")
94
+ if version != ENTITIES_FORMAT_VERSION:
95
+ msg = (
96
+ f"unsupported entities format version: {version!r} "
97
+ f"(expected {ENTITIES_FORMAT_VERSION})"
98
+ )
99
+ raise IndexNotFoundError(msg, details={"path": str(path)})
100
+
101
+ entities = tuple(_entity_from_dict(d) for d in payload["entities"])
102
+ return cls(
103
+ entities,
104
+ doc_id=payload["doc_id"],
105
+ extractor=payload["extractor"],
106
+ )
107
+
108
+ # -- queries -----------------------------------------------------------
109
+
110
+ def __len__(self) -> int:
111
+ return len(self._all)
112
+
113
+ def __iter__(self) -> Iterator[Entity]:
114
+ return iter(self._all)
115
+
116
+ def lookup(
117
+ self,
118
+ name: str,
119
+ *,
120
+ kinds: tuple[EntityKind, ...] | None = None,
121
+ ) -> Entity | None:
122
+ """Return the first matching entity by canonical or any surface form.
123
+
124
+ Precedence: canonical match before surface-form match. When ``kinds``
125
+ is supplied, only entities of those kinds are considered.
126
+ """
127
+ for ent in self._candidates(name):
128
+ if kinds is None or ent.kind in kinds:
129
+ return ent
130
+ return None
131
+
132
+ def lookup_all(
133
+ self,
134
+ name: str,
135
+ *,
136
+ kinds: tuple[EntityKind, ...] | None = None,
137
+ ) -> list[Entity]:
138
+ """Return every matching entity (across kinds) for ``name``."""
139
+ seen: set[tuple[str, str]] = set()
140
+ out: list[Entity] = []
141
+ for ent in self._candidates(name):
142
+ key = (ent.canonical, ent.kind)
143
+ if key in seen:
144
+ continue
145
+ if kinds is None or ent.kind in kinds:
146
+ out.append(ent)
147
+ seen.add(key)
148
+ return out
149
+
150
+ def by_section(self, section_id: str) -> list[Entity]:
151
+ return list(self._by_section.get(section_id, ()))
152
+
153
+ def by_kind(self, kind: EntityKind) -> list[Entity]:
154
+ return [e for e in self._all if e.kind == kind]
155
+
156
+ def _candidates(self, name: str) -> Iterator[Entity]:
157
+ # Canonical hits first, across all kinds, in extractor order.
158
+ for ent in self._all:
159
+ if ent.canonical == name:
160
+ yield ent
161
+ # Then surface-form matches.
162
+ for ent in self._by_surface.get(name, ()):
163
+ if ent.canonical != name:
164
+ yield ent
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Aggregation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def _aggregate(hits: Iterable[ExtractionHit]) -> list[Entity]:
173
+ """Fold ``ExtractionHit`` stream into a flat list of :class:`Entity`."""
174
+ by_key: dict[tuple[str, str], _Acc] = {}
175
+ insertion_order: list[tuple[str, str]] = []
176
+
177
+ for hit in hits:
178
+ if not hit.canonical:
179
+ msg = "extractor emitted an empty canonical"
180
+ raise IndexBuildError(msg)
181
+ key = (hit.canonical, hit.kind)
182
+ acc = by_key.get(key)
183
+ if acc is None:
184
+ acc = _Acc(canonical=hit.canonical, kind=hit.kind)
185
+ by_key[key] = acc
186
+ insertion_order.append(key)
187
+ acc.surface_forms[hit.surface_form] = None
188
+ acc.mentions.append(Mention(section_id=hit.section_id, span=hit.span))
189
+
190
+ return [by_key[key].freeze() for key in insertion_order]
191
+
192
+
193
+ class _Acc:
194
+ __slots__ = ("canonical", "kind", "mentions", "surface_forms")
195
+
196
+ def __init__(self, *, canonical: str, kind: EntityKind) -> None:
197
+ self.canonical = canonical
198
+ self.kind = kind
199
+ # dict[str, None] is the cheapest insertion-ordered set in Python.
200
+ self.surface_forms: dict[str, None] = {}
201
+ self.mentions: list[Mention] = []
202
+
203
+ def freeze(self) -> Entity:
204
+ return Entity(
205
+ canonical=self.canonical,
206
+ surface_forms=tuple(self.surface_forms.keys()),
207
+ kind=self.kind,
208
+ mentions=tuple(self.mentions),
209
+ )
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Serialization
214
+ # ---------------------------------------------------------------------------
215
+
216
+
217
+ def _entity_to_dict(e: Entity) -> dict[str, Any]:
218
+ return {
219
+ "canonical": e.canonical,
220
+ "surface_forms": list(e.surface_forms),
221
+ "kind": e.kind,
222
+ "mentions": [
223
+ {
224
+ "section_id": m.section_id,
225
+ "span": {"start": m.span.start, "end": m.span.end},
226
+ }
227
+ for m in e.mentions
228
+ ],
229
+ }
230
+
231
+
232
+ def _entity_from_dict(d: dict[str, Any]) -> Entity:
233
+ return Entity(
234
+ canonical=d["canonical"],
235
+ surface_forms=tuple(d["surface_forms"]),
236
+ kind=d["kind"],
237
+ mentions=tuple(
238
+ Mention(
239
+ section_id=m["section_id"],
240
+ span=Span(start=m["span"]["start"], end=m["span"]["end"]),
241
+ )
242
+ for m in d["mentions"]
243
+ ),
244
+ )
@@ -0,0 +1,269 @@
1
+ """Summaries sub-index — per-section multi-granularity summaries.
2
+
3
+ ``SummaryBuilder`` runs at indexing time and writes ``summaries.json``;
4
+ ``Summaries`` loads that file and serves queries. Both honor the
5
+ :class:`cairn.core.types.SummarySet` contract.
6
+
7
+ The cache layer (``cairn.summarize.cache.SummaryCache``) is keyed by
8
+ ``(summarizer.name, level, section_hash)``. Switching to a different
9
+ summarizer transparently invalidates prior cache entries; editing a section
10
+ invalidates only that section's entries.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import hashlib
17
+ import json
18
+ from collections.abc import Awaitable, Callable, Iterator, Sequence
19
+ from datetime import UTC, datetime
20
+ from pathlib import Path
21
+ from typing import Any, Final
22
+
23
+ from cairn.core.errors import IndexBuildError, IndexNotFoundError
24
+ from cairn.core.types import Document, SectionNode, SummarySet
25
+ from cairn.summarize.base import Summarizer, SummaryLevel
26
+ from cairn.summarize.cache import SummaryCache
27
+
28
+ SUMMARIES_FILENAME: Final = "summaries.json"
29
+ SUMMARIES_FORMAT_VERSION: Final = 1
30
+
31
+
32
+ def section_hash(node: SectionNode) -> str:
33
+ """Cache-invalidating fingerprint of a section's content.
34
+
35
+ Uses ``sha256(title || NUL || raw_text)``. Changes in either invalidate
36
+ cached summaries for that section.
37
+ """
38
+ h = hashlib.sha256()
39
+ h.update(node.title.encode("utf-8"))
40
+ h.update(b"\x00")
41
+ h.update(node.raw_text.encode("utf-8"))
42
+ return h.hexdigest()
43
+
44
+
45
+ class SummaryBuilder:
46
+ """Asynchronously generate and persist summaries for a Document."""
47
+
48
+ def __init__(
49
+ self,
50
+ summarizer: Summarizer,
51
+ *,
52
+ cache: SummaryCache | None = None,
53
+ concurrency: int = 4,
54
+ progress: Callable[[int, int], None] | None = None,
55
+ ) -> None:
56
+ if concurrency < 1:
57
+ msg = f"concurrency must be ≥1; got {concurrency}"
58
+ raise IndexBuildError(msg)
59
+ self.summarizer = summarizer
60
+ self.cache = cache
61
+ self.concurrency = concurrency
62
+ self.progress = progress
63
+
64
+ async def build(
65
+ self,
66
+ document: Document,
67
+ *,
68
+ out_dir: Path,
69
+ levels: Sequence[SummaryLevel] = (
70
+ SummaryLevel.GIST,
71
+ SummaryLevel.SYNOPSIS,
72
+ SummaryLevel.DIGEST,
73
+ ),
74
+ ) -> Path:
75
+ """Summarize every section in ``document`` and write ``summaries.json``.
76
+
77
+ Args:
78
+ document: The parsed document.
79
+ out_dir: Directory to write into. Created if absent.
80
+ levels: Which granularity levels to generate. Order is preserved
81
+ and duplicates are dropped. Defaults to all three levels
82
+ (gist, synopsis, digest) since v0.2.4.
83
+
84
+ Returns:
85
+ Path to the written ``summaries.json``.
86
+ """
87
+ ordered_levels = _dedupe_preserve_order(levels)
88
+ if not ordered_levels:
89
+ msg = "at least one SummaryLevel is required"
90
+ raise IndexBuildError(msg)
91
+
92
+ out_dir.mkdir(parents=True, exist_ok=True)
93
+ path = out_dir / SUMMARIES_FILENAME
94
+ now = datetime.now(UTC)
95
+ semaphore = asyncio.Semaphore(self.concurrency)
96
+ progress_lock = asyncio.Lock()
97
+ completed = 0
98
+ total = len(document.sections) * len(ordered_levels)
99
+
100
+ async def mark_progress() -> None:
101
+ nonlocal completed
102
+ async with progress_lock:
103
+ completed += 1
104
+ if self.progress is None:
105
+ return
106
+ step = max(5, total // 20)
107
+ if completed != 1 and completed != total and completed % step != 0:
108
+ return
109
+ self.progress(completed, total)
110
+
111
+ async def for_section(node: SectionNode) -> dict[str, Any]:
112
+ return await self._summarize_section(
113
+ node,
114
+ ordered_levels,
115
+ semaphore,
116
+ now,
117
+ mark_progress=mark_progress,
118
+ )
119
+
120
+ records = await asyncio.gather(
121
+ *(for_section(s) for s in document.sections)
122
+ )
123
+
124
+ payload: dict[str, Any] = {
125
+ "format_version": SUMMARIES_FORMAT_VERSION,
126
+ "doc_id": document.id,
127
+ "model": self.summarizer.name,
128
+ "levels": [lvl.value for lvl in ordered_levels],
129
+ "generated_at": now.isoformat(),
130
+ "summaries": list(records),
131
+ }
132
+
133
+ with path.open("w", encoding="utf-8") as fh:
134
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
135
+ fh.write("\n")
136
+ return path
137
+
138
+ async def _summarize_section(
139
+ self,
140
+ node: SectionNode,
141
+ levels: Sequence[SummaryLevel],
142
+ semaphore: asyncio.Semaphore,
143
+ now: datetime,
144
+ mark_progress: Callable[[], Awaitable[None]],
145
+ ) -> dict[str, Any]:
146
+ sh = section_hash(node)
147
+ results: dict[str, str] = {}
148
+
149
+ for level in levels:
150
+ cache_key: str | None = None
151
+ cached: str | None = None
152
+ if self.cache is not None:
153
+ cache_key = SummaryCache.key(
154
+ model=self.summarizer.name,
155
+ level=level.value,
156
+ section_hash=sh,
157
+ )
158
+ cached = self.cache.get(cache_key)
159
+
160
+ if cached is not None:
161
+ results[level.value] = cached
162
+ await mark_progress()
163
+ continue
164
+
165
+ async with semaphore:
166
+ text = await self.summarizer.summarize(
167
+ title=node.title,
168
+ body=node.raw_text,
169
+ level=level,
170
+ )
171
+ results[level.value] = text
172
+ await mark_progress()
173
+ if self.cache is not None and cache_key is not None:
174
+ self.cache.put(cache_key, text)
175
+
176
+ return {
177
+ "section_id": node.id,
178
+ "section_hash": sh,
179
+ "gist": results.get(SummaryLevel.GIST.value, ""),
180
+ "synopsis": results.get(SummaryLevel.SYNOPSIS.value, ""),
181
+ "digest": results.get(SummaryLevel.DIGEST.value),
182
+ "model": self.summarizer.name,
183
+ "generated_at": now.isoformat(),
184
+ }
185
+
186
+
187
+ class Summaries:
188
+ """Loaded summaries index. Read-only by section id."""
189
+
190
+ def __init__(
191
+ self,
192
+ sets: tuple[SummarySet, ...],
193
+ *,
194
+ doc_id: str,
195
+ model: str,
196
+ ) -> None:
197
+ self._by_id: dict[str, SummarySet] = {s.section_id: s for s in sets}
198
+ self._all = sets
199
+ self.doc_id = doc_id
200
+ self.model = model
201
+
202
+ @classmethod
203
+ def load(cls, doc_dir: Path) -> Summaries:
204
+ """Load ``summaries.json`` from a document directory."""
205
+ path = doc_dir / SUMMARIES_FILENAME
206
+ if not path.exists():
207
+ msg = f"summaries.json not found in {doc_dir}"
208
+ raise IndexNotFoundError(msg, details={"path": str(path)})
209
+
210
+ with path.open("r", encoding="utf-8") as fh:
211
+ payload = json.load(fh)
212
+
213
+ version = payload.get("format_version")
214
+ if version != SUMMARIES_FORMAT_VERSION:
215
+ msg = (
216
+ f"unsupported summaries format version: {version!r} "
217
+ f"(expected {SUMMARIES_FORMAT_VERSION})"
218
+ )
219
+ raise IndexNotFoundError(msg, details={"path": str(path)})
220
+
221
+ sets = tuple(
222
+ SummarySet(
223
+ section_id=record["section_id"],
224
+ gist=record["gist"],
225
+ synopsis=record["synopsis"],
226
+ digest=record.get("digest"),
227
+ model=record["model"],
228
+ section_hash=record["section_hash"],
229
+ generated_at=datetime.fromisoformat(record["generated_at"]),
230
+ )
231
+ for record in payload["summaries"]
232
+ )
233
+ return cls(sets, doc_id=payload["doc_id"], model=payload["model"])
234
+
235
+ # -- queries -----------------------------------------------------------
236
+
237
+ def get(self, section_id: str) -> SummarySet | None:
238
+ return self._by_id.get(section_id)
239
+
240
+ def require(self, section_id: str) -> SummarySet:
241
+ s = self.get(section_id)
242
+ if s is None:
243
+ msg = f"summary not found for section: {section_id!r}"
244
+ raise IndexNotFoundError(msg, details={"section_id": section_id})
245
+ return s
246
+
247
+ def __contains__(self, section_id: object) -> bool:
248
+ return isinstance(section_id, str) and section_id in self._by_id
249
+
250
+ def __len__(self) -> int:
251
+ return len(self._all)
252
+
253
+ def __iter__(self) -> Iterator[SummarySet]:
254
+ return iter(self._all)
255
+
256
+
257
+ # ---------------------------------------------------------------------------
258
+ # helpers
259
+ # ---------------------------------------------------------------------------
260
+
261
+
262
+ def _dedupe_preserve_order(levels: Sequence[SummaryLevel]) -> list[SummaryLevel]:
263
+ seen: set[SummaryLevel] = set()
264
+ out: list[SummaryLevel] = []
265
+ for lvl in levels:
266
+ if lvl not in seen:
267
+ seen.add(lvl)
268
+ out.append(lvl)
269
+ return out