docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/ingest/pdf.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""PDF parser (pymupdf baseline).
|
|
2
|
+
|
|
3
|
+
Two extraction paths:
|
|
4
|
+
|
|
5
|
+
1. **Outline-based** (preferred). When the PDF carries an outline /
|
|
6
|
+
bookmarks (``doc.get_toc()``), use it directly: each entry becomes a
|
|
7
|
+
``SectionNode`` with the same hierarchical slug convention as the
|
|
8
|
+
Markdown parser.
|
|
9
|
+
2. **Heuristic fallback**. When no outline exists, look at text spans
|
|
10
|
+
with their font sizes; treat blocks whose size exceeds 1.3x the
|
|
11
|
+
median block size as level-1 headings (no nesting). The fallback is
|
|
12
|
+
honest about its limits — for serious PDFs you'll want to add a
|
|
13
|
+
table-of-contents (``pdftk update_info`` or any PDF editor) before
|
|
14
|
+
indexing.
|
|
15
|
+
|
|
16
|
+
Spans use the same convention as ``MarkdownParser``: byte offsets into a
|
|
17
|
+
canonicalized full-document text built by concatenating per-page text
|
|
18
|
+
with ``\\n\\n`` separators. ``source_path`` is preserved for traceability.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import hashlib
|
|
24
|
+
import statistics
|
|
25
|
+
from collections import defaultdict
|
|
26
|
+
from datetime import UTC, datetime
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import fitz
|
|
31
|
+
from slugify import slugify
|
|
32
|
+
|
|
33
|
+
from cairn import __version__
|
|
34
|
+
from cairn.core.errors import ParseError
|
|
35
|
+
from cairn.core.types import Document, SectionNode, Span
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PdfParser:
|
|
39
|
+
"""pymupdf-backed PDF parser."""
|
|
40
|
+
|
|
41
|
+
name = "pdf"
|
|
42
|
+
extensions: tuple[str, ...] = (".pdf",)
|
|
43
|
+
|
|
44
|
+
def parse(
|
|
45
|
+
self,
|
|
46
|
+
source: Path | bytes | str,
|
|
47
|
+
*,
|
|
48
|
+
doc_id: str | None = None,
|
|
49
|
+
) -> Document:
|
|
50
|
+
source_path, resolved_doc_id, doc = self._open(source, doc_id)
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
pages_text, page_offsets, full_text = _extract_text(doc)
|
|
54
|
+
toc = doc.get_toc(simple=False) or doc.get_toc() or []
|
|
55
|
+
if toc:
|
|
56
|
+
sections = _sections_from_outline(toc, pages_text, page_offsets, full_text)
|
|
57
|
+
else:
|
|
58
|
+
sections = _sections_from_heuristic(doc, pages_text, page_offsets, full_text)
|
|
59
|
+
finally:
|
|
60
|
+
doc.close()
|
|
61
|
+
|
|
62
|
+
text_bytes = full_text.encode("utf-8")
|
|
63
|
+
return Document(
|
|
64
|
+
id=resolved_doc_id,
|
|
65
|
+
source_path=source_path,
|
|
66
|
+
source_hash=hashlib.sha256(text_bytes).hexdigest(),
|
|
67
|
+
sections=tuple(sections),
|
|
68
|
+
indexed_at=datetime.now(UTC),
|
|
69
|
+
cairn_version=__version__,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _open(
|
|
74
|
+
source: Path | bytes | str,
|
|
75
|
+
doc_id: str | None,
|
|
76
|
+
) -> tuple[Path, str, Any]:
|
|
77
|
+
if isinstance(source, Path):
|
|
78
|
+
try:
|
|
79
|
+
doc = fitz.open(str(source))
|
|
80
|
+
except Exception as exc: # pymupdf raises a variety of errors
|
|
81
|
+
msg = f"could not open PDF: {source}"
|
|
82
|
+
raise ParseError(msg, details={"path": str(source)}) from exc
|
|
83
|
+
resolved_id = doc_id or _slug_or_raise(source.stem, ctx="filename stem")
|
|
84
|
+
return source, resolved_id, doc
|
|
85
|
+
|
|
86
|
+
if doc_id is None:
|
|
87
|
+
msg = "doc_id is required when source is not a path"
|
|
88
|
+
raise ParseError(msg)
|
|
89
|
+
|
|
90
|
+
if isinstance(source, str):
|
|
91
|
+
source = source.encode("utf-8")
|
|
92
|
+
try:
|
|
93
|
+
doc = fitz.open(stream=source, filetype="pdf")
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
msg = "could not open PDF from bytes"
|
|
96
|
+
raise ParseError(msg) from exc
|
|
97
|
+
return Path(f"<in-memory:{doc_id}>"), doc_id, doc
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
# Text + offsets
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _extract_text(doc: Any) -> tuple[list[str], list[int], str]:
|
|
106
|
+
"""Return per-page text, per-page byte offset, and the concatenated text."""
|
|
107
|
+
pages: list[str] = [page.get_text("text") or "" for page in doc]
|
|
108
|
+
full_text = "\n\n".join(pages)
|
|
109
|
+
offsets: list[int] = []
|
|
110
|
+
running = 0
|
|
111
|
+
for i, page_text in enumerate(pages):
|
|
112
|
+
offsets.append(running)
|
|
113
|
+
running += len(page_text.encode("utf-8"))
|
|
114
|
+
if i < len(pages) - 1:
|
|
115
|
+
running += len(b"\n\n")
|
|
116
|
+
return pages, offsets, full_text
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Outline-based section building
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _sections_from_outline(
|
|
125
|
+
toc: list[Any],
|
|
126
|
+
pages_text: list[str],
|
|
127
|
+
page_offsets: list[int],
|
|
128
|
+
full_text: str,
|
|
129
|
+
) -> list[SectionNode]:
|
|
130
|
+
"""Build sections from a PDF outline.
|
|
131
|
+
|
|
132
|
+
``toc`` is the output of ``doc.get_toc(simple=False)`` — each row is
|
|
133
|
+
``[level, title, page, dest_dict]`` (1-indexed page). The ``simple``
|
|
134
|
+
variant drops the dest dict but is otherwise identical.
|
|
135
|
+
"""
|
|
136
|
+
entries = _normalize_toc(toc)
|
|
137
|
+
if not entries:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
n_pages = len(pages_text)
|
|
141
|
+
total_bytes = len(full_text.encode("utf-8"))
|
|
142
|
+
|
|
143
|
+
# For each entry, compute the byte offset where its page starts.
|
|
144
|
+
territory_start_byte: list[int] = []
|
|
145
|
+
for entry in entries:
|
|
146
|
+
page = max(0, min(entry["page"] - 1, n_pages - 1))
|
|
147
|
+
territory_start_byte.append(page_offsets[page])
|
|
148
|
+
|
|
149
|
+
# Territory end: where the next entry at the same-or-shallower level begins.
|
|
150
|
+
territory_end_byte: list[int] = []
|
|
151
|
+
for i, entry in enumerate(entries):
|
|
152
|
+
end = total_bytes
|
|
153
|
+
for j in range(i + 1, len(entries)):
|
|
154
|
+
if entries[j]["level"] <= entry["level"]:
|
|
155
|
+
end = territory_start_byte[j]
|
|
156
|
+
break
|
|
157
|
+
territory_end_byte.append(end)
|
|
158
|
+
|
|
159
|
+
# Hierarchical slug IDs + parent/child links — same convention as Markdown.
|
|
160
|
+
metadata: list[tuple[str, str, int, str | None, tuple[str, ...]]] = []
|
|
161
|
+
stack: list[tuple[int, str, str]] = []
|
|
162
|
+
sibling_counts: dict[tuple[str, str], int] = defaultdict(int)
|
|
163
|
+
|
|
164
|
+
for entry in entries:
|
|
165
|
+
level = entry["level"]
|
|
166
|
+
title = entry["title"]
|
|
167
|
+
while stack and stack[-1][0] >= level:
|
|
168
|
+
stack.pop()
|
|
169
|
+
parent_id = stack[-1][1] if stack else None
|
|
170
|
+
slug = _safe_slug(title)
|
|
171
|
+
key = (parent_id or "", slug)
|
|
172
|
+
sibling_counts[key] += 1
|
|
173
|
+
count = sibling_counts[key]
|
|
174
|
+
unique_slug = slug if count == 1 else f"{slug}-{count}"
|
|
175
|
+
section_id = f"{parent_id}/{unique_slug}" if parent_id else unique_slug
|
|
176
|
+
path = (*(t for _, _, t in stack), title)
|
|
177
|
+
metadata.append((section_id, title, level, parent_id, path))
|
|
178
|
+
stack.append((level, section_id, title))
|
|
179
|
+
|
|
180
|
+
children_map: dict[str, list[str]] = defaultdict(list)
|
|
181
|
+
for sid, _t, _l, parent_id, _p in metadata:
|
|
182
|
+
if parent_id is not None:
|
|
183
|
+
children_map[parent_id].append(sid)
|
|
184
|
+
|
|
185
|
+
sections: list[SectionNode] = []
|
|
186
|
+
for idx in range(len(entries)):
|
|
187
|
+
section_id, title, level, parent_id, path = metadata[idx]
|
|
188
|
+
span_start = territory_start_byte[idx]
|
|
189
|
+
span_end = territory_end_byte[idx]
|
|
190
|
+
# raw_text excludes descendant bodies: stop at the next ANY-level entry.
|
|
191
|
+
raw_end = span_end
|
|
192
|
+
if idx + 1 < len(entries):
|
|
193
|
+
raw_end = min(raw_end, territory_start_byte[idx + 1])
|
|
194
|
+
text_bytes = full_text.encode("utf-8")
|
|
195
|
+
raw_text = text_bytes[span_start:raw_end].decode("utf-8", errors="replace")
|
|
196
|
+
|
|
197
|
+
sections.append(
|
|
198
|
+
SectionNode(
|
|
199
|
+
id=section_id,
|
|
200
|
+
title=title,
|
|
201
|
+
level=min(6, max(1, level)),
|
|
202
|
+
parent=parent_id,
|
|
203
|
+
children=tuple(children_map.get(section_id, ())),
|
|
204
|
+
span=Span(start=span_start, end=span_end),
|
|
205
|
+
path=path,
|
|
206
|
+
raw_text=raw_text,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
return sections
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _normalize_toc(toc: list[Any]) -> list[dict[str, Any]]:
|
|
213
|
+
"""Flatten pymupdf's TOC into ``[{level, title, page}]`` dicts."""
|
|
214
|
+
out: list[dict[str, Any]] = []
|
|
215
|
+
for row in toc:
|
|
216
|
+
if not row or len(row) < 3:
|
|
217
|
+
continue
|
|
218
|
+
level = int(row[0])
|
|
219
|
+
title = str(row[1]).strip()
|
|
220
|
+
page = int(row[2])
|
|
221
|
+
if not title:
|
|
222
|
+
continue
|
|
223
|
+
out.append({"level": level, "title": title, "page": page})
|
|
224
|
+
return out
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
# Heuristic fallback (no outline)
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _sections_from_heuristic(
|
|
233
|
+
doc: Any,
|
|
234
|
+
pages_text: list[str],
|
|
235
|
+
page_offsets: list[int],
|
|
236
|
+
full_text: str,
|
|
237
|
+
) -> list[SectionNode]:
|
|
238
|
+
"""Best-effort: take blocks whose font size exceeds 1.3x median as H1.
|
|
239
|
+
|
|
240
|
+
The fallback only emits level-1 sections. Authors with no PDF outline
|
|
241
|
+
should add one before indexing serious documents.
|
|
242
|
+
"""
|
|
243
|
+
headings: list[tuple[str, int]] = [] # (title, byte_offset)
|
|
244
|
+
block_sizes: list[float] = []
|
|
245
|
+
|
|
246
|
+
# First pass: collect typical block font sizes (median = body text).
|
|
247
|
+
for page in doc:
|
|
248
|
+
blocks = page.get_text("dict").get("blocks", [])
|
|
249
|
+
for block in blocks:
|
|
250
|
+
for line in block.get("lines", []):
|
|
251
|
+
for span in line.get("spans", []):
|
|
252
|
+
size = float(span.get("size", 0.0))
|
|
253
|
+
if size > 0 and span.get("text", "").strip():
|
|
254
|
+
block_sizes.append(size)
|
|
255
|
+
|
|
256
|
+
if not block_sizes:
|
|
257
|
+
# Document has no extractable text. Return a single placeholder section.
|
|
258
|
+
return _placeholder_section(full_text)
|
|
259
|
+
|
|
260
|
+
median = statistics.median(block_sizes)
|
|
261
|
+
threshold = median * 1.3
|
|
262
|
+
|
|
263
|
+
# Second pass: find heading candidates and locate their byte offset.
|
|
264
|
+
text_bytes = full_text.encode("utf-8")
|
|
265
|
+
for page_idx, page in enumerate(doc):
|
|
266
|
+
page_text = pages_text[page_idx]
|
|
267
|
+
page_offset = page_offsets[page_idx]
|
|
268
|
+
blocks = page.get_text("dict").get("blocks", [])
|
|
269
|
+
for block in blocks:
|
|
270
|
+
for line in block.get("lines", []):
|
|
271
|
+
spans = line.get("spans", [])
|
|
272
|
+
if not spans:
|
|
273
|
+
continue
|
|
274
|
+
max_size = max(float(s.get("size", 0.0)) for s in spans)
|
|
275
|
+
if max_size < threshold:
|
|
276
|
+
continue
|
|
277
|
+
title = "".join(s.get("text", "") for s in spans).strip()
|
|
278
|
+
if not title or len(title) > 200:
|
|
279
|
+
continue
|
|
280
|
+
offset_in_page = page_text.find(title)
|
|
281
|
+
if offset_in_page < 0:
|
|
282
|
+
continue
|
|
283
|
+
byte_offset = page_offset + len(
|
|
284
|
+
page_text[:offset_in_page].encode("utf-8")
|
|
285
|
+
)
|
|
286
|
+
headings.append((title, byte_offset))
|
|
287
|
+
|
|
288
|
+
if not headings:
|
|
289
|
+
return _placeholder_section(full_text)
|
|
290
|
+
|
|
291
|
+
# Deduplicate by title + offset; sort by offset to enforce document order.
|
|
292
|
+
seen: set[tuple[str, int]] = set()
|
|
293
|
+
unique: list[tuple[str, int]] = []
|
|
294
|
+
for title, offset in headings:
|
|
295
|
+
if (title, offset) not in seen:
|
|
296
|
+
seen.add((title, offset))
|
|
297
|
+
unique.append((title, offset))
|
|
298
|
+
unique.sort(key=lambda x: x[1])
|
|
299
|
+
|
|
300
|
+
sections: list[SectionNode] = []
|
|
301
|
+
sibling_counts: dict[str, int] = defaultdict(int)
|
|
302
|
+
total_bytes = len(text_bytes)
|
|
303
|
+
for i, (title, byte_offset) in enumerate(unique):
|
|
304
|
+
slug = _safe_slug(title)
|
|
305
|
+
sibling_counts[slug] += 1
|
|
306
|
+
count = sibling_counts[slug]
|
|
307
|
+
section_id = slug if count == 1 else f"{slug}-{count}"
|
|
308
|
+
end = unique[i + 1][1] if i + 1 < len(unique) else total_bytes
|
|
309
|
+
sections.append(
|
|
310
|
+
SectionNode(
|
|
311
|
+
id=section_id,
|
|
312
|
+
title=title,
|
|
313
|
+
level=1,
|
|
314
|
+
parent=None,
|
|
315
|
+
children=(),
|
|
316
|
+
span=Span(start=byte_offset, end=end),
|
|
317
|
+
path=(title,),
|
|
318
|
+
raw_text=text_bytes[byte_offset:end].decode("utf-8", errors="replace"),
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
return sections
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _placeholder_section(full_text: str) -> list[SectionNode]:
|
|
325
|
+
"""When extraction yields nothing useful, return a single virtual section."""
|
|
326
|
+
text_bytes = full_text.encode("utf-8")
|
|
327
|
+
if not text_bytes:
|
|
328
|
+
return []
|
|
329
|
+
return [
|
|
330
|
+
SectionNode(
|
|
331
|
+
id="document",
|
|
332
|
+
title="Document",
|
|
333
|
+
level=1,
|
|
334
|
+
parent=None,
|
|
335
|
+
children=(),
|
|
336
|
+
span=Span(start=0, end=len(text_bytes)),
|
|
337
|
+
path=("Document",),
|
|
338
|
+
raw_text=full_text,
|
|
339
|
+
)
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ---------------------------------------------------------------------------
|
|
344
|
+
# Helpers
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _safe_slug(text: str) -> str:
|
|
349
|
+
return slugify(text) or "section"
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _slug_or_raise(text: str, *, ctx: str) -> str:
|
|
353
|
+
slug = slugify(text)
|
|
354
|
+
if not slug:
|
|
355
|
+
msg = f"could not derive a slug from {ctx}: {text!r}"
|
|
356
|
+
raise ParseError(msg)
|
|
357
|
+
return slug
|