docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/ingest/pdf.py ADDED
@@ -0,0 +1,357 @@
1
+ """PDF parser (pymupdf baseline).
2
+
3
+ Two extraction paths:
4
+
5
+ 1. **Outline-based** (preferred). When the PDF carries an outline /
6
+ bookmarks (``doc.get_toc()``), use it directly: each entry becomes a
7
+ ``SectionNode`` with the same hierarchical slug convention as the
8
+ Markdown parser.
9
+ 2. **Heuristic fallback**. When no outline exists, look at text spans
10
+ with their font sizes; treat blocks whose size exceeds 1.3x the
11
+ median block size as level-1 headings (no nesting). The fallback is
12
+ honest about its limits — for serious PDFs you'll want to add a
13
+ table-of-contents (``pdftk update_info`` or any PDF editor) before
14
+ indexing.
15
+
16
+ Spans use the same convention as ``MarkdownParser``: byte offsets into a
17
+ canonicalized full-document text built by concatenating per-page text
18
+ with ``\\n\\n`` separators. ``source_path`` is preserved for traceability.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import hashlib
24
+ import statistics
25
+ from collections import defaultdict
26
+ from datetime import UTC, datetime
27
+ from pathlib import Path
28
+ from typing import Any
29
+
30
+ import fitz
31
+ from slugify import slugify
32
+
33
+ from cairn import __version__
34
+ from cairn.core.errors import ParseError
35
+ from cairn.core.types import Document, SectionNode, Span
36
+
37
+
38
+ class PdfParser:
39
+ """pymupdf-backed PDF parser."""
40
+
41
+ name = "pdf"
42
+ extensions: tuple[str, ...] = (".pdf",)
43
+
44
+ def parse(
45
+ self,
46
+ source: Path | bytes | str,
47
+ *,
48
+ doc_id: str | None = None,
49
+ ) -> Document:
50
+ source_path, resolved_doc_id, doc = self._open(source, doc_id)
51
+
52
+ try:
53
+ pages_text, page_offsets, full_text = _extract_text(doc)
54
+ toc = doc.get_toc(simple=False) or doc.get_toc() or []
55
+ if toc:
56
+ sections = _sections_from_outline(toc, pages_text, page_offsets, full_text)
57
+ else:
58
+ sections = _sections_from_heuristic(doc, pages_text, page_offsets, full_text)
59
+ finally:
60
+ doc.close()
61
+
62
+ text_bytes = full_text.encode("utf-8")
63
+ return Document(
64
+ id=resolved_doc_id,
65
+ source_path=source_path,
66
+ source_hash=hashlib.sha256(text_bytes).hexdigest(),
67
+ sections=tuple(sections),
68
+ indexed_at=datetime.now(UTC),
69
+ cairn_version=__version__,
70
+ )
71
+
72
+ @staticmethod
73
+ def _open(
74
+ source: Path | bytes | str,
75
+ doc_id: str | None,
76
+ ) -> tuple[Path, str, Any]:
77
+ if isinstance(source, Path):
78
+ try:
79
+ doc = fitz.open(str(source))
80
+ except Exception as exc: # pymupdf raises a variety of errors
81
+ msg = f"could not open PDF: {source}"
82
+ raise ParseError(msg, details={"path": str(source)}) from exc
83
+ resolved_id = doc_id or _slug_or_raise(source.stem, ctx="filename stem")
84
+ return source, resolved_id, doc
85
+
86
+ if doc_id is None:
87
+ msg = "doc_id is required when source is not a path"
88
+ raise ParseError(msg)
89
+
90
+ if isinstance(source, str):
91
+ source = source.encode("utf-8")
92
+ try:
93
+ doc = fitz.open(stream=source, filetype="pdf")
94
+ except Exception as exc:
95
+ msg = "could not open PDF from bytes"
96
+ raise ParseError(msg) from exc
97
+ return Path(f"<in-memory:{doc_id}>"), doc_id, doc
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Text + offsets
102
+ # ---------------------------------------------------------------------------
103
+
104
+
105
+ def _extract_text(doc: Any) -> tuple[list[str], list[int], str]:
106
+ """Return per-page text, per-page byte offset, and the concatenated text."""
107
+ pages: list[str] = [page.get_text("text") or "" for page in doc]
108
+ full_text = "\n\n".join(pages)
109
+ offsets: list[int] = []
110
+ running = 0
111
+ for i, page_text in enumerate(pages):
112
+ offsets.append(running)
113
+ running += len(page_text.encode("utf-8"))
114
+ if i < len(pages) - 1:
115
+ running += len(b"\n\n")
116
+ return pages, offsets, full_text
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Outline-based section building
121
+ # ---------------------------------------------------------------------------
122
+
123
+
124
+ def _sections_from_outline(
125
+ toc: list[Any],
126
+ pages_text: list[str],
127
+ page_offsets: list[int],
128
+ full_text: str,
129
+ ) -> list[SectionNode]:
130
+ """Build sections from a PDF outline.
131
+
132
+ ``toc`` is the output of ``doc.get_toc(simple=False)`` — each row is
133
+ ``[level, title, page, dest_dict]`` (1-indexed page). The ``simple``
134
+ variant drops the dest dict but is otherwise identical.
135
+ """
136
+ entries = _normalize_toc(toc)
137
+ if not entries:
138
+ return []
139
+
140
+ n_pages = len(pages_text)
141
+ total_bytes = len(full_text.encode("utf-8"))
142
+
143
+ # For each entry, compute the byte offset where its page starts.
144
+ territory_start_byte: list[int] = []
145
+ for entry in entries:
146
+ page = max(0, min(entry["page"] - 1, n_pages - 1))
147
+ territory_start_byte.append(page_offsets[page])
148
+
149
+ # Territory end: where the next entry at the same-or-shallower level begins.
150
+ territory_end_byte: list[int] = []
151
+ for i, entry in enumerate(entries):
152
+ end = total_bytes
153
+ for j in range(i + 1, len(entries)):
154
+ if entries[j]["level"] <= entry["level"]:
155
+ end = territory_start_byte[j]
156
+ break
157
+ territory_end_byte.append(end)
158
+
159
+ # Hierarchical slug IDs + parent/child links — same convention as Markdown.
160
+ metadata: list[tuple[str, str, int, str | None, tuple[str, ...]]] = []
161
+ stack: list[tuple[int, str, str]] = []
162
+ sibling_counts: dict[tuple[str, str], int] = defaultdict(int)
163
+
164
+ for entry in entries:
165
+ level = entry["level"]
166
+ title = entry["title"]
167
+ while stack and stack[-1][0] >= level:
168
+ stack.pop()
169
+ parent_id = stack[-1][1] if stack else None
170
+ slug = _safe_slug(title)
171
+ key = (parent_id or "", slug)
172
+ sibling_counts[key] += 1
173
+ count = sibling_counts[key]
174
+ unique_slug = slug if count == 1 else f"{slug}-{count}"
175
+ section_id = f"{parent_id}/{unique_slug}" if parent_id else unique_slug
176
+ path = (*(t for _, _, t in stack), title)
177
+ metadata.append((section_id, title, level, parent_id, path))
178
+ stack.append((level, section_id, title))
179
+
180
+ children_map: dict[str, list[str]] = defaultdict(list)
181
+ for sid, _t, _l, parent_id, _p in metadata:
182
+ if parent_id is not None:
183
+ children_map[parent_id].append(sid)
184
+
185
+ sections: list[SectionNode] = []
186
+ for idx in range(len(entries)):
187
+ section_id, title, level, parent_id, path = metadata[idx]
188
+ span_start = territory_start_byte[idx]
189
+ span_end = territory_end_byte[idx]
190
+ # raw_text excludes descendant bodies: stop at the next ANY-level entry.
191
+ raw_end = span_end
192
+ if idx + 1 < len(entries):
193
+ raw_end = min(raw_end, territory_start_byte[idx + 1])
194
+ text_bytes = full_text.encode("utf-8")
195
+ raw_text = text_bytes[span_start:raw_end].decode("utf-8", errors="replace")
196
+
197
+ sections.append(
198
+ SectionNode(
199
+ id=section_id,
200
+ title=title,
201
+ level=min(6, max(1, level)),
202
+ parent=parent_id,
203
+ children=tuple(children_map.get(section_id, ())),
204
+ span=Span(start=span_start, end=span_end),
205
+ path=path,
206
+ raw_text=raw_text,
207
+ )
208
+ )
209
+ return sections
210
+
211
+
212
+ def _normalize_toc(toc: list[Any]) -> list[dict[str, Any]]:
213
+ """Flatten pymupdf's TOC into ``[{level, title, page}]`` dicts."""
214
+ out: list[dict[str, Any]] = []
215
+ for row in toc:
216
+ if not row or len(row) < 3:
217
+ continue
218
+ level = int(row[0])
219
+ title = str(row[1]).strip()
220
+ page = int(row[2])
221
+ if not title:
222
+ continue
223
+ out.append({"level": level, "title": title, "page": page})
224
+ return out
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Heuristic fallback (no outline)
229
+ # ---------------------------------------------------------------------------
230
+
231
+
232
+ def _sections_from_heuristic(
233
+ doc: Any,
234
+ pages_text: list[str],
235
+ page_offsets: list[int],
236
+ full_text: str,
237
+ ) -> list[SectionNode]:
238
+ """Best-effort: take blocks whose font size exceeds 1.3x median as H1.
239
+
240
+ The fallback only emits level-1 sections. Authors with no PDF outline
241
+ should add one before indexing serious documents.
242
+ """
243
+ headings: list[tuple[str, int]] = [] # (title, byte_offset)
244
+ block_sizes: list[float] = []
245
+
246
+ # First pass: collect typical block font sizes (median = body text).
247
+ for page in doc:
248
+ blocks = page.get_text("dict").get("blocks", [])
249
+ for block in blocks:
250
+ for line in block.get("lines", []):
251
+ for span in line.get("spans", []):
252
+ size = float(span.get("size", 0.0))
253
+ if size > 0 and span.get("text", "").strip():
254
+ block_sizes.append(size)
255
+
256
+ if not block_sizes:
257
+ # Document has no extractable text. Return a single placeholder section.
258
+ return _placeholder_section(full_text)
259
+
260
+ median = statistics.median(block_sizes)
261
+ threshold = median * 1.3
262
+
263
+ # Second pass: find heading candidates and locate their byte offset.
264
+ text_bytes = full_text.encode("utf-8")
265
+ for page_idx, page in enumerate(doc):
266
+ page_text = pages_text[page_idx]
267
+ page_offset = page_offsets[page_idx]
268
+ blocks = page.get_text("dict").get("blocks", [])
269
+ for block in blocks:
270
+ for line in block.get("lines", []):
271
+ spans = line.get("spans", [])
272
+ if not spans:
273
+ continue
274
+ max_size = max(float(s.get("size", 0.0)) for s in spans)
275
+ if max_size < threshold:
276
+ continue
277
+ title = "".join(s.get("text", "") for s in spans).strip()
278
+ if not title or len(title) > 200:
279
+ continue
280
+ offset_in_page = page_text.find(title)
281
+ if offset_in_page < 0:
282
+ continue
283
+ byte_offset = page_offset + len(
284
+ page_text[:offset_in_page].encode("utf-8")
285
+ )
286
+ headings.append((title, byte_offset))
287
+
288
+ if not headings:
289
+ return _placeholder_section(full_text)
290
+
291
+ # Deduplicate by title + offset; sort by offset to enforce document order.
292
+ seen: set[tuple[str, int]] = set()
293
+ unique: list[tuple[str, int]] = []
294
+ for title, offset in headings:
295
+ if (title, offset) not in seen:
296
+ seen.add((title, offset))
297
+ unique.append((title, offset))
298
+ unique.sort(key=lambda x: x[1])
299
+
300
+ sections: list[SectionNode] = []
301
+ sibling_counts: dict[str, int] = defaultdict(int)
302
+ total_bytes = len(text_bytes)
303
+ for i, (title, byte_offset) in enumerate(unique):
304
+ slug = _safe_slug(title)
305
+ sibling_counts[slug] += 1
306
+ count = sibling_counts[slug]
307
+ section_id = slug if count == 1 else f"{slug}-{count}"
308
+ end = unique[i + 1][1] if i + 1 < len(unique) else total_bytes
309
+ sections.append(
310
+ SectionNode(
311
+ id=section_id,
312
+ title=title,
313
+ level=1,
314
+ parent=None,
315
+ children=(),
316
+ span=Span(start=byte_offset, end=end),
317
+ path=(title,),
318
+ raw_text=text_bytes[byte_offset:end].decode("utf-8", errors="replace"),
319
+ )
320
+ )
321
+ return sections
322
+
323
+
324
+ def _placeholder_section(full_text: str) -> list[SectionNode]:
325
+ """When extraction yields nothing useful, return a single virtual section."""
326
+ text_bytes = full_text.encode("utf-8")
327
+ if not text_bytes:
328
+ return []
329
+ return [
330
+ SectionNode(
331
+ id="document",
332
+ title="Document",
333
+ level=1,
334
+ parent=None,
335
+ children=(),
336
+ span=Span(start=0, end=len(text_bytes)),
337
+ path=("Document",),
338
+ raw_text=full_text,
339
+ )
340
+ ]
341
+
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # Helpers
345
+ # ---------------------------------------------------------------------------
346
+
347
+
348
+ def _safe_slug(text: str) -> str:
349
+ return slugify(text) or "section"
350
+
351
+
352
+ def _slug_or_raise(text: str, *, ctx: str) -> str:
353
+ slug = slugify(text)
354
+ if not slug:
355
+ msg = f"could not derive a slug from {ctx}: {text!r}"
356
+ raise ParseError(msg)
357
+ return slug