docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/index/xrefs.py ADDED
@@ -0,0 +1,195 @@
1
+ """XRefs sub-index — cross-references (X), the document graph.
2
+
3
+ The :class:`XRefBuilder` runs an :class:`cairn.xref.base.XRefExtractor`
4
+ over a Document (plus an optional Entities reader), deduplicates
5
+ ``(src, dst, kind)`` triples by keeping the highest-confidence span, and
6
+ writes ``refs.json``.
7
+
8
+ The :class:`XRefs` reader exposes outgoing/incoming/by-kind queries. Edges
9
+ are directed; a backward edge between two sections is its own record.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from collections import defaultdict
16
+ from collections.abc import Iterable, Iterator
17
+ from datetime import UTC, datetime
18
+ from pathlib import Path
19
+ from typing import Any, Final
20
+
21
+ from cairn.core.errors import IndexBuildError, IndexNotFoundError
22
+ from cairn.core.types import Document, Span, XRef, XRefKind
23
+ from cairn.index.entities import Entities
24
+ from cairn.xref.base import ExtractionEdge, XRefExtractor
25
+
26
+ XREFS_FILENAME: Final = "refs.json"
27
+ XREFS_FORMAT_VERSION: Final = 1
28
+
29
+
30
+ class XRefBuilder:
31
+ """Run an extractor, aggregate edges, persist ``refs.json``."""
32
+
33
+ def __init__(self, extractor: XRefExtractor) -> None:
34
+ self.extractor = extractor
35
+
36
+ async def build(
37
+ self,
38
+ document: Document,
39
+ *,
40
+ out_dir: Path,
41
+ entities: Entities | None = None,
42
+ ) -> Path:
43
+ out_dir.mkdir(parents=True, exist_ok=True)
44
+ path = out_dir / XREFS_FILENAME
45
+
46
+ edges = await self.extractor.extract(document, entities=entities)
47
+ refs = _aggregate(edges)
48
+ now = datetime.now(UTC)
49
+
50
+ payload: dict[str, Any] = {
51
+ "format_version": XREFS_FORMAT_VERSION,
52
+ "doc_id": document.id,
53
+ "extractor": self.extractor.name,
54
+ "generated_at": now.isoformat(),
55
+ "refs": [_xref_to_dict(r) for r in refs],
56
+ }
57
+
58
+ with path.open("w", encoding="utf-8") as fh:
59
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
60
+ fh.write("\n")
61
+ return path
62
+
63
+
64
+ class XRefs:
65
+ """Loaded cross-references sub-index. Read-only queries."""
66
+
67
+ def __init__(
68
+ self,
69
+ refs: tuple[XRef, ...],
70
+ *,
71
+ doc_id: str,
72
+ extractor: str,
73
+ ) -> None:
74
+ self._all = refs
75
+ self.doc_id = doc_id
76
+ self.extractor = extractor
77
+
78
+ self._outgoing: dict[str, list[XRef]] = defaultdict(list)
79
+ self._incoming: dict[str, list[XRef]] = defaultdict(list)
80
+ for ref in refs:
81
+ self._outgoing[ref.src].append(ref)
82
+ self._incoming[ref.dst].append(ref)
83
+
84
+ @classmethod
85
+ def load(cls, doc_dir: Path) -> XRefs:
86
+ path = doc_dir / XREFS_FILENAME
87
+ if not path.exists():
88
+ msg = f"refs.json not found in {doc_dir}"
89
+ raise IndexNotFoundError(msg, details={"path": str(path)})
90
+
91
+ with path.open("r", encoding="utf-8") as fh:
92
+ payload = json.load(fh)
93
+
94
+ version = payload.get("format_version")
95
+ if version != XREFS_FORMAT_VERSION:
96
+ msg = (
97
+ f"unsupported refs format version: {version!r} "
98
+ f"(expected {XREFS_FORMAT_VERSION})"
99
+ )
100
+ raise IndexNotFoundError(msg, details={"path": str(path)})
101
+
102
+ refs = tuple(_xref_from_dict(d) for d in payload["refs"])
103
+ return cls(refs, doc_id=payload["doc_id"], extractor=payload["extractor"])
104
+
105
+ # -- queries -----------------------------------------------------------
106
+
107
+ def __len__(self) -> int:
108
+ return len(self._all)
109
+
110
+ def __iter__(self) -> Iterator[XRef]:
111
+ return iter(self._all)
112
+
113
+ def outgoing_from(
114
+ self, section_id: str, *, kinds: tuple[XRefKind, ...] | None = None
115
+ ) -> list[XRef]:
116
+ """Outgoing edges sorted by confidence descending."""
117
+ edges = self._outgoing.get(section_id, ())
118
+ if kinds is not None:
119
+ edges = [e for e in edges if e.kind in kinds]
120
+ return sorted(edges, key=lambda r: (-r.confidence, r.dst))
121
+
122
+ def incoming_to(
123
+ self, section_id: str, *, kinds: tuple[XRefKind, ...] | None = None
124
+ ) -> list[XRef]:
125
+ edges = self._incoming.get(section_id, ())
126
+ if kinds is not None:
127
+ edges = [e for e in edges if e.kind in kinds]
128
+ return sorted(edges, key=lambda r: (-r.confidence, r.src))
129
+
130
+ def by_kind(self, kind: XRefKind) -> list[XRef]:
131
+ return [r for r in self._all if r.kind == kind]
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Aggregation
136
+ # ---------------------------------------------------------------------------
137
+
138
+
139
+ def _aggregate(edges: Iterable[ExtractionEdge]) -> list[XRef]:
140
+ """Deduplicate ``(src, dst, kind)``; keep highest-confidence span."""
141
+ by_key: dict[tuple[str, str, str], XRef] = {}
142
+ insertion_order: list[tuple[str, str, str]] = []
143
+
144
+ for edge in edges:
145
+ if edge.src == edge.dst:
146
+ continue
147
+ if not edge.src or not edge.dst:
148
+ msg = "extractor emitted an edge with empty endpoint id"
149
+ raise IndexBuildError(msg)
150
+ key = (edge.src, edge.dst, edge.kind)
151
+ existing = by_key.get(key)
152
+ if existing is None:
153
+ by_key[key] = XRef(
154
+ src=edge.src,
155
+ dst=edge.dst,
156
+ kind=edge.kind,
157
+ confidence=edge.confidence,
158
+ span=edge.span,
159
+ )
160
+ insertion_order.append(key)
161
+ elif edge.confidence > existing.confidence:
162
+ by_key[key] = XRef(
163
+ src=edge.src,
164
+ dst=edge.dst,
165
+ kind=edge.kind,
166
+ confidence=edge.confidence,
167
+ span=edge.span,
168
+ )
169
+
170
+ return [by_key[key] for key in insertion_order]
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # Serialization
175
+ # ---------------------------------------------------------------------------
176
+
177
+
178
+ def _xref_to_dict(r: XRef) -> dict[str, Any]:
179
+ return {
180
+ "src": r.src,
181
+ "dst": r.dst,
182
+ "kind": r.kind,
183
+ "confidence": r.confidence,
184
+ "span": {"start": r.span.start, "end": r.span.end},
185
+ }
186
+
187
+
188
+ def _xref_from_dict(d: dict[str, Any]) -> XRef:
189
+ return XRef(
190
+ src=d["src"],
191
+ dst=d["dst"],
192
+ kind=d["kind"],
193
+ confidence=d["confidence"],
194
+ span=Span(start=d["span"]["start"], end=d["span"]["end"]),
195
+ )
@@ -0,0 +1,36 @@
1
+ """Ingestion layer — parsers from source formats into the canonical Document AST."""
2
+
3
+ from cairn.ingest.base import Parser
4
+ from cairn.ingest.markdown import MarkdownParser
5
+ from cairn.ingest.markitdown import MarkItDownParser
6
+ from cairn.ingest.pdf import PdfParser
7
+
8
+ __all__ = ["MarkItDownParser", "MarkdownParser", "Parser", "PdfParser"]
9
+
10
+
11
+ def parser_for_path(path) -> Parser: # type: ignore[no-untyped-def]
12
+ """Pick a parser based on the file's extension.
13
+
14
+ Raises :class:`cairn.core.errors.ConfigError` for unknown extensions.
15
+ """
16
+ from pathlib import Path
17
+
18
+ from cairn.core.errors import ConfigError
19
+
20
+ p = Path(path)
21
+ ext = p.suffix.lower()
22
+ if ext in MarkdownParser.extensions:
23
+ return MarkdownParser()
24
+ if ext in PdfParser.extensions:
25
+ return PdfParser()
26
+ if ext in MarkItDownParser.extensions:
27
+ return MarkItDownParser()
28
+ msg = f"no parser registered for extension {ext!r}"
29
+ raise ConfigError(msg, details={"path": str(p), "extension": ext})
30
+
31
+
32
+ def supported_extensions() -> frozenset[str]:
33
+ """Return every file extension Cairn can dispatch to an ingest parser."""
34
+ return frozenset(
35
+ (*MarkdownParser.extensions, *PdfParser.extensions, *MarkItDownParser.extensions)
36
+ )
cairn/ingest/base.py ADDED
@@ -0,0 +1,46 @@
1
+ """Parser protocol — the contract every ingestion plugin must satisfy."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Protocol, runtime_checkable
7
+
8
+ from cairn.core.types import Document
9
+
10
+
11
+ @runtime_checkable
12
+ class Parser(Protocol):
13
+ """A source-format parser.
14
+
15
+ Implementations live in `cairn.plugins.*` or `cairn.ingest.*`. They must
16
+ preserve heading hierarchy, emit stable slug-based section IDs, and
17
+ populate byte spans into the original source.
18
+
19
+ See ARCHITECTURE.md §2.1 for the full set of hard rules.
20
+ """
21
+
22
+ name: str
23
+ """Identifier used in config (e.g. ``markdown``, ``pdf``)."""
24
+
25
+ extensions: tuple[str, ...]
26
+ """File extensions this parser claims, with leading dot. e.g. ``(".md",)``."""
27
+
28
+ def parse(
29
+ self,
30
+ source: Path | bytes | str,
31
+ *,
32
+ doc_id: str | None = None,
33
+ ) -> Document:
34
+ """Parse ``source`` into a canonical :class:`Document`.
35
+
36
+ Args:
37
+ source: A path, raw bytes, or text.
38
+ doc_id: Optional explicit document identifier. Required when
39
+ ``source`` is bytes or text. When a path is given and
40
+ ``doc_id`` is omitted, the parser derives it from the
41
+ filename stem.
42
+
43
+ Returns:
44
+ A fully populated :class:`Document` with section tree and spans.
45
+ """
46
+ ...
@@ -0,0 +1,244 @@
1
+ """Markdown parser — Markdown source → canonical Document AST.
2
+
3
+ Preserves heading hierarchy, generates stable hierarchical slug-based section
4
+ IDs, computes byte spans, and emits ``raw_text`` that excludes descendant
5
+ section bodies (per ARCHITECTURE.md §2.2).
6
+
7
+ Front-matter (YAML, TOML) is parsed and discarded. Content preceding the first
8
+ heading is discarded; if a document has no headings, an empty section list is
9
+ returned (callers may choose to treat this as a parse error).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ from collections import defaultdict
16
+ from datetime import UTC, datetime
17
+ from pathlib import Path
18
+
19
+ from markdown_it import MarkdownIt
20
+ from markdown_it.token import Token
21
+ from mdit_py_plugins.front_matter import front_matter_plugin
22
+ from slugify import slugify
23
+
24
+ from cairn import __version__
25
+ from cairn.core.errors import ParseError
26
+ from cairn.core.types import Document, SectionNode, Span
27
+
28
+
29
+ class MarkdownParser:
30
+ """CommonMark-compliant Markdown parser with front-matter and tables."""
31
+
32
+ name = "markdown"
33
+ extensions: tuple[str, ...] = (".md", ".markdown", ".mdown", ".mkd")
34
+
35
+ def __init__(self) -> None:
36
+ md = MarkdownIt("commonmark", {"html": False})
37
+ md.use(front_matter_plugin)
38
+ md.enable(["table"])
39
+ self._md = md
40
+
41
+ def parse(
42
+ self,
43
+ source: Path | bytes | str,
44
+ *,
45
+ doc_id: str | None = None,
46
+ ) -> Document:
47
+ source_path, text, derived_doc_id = self._resolve_source(source, doc_id)
48
+
49
+ text_bytes = text.encode("utf-8")
50
+ line_offsets = _compute_line_offsets(text_bytes)
51
+ tokens = self._md.parse(text)
52
+ headings = _extract_headings(tokens)
53
+ sections = _build_sections(headings, text_bytes, line_offsets)
54
+
55
+ return Document(
56
+ id=derived_doc_id,
57
+ source_path=source_path,
58
+ source_hash=hashlib.sha256(text_bytes).hexdigest(),
59
+ sections=tuple(sections),
60
+ indexed_at=datetime.now(UTC),
61
+ cairn_version=__version__,
62
+ )
63
+
64
+ @staticmethod
65
+ def _resolve_source(
66
+ source: Path | bytes | str,
67
+ doc_id: str | None,
68
+ ) -> tuple[Path, str, str]:
69
+ if isinstance(source, Path):
70
+ try:
71
+ text = source.read_text(encoding="utf-8")
72
+ except OSError as exc:
73
+ msg = f"could not read source file: {source}"
74
+ raise ParseError(msg, details={"path": str(source)}) from exc
75
+ resolved_id = doc_id or _slug_or_raise(source.stem, ctx="filename stem")
76
+ return source, text, resolved_id
77
+
78
+ if doc_id is None:
79
+ msg = "doc_id is required when source is not a path"
80
+ raise ParseError(msg)
81
+
82
+ text = source.decode("utf-8") if isinstance(source, bytes) else source
83
+ return Path(f"<in-memory:{doc_id}>"), text, doc_id
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Internal helpers
88
+ # ---------------------------------------------------------------------------
89
+
90
+
91
+ def _compute_line_offsets(text_bytes: bytes) -> list[int]:
92
+ """Return byte offsets where each line begins.
93
+
94
+ The list has length ``num_lines + 1``: the final element equals
95
+ ``len(text_bytes)``, acting as a virtual one-past-the-end line start so
96
+ callers can address EOF uniformly.
97
+ """
98
+ offsets = [0]
99
+ for i, byte in enumerate(text_bytes):
100
+ if byte == 0x0A: # newline
101
+ offsets.append(i + 1)
102
+ offsets.append(len(text_bytes))
103
+ return offsets
104
+
105
+
106
+ def _heading_title(inline_tok: Token) -> str:
107
+ """Extract plain text title from a heading's inline token.
108
+
109
+ Concatenates the text content of leaf ``text`` children, dropping markup.
110
+ Falls back to the raw inline content if no children are present.
111
+ """
112
+ if inline_tok.children is None:
113
+ return inline_tok.content
114
+ parts: list[str] = []
115
+ for child in inline_tok.children:
116
+ if child.type in ("text", "code_inline"):
117
+ parts.append(child.content)
118
+ return "".join(parts).strip() or inline_tok.content.strip()
119
+
120
+
121
+ def _extract_headings(tokens: list[Token]) -> list[tuple[int, str, int, int]]:
122
+ """Return ``(level, title, line_start, line_end_excl)`` for each heading."""
123
+ headings: list[tuple[int, str, int, int]] = []
124
+ for i, tok in enumerate(tokens):
125
+ if tok.type != "heading_open" or tok.map is None:
126
+ continue
127
+ level = int(tok.tag[1])
128
+ line_start, line_end_excl = tok.map
129
+ # Inline token always follows heading_open.
130
+ if i + 1 >= len(tokens) or tokens[i + 1].type != "inline":
131
+ continue
132
+ title = _heading_title(tokens[i + 1])
133
+ headings.append((level, title, line_start, line_end_excl))
134
+ return headings
135
+
136
+
137
+ def _slug_or_raise(text: str, *, ctx: str) -> str:
138
+ slug = slugify(text)
139
+ if not slug:
140
+ msg = f"could not derive a slug from {ctx}: {text!r}"
141
+ raise ParseError(msg)
142
+ return slug
143
+
144
+
145
+ def _safe_slug(text: str) -> str:
146
+ """Slug that always returns something usable; falls back to ``section``."""
147
+ return slugify(text) or "section"
148
+
149
+
150
+ def _build_sections(
151
+ headings: list[tuple[int, str, int, int]],
152
+ text_bytes: bytes,
153
+ line_offsets: list[int],
154
+ ) -> list[SectionNode]:
155
+ """Assemble SectionNode objects from heading metadata."""
156
+ if not headings:
157
+ return []
158
+
159
+ n = len(headings)
160
+ total_bytes = len(text_bytes)
161
+
162
+ # Territory: end at next heading with level <= current_level (else EOF).
163
+ territory_end_line: list[int] = []
164
+ for i, (level, _, _, _) in enumerate(headings):
165
+ next_terr = len(line_offsets) - 1
166
+ for j in range(i + 1, n):
167
+ if headings[j][0] <= level:
168
+ next_terr = headings[j][2]
169
+ break
170
+ territory_end_line.append(next_terr)
171
+
172
+ # Raw text end: next heading at ANY level (else territory end).
173
+ raw_text_end_line: list[int] = []
174
+ for i in range(n):
175
+ if i + 1 < n:
176
+ raw_text_end_line.append(headings[i + 1][2])
177
+ else:
178
+ raw_text_end_line.append(territory_end_line[i])
179
+
180
+ # Hierarchical IDs, parents, paths.
181
+ metadata: list[tuple[str, str, int, str | None, tuple[str, ...]]] = []
182
+ stack: list[tuple[int, str, str]] = [] # (level, id, title)
183
+ sibling_counters: dict[tuple[str, str], int] = defaultdict(int)
184
+
185
+ for level, title, _line_start, _line_end in headings:
186
+ while stack and stack[-1][0] >= level:
187
+ stack.pop()
188
+
189
+ parent_id = stack[-1][1] if stack else None
190
+ slug = _safe_slug(title)
191
+ key = (parent_id or "", slug)
192
+ sibling_counters[key] += 1
193
+ count = sibling_counters[key]
194
+ unique_slug = slug if count == 1 else f"{slug}-{count}"
195
+
196
+ section_id = f"{parent_id}/{unique_slug}" if parent_id else unique_slug
197
+ path = (*(t for _, _, t in stack), title)
198
+
199
+ metadata.append((section_id, title, level, parent_id, path))
200
+ stack.append((level, section_id, title))
201
+
202
+ children_map: dict[str, list[str]] = defaultdict(list)
203
+ for sid, _title, _level, parent_id, _path in metadata:
204
+ if parent_id is not None:
205
+ children_map[parent_id].append(sid)
206
+
207
+ sections: list[SectionNode] = []
208
+ for idx, (level, _title_h, _line_start, line_end_excl) in enumerate(headings):
209
+ section_id, title, _lvl, parent_id, path = metadata[idx]
210
+
211
+ span_start_line = headings[idx][2]
212
+ span_end_line = territory_end_line[idx]
213
+ span = Span(
214
+ start=_line_to_byte(line_offsets, span_start_line, total_bytes),
215
+ end=_line_to_byte(line_offsets, span_end_line, total_bytes),
216
+ )
217
+
218
+ raw_start = _line_to_byte(line_offsets, line_end_excl, total_bytes)
219
+ raw_end = _line_to_byte(line_offsets, raw_text_end_line[idx], total_bytes)
220
+ raw_text = text_bytes[raw_start:raw_end].decode("utf-8")
221
+
222
+ sections.append(
223
+ SectionNode(
224
+ id=section_id,
225
+ title=title,
226
+ level=level,
227
+ parent=parent_id,
228
+ children=tuple(children_map.get(section_id, ())),
229
+ span=span,
230
+ path=path,
231
+ raw_text=raw_text,
232
+ )
233
+ )
234
+
235
+ return sections
236
+
237
+
238
+ def _line_to_byte(line_offsets: list[int], line: int, total_bytes: int) -> int:
239
+ """Resolve a 0-indexed line number to its starting byte offset."""
240
+ if line >= len(line_offsets):
241
+ return total_bytes
242
+ if line < 0:
243
+ return 0
244
+ return line_offsets[line]
@@ -0,0 +1,145 @@
1
+ """Optional MarkItDown-backed parser for office/data/web document formats.
2
+
3
+ MarkItDown converts many source formats into Markdown intended for LLM and
4
+ text-analysis pipelines. Cairn uses it as a local-file conversion layer, then
5
+ delegates structure extraction to :class:`MarkdownParser`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ from importlib.metadata import PackageNotFoundError, version
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from slugify import slugify
16
+
17
+ from cairn.core.errors import ParseError
18
+ from cairn.core.types import Document
19
+ from cairn.ingest.markdown import MarkdownParser
20
+
21
+
22
+ class MarkItDownParser:
23
+ """Convert non-native local files to Markdown and parse the result."""
24
+
25
+ name = "markitdown"
26
+ extensions: tuple[str, ...] = (
27
+ ".docx",
28
+ ".pptx",
29
+ ".xlsx",
30
+ ".xls",
31
+ ".html",
32
+ ".htm",
33
+ ".csv",
34
+ ".json",
35
+ ".xml",
36
+ ".epub",
37
+ ".msg",
38
+ ".png",
39
+ ".jpg",
40
+ ".jpeg",
41
+ ".gif",
42
+ ".wav",
43
+ ".mp3",
44
+ ".zip",
45
+ )
46
+
47
+ def __init__(self) -> None:
48
+ self._markdown = MarkdownParser()
49
+
50
+ def parse(
51
+ self,
52
+ source: Path | bytes | str,
53
+ *,
54
+ doc_id: str | None = None,
55
+ ) -> Document:
56
+ if not isinstance(source, Path):
57
+ msg = "MarkItDownParser only accepts local file paths"
58
+ raise ParseError(msg)
59
+
60
+ source_path = source.resolve()
61
+ resolved_doc_id = doc_id or _slug_or_raise(source_path.stem, ctx="filename stem")
62
+ markdown = self._convert_local(source_path)
63
+ if not markdown.strip():
64
+ msg = f"MarkItDown produced empty Markdown for: {source_path}"
65
+ raise ParseError(msg, details={"path": str(source_path)})
66
+ markdown = _ensure_heading(markdown, source_path.stem)
67
+
68
+ parsed = self._markdown.parse(markdown, doc_id=resolved_doc_id)
69
+ try:
70
+ source_bytes = source_path.read_bytes()
71
+ except OSError as exc:
72
+ msg = f"could not read converted source file: {source_path}"
73
+ raise ParseError(msg, details={"path": str(source_path)}) from exc
74
+ converter_version = _markitdown_version()
75
+ source_hash = hashlib.sha256(
76
+ b"\x00".join(
77
+ [
78
+ source_bytes,
79
+ converter_version.encode("utf-8"),
80
+ markdown.encode("utf-8"),
81
+ ]
82
+ )
83
+ ).hexdigest()
84
+ return parsed.model_copy(
85
+ update={
86
+ "source_path": source_path,
87
+ "source_hash": source_hash,
88
+ }
89
+ )
90
+
91
+ @staticmethod
92
+ def _convert_local(path: Path) -> str:
93
+ try:
94
+ from markitdown import MarkItDown
95
+ except ImportError as exc:
96
+ msg = (
97
+ "MarkItDown support is not installed. Install with "
98
+ "`pip install 'cairn[markitdown]'` or "
99
+ "`uv pip install -e '.[markitdown]'`."
100
+ )
101
+ raise ParseError(msg, details={"path": str(path)}) from exc
102
+
103
+ try:
104
+ converter: Any = MarkItDown(enable_plugins=False)
105
+ if hasattr(converter, "convert_local"):
106
+ result = converter.convert_local(str(path))
107
+ else:
108
+ result = converter.convert(str(path))
109
+ except Exception as exc:
110
+ msg = f"MarkItDown could not convert local file: {path}"
111
+ raise ParseError(msg, details={"path": str(path)}) from exc
112
+
113
+ text = getattr(result, "text_content", None)
114
+ if text is None:
115
+ text = getattr(result, "markdown", None)
116
+ if not isinstance(text, str):
117
+ msg = "MarkItDown returned no Markdown text content"
118
+ raise ParseError(msg, details={"path": str(path)})
119
+ return text
120
+
121
+
122
+ def _slug_or_raise(text: str, *, ctx: str) -> str:
123
+ slug = slugify(text)
124
+ if not slug:
125
+ msg = f"could not derive a slug from {ctx}: {text!r}"
126
+ raise ParseError(msg)
127
+ return slug
128
+
129
+
130
+ def _markitdown_version() -> str:
131
+ try:
132
+ return version("markitdown")
133
+ except PackageNotFoundError:
134
+ return "not-installed"
135
+
136
+
137
+ def _ensure_heading(markdown: str, fallback_title: str) -> str:
138
+ for line in markdown.splitlines():
139
+ stripped = line.strip()
140
+ if stripped.startswith("# "):
141
+ return markdown
142
+ if stripped:
143
+ break
144
+ title = fallback_title.replace("_", " ").replace("-", " ").strip() or "Document"
145
+ return f"# {title}\n\n{markdown}"