docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/index/xrefs.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""XRefs sub-index — cross-references (X), the document graph.
|
|
2
|
+
|
|
3
|
+
The :class:`XRefBuilder` runs an :class:`cairn.xref.base.XRefExtractor`
|
|
4
|
+
over a Document (plus an optional Entities reader), deduplicates
|
|
5
|
+
``(src, dst, kind)`` triples by keeping the highest-confidence span, and
|
|
6
|
+
writes ``refs.json``.
|
|
7
|
+
|
|
8
|
+
The :class:`XRefs` reader exposes outgoing/incoming/by-kind queries. Edges
|
|
9
|
+
are directed; a backward edge between two sections is its own record.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from collections.abc import Iterable, Iterator
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Final
|
|
20
|
+
|
|
21
|
+
from cairn.core.errors import IndexBuildError, IndexNotFoundError
|
|
22
|
+
from cairn.core.types import Document, Span, XRef, XRefKind
|
|
23
|
+
from cairn.index.entities import Entities
|
|
24
|
+
from cairn.xref.base import ExtractionEdge, XRefExtractor
|
|
25
|
+
|
|
26
|
+
XREFS_FILENAME: Final = "refs.json"
|
|
27
|
+
XREFS_FORMAT_VERSION: Final = 1
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class XRefBuilder:
|
|
31
|
+
"""Run an extractor, aggregate edges, persist ``refs.json``."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, extractor: XRefExtractor) -> None:
|
|
34
|
+
self.extractor = extractor
|
|
35
|
+
|
|
36
|
+
async def build(
|
|
37
|
+
self,
|
|
38
|
+
document: Document,
|
|
39
|
+
*,
|
|
40
|
+
out_dir: Path,
|
|
41
|
+
entities: Entities | None = None,
|
|
42
|
+
) -> Path:
|
|
43
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
path = out_dir / XREFS_FILENAME
|
|
45
|
+
|
|
46
|
+
edges = await self.extractor.extract(document, entities=entities)
|
|
47
|
+
refs = _aggregate(edges)
|
|
48
|
+
now = datetime.now(UTC)
|
|
49
|
+
|
|
50
|
+
payload: dict[str, Any] = {
|
|
51
|
+
"format_version": XREFS_FORMAT_VERSION,
|
|
52
|
+
"doc_id": document.id,
|
|
53
|
+
"extractor": self.extractor.name,
|
|
54
|
+
"generated_at": now.isoformat(),
|
|
55
|
+
"refs": [_xref_to_dict(r) for r in refs],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
59
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
60
|
+
fh.write("\n")
|
|
61
|
+
return path
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class XRefs:
|
|
65
|
+
"""Loaded cross-references sub-index. Read-only queries."""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
refs: tuple[XRef, ...],
|
|
70
|
+
*,
|
|
71
|
+
doc_id: str,
|
|
72
|
+
extractor: str,
|
|
73
|
+
) -> None:
|
|
74
|
+
self._all = refs
|
|
75
|
+
self.doc_id = doc_id
|
|
76
|
+
self.extractor = extractor
|
|
77
|
+
|
|
78
|
+
self._outgoing: dict[str, list[XRef]] = defaultdict(list)
|
|
79
|
+
self._incoming: dict[str, list[XRef]] = defaultdict(list)
|
|
80
|
+
for ref in refs:
|
|
81
|
+
self._outgoing[ref.src].append(ref)
|
|
82
|
+
self._incoming[ref.dst].append(ref)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def load(cls, doc_dir: Path) -> XRefs:
|
|
86
|
+
path = doc_dir / XREFS_FILENAME
|
|
87
|
+
if not path.exists():
|
|
88
|
+
msg = f"refs.json not found in {doc_dir}"
|
|
89
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
90
|
+
|
|
91
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
92
|
+
payload = json.load(fh)
|
|
93
|
+
|
|
94
|
+
version = payload.get("format_version")
|
|
95
|
+
if version != XREFS_FORMAT_VERSION:
|
|
96
|
+
msg = (
|
|
97
|
+
f"unsupported refs format version: {version!r} "
|
|
98
|
+
f"(expected {XREFS_FORMAT_VERSION})"
|
|
99
|
+
)
|
|
100
|
+
raise IndexNotFoundError(msg, details={"path": str(path)})
|
|
101
|
+
|
|
102
|
+
refs = tuple(_xref_from_dict(d) for d in payload["refs"])
|
|
103
|
+
return cls(refs, doc_id=payload["doc_id"], extractor=payload["extractor"])
|
|
104
|
+
|
|
105
|
+
# -- queries -----------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
def __len__(self) -> int:
|
|
108
|
+
return len(self._all)
|
|
109
|
+
|
|
110
|
+
def __iter__(self) -> Iterator[XRef]:
|
|
111
|
+
return iter(self._all)
|
|
112
|
+
|
|
113
|
+
def outgoing_from(
|
|
114
|
+
self, section_id: str, *, kinds: tuple[XRefKind, ...] | None = None
|
|
115
|
+
) -> list[XRef]:
|
|
116
|
+
"""Outgoing edges sorted by confidence descending."""
|
|
117
|
+
edges = self._outgoing.get(section_id, ())
|
|
118
|
+
if kinds is not None:
|
|
119
|
+
edges = [e for e in edges if e.kind in kinds]
|
|
120
|
+
return sorted(edges, key=lambda r: (-r.confidence, r.dst))
|
|
121
|
+
|
|
122
|
+
def incoming_to(
|
|
123
|
+
self, section_id: str, *, kinds: tuple[XRefKind, ...] | None = None
|
|
124
|
+
) -> list[XRef]:
|
|
125
|
+
edges = self._incoming.get(section_id, ())
|
|
126
|
+
if kinds is not None:
|
|
127
|
+
edges = [e for e in edges if e.kind in kinds]
|
|
128
|
+
return sorted(edges, key=lambda r: (-r.confidence, r.src))
|
|
129
|
+
|
|
130
|
+
def by_kind(self, kind: XRefKind) -> list[XRef]:
|
|
131
|
+
return [r for r in self._all if r.kind == kind]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Aggregation
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _aggregate(edges: Iterable[ExtractionEdge]) -> list[XRef]:
|
|
140
|
+
"""Deduplicate ``(src, dst, kind)``; keep highest-confidence span."""
|
|
141
|
+
by_key: dict[tuple[str, str, str], XRef] = {}
|
|
142
|
+
insertion_order: list[tuple[str, str, str]] = []
|
|
143
|
+
|
|
144
|
+
for edge in edges:
|
|
145
|
+
if edge.src == edge.dst:
|
|
146
|
+
continue
|
|
147
|
+
if not edge.src or not edge.dst:
|
|
148
|
+
msg = "extractor emitted an edge with empty endpoint id"
|
|
149
|
+
raise IndexBuildError(msg)
|
|
150
|
+
key = (edge.src, edge.dst, edge.kind)
|
|
151
|
+
existing = by_key.get(key)
|
|
152
|
+
if existing is None:
|
|
153
|
+
by_key[key] = XRef(
|
|
154
|
+
src=edge.src,
|
|
155
|
+
dst=edge.dst,
|
|
156
|
+
kind=edge.kind,
|
|
157
|
+
confidence=edge.confidence,
|
|
158
|
+
span=edge.span,
|
|
159
|
+
)
|
|
160
|
+
insertion_order.append(key)
|
|
161
|
+
elif edge.confidence > existing.confidence:
|
|
162
|
+
by_key[key] = XRef(
|
|
163
|
+
src=edge.src,
|
|
164
|
+
dst=edge.dst,
|
|
165
|
+
kind=edge.kind,
|
|
166
|
+
confidence=edge.confidence,
|
|
167
|
+
span=edge.span,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return [by_key[key] for key in insertion_order]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
# Serialization
|
|
175
|
+
# ---------------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _xref_to_dict(r: XRef) -> dict[str, Any]:
|
|
179
|
+
return {
|
|
180
|
+
"src": r.src,
|
|
181
|
+
"dst": r.dst,
|
|
182
|
+
"kind": r.kind,
|
|
183
|
+
"confidence": r.confidence,
|
|
184
|
+
"span": {"start": r.span.start, "end": r.span.end},
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _xref_from_dict(d: dict[str, Any]) -> XRef:
|
|
189
|
+
return XRef(
|
|
190
|
+
src=d["src"],
|
|
191
|
+
dst=d["dst"],
|
|
192
|
+
kind=d["kind"],
|
|
193
|
+
confidence=d["confidence"],
|
|
194
|
+
span=Span(start=d["span"]["start"], end=d["span"]["end"]),
|
|
195
|
+
)
|
cairn/ingest/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Ingestion layer — parsers from source formats into the canonical Document AST."""
|
|
2
|
+
|
|
3
|
+
from cairn.ingest.base import Parser
|
|
4
|
+
from cairn.ingest.markdown import MarkdownParser
|
|
5
|
+
from cairn.ingest.markitdown import MarkItDownParser
|
|
6
|
+
from cairn.ingest.pdf import PdfParser
|
|
7
|
+
|
|
8
|
+
__all__ = ["MarkItDownParser", "MarkdownParser", "Parser", "PdfParser"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parser_for_path(path) -> Parser: # type: ignore[no-untyped-def]
|
|
12
|
+
"""Pick a parser based on the file's extension.
|
|
13
|
+
|
|
14
|
+
Raises :class:`cairn.core.errors.ConfigError` for unknown extensions.
|
|
15
|
+
"""
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from cairn.core.errors import ConfigError
|
|
19
|
+
|
|
20
|
+
p = Path(path)
|
|
21
|
+
ext = p.suffix.lower()
|
|
22
|
+
if ext in MarkdownParser.extensions:
|
|
23
|
+
return MarkdownParser()
|
|
24
|
+
if ext in PdfParser.extensions:
|
|
25
|
+
return PdfParser()
|
|
26
|
+
if ext in MarkItDownParser.extensions:
|
|
27
|
+
return MarkItDownParser()
|
|
28
|
+
msg = f"no parser registered for extension {ext!r}"
|
|
29
|
+
raise ConfigError(msg, details={"path": str(p), "extension": ext})
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def supported_extensions() -> frozenset[str]:
|
|
33
|
+
"""Return every file extension Cairn can dispatch to an ingest parser."""
|
|
34
|
+
return frozenset(
|
|
35
|
+
(*MarkdownParser.extensions, *PdfParser.extensions, *MarkItDownParser.extensions)
|
|
36
|
+
)
|
cairn/ingest/base.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Parser protocol — the contract every ingestion plugin must satisfy."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
from cairn.core.types import Document
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class Parser(Protocol):
|
|
13
|
+
"""A source-format parser.
|
|
14
|
+
|
|
15
|
+
Implementations live in `cairn.plugins.*` or `cairn.ingest.*`. They must
|
|
16
|
+
preserve heading hierarchy, emit stable slug-based section IDs, and
|
|
17
|
+
populate byte spans into the original source.
|
|
18
|
+
|
|
19
|
+
See ARCHITECTURE.md §2.1 for the full set of hard rules.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name: str
|
|
23
|
+
"""Identifier used in config (e.g. ``markdown``, ``pdf``)."""
|
|
24
|
+
|
|
25
|
+
extensions: tuple[str, ...]
|
|
26
|
+
"""File extensions this parser claims, with leading dot. e.g. ``(".md",)``."""
|
|
27
|
+
|
|
28
|
+
def parse(
|
|
29
|
+
self,
|
|
30
|
+
source: Path | bytes | str,
|
|
31
|
+
*,
|
|
32
|
+
doc_id: str | None = None,
|
|
33
|
+
) -> Document:
|
|
34
|
+
"""Parse ``source`` into a canonical :class:`Document`.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
source: A path, raw bytes, or text.
|
|
38
|
+
doc_id: Optional explicit document identifier. Required when
|
|
39
|
+
``source`` is bytes or text. When a path is given and
|
|
40
|
+
``doc_id`` is omitted, the parser derives it from the
|
|
41
|
+
filename stem.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A fully populated :class:`Document` with section tree and spans.
|
|
45
|
+
"""
|
|
46
|
+
...
|
cairn/ingest/markdown.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Markdown parser — Markdown source → canonical Document AST.
|
|
2
|
+
|
|
3
|
+
Preserves heading hierarchy, generates stable hierarchical slug-based section
|
|
4
|
+
IDs, computes byte spans, and emits ``raw_text`` that excludes descendant
|
|
5
|
+
section bodies (per ARCHITECTURE.md §2.2).
|
|
6
|
+
|
|
7
|
+
Front-matter (YAML, TOML) is parsed and discarded. Content preceding the first
|
|
8
|
+
heading is discarded; if a document has no headings, an empty section list is
|
|
9
|
+
returned (callers may choose to treat this as a parse error).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from datetime import UTC, datetime
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from markdown_it import MarkdownIt
|
|
20
|
+
from markdown_it.token import Token
|
|
21
|
+
from mdit_py_plugins.front_matter import front_matter_plugin
|
|
22
|
+
from slugify import slugify
|
|
23
|
+
|
|
24
|
+
from cairn import __version__
|
|
25
|
+
from cairn.core.errors import ParseError
|
|
26
|
+
from cairn.core.types import Document, SectionNode, Span
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MarkdownParser:
|
|
30
|
+
"""CommonMark-compliant Markdown parser with front-matter and tables."""
|
|
31
|
+
|
|
32
|
+
name = "markdown"
|
|
33
|
+
extensions: tuple[str, ...] = (".md", ".markdown", ".mdown", ".mkd")
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
md = MarkdownIt("commonmark", {"html": False})
|
|
37
|
+
md.use(front_matter_plugin)
|
|
38
|
+
md.enable(["table"])
|
|
39
|
+
self._md = md
|
|
40
|
+
|
|
41
|
+
def parse(
|
|
42
|
+
self,
|
|
43
|
+
source: Path | bytes | str,
|
|
44
|
+
*,
|
|
45
|
+
doc_id: str | None = None,
|
|
46
|
+
) -> Document:
|
|
47
|
+
source_path, text, derived_doc_id = self._resolve_source(source, doc_id)
|
|
48
|
+
|
|
49
|
+
text_bytes = text.encode("utf-8")
|
|
50
|
+
line_offsets = _compute_line_offsets(text_bytes)
|
|
51
|
+
tokens = self._md.parse(text)
|
|
52
|
+
headings = _extract_headings(tokens)
|
|
53
|
+
sections = _build_sections(headings, text_bytes, line_offsets)
|
|
54
|
+
|
|
55
|
+
return Document(
|
|
56
|
+
id=derived_doc_id,
|
|
57
|
+
source_path=source_path,
|
|
58
|
+
source_hash=hashlib.sha256(text_bytes).hexdigest(),
|
|
59
|
+
sections=tuple(sections),
|
|
60
|
+
indexed_at=datetime.now(UTC),
|
|
61
|
+
cairn_version=__version__,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _resolve_source(
|
|
66
|
+
source: Path | bytes | str,
|
|
67
|
+
doc_id: str | None,
|
|
68
|
+
) -> tuple[Path, str, str]:
|
|
69
|
+
if isinstance(source, Path):
|
|
70
|
+
try:
|
|
71
|
+
text = source.read_text(encoding="utf-8")
|
|
72
|
+
except OSError as exc:
|
|
73
|
+
msg = f"could not read source file: {source}"
|
|
74
|
+
raise ParseError(msg, details={"path": str(source)}) from exc
|
|
75
|
+
resolved_id = doc_id or _slug_or_raise(source.stem, ctx="filename stem")
|
|
76
|
+
return source, text, resolved_id
|
|
77
|
+
|
|
78
|
+
if doc_id is None:
|
|
79
|
+
msg = "doc_id is required when source is not a path"
|
|
80
|
+
raise ParseError(msg)
|
|
81
|
+
|
|
82
|
+
text = source.decode("utf-8") if isinstance(source, bytes) else source
|
|
83
|
+
return Path(f"<in-memory:{doc_id}>"), text, doc_id
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Internal helpers
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _compute_line_offsets(text_bytes: bytes) -> list[int]:
|
|
92
|
+
"""Return byte offsets where each line begins.
|
|
93
|
+
|
|
94
|
+
The list has length ``num_lines + 1``: the final element equals
|
|
95
|
+
``len(text_bytes)``, acting as a virtual one-past-the-end line start so
|
|
96
|
+
callers can address EOF uniformly.
|
|
97
|
+
"""
|
|
98
|
+
offsets = [0]
|
|
99
|
+
for i, byte in enumerate(text_bytes):
|
|
100
|
+
if byte == 0x0A: # newline
|
|
101
|
+
offsets.append(i + 1)
|
|
102
|
+
offsets.append(len(text_bytes))
|
|
103
|
+
return offsets
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _heading_title(inline_tok: Token) -> str:
|
|
107
|
+
"""Extract plain text title from a heading's inline token.
|
|
108
|
+
|
|
109
|
+
Concatenates the text content of leaf ``text`` children, dropping markup.
|
|
110
|
+
Falls back to the raw inline content if no children are present.
|
|
111
|
+
"""
|
|
112
|
+
if inline_tok.children is None:
|
|
113
|
+
return inline_tok.content
|
|
114
|
+
parts: list[str] = []
|
|
115
|
+
for child in inline_tok.children:
|
|
116
|
+
if child.type in ("text", "code_inline"):
|
|
117
|
+
parts.append(child.content)
|
|
118
|
+
return "".join(parts).strip() or inline_tok.content.strip()
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _extract_headings(tokens: list[Token]) -> list[tuple[int, str, int, int]]:
|
|
122
|
+
"""Return ``(level, title, line_start, line_end_excl)`` for each heading."""
|
|
123
|
+
headings: list[tuple[int, str, int, int]] = []
|
|
124
|
+
for i, tok in enumerate(tokens):
|
|
125
|
+
if tok.type != "heading_open" or tok.map is None:
|
|
126
|
+
continue
|
|
127
|
+
level = int(tok.tag[1])
|
|
128
|
+
line_start, line_end_excl = tok.map
|
|
129
|
+
# Inline token always follows heading_open.
|
|
130
|
+
if i + 1 >= len(tokens) or tokens[i + 1].type != "inline":
|
|
131
|
+
continue
|
|
132
|
+
title = _heading_title(tokens[i + 1])
|
|
133
|
+
headings.append((level, title, line_start, line_end_excl))
|
|
134
|
+
return headings
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _slug_or_raise(text: str, *, ctx: str) -> str:
|
|
138
|
+
slug = slugify(text)
|
|
139
|
+
if not slug:
|
|
140
|
+
msg = f"could not derive a slug from {ctx}: {text!r}"
|
|
141
|
+
raise ParseError(msg)
|
|
142
|
+
return slug
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _safe_slug(text: str) -> str:
|
|
146
|
+
"""Slug that always returns something usable; falls back to ``section``."""
|
|
147
|
+
return slugify(text) or "section"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _build_sections(
|
|
151
|
+
headings: list[tuple[int, str, int, int]],
|
|
152
|
+
text_bytes: bytes,
|
|
153
|
+
line_offsets: list[int],
|
|
154
|
+
) -> list[SectionNode]:
|
|
155
|
+
"""Assemble SectionNode objects from heading metadata."""
|
|
156
|
+
if not headings:
|
|
157
|
+
return []
|
|
158
|
+
|
|
159
|
+
n = len(headings)
|
|
160
|
+
total_bytes = len(text_bytes)
|
|
161
|
+
|
|
162
|
+
# Territory: end at next heading with level <= current_level (else EOF).
|
|
163
|
+
territory_end_line: list[int] = []
|
|
164
|
+
for i, (level, _, _, _) in enumerate(headings):
|
|
165
|
+
next_terr = len(line_offsets) - 1
|
|
166
|
+
for j in range(i + 1, n):
|
|
167
|
+
if headings[j][0] <= level:
|
|
168
|
+
next_terr = headings[j][2]
|
|
169
|
+
break
|
|
170
|
+
territory_end_line.append(next_terr)
|
|
171
|
+
|
|
172
|
+
# Raw text end: next heading at ANY level (else territory end).
|
|
173
|
+
raw_text_end_line: list[int] = []
|
|
174
|
+
for i in range(n):
|
|
175
|
+
if i + 1 < n:
|
|
176
|
+
raw_text_end_line.append(headings[i + 1][2])
|
|
177
|
+
else:
|
|
178
|
+
raw_text_end_line.append(territory_end_line[i])
|
|
179
|
+
|
|
180
|
+
# Hierarchical IDs, parents, paths.
|
|
181
|
+
metadata: list[tuple[str, str, int, str | None, tuple[str, ...]]] = []
|
|
182
|
+
stack: list[tuple[int, str, str]] = [] # (level, id, title)
|
|
183
|
+
sibling_counters: dict[tuple[str, str], int] = defaultdict(int)
|
|
184
|
+
|
|
185
|
+
for level, title, _line_start, _line_end in headings:
|
|
186
|
+
while stack and stack[-1][0] >= level:
|
|
187
|
+
stack.pop()
|
|
188
|
+
|
|
189
|
+
parent_id = stack[-1][1] if stack else None
|
|
190
|
+
slug = _safe_slug(title)
|
|
191
|
+
key = (parent_id or "", slug)
|
|
192
|
+
sibling_counters[key] += 1
|
|
193
|
+
count = sibling_counters[key]
|
|
194
|
+
unique_slug = slug if count == 1 else f"{slug}-{count}"
|
|
195
|
+
|
|
196
|
+
section_id = f"{parent_id}/{unique_slug}" if parent_id else unique_slug
|
|
197
|
+
path = (*(t for _, _, t in stack), title)
|
|
198
|
+
|
|
199
|
+
metadata.append((section_id, title, level, parent_id, path))
|
|
200
|
+
stack.append((level, section_id, title))
|
|
201
|
+
|
|
202
|
+
children_map: dict[str, list[str]] = defaultdict(list)
|
|
203
|
+
for sid, _title, _level, parent_id, _path in metadata:
|
|
204
|
+
if parent_id is not None:
|
|
205
|
+
children_map[parent_id].append(sid)
|
|
206
|
+
|
|
207
|
+
sections: list[SectionNode] = []
|
|
208
|
+
for idx, (level, _title_h, _line_start, line_end_excl) in enumerate(headings):
|
|
209
|
+
section_id, title, _lvl, parent_id, path = metadata[idx]
|
|
210
|
+
|
|
211
|
+
span_start_line = headings[idx][2]
|
|
212
|
+
span_end_line = territory_end_line[idx]
|
|
213
|
+
span = Span(
|
|
214
|
+
start=_line_to_byte(line_offsets, span_start_line, total_bytes),
|
|
215
|
+
end=_line_to_byte(line_offsets, span_end_line, total_bytes),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
raw_start = _line_to_byte(line_offsets, line_end_excl, total_bytes)
|
|
219
|
+
raw_end = _line_to_byte(line_offsets, raw_text_end_line[idx], total_bytes)
|
|
220
|
+
raw_text = text_bytes[raw_start:raw_end].decode("utf-8")
|
|
221
|
+
|
|
222
|
+
sections.append(
|
|
223
|
+
SectionNode(
|
|
224
|
+
id=section_id,
|
|
225
|
+
title=title,
|
|
226
|
+
level=level,
|
|
227
|
+
parent=parent_id,
|
|
228
|
+
children=tuple(children_map.get(section_id, ())),
|
|
229
|
+
span=span,
|
|
230
|
+
path=path,
|
|
231
|
+
raw_text=raw_text,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return sections
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _line_to_byte(line_offsets: list[int], line: int, total_bytes: int) -> int:
|
|
239
|
+
"""Resolve a 0-indexed line number to its starting byte offset."""
|
|
240
|
+
if line >= len(line_offsets):
|
|
241
|
+
return total_bytes
|
|
242
|
+
if line < 0:
|
|
243
|
+
return 0
|
|
244
|
+
return line_offsets[line]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Optional MarkItDown-backed parser for office/data/web document formats.
|
|
2
|
+
|
|
3
|
+
MarkItDown converts many source formats into Markdown intended for LLM and
|
|
4
|
+
text-analysis pipelines. Cairn uses it as a local-file conversion layer, then
|
|
5
|
+
delegates structure extraction to :class:`MarkdownParser`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from slugify import slugify
|
|
16
|
+
|
|
17
|
+
from cairn.core.errors import ParseError
|
|
18
|
+
from cairn.core.types import Document
|
|
19
|
+
from cairn.ingest.markdown import MarkdownParser
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MarkItDownParser:
|
|
23
|
+
"""Convert non-native local files to Markdown and parse the result."""
|
|
24
|
+
|
|
25
|
+
name = "markitdown"
|
|
26
|
+
extensions: tuple[str, ...] = (
|
|
27
|
+
".docx",
|
|
28
|
+
".pptx",
|
|
29
|
+
".xlsx",
|
|
30
|
+
".xls",
|
|
31
|
+
".html",
|
|
32
|
+
".htm",
|
|
33
|
+
".csv",
|
|
34
|
+
".json",
|
|
35
|
+
".xml",
|
|
36
|
+
".epub",
|
|
37
|
+
".msg",
|
|
38
|
+
".png",
|
|
39
|
+
".jpg",
|
|
40
|
+
".jpeg",
|
|
41
|
+
".gif",
|
|
42
|
+
".wav",
|
|
43
|
+
".mp3",
|
|
44
|
+
".zip",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def __init__(self) -> None:
|
|
48
|
+
self._markdown = MarkdownParser()
|
|
49
|
+
|
|
50
|
+
def parse(
|
|
51
|
+
self,
|
|
52
|
+
source: Path | bytes | str,
|
|
53
|
+
*,
|
|
54
|
+
doc_id: str | None = None,
|
|
55
|
+
) -> Document:
|
|
56
|
+
if not isinstance(source, Path):
|
|
57
|
+
msg = "MarkItDownParser only accepts local file paths"
|
|
58
|
+
raise ParseError(msg)
|
|
59
|
+
|
|
60
|
+
source_path = source.resolve()
|
|
61
|
+
resolved_doc_id = doc_id or _slug_or_raise(source_path.stem, ctx="filename stem")
|
|
62
|
+
markdown = self._convert_local(source_path)
|
|
63
|
+
if not markdown.strip():
|
|
64
|
+
msg = f"MarkItDown produced empty Markdown for: {source_path}"
|
|
65
|
+
raise ParseError(msg, details={"path": str(source_path)})
|
|
66
|
+
markdown = _ensure_heading(markdown, source_path.stem)
|
|
67
|
+
|
|
68
|
+
parsed = self._markdown.parse(markdown, doc_id=resolved_doc_id)
|
|
69
|
+
try:
|
|
70
|
+
source_bytes = source_path.read_bytes()
|
|
71
|
+
except OSError as exc:
|
|
72
|
+
msg = f"could not read converted source file: {source_path}"
|
|
73
|
+
raise ParseError(msg, details={"path": str(source_path)}) from exc
|
|
74
|
+
converter_version = _markitdown_version()
|
|
75
|
+
source_hash = hashlib.sha256(
|
|
76
|
+
b"\x00".join(
|
|
77
|
+
[
|
|
78
|
+
source_bytes,
|
|
79
|
+
converter_version.encode("utf-8"),
|
|
80
|
+
markdown.encode("utf-8"),
|
|
81
|
+
]
|
|
82
|
+
)
|
|
83
|
+
).hexdigest()
|
|
84
|
+
return parsed.model_copy(
|
|
85
|
+
update={
|
|
86
|
+
"source_path": source_path,
|
|
87
|
+
"source_hash": source_hash,
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _convert_local(path: Path) -> str:
|
|
93
|
+
try:
|
|
94
|
+
from markitdown import MarkItDown
|
|
95
|
+
except ImportError as exc:
|
|
96
|
+
msg = (
|
|
97
|
+
"MarkItDown support is not installed. Install with "
|
|
98
|
+
"`pip install 'cairn[markitdown]'` or "
|
|
99
|
+
"`uv pip install -e '.[markitdown]'`."
|
|
100
|
+
)
|
|
101
|
+
raise ParseError(msg, details={"path": str(path)}) from exc
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
converter: Any = MarkItDown(enable_plugins=False)
|
|
105
|
+
if hasattr(converter, "convert_local"):
|
|
106
|
+
result = converter.convert_local(str(path))
|
|
107
|
+
else:
|
|
108
|
+
result = converter.convert(str(path))
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
msg = f"MarkItDown could not convert local file: {path}"
|
|
111
|
+
raise ParseError(msg, details={"path": str(path)}) from exc
|
|
112
|
+
|
|
113
|
+
text = getattr(result, "text_content", None)
|
|
114
|
+
if text is None:
|
|
115
|
+
text = getattr(result, "markdown", None)
|
|
116
|
+
if not isinstance(text, str):
|
|
117
|
+
msg = "MarkItDown returned no Markdown text content"
|
|
118
|
+
raise ParseError(msg, details={"path": str(path)})
|
|
119
|
+
return text
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _slug_or_raise(text: str, *, ctx: str) -> str:
|
|
123
|
+
slug = slugify(text)
|
|
124
|
+
if not slug:
|
|
125
|
+
msg = f"could not derive a slug from {ctx}: {text!r}"
|
|
126
|
+
raise ParseError(msg)
|
|
127
|
+
return slug
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _markitdown_version() -> str:
|
|
131
|
+
try:
|
|
132
|
+
return version("markitdown")
|
|
133
|
+
except PackageNotFoundError:
|
|
134
|
+
return "not-installed"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _ensure_heading(markdown: str, fallback_title: str) -> str:
|
|
138
|
+
for line in markdown.splitlines():
|
|
139
|
+
stripped = line.strip()
|
|
140
|
+
if stripped.startswith("# "):
|
|
141
|
+
return markdown
|
|
142
|
+
if stripped:
|
|
143
|
+
break
|
|
144
|
+
title = fallback_title.replace("_", " ").replace("-", " ").strip() or "Document"
|
|
145
|
+
return f"# {title}\n\n{markdown}"
|