fylepy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. fyle/__init__.py +46 -0
  2. fyle/_core/__init__.py +5 -0
  3. fyle/_core/api.py +164 -0
  4. fyle/_core/chunking.py +107 -0
  5. fyle/_core/document.py +345 -0
  6. fyle/_core/fetcher.py +68 -0
  7. fyle/_core/registry.py +107 -0
  8. fyle/_core/sniffer.py +251 -0
  9. fyle/_readers/__init__.py +32 -0
  10. fyle/_readers/_md_structure.py +208 -0
  11. fyle/_readers/_whisper.py +126 -0
  12. fyle/_readers/archive/__init__.py +8 -0
  13. fyle/_readers/archive/stdlib.py +513 -0
  14. fyle/_readers/audio/__init__.py +9 -0
  15. fyle/_readers/audio/faster_whisper.py +162 -0
  16. fyle/_readers/base.py +70 -0
  17. fyle/_readers/csv/__init__.py +6 -0
  18. fyle/_readers/csv/stdlib.py +119 -0
  19. fyle/_readers/docx/__init__.py +6 -0
  20. fyle/_readers/docx/mammoth.py +130 -0
  21. fyle/_readers/html/__init__.py +6 -0
  22. fyle/_readers/html/markdownify.py +113 -0
  23. fyle/_readers/image/__init__.py +18 -0
  24. fyle/_readers/image/stdlib.py +136 -0
  25. fyle/_readers/markdown/__init__.py +6 -0
  26. fyle/_readers/markdown/stdlib.py +61 -0
  27. fyle/_readers/pdf/__init__.py +2 -0
  28. fyle/_readers/pdf/pymupdf4llm.py +202 -0
  29. fyle/_readers/pptx/__init__.py +7 -0
  30. fyle/_readers/pptx/python_pptx.py +306 -0
  31. fyle/_readers/sqlite/__init__.py +8 -0
  32. fyle/_readers/sqlite/stdlib.py +366 -0
  33. fyle/_readers/text/__init__.py +7 -0
  34. fyle/_readers/text/stdlib.py +76 -0
  35. fyle/_readers/video/__init__.py +10 -0
  36. fyle/_readers/video/scenedetect.py +330 -0
  37. fyle/_readers/xlsx/__init__.py +6 -0
  38. fyle/_readers/xlsx/openpyxl.py +158 -0
  39. fyle/errors.py +42 -0
  40. fyle/sqlite.py +175 -0
  41. fylepy-0.1.0.dist-info/METADATA +272 -0
  42. fylepy-0.1.0.dist-info/RECORD +44 -0
  43. fylepy-0.1.0.dist-info/WHEEL +4 -0
  44. fylepy-0.1.0.dist-info/licenses/LICENSE +21 -0
fyle/__init__.py ADDED
@@ -0,0 +1,46 @@
1
+ """fyle — open anything, get clean Markdown for LLMs.
2
+
3
+ Public surface: three entry points (``open`` / ``read`` / ``readers``), the
4
+ data model (``Document`` / ``Page`` / ``Table`` / ``Image`` / ``Meta`` /
5
+ ``Chunk``), and four exception types.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from importlib.metadata import PackageNotFoundError, version as _pkg_version
10
+
11
+ from ._core.api import open, read, readers
12
+ from ._core.document import Chunk, Document, Image, Meta, Page, Table
13
+ from .errors import (
14
+ DownloadError,
15
+ ParseError,
16
+ ReaderNotFoundError,
17
+ UnsupportedFormatError,
18
+ )
19
+
20
+ __all__ = [
21
+ # Entry points
22
+ "open",
23
+ "read",
24
+ "readers",
25
+ # Data model (exposed for type hints / isinstance checks)
26
+ "Document",
27
+ "Page",
28
+ "Table",
29
+ "Image",
30
+ "Meta",
31
+ "Chunk",
32
+ # Exceptions
33
+ "UnsupportedFormatError",
34
+ "ParseError",
35
+ "ReaderNotFoundError",
36
+ "DownloadError",
37
+ ]
38
+
39
+ # ``pyproject.toml`` is the single source of truth for the version string.
40
+ # Read it from the installed package metadata at runtime; fall back to a
41
+ # clearly-fake value when running from an uninstalled source tree (e.g.
42
+ # ``PYTHONPATH=src python -c 'import fyle'``).
43
+ try:
44
+ __version__ = _pkg_version("fyle")
45
+ except PackageNotFoundError: # pragma: no cover - only hit without install
46
+ __version__ = "0.0.0+unknown"
fyle/_core/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Internal core: data models, sniffer, registry, fetcher, chunking.
2
+
3
+ Not part of the public API. Everything users need is re-exported from the
4
+ top-level ``fyle`` namespace; names under ``fyle._core`` may change at any time.
5
+ """
fyle/_core/api.py ADDED
@@ -0,0 +1,164 @@
1
+ """Top-level API — ``fyle.open`` / ``fyle.read`` / ``fyle.readers``."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import IO, Any, Optional, Union
6
+ from urllib.parse import urlparse
7
+
8
+ from . import fetcher, registry, sniffer
9
+ from .document import Document
10
+
11
+ Src = Union[str, Path, bytes, bytearray, IO[Any]]
12
+
13
+ _readers_loaded: bool = False
14
+
15
+
16
+ def _ensure_readers() -> None:
17
+ """Trigger reader registration and startup validation on first use."""
18
+ global _readers_loaded
19
+ if _readers_loaded:
20
+ return
21
+ # Imported lazily to avoid a circular import during ``fyle`` package init.
22
+ from .. import _readers # noqa: F401
23
+
24
+ registry.validate()
25
+ _readers_loaded = True
26
+
27
+
28
+ def _normalize(src: Src) -> tuple[bytes, Optional[str], Optional[str], Optional[str]]:
29
+ """Normalise ``src`` to ``(bytes, source_name, content_type, source_path)``.
30
+
31
+ Dispatcher responsibility: every reader receives plain ``bytes``, so
32
+ individual readers never have to handle polymorphic inputs.
33
+
34
+ ``source_path`` is the absolute filesystem path of a local file source,
35
+ or ``None`` for URL / bytes / file-like inputs. Most readers ignore it;
36
+ the archive reader needs it to decide where to extract.
37
+ """
38
+ # URL.
39
+ if isinstance(src, str) and (src.startswith("http://") or src.startswith("https://")):
40
+ data, ct = fetcher.fetch(src)
41
+ parsed_path = urlparse(src).path
42
+ name = parsed_path.rsplit("/", 1)[-1] if parsed_path else None
43
+ return data, (name or None), ct, None
44
+
45
+ # Local filesystem path.
46
+ if isinstance(src, (str, Path)):
47
+ p = Path(src)
48
+ try:
49
+ resolved = str(p.resolve())
50
+ except OSError:
51
+ resolved = str(p)
52
+ return p.read_bytes(), p.name, None, resolved
53
+
54
+ # bytes / bytearray.
55
+ if isinstance(src, (bytes, bytearray)):
56
+ return bytes(src), None, None, None
57
+
58
+ # File-like object.
59
+ if hasattr(src, "read"):
60
+ try:
61
+ seekable = bool(src.seekable()) if hasattr(src, "seekable") else False
62
+ except Exception:
63
+ seekable = False
64
+ if seekable:
65
+ try:
66
+ src.seek(0)
67
+ except Exception:
68
+ pass
69
+ data = src.read()
70
+ if isinstance(data, str):
71
+ data = data.encode("utf-8")
72
+ raw_name = getattr(src, "name", None)
73
+ if isinstance(raw_name, bytes):
74
+ try:
75
+ raw_name = raw_name.decode("utf-8", errors="ignore")
76
+ except Exception:
77
+ raw_name = None
78
+ if isinstance(raw_name, str):
79
+ name = Path(raw_name).name
80
+ # A file-like object whose ``name`` is a real filesystem path
81
+ # lets us surface an absolute ``source_path`` too.
82
+ try:
83
+ candidate = Path(raw_name)
84
+ source_path = str(candidate.resolve()) if candidate.exists() else None
85
+ except OSError:
86
+ source_path = None
87
+ else:
88
+ name = None
89
+ source_path = None
90
+ return bytes(data), name, None, source_path
91
+
92
+ raise TypeError(f"Unsupported src type: {type(src).__name__}")
93
+
94
+
95
+ def open(src: Src, *, reader: Optional[str] = None) -> Document:
96
+ """Open a document and return a ``Document``.
97
+
98
+ ``src`` accepts: a local path (``str`` / ``Path``), ``bytes``, a file-like
99
+ object, or an ``http(s)://`` URL. Pass ``reader=<name>`` to force a
100
+ specific reader (see ``fyle.readers()`` for the list of available names).
101
+ """
102
+ _ensure_readers()
103
+ data, source_name, content_type, source_path = _normalize(src)
104
+ fmt = sniffer.detect(data, source_name=source_name, content_type=content_type)
105
+ reader_cls = registry.resolve(fmt, reader)
106
+ doc = reader_cls().read(data, source_name=source_name, source_path=source_path)
107
+ # Fill in the final meta fields that only the dispatcher can know.
108
+ doc.meta.reader = reader_cls.name
109
+ if not doc.meta.format:
110
+ doc.meta.format = fmt
111
+ if not doc.meta.size:
112
+ doc.meta.size = len(data)
113
+ # Fine-grained subtype. ``format`` is the reader family (e.g. ``image``,
114
+ # ``text``); ``ext`` pins down the concrete subtype (``png`` vs ``jpeg``,
115
+ # ``py`` vs ``json``). Filled centrally so every reader gets it for free.
116
+ if doc.meta.ext is None and source_name:
117
+ suffix = Path(source_name).suffix.lower().lstrip(".")
118
+ doc.meta.ext = suffix or None
119
+ # Normalise ``title`` to a filename stem. Two independent sources can
120
+ # leave ``title`` already ending in ``.ext``:
121
+ # 1. A reader falls back to ``source_name`` (full filename) when the
122
+ # document has no embedded title field.
123
+ # 2. Some producers embed a title string that itself includes the
124
+ # filename extension (common for PDFs generated from ``save as``).
125
+ # Either way, pairing such a ``title`` with the separately-stored
126
+ # ``ext`` would double the suffix in the file-level header
127
+ # (``report.pdf.pdf``). Strip the redundant suffix centrally, case-
128
+ # insensitively, while preserving the title's original casing.
129
+ if doc.meta.title and doc.meta.ext:
130
+ suffix_with_dot = "." + doc.meta.ext.lower()
131
+ if doc.meta.title.lower().endswith(suffix_with_dot):
132
+ doc.meta.title = doc.meta.title[: -len(suffix_with_dot)] or doc.meta.title
133
+ # Surface the original URL for remote sources so the LLM-ready header
134
+ # can tell the model *where* the file came from — the domain alone
135
+ # (arxiv.org / github.com / a vendor's docs site) is a strong
136
+ # semantic signal. Local filesystem paths are intentionally not
137
+ # surfaced here (privacy: avoids leaking ``/Users/<user>/...`` into
138
+ # any payload the user later shares, logs, or forwards to a hosted
139
+ # model API). bytes / file-like inputs have no URL to surface.
140
+ if isinstance(src, str) and (
141
+ src.startswith("http://") or src.startswith("https://")
142
+ ):
143
+ doc.meta.source = src
144
+ return doc
145
+
146
+
147
+ def read(src: Src, *, reader: Optional[str] = None) -> str:
148
+ """Sugar: equivalent to ``str(open(src, reader=reader))``.
149
+
150
+ Returns the LLM-ready payload (file-level header + Markdown content),
151
+ which is what most callers of a one-liner convenience actually want.
152
+ For the raw content without the header, use ``open(src).text``.
153
+ """
154
+ return str(open(src, reader=reader))
155
+
156
+
157
+ def readers() -> dict[str, list[str]]:
158
+ """Return the readers available in the current environment.
159
+
160
+ The default reader for each format is suffixed with ``*``. Example:
161
+ ``{"pdf": ["pymupdf4llm*", "pdfplumber", "pypdf"], ...}``.
162
+ """
163
+ _ensure_readers()
164
+ return registry.list_all()
fyle/_core/chunking.py ADDED
@@ -0,0 +1,107 @@
1
+ """Token estimation and paragraph-boundary chunking.
2
+
3
+ - Token estimation: prefer ``tiktoken`` with the ``cl100k_base`` encoding;
4
+ fall back to ~4 chars/token when tiktoken is unavailable.
5
+ - Chunking: aggregate paragraphs (split on ``\n\n``) under a ``max_tokens``
6
+ soft limit; fill ``overlap`` by back-filling whole trailing paragraphs.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Iterator, Optional, TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from .document import Chunk, Document
14
+
15
+ _ENCODING: object = None # tiktoken Encoding instance or the string "fallback".
16
+
17
+
18
+ def _get_encoding():
19
+ global _ENCODING
20
+ if _ENCODING is None:
21
+ try:
22
+ import tiktoken
23
+
24
+ _ENCODING = tiktoken.get_encoding("cl100k_base")
25
+ except Exception:
26
+ _ENCODING = "fallback"
27
+ return _ENCODING
28
+
29
+
30
+ def estimate_tokens(text: str) -> int:
31
+ """Estimate the token count of ``text``."""
32
+ if not text:
33
+ return 0
34
+ enc = _get_encoding()
35
+ if enc == "fallback":
36
+ return max(1, len(text) // 4)
37
+ return len(enc.encode(text))
38
+
39
+
40
+ def chunk_document(
41
+ doc: "Document", *, max_tokens: int = 4000, overlap: int = 200
42
+ ) -> Iterator["Chunk"]:
43
+ """Split a ``Document`` on paragraph boundaries.
44
+
45
+ - No hard cuts: if adding the next paragraph would overflow ``max_tokens``,
46
+ yield the current chunk first.
47
+ - ``overlap``: back-fill trailing paragraphs of the just-yielded chunk
48
+ until the accumulated overlap reaches roughly ``overlap`` tokens.
49
+ - ``page_range``: derived from the source page numbers of the paragraphs
50
+ in the chunk; ``None`` for formats without native pagination.
51
+ """
52
+ from .document import Chunk
53
+
54
+ if max_tokens <= 0:
55
+ raise ValueError("max_tokens must be positive")
56
+ if overlap < 0:
57
+ raise ValueError("overlap must be non-negative")
58
+ if overlap >= max_tokens:
59
+ raise ValueError("overlap must be smaller than max_tokens")
60
+
61
+ # 1. Split every page.text into paragraphs, tagged with their source page number.
62
+ paragraphs: list[tuple[str, Optional[int]]] = []
63
+ for page in doc.pages:
64
+ page_num = page.number
65
+ for para in page.text.split("\n\n"):
66
+ para = para.strip()
67
+ if para:
68
+ paragraphs.append((para, page_num))
69
+
70
+ if not paragraphs:
71
+ return
72
+
73
+ buf: list[str] = []
74
+ buf_pages: list[Optional[int]] = []
75
+ buf_tokens: int = 0
76
+
77
+ def make_chunk() -> Chunk:
78
+ text = "\n\n".join(buf)
79
+ real_pages = [p for p in buf_pages if p is not None]
80
+ page_range: Optional[tuple[int, int]] = (
81
+ (min(real_pages), max(real_pages)) if real_pages else None
82
+ )
83
+ return Chunk(text=text, tokens=estimate_tokens(text), page_range=page_range)
84
+
85
+ for para, page_num in paragraphs:
86
+ p_tokens = estimate_tokens(para)
87
+ if buf and buf_tokens + p_tokens > max_tokens:
88
+ yield make_chunk()
89
+ # Back-fill overlap from the tail of the previous buffer.
90
+ carry: list[str] = []
91
+ carry_pages: list[Optional[int]] = []
92
+ carry_tokens = 0
93
+ if overlap > 0:
94
+ for prev_para, prev_page in zip(reversed(buf), reversed(buf_pages)):
95
+ t = estimate_tokens(prev_para)
96
+ if carry_tokens + t > overlap:
97
+ break
98
+ carry.insert(0, prev_para)
99
+ carry_pages.insert(0, prev_page)
100
+ carry_tokens += t
101
+ buf, buf_pages, buf_tokens = carry, carry_pages, carry_tokens
102
+ buf.append(para)
103
+ buf_pages.append(page_num)
104
+ buf_tokens += p_tokens
105
+
106
+ if buf:
107
+ yield make_chunk()
fyle/_core/document.py ADDED
@@ -0,0 +1,345 @@
1
+ """Data model — Document / Page / Table / Image / Meta / Chunk.
2
+
3
+ Naming rule: ``.text`` is always a Markdown string, on every level
4
+ (``doc.text``, ``page.text``, ``table.text``, ``image.text``).
5
+
6
+ The element types (``Meta`` / ``Image`` / ``Table`` / ``Page`` / ``Chunk``)
7
+ are ``pydantic.BaseModel`` subclasses, so they get runtime validation, a
8
+ proper ``.model_dump()`` / ``.model_dump_json()`` surface, and a consistent
9
+ construction contract. ``Document`` itself is intentionally a plain class
10
+ with ``__slots__`` because it caches derived views and exposes properties.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Iterator, Optional, Union
17
+
18
+ from pydantic import BaseModel, ConfigDict, Field
19
+
20
+
21
+ class _Element(BaseModel):
22
+ """Shared base for every element type in the data model."""
23
+
24
+ # Element instances are treated as value objects. ``extra="forbid"``
25
+ # prevents silent typos at construction time; ``ser_json_bytes="base64"``
26
+ # makes ``.model_dump_json()`` work on image payloads.
27
+ model_config = ConfigDict(
28
+ extra="forbid",
29
+ arbitrary_types_allowed=False,
30
+ ser_json_bytes="base64",
31
+ )
32
+
33
+
34
+ class Meta(_Element):
35
+ """Document-level metadata."""
36
+
37
+ format: str = ""
38
+ # Fine-grained subtype. ``format`` is the *reader family* (e.g. ``image``,
39
+ # ``text``, ``docx``) and intentionally coarse. ``ext`` records the concrete
40
+ # subtype so callers can distinguish ``.png`` vs ``.jpeg``, ``.py`` vs
41
+ # ``.json``, etc., without having to re-parse ``source_name`` or the
42
+ # ``data_url`` MIME of each image.
43
+ # Filled by the dispatcher from the source name's suffix (lower-cased, no
44
+ # leading dot). ``None`` when the input has no name (e.g. raw bytes with no
45
+ # ``source_name`` and a URL whose path has no suffix).
46
+ ext: Optional[str] = None
47
+ # Formats without native pagination always report ``pages=1``.
48
+ pages: int = 1
49
+ size: int = 0
50
+ title: Optional[str] = None
51
+ author: Optional[str] = None
52
+ created_at: Optional[datetime] = None
53
+ # Original source URL for inputs fetched from ``http(s)://``. Stays
54
+ # ``None`` for local paths, bytes, and file-like inputs: we
55
+ # deliberately never surface local filesystem paths here to avoid
56
+ # leaking ``/Users/<user>/...`` (or equivalent) into the LLM-ready
57
+ # payload when it gets shared, logged, or forwarded to a hosted
58
+ # model API. The filename in ``title`` + ``ext`` is enough.
59
+ source: Optional[str] = None
60
+ reader: str = ""
61
+ warnings: list[str] = Field(default_factory=list)
62
+
63
+ def as_dict(self) -> dict:
64
+ """Return a JSON-friendly dict (``created_at`` as ISO-8601 string)."""
65
+ return {
66
+ "format": self.format,
67
+ "ext": self.ext,
68
+ "pages": self.pages,
69
+ "size": self.size,
70
+ "title": self.title,
71
+ "author": self.author,
72
+ "created_at": self.created_at.isoformat() if self.created_at else None,
73
+ "source": self.source,
74
+ "reader": self.reader,
75
+ "warnings": list(self.warnings),
76
+ }
77
+
78
+
79
+ class Image(_Element):
80
+ """Image element. fyle does not perform OCR."""
81
+
82
+ data_url: str
83
+ # Raw image bytes. Named ``data`` rather than ``bytes`` to avoid shadowing
84
+ # the builtin ``bytes`` type in annotations.
85
+ data: bytes = b""
86
+ caption: Optional[str] = None
87
+ page: Optional[int] = None
88
+
89
+ @property
90
+ def text(self) -> str:
91
+ """Return Markdown image syntax: ``![caption](data:image/...;base64,...)``.
92
+
93
+ Keeps the ``.text`` contract consistent with ``doc.text`` /
94
+ ``page.text`` / ``table.text``: ``.text`` is always Markdown.
95
+ """
96
+ alt = self.caption or ""
97
+ return f"![{alt}]({self.data_url})"
98
+
99
+ def save(self, path: Union[str, Path]) -> None:
100
+ Path(path).write_bytes(self.data)
101
+
102
+
103
+ class Table(_Element):
104
+ """Table element."""
105
+
106
+ # Markdown table string; name aligned with ``doc.text`` / ``page.text``.
107
+ text: str
108
+ rows: list[list[str]] = Field(default_factory=list)
109
+ headers: list[str] = Field(default_factory=list)
110
+ page: Optional[int] = None
111
+
112
+
113
+ class Page(_Element):
114
+ """Page element.
115
+
116
+ For formats without native pagination, ``pages`` contains a single ``Page``
117
+ with ``number=1``.
118
+
119
+ ``name`` is an optional human-meaningful label for this page. It is used
120
+ by formats where the page has a natural identity beyond a page number:
121
+
122
+ - XLSX: sheet name (``ws.title``).
123
+ - PPTX (future): slide title.
124
+
125
+ For PDF / DOCX / HTML / plain text / Markdown / CSV it stays ``None``.
126
+ We deliberately keep this on ``Page`` rather than introducing separate
127
+ ``Sheet`` / ``Slide`` models: the data shape is identical and the content
128
+ surface (``doc.text`` / ``doc.pages`` / ``doc.tables`` / ``doc.images`` /
129
+ ``doc.meta``) must stay at exactly five attributes.
130
+ """
131
+
132
+ # Markdown content of this page.
133
+ text: str
134
+ number: int = 1
135
+ name: Optional[str] = None
136
+ tables: list[Table] = Field(default_factory=list)
137
+ images: list[Image] = Field(default_factory=list)
138
+
139
+
140
+ class Chunk(_Element):
141
+ """LLM-oriented chunk produced by ``Document.chunks()``."""
142
+
143
+ text: str
144
+ tokens: int
145
+ # ``None`` for formats without native pagination.
146
+ page_range: Optional[tuple[int, int]] = None
147
+
148
+
149
+ class Document:
150
+ """Top-level document object returned by ``fyle.open``.
151
+
152
+ Five content attributes (``text`` / ``pages`` / ``tables`` / ``images`` /
153
+ ``meta``) plus three LLM helpers (``tokens`` / ``tokens_for`` / ``chunks``).
154
+ Eight in total; the surface is frozen.
155
+ """
156
+
157
+ __slots__ = (
158
+ "_pages",
159
+ "meta",
160
+ "_text_cache",
161
+ "_tables_cache",
162
+ "_images_cache",
163
+ )
164
+
165
+ def __init__(self, *, pages: list[Page], meta: Meta) -> None:
166
+ self._pages = pages
167
+ self.meta = meta
168
+ self._text_cache: Optional[str] = None
169
+ self._tables_cache: Optional[list[Table]] = None
170
+ self._images_cache: Optional[list[Image]] = None
171
+
172
+ # ------------------------------------------------------------------
173
+ # Content attributes (5)
174
+ # ------------------------------------------------------------------
175
+ @property
176
+ def text(self) -> str:
177
+ if self._text_cache is None:
178
+ self._text_cache = "\n\n".join(p.text for p in self._pages if p.text)
179
+ return self._text_cache
180
+
181
+ @property
182
+ def pages(self) -> list[Page]:
183
+ return self._pages
184
+
185
+ @property
186
+ def tables(self) -> list[Table]:
187
+ if self._tables_cache is None:
188
+ self._tables_cache = [t for p in self._pages for t in p.tables]
189
+ return self._tables_cache
190
+
191
+ @property
192
+ def images(self) -> list[Image]:
193
+ if self._images_cache is None:
194
+ self._images_cache = [img for p in self._pages for img in p.images]
195
+ return self._images_cache
196
+
197
+ # ------------------------------------------------------------------
198
+ # LLM helpers (3)
199
+ # ------------------------------------------------------------------
200
+ @property
201
+ def tokens(self) -> int:
202
+ from .chunking import estimate_tokens
203
+
204
+ return estimate_tokens(self.text)
205
+
206
+ def tokens_for(self, obj) -> int:
207
+ from .chunking import estimate_tokens
208
+
209
+ text = getattr(obj, "text", None)
210
+ if text is None:
211
+ raise TypeError(f"tokens_for expected an object with .text, got {type(obj).__name__}")
212
+ return estimate_tokens(text)
213
+
214
+ def chunks(self, max_tokens: int = 4000, overlap: int = 200) -> Iterator[Chunk]:
215
+ from .chunking import chunk_document
216
+
217
+ yield from chunk_document(self, max_tokens=max_tokens, overlap=overlap)
218
+
219
+ # ------------------------------------------------------------------
220
+ # Optional context manager
221
+ # ------------------------------------------------------------------
222
+ def __enter__(self) -> "Document":
223
+ return self
224
+
225
+ def __exit__(self, exc_type, exc, tb) -> bool:
226
+ return False
227
+
228
+ def __repr__(self) -> str:
229
+ return (
230
+ f"Document(format={self.meta.format!r}, pages={len(self._pages)}, "
231
+ f"reader={self.meta.reader!r})"
232
+ )
233
+
234
+ def __str__(self) -> str:
235
+ """Return an LLM-ready payload: file-level header + ``doc.text``.
236
+
237
+ Intended as the one-liner you hand to a model:
238
+ ``llm.complete(str(doc))``. The header surfaces filename, format,
239
+ and size so the model still knows what it is looking at when
240
+ only the string is passed in (no ``doc.meta`` alongside). The
241
+ filename in particular carries real semantic signal that
242
+ ``doc.text`` alone would discard.
243
+
244
+ For the raw content without the wrapper, use ``doc.text``.
245
+ """
246
+ header = self._file_level_header()
247
+ if not header:
248
+ return self.text
249
+ return f"{header}\n\n---\n\n{self.text}"
250
+
251
+ def _file_level_header(self) -> str:
252
+ """Compose the outer Markdown header surfaced by ``__str__``.
253
+
254
+ Surfaces the metadata fields that carry real semantic signal for
255
+ an LLM reading the payload: filename, format, size, page count
256
+ (when >1), author, creation time, and any parse warnings. The
257
+ ``reader`` field is deliberately omitted — it's an internal
258
+ implementation detail (which library parsed the file) with no
259
+ value to the model; developers who need it can read
260
+ ``doc.meta.reader`` directly.
261
+
262
+ Content-specific metadata (audio duration, video keyframes,
263
+ detected language, ...) belongs to the reader's own inline
264
+ header inside ``doc.text`` and stays there.
265
+
266
+ The core fields are rendered as a two-column ``field | value``
267
+ Markdown table — one row per attribute. This shape stays
268
+ compact regardless of how many fields are present and matches
269
+ how LLMs naturally parse labeled key/value pairs. Warnings are
270
+ rendered as a separate bullet list because they are a
271
+ variable-length ``list[str]`` and don't fit the single-value
272
+ row shape.
273
+ """
274
+ # Collect present (name, value) pairs. Every field is optional
275
+ # so missing ones simply do not produce a row.
276
+ fields: list[tuple[str, str]] = []
277
+ if self.meta.title:
278
+ # ``meta.title`` is the filename stem; ``meta.ext`` is the
279
+ # dot-less suffix — together they reconstruct the source
280
+ # filename. The filename is just another attribute here,
281
+ # not a separate heading.
282
+ filename = (
283
+ f"{self.meta.title}.{self.meta.ext}"
284
+ if self.meta.ext
285
+ else self.meta.title
286
+ )
287
+ fields.append(("filename", filename))
288
+ if self.meta.source:
289
+ # Only populated for ``http(s)://`` inputs — see ``Meta.source``
290
+ # for why local paths are excluded. The URL carries real
291
+ # semantic signal for the LLM (arxiv.org / github.com /
292
+ # a vendor's docs site all imply different content types).
293
+ fields.append(("source", self.meta.source))
294
+ if self.meta.format:
295
+ fields.append(("format", self.meta.format))
296
+ if self.meta.size:
297
+ fields.append(("size", _human_size(self.meta.size)))
298
+ if len(self._pages) > 1:
299
+ fields.append(("pages", str(len(self._pages))))
300
+ if self.meta.author:
301
+ fields.append(("author", self.meta.author))
302
+ if self.meta.created_at:
303
+ fields.append(
304
+ ("created", self.meta.created_at.isoformat(timespec="seconds"))
305
+ )
306
+
307
+ lines: list[str] = []
308
+ if fields:
309
+ lines.append("| field | value |")
310
+ lines.append("| --- | --- |")
311
+ for name, value in fields:
312
+ lines.append(
313
+ f"| {_escape_table_cell(name)} | {_escape_table_cell(value)} |"
314
+ )
315
+ # Warnings are variable-length; give them their own bullet list
316
+ # so they never distort the table's two-column shape.
317
+ if self.meta.warnings:
318
+ if lines:
319
+ lines.append("")
320
+ lines.append("**Warnings:**")
321
+ for w in self.meta.warnings:
322
+ lines.append(f"- {w}")
323
+ return "\n".join(lines)
324
+
325
+
326
+ def _human_size(n: int) -> str:
327
+ """Render a byte count as ``1.2 MB`` / ``938.8 KB`` / ``420 B``."""
328
+ if n < 1024:
329
+ return f"{n} B"
330
+ size = float(n)
331
+ for unit in ("KB", "MB", "GB", "TB"):
332
+ size /= 1024
333
+ if size < 1024 or unit == "TB":
334
+ return f"{size:.1f} {unit}"
335
+ return f"{size:.1f} TB"
336
+
337
+
338
+ def _escape_table_cell(v: str) -> str:
339
+ """Escape a value for safe inclusion in a single Markdown table cell.
340
+
341
+ Replaces ``|`` with ``\\|`` (table column separator) and collapses
342
+ embedded newlines to spaces so a rogue multi-line ``author`` or
343
+ ``title`` value never breaks the table's single-row shape.
344
+ """
345
+ return v.replace("|", "\\|").replace("\n", " ").replace("\r", " ")