fylepy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fyle/__init__.py +46 -0
- fyle/_core/__init__.py +5 -0
- fyle/_core/api.py +164 -0
- fyle/_core/chunking.py +107 -0
- fyle/_core/document.py +345 -0
- fyle/_core/fetcher.py +68 -0
- fyle/_core/registry.py +107 -0
- fyle/_core/sniffer.py +251 -0
- fyle/_readers/__init__.py +32 -0
- fyle/_readers/_md_structure.py +208 -0
- fyle/_readers/_whisper.py +126 -0
- fyle/_readers/archive/__init__.py +8 -0
- fyle/_readers/archive/stdlib.py +513 -0
- fyle/_readers/audio/__init__.py +9 -0
- fyle/_readers/audio/faster_whisper.py +162 -0
- fyle/_readers/base.py +70 -0
- fyle/_readers/csv/__init__.py +6 -0
- fyle/_readers/csv/stdlib.py +119 -0
- fyle/_readers/docx/__init__.py +6 -0
- fyle/_readers/docx/mammoth.py +130 -0
- fyle/_readers/html/__init__.py +6 -0
- fyle/_readers/html/markdownify.py +113 -0
- fyle/_readers/image/__init__.py +18 -0
- fyle/_readers/image/stdlib.py +136 -0
- fyle/_readers/markdown/__init__.py +6 -0
- fyle/_readers/markdown/stdlib.py +61 -0
- fyle/_readers/pdf/__init__.py +2 -0
- fyle/_readers/pdf/pymupdf4llm.py +202 -0
- fyle/_readers/pptx/__init__.py +7 -0
- fyle/_readers/pptx/python_pptx.py +306 -0
- fyle/_readers/sqlite/__init__.py +8 -0
- fyle/_readers/sqlite/stdlib.py +366 -0
- fyle/_readers/text/__init__.py +7 -0
- fyle/_readers/text/stdlib.py +76 -0
- fyle/_readers/video/__init__.py +10 -0
- fyle/_readers/video/scenedetect.py +330 -0
- fyle/_readers/xlsx/__init__.py +6 -0
- fyle/_readers/xlsx/openpyxl.py +158 -0
- fyle/errors.py +42 -0
- fyle/sqlite.py +175 -0
- fylepy-0.1.0.dist-info/METADATA +272 -0
- fylepy-0.1.0.dist-info/RECORD +44 -0
- fylepy-0.1.0.dist-info/WHEEL +4 -0
- fylepy-0.1.0.dist-info/licenses/LICENSE +21 -0
fyle/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""fyle — open anything, get clean Markdown for LLMs.
|
|
2
|
+
|
|
3
|
+
Public surface: three entry points (``open`` / ``read`` / ``readers``), the
|
|
4
|
+
data model (``Document`` / ``Page`` / ``Table`` / ``Image`` / ``Meta`` /
|
|
5
|
+
``Chunk``), and four exception types.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from importlib.metadata import PackageNotFoundError, version as _pkg_version
|
|
10
|
+
|
|
11
|
+
from ._core.api import open, read, readers
|
|
12
|
+
from ._core.document import Chunk, Document, Image, Meta, Page, Table
|
|
13
|
+
from .errors import (
|
|
14
|
+
DownloadError,
|
|
15
|
+
ParseError,
|
|
16
|
+
ReaderNotFoundError,
|
|
17
|
+
UnsupportedFormatError,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
# Entry points
|
|
22
|
+
"open",
|
|
23
|
+
"read",
|
|
24
|
+
"readers",
|
|
25
|
+
# Data model (exposed for type hints / isinstance checks)
|
|
26
|
+
"Document",
|
|
27
|
+
"Page",
|
|
28
|
+
"Table",
|
|
29
|
+
"Image",
|
|
30
|
+
"Meta",
|
|
31
|
+
"Chunk",
|
|
32
|
+
# Exceptions
|
|
33
|
+
"UnsupportedFormatError",
|
|
34
|
+
"ParseError",
|
|
35
|
+
"ReaderNotFoundError",
|
|
36
|
+
"DownloadError",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# ``pyproject.toml`` is the single source of truth for the version string.
|
|
40
|
+
# Read it from the installed package metadata at runtime; fall back to a
|
|
41
|
+
# clearly-fake value when running from an uninstalled source tree (e.g.
|
|
42
|
+
# ``PYTHONPATH=src python -c 'import fyle'``).
|
|
43
|
+
try:
|
|
44
|
+
__version__ = _pkg_version("fyle")
|
|
45
|
+
except PackageNotFoundError: # pragma: no cover - only hit without install
|
|
46
|
+
__version__ = "0.0.0+unknown"
|
fyle/_core/__init__.py
ADDED
fyle/_core/api.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Top-level API — ``fyle.open`` / ``fyle.read`` / ``fyle.readers``."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import IO, Any, Optional, Union
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from . import fetcher, registry, sniffer
|
|
9
|
+
from .document import Document
|
|
10
|
+
|
|
11
|
+
Src = Union[str, Path, bytes, bytearray, IO[Any]]
|
|
12
|
+
|
|
13
|
+
_readers_loaded: bool = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _ensure_readers() -> None:
|
|
17
|
+
"""Trigger reader registration and startup validation on first use."""
|
|
18
|
+
global _readers_loaded
|
|
19
|
+
if _readers_loaded:
|
|
20
|
+
return
|
|
21
|
+
# Imported lazily to avoid a circular import during ``fyle`` package init.
|
|
22
|
+
from .. import _readers # noqa: F401
|
|
23
|
+
|
|
24
|
+
registry.validate()
|
|
25
|
+
_readers_loaded = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _normalize(src: Src) -> tuple[bytes, Optional[str], Optional[str], Optional[str]]:
|
|
29
|
+
"""Normalise ``src`` to ``(bytes, source_name, content_type, source_path)``.
|
|
30
|
+
|
|
31
|
+
Dispatcher responsibility: every reader receives plain ``bytes``, so
|
|
32
|
+
individual readers never have to handle polymorphic inputs.
|
|
33
|
+
|
|
34
|
+
``source_path`` is the absolute filesystem path of a local file source,
|
|
35
|
+
or ``None`` for URL / bytes / file-like inputs. Most readers ignore it;
|
|
36
|
+
the archive reader needs it to decide where to extract.
|
|
37
|
+
"""
|
|
38
|
+
# URL.
|
|
39
|
+
if isinstance(src, str) and (src.startswith("http://") or src.startswith("https://")):
|
|
40
|
+
data, ct = fetcher.fetch(src)
|
|
41
|
+
parsed_path = urlparse(src).path
|
|
42
|
+
name = parsed_path.rsplit("/", 1)[-1] if parsed_path else None
|
|
43
|
+
return data, (name or None), ct, None
|
|
44
|
+
|
|
45
|
+
# Local filesystem path.
|
|
46
|
+
if isinstance(src, (str, Path)):
|
|
47
|
+
p = Path(src)
|
|
48
|
+
try:
|
|
49
|
+
resolved = str(p.resolve())
|
|
50
|
+
except OSError:
|
|
51
|
+
resolved = str(p)
|
|
52
|
+
return p.read_bytes(), p.name, None, resolved
|
|
53
|
+
|
|
54
|
+
# bytes / bytearray.
|
|
55
|
+
if isinstance(src, (bytes, bytearray)):
|
|
56
|
+
return bytes(src), None, None, None
|
|
57
|
+
|
|
58
|
+
# File-like object.
|
|
59
|
+
if hasattr(src, "read"):
|
|
60
|
+
try:
|
|
61
|
+
seekable = bool(src.seekable()) if hasattr(src, "seekable") else False
|
|
62
|
+
except Exception:
|
|
63
|
+
seekable = False
|
|
64
|
+
if seekable:
|
|
65
|
+
try:
|
|
66
|
+
src.seek(0)
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
data = src.read()
|
|
70
|
+
if isinstance(data, str):
|
|
71
|
+
data = data.encode("utf-8")
|
|
72
|
+
raw_name = getattr(src, "name", None)
|
|
73
|
+
if isinstance(raw_name, bytes):
|
|
74
|
+
try:
|
|
75
|
+
raw_name = raw_name.decode("utf-8", errors="ignore")
|
|
76
|
+
except Exception:
|
|
77
|
+
raw_name = None
|
|
78
|
+
if isinstance(raw_name, str):
|
|
79
|
+
name = Path(raw_name).name
|
|
80
|
+
# A file-like object whose ``name`` is a real filesystem path
|
|
81
|
+
# lets us surface an absolute ``source_path`` too.
|
|
82
|
+
try:
|
|
83
|
+
candidate = Path(raw_name)
|
|
84
|
+
source_path = str(candidate.resolve()) if candidate.exists() else None
|
|
85
|
+
except OSError:
|
|
86
|
+
source_path = None
|
|
87
|
+
else:
|
|
88
|
+
name = None
|
|
89
|
+
source_path = None
|
|
90
|
+
return bytes(data), name, None, source_path
|
|
91
|
+
|
|
92
|
+
raise TypeError(f"Unsupported src type: {type(src).__name__}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def open(src: Src, *, reader: Optional[str] = None) -> Document:
|
|
96
|
+
"""Open a document and return a ``Document``.
|
|
97
|
+
|
|
98
|
+
``src`` accepts: a local path (``str`` / ``Path``), ``bytes``, a file-like
|
|
99
|
+
object, or an ``http(s)://`` URL. Pass ``reader=<name>`` to force a
|
|
100
|
+
specific reader (see ``fyle.readers()`` for the list of available names).
|
|
101
|
+
"""
|
|
102
|
+
_ensure_readers()
|
|
103
|
+
data, source_name, content_type, source_path = _normalize(src)
|
|
104
|
+
fmt = sniffer.detect(data, source_name=source_name, content_type=content_type)
|
|
105
|
+
reader_cls = registry.resolve(fmt, reader)
|
|
106
|
+
doc = reader_cls().read(data, source_name=source_name, source_path=source_path)
|
|
107
|
+
# Fill in the final meta fields that only the dispatcher can know.
|
|
108
|
+
doc.meta.reader = reader_cls.name
|
|
109
|
+
if not doc.meta.format:
|
|
110
|
+
doc.meta.format = fmt
|
|
111
|
+
if not doc.meta.size:
|
|
112
|
+
doc.meta.size = len(data)
|
|
113
|
+
# Fine-grained subtype. ``format`` is the reader family (e.g. ``image``,
|
|
114
|
+
# ``text``); ``ext`` pins down the concrete subtype (``png`` vs ``jpeg``,
|
|
115
|
+
# ``py`` vs ``json``). Filled centrally so every reader gets it for free.
|
|
116
|
+
if doc.meta.ext is None and source_name:
|
|
117
|
+
suffix = Path(source_name).suffix.lower().lstrip(".")
|
|
118
|
+
doc.meta.ext = suffix or None
|
|
119
|
+
# Normalise ``title`` to a filename stem. Two independent sources can
|
|
120
|
+
# leave ``title`` already ending in ``.ext``:
|
|
121
|
+
# 1. A reader falls back to ``source_name`` (full filename) when the
|
|
122
|
+
# document has no embedded title field.
|
|
123
|
+
# 2. Some producers embed a title string that itself includes the
|
|
124
|
+
# filename extension (common for PDFs generated from ``save as``).
|
|
125
|
+
# Either way, pairing such a ``title`` with the separately-stored
|
|
126
|
+
# ``ext`` would double the suffix in the file-level header
|
|
127
|
+
# (``report.pdf.pdf``). Strip the redundant suffix centrally, case-
|
|
128
|
+
# insensitively, while preserving the title's original casing.
|
|
129
|
+
if doc.meta.title and doc.meta.ext:
|
|
130
|
+
suffix_with_dot = "." + doc.meta.ext.lower()
|
|
131
|
+
if doc.meta.title.lower().endswith(suffix_with_dot):
|
|
132
|
+
doc.meta.title = doc.meta.title[: -len(suffix_with_dot)] or doc.meta.title
|
|
133
|
+
# Surface the original URL for remote sources so the LLM-ready header
|
|
134
|
+
# can tell the model *where* the file came from — the domain alone
|
|
135
|
+
# (arxiv.org / github.com / a vendor's docs site) is a strong
|
|
136
|
+
# semantic signal. Local filesystem paths are intentionally not
|
|
137
|
+
# surfaced here (privacy: avoids leaking ``/Users/<user>/...`` into
|
|
138
|
+
# any payload the user later shares, logs, or forwards to a hosted
|
|
139
|
+
# model API). bytes / file-like inputs have no URL to surface.
|
|
140
|
+
if isinstance(src, str) and (
|
|
141
|
+
src.startswith("http://") or src.startswith("https://")
|
|
142
|
+
):
|
|
143
|
+
doc.meta.source = src
|
|
144
|
+
return doc
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def read(src: Src, *, reader: Optional[str] = None) -> str:
|
|
148
|
+
"""Sugar: equivalent to ``str(open(src, reader=reader))``.
|
|
149
|
+
|
|
150
|
+
Returns the LLM-ready payload (file-level header + Markdown content),
|
|
151
|
+
which is what most callers of a one-liner convenience actually want.
|
|
152
|
+
For the raw content without the header, use ``open(src).text``.
|
|
153
|
+
"""
|
|
154
|
+
return str(open(src, reader=reader))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def readers() -> dict[str, list[str]]:
|
|
158
|
+
"""Return the readers available in the current environment.
|
|
159
|
+
|
|
160
|
+
The default reader for each format is suffixed with ``*``. Example:
|
|
161
|
+
``{"pdf": ["pymupdf4llm*", "pdfplumber", "pypdf"], ...}``.
|
|
162
|
+
"""
|
|
163
|
+
_ensure_readers()
|
|
164
|
+
return registry.list_all()
|
fyle/_core/chunking.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Token estimation and paragraph-boundary chunking.
|
|
2
|
+
|
|
3
|
+
- Token estimation: prefer ``tiktoken`` with the ``cl100k_base`` encoding;
|
|
4
|
+
fall back to ~4 chars/token when tiktoken is unavailable.
|
|
5
|
+
- Chunking: aggregate paragraphs (split on ``\n\n``) under a ``max_tokens``
|
|
6
|
+
soft limit; fill ``overlap`` by back-filling whole trailing paragraphs.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Iterator, Optional, TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .document import Chunk, Document
|
|
14
|
+
|
|
15
|
+
_ENCODING: object = None # tiktoken Encoding instance or the string "fallback".
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_encoding():
|
|
19
|
+
global _ENCODING
|
|
20
|
+
if _ENCODING is None:
|
|
21
|
+
try:
|
|
22
|
+
import tiktoken
|
|
23
|
+
|
|
24
|
+
_ENCODING = tiktoken.get_encoding("cl100k_base")
|
|
25
|
+
except Exception:
|
|
26
|
+
_ENCODING = "fallback"
|
|
27
|
+
return _ENCODING
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def estimate_tokens(text: str) -> int:
|
|
31
|
+
"""Estimate the token count of ``text``."""
|
|
32
|
+
if not text:
|
|
33
|
+
return 0
|
|
34
|
+
enc = _get_encoding()
|
|
35
|
+
if enc == "fallback":
|
|
36
|
+
return max(1, len(text) // 4)
|
|
37
|
+
return len(enc.encode(text))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def chunk_document(
|
|
41
|
+
doc: "Document", *, max_tokens: int = 4000, overlap: int = 200
|
|
42
|
+
) -> Iterator["Chunk"]:
|
|
43
|
+
"""Split a ``Document`` on paragraph boundaries.
|
|
44
|
+
|
|
45
|
+
- No hard cuts: if adding the next paragraph would overflow ``max_tokens``,
|
|
46
|
+
yield the current chunk first.
|
|
47
|
+
- ``overlap``: back-fill trailing paragraphs of the just-yielded chunk
|
|
48
|
+
until the accumulated overlap reaches roughly ``overlap`` tokens.
|
|
49
|
+
- ``page_range``: derived from the source page numbers of the paragraphs
|
|
50
|
+
in the chunk; ``None`` for formats without native pagination.
|
|
51
|
+
"""
|
|
52
|
+
from .document import Chunk
|
|
53
|
+
|
|
54
|
+
if max_tokens <= 0:
|
|
55
|
+
raise ValueError("max_tokens must be positive")
|
|
56
|
+
if overlap < 0:
|
|
57
|
+
raise ValueError("overlap must be non-negative")
|
|
58
|
+
if overlap >= max_tokens:
|
|
59
|
+
raise ValueError("overlap must be smaller than max_tokens")
|
|
60
|
+
|
|
61
|
+
# 1. Split every page.text into paragraphs, tagged with their source page number.
|
|
62
|
+
paragraphs: list[tuple[str, Optional[int]]] = []
|
|
63
|
+
for page in doc.pages:
|
|
64
|
+
page_num = page.number
|
|
65
|
+
for para in page.text.split("\n\n"):
|
|
66
|
+
para = para.strip()
|
|
67
|
+
if para:
|
|
68
|
+
paragraphs.append((para, page_num))
|
|
69
|
+
|
|
70
|
+
if not paragraphs:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
buf: list[str] = []
|
|
74
|
+
buf_pages: list[Optional[int]] = []
|
|
75
|
+
buf_tokens: int = 0
|
|
76
|
+
|
|
77
|
+
def make_chunk() -> Chunk:
|
|
78
|
+
text = "\n\n".join(buf)
|
|
79
|
+
real_pages = [p for p in buf_pages if p is not None]
|
|
80
|
+
page_range: Optional[tuple[int, int]] = (
|
|
81
|
+
(min(real_pages), max(real_pages)) if real_pages else None
|
|
82
|
+
)
|
|
83
|
+
return Chunk(text=text, tokens=estimate_tokens(text), page_range=page_range)
|
|
84
|
+
|
|
85
|
+
for para, page_num in paragraphs:
|
|
86
|
+
p_tokens = estimate_tokens(para)
|
|
87
|
+
if buf and buf_tokens + p_tokens > max_tokens:
|
|
88
|
+
yield make_chunk()
|
|
89
|
+
# Back-fill overlap from the tail of the previous buffer.
|
|
90
|
+
carry: list[str] = []
|
|
91
|
+
carry_pages: list[Optional[int]] = []
|
|
92
|
+
carry_tokens = 0
|
|
93
|
+
if overlap > 0:
|
|
94
|
+
for prev_para, prev_page in zip(reversed(buf), reversed(buf_pages)):
|
|
95
|
+
t = estimate_tokens(prev_para)
|
|
96
|
+
if carry_tokens + t > overlap:
|
|
97
|
+
break
|
|
98
|
+
carry.insert(0, prev_para)
|
|
99
|
+
carry_pages.insert(0, prev_page)
|
|
100
|
+
carry_tokens += t
|
|
101
|
+
buf, buf_pages, buf_tokens = carry, carry_pages, carry_tokens
|
|
102
|
+
buf.append(para)
|
|
103
|
+
buf_pages.append(page_num)
|
|
104
|
+
buf_tokens += p_tokens
|
|
105
|
+
|
|
106
|
+
if buf:
|
|
107
|
+
yield make_chunk()
|
fyle/_core/document.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""Data model — Document / Page / Table / Image / Meta / Chunk.
|
|
2
|
+
|
|
3
|
+
Naming rule: ``.text`` is always a Markdown string, on every level
|
|
4
|
+
(``doc.text``, ``page.text``, ``table.text``, ``image.text``).
|
|
5
|
+
|
|
6
|
+
The element types (``Meta`` / ``Image`` / ``Table`` / ``Page`` / ``Chunk``)
|
|
7
|
+
are ``pydantic.BaseModel`` subclasses, so they get runtime validation, a
|
|
8
|
+
proper ``.model_dump()`` / ``.model_dump_json()`` surface, and a consistent
|
|
9
|
+
construction contract. ``Document`` itself is intentionally a plain class
|
|
10
|
+
with ``__slots__`` because it caches derived views and exposes properties.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Iterator, Optional, Union
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class _Element(BaseModel):
|
|
22
|
+
"""Shared base for every element type in the data model."""
|
|
23
|
+
|
|
24
|
+
# Element instances are treated as value objects. ``extra="forbid"``
|
|
25
|
+
# prevents silent typos at construction time; ``ser_json_bytes="base64"``
|
|
26
|
+
# makes ``.model_dump_json()`` work on image payloads.
|
|
27
|
+
model_config = ConfigDict(
|
|
28
|
+
extra="forbid",
|
|
29
|
+
arbitrary_types_allowed=False,
|
|
30
|
+
ser_json_bytes="base64",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Meta(_Element):
|
|
35
|
+
"""Document-level metadata."""
|
|
36
|
+
|
|
37
|
+
format: str = ""
|
|
38
|
+
# Fine-grained subtype. ``format`` is the *reader family* (e.g. ``image``,
|
|
39
|
+
# ``text``, ``docx``) and intentionally coarse. ``ext`` records the concrete
|
|
40
|
+
# subtype so callers can distinguish ``.png`` vs ``.jpeg``, ``.py`` vs
|
|
41
|
+
# ``.json``, etc., without having to re-parse ``source_name`` or the
|
|
42
|
+
# ``data_url`` MIME of each image.
|
|
43
|
+
# Filled by the dispatcher from the source name's suffix (lower-cased, no
|
|
44
|
+
# leading dot). ``None`` when the input has no name (e.g. raw bytes with no
|
|
45
|
+
# ``source_name`` and a URL whose path has no suffix).
|
|
46
|
+
ext: Optional[str] = None
|
|
47
|
+
# Formats without native pagination always report ``pages=1``.
|
|
48
|
+
pages: int = 1
|
|
49
|
+
size: int = 0
|
|
50
|
+
title: Optional[str] = None
|
|
51
|
+
author: Optional[str] = None
|
|
52
|
+
created_at: Optional[datetime] = None
|
|
53
|
+
# Original source URL for inputs fetched from ``http(s)://``. Stays
|
|
54
|
+
# ``None`` for local paths, bytes, and file-like inputs: we
|
|
55
|
+
# deliberately never surface local filesystem paths here to avoid
|
|
56
|
+
# leaking ``/Users/<user>/...`` (or equivalent) into the LLM-ready
|
|
57
|
+
# payload when it gets shared, logged, or forwarded to a hosted
|
|
58
|
+
# model API. The filename in ``title`` + ``ext`` is enough.
|
|
59
|
+
source: Optional[str] = None
|
|
60
|
+
reader: str = ""
|
|
61
|
+
warnings: list[str] = Field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
def as_dict(self) -> dict:
|
|
64
|
+
"""Return a JSON-friendly dict (``created_at`` as ISO-8601 string)."""
|
|
65
|
+
return {
|
|
66
|
+
"format": self.format,
|
|
67
|
+
"ext": self.ext,
|
|
68
|
+
"pages": self.pages,
|
|
69
|
+
"size": self.size,
|
|
70
|
+
"title": self.title,
|
|
71
|
+
"author": self.author,
|
|
72
|
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
73
|
+
"source": self.source,
|
|
74
|
+
"reader": self.reader,
|
|
75
|
+
"warnings": list(self.warnings),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Image(_Element):
|
|
80
|
+
"""Image element. fyle does not perform OCR."""
|
|
81
|
+
|
|
82
|
+
data_url: str
|
|
83
|
+
# Raw image bytes. Named ``data`` rather than ``bytes`` to avoid shadowing
|
|
84
|
+
# the builtin ``bytes`` type in annotations.
|
|
85
|
+
data: bytes = b""
|
|
86
|
+
caption: Optional[str] = None
|
|
87
|
+
page: Optional[int] = None
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def text(self) -> str:
|
|
91
|
+
"""Return Markdown image syntax: ````.
|
|
92
|
+
|
|
93
|
+
Keeps the ``.text`` contract consistent with ``doc.text`` /
|
|
94
|
+
``page.text`` / ``table.text``: ``.text`` is always Markdown.
|
|
95
|
+
"""
|
|
96
|
+
alt = self.caption or ""
|
|
97
|
+
return f""
|
|
98
|
+
|
|
99
|
+
def save(self, path: Union[str, Path]) -> None:
|
|
100
|
+
Path(path).write_bytes(self.data)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class Table(_Element):
|
|
104
|
+
"""Table element."""
|
|
105
|
+
|
|
106
|
+
# Markdown table string; name aligned with ``doc.text`` / ``page.text``.
|
|
107
|
+
text: str
|
|
108
|
+
rows: list[list[str]] = Field(default_factory=list)
|
|
109
|
+
headers: list[str] = Field(default_factory=list)
|
|
110
|
+
page: Optional[int] = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class Page(_Element):
|
|
114
|
+
"""Page element.
|
|
115
|
+
|
|
116
|
+
For formats without native pagination, ``pages`` contains a single ``Page``
|
|
117
|
+
with ``number=1``.
|
|
118
|
+
|
|
119
|
+
``name`` is an optional human-meaningful label for this page. It is used
|
|
120
|
+
by formats where the page has a natural identity beyond a page number:
|
|
121
|
+
|
|
122
|
+
- XLSX: sheet name (``ws.title``).
|
|
123
|
+
- PPTX (future): slide title.
|
|
124
|
+
|
|
125
|
+
For PDF / DOCX / HTML / plain text / Markdown / CSV it stays ``None``.
|
|
126
|
+
We deliberately keep this on ``Page`` rather than introducing separate
|
|
127
|
+
``Sheet`` / ``Slide`` models: the data shape is identical and the content
|
|
128
|
+
surface (``doc.text`` / ``doc.pages`` / ``doc.tables`` / ``doc.images`` /
|
|
129
|
+
``doc.meta``) must stay at exactly five attributes.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
# Markdown content of this page.
|
|
133
|
+
text: str
|
|
134
|
+
number: int = 1
|
|
135
|
+
name: Optional[str] = None
|
|
136
|
+
tables: list[Table] = Field(default_factory=list)
|
|
137
|
+
images: list[Image] = Field(default_factory=list)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class Chunk(_Element):
|
|
141
|
+
"""LLM-oriented chunk produced by ``Document.chunks()``."""
|
|
142
|
+
|
|
143
|
+
text: str
|
|
144
|
+
tokens: int
|
|
145
|
+
# ``None`` for formats without native pagination.
|
|
146
|
+
page_range: Optional[tuple[int, int]] = None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Document:
|
|
150
|
+
"""Top-level document object returned by ``fyle.open``.
|
|
151
|
+
|
|
152
|
+
Five content attributes (``text`` / ``pages`` / ``tables`` / ``images`` /
|
|
153
|
+
``meta``) plus three LLM helpers (``tokens`` / ``tokens_for`` / ``chunks``).
|
|
154
|
+
Eight in total; the surface is frozen.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
__slots__ = (
|
|
158
|
+
"_pages",
|
|
159
|
+
"meta",
|
|
160
|
+
"_text_cache",
|
|
161
|
+
"_tables_cache",
|
|
162
|
+
"_images_cache",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def __init__(self, *, pages: list[Page], meta: Meta) -> None:
|
|
166
|
+
self._pages = pages
|
|
167
|
+
self.meta = meta
|
|
168
|
+
self._text_cache: Optional[str] = None
|
|
169
|
+
self._tables_cache: Optional[list[Table]] = None
|
|
170
|
+
self._images_cache: Optional[list[Image]] = None
|
|
171
|
+
|
|
172
|
+
# ------------------------------------------------------------------
|
|
173
|
+
# Content attributes (5)
|
|
174
|
+
# ------------------------------------------------------------------
|
|
175
|
+
@property
|
|
176
|
+
def text(self) -> str:
|
|
177
|
+
if self._text_cache is None:
|
|
178
|
+
self._text_cache = "\n\n".join(p.text for p in self._pages if p.text)
|
|
179
|
+
return self._text_cache
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def pages(self) -> list[Page]:
|
|
183
|
+
return self._pages
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def tables(self) -> list[Table]:
|
|
187
|
+
if self._tables_cache is None:
|
|
188
|
+
self._tables_cache = [t for p in self._pages for t in p.tables]
|
|
189
|
+
return self._tables_cache
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def images(self) -> list[Image]:
|
|
193
|
+
if self._images_cache is None:
|
|
194
|
+
self._images_cache = [img for p in self._pages for img in p.images]
|
|
195
|
+
return self._images_cache
|
|
196
|
+
|
|
197
|
+
# ------------------------------------------------------------------
|
|
198
|
+
# LLM helpers (3)
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
@property
|
|
201
|
+
def tokens(self) -> int:
|
|
202
|
+
from .chunking import estimate_tokens
|
|
203
|
+
|
|
204
|
+
return estimate_tokens(self.text)
|
|
205
|
+
|
|
206
|
+
def tokens_for(self, obj) -> int:
|
|
207
|
+
from .chunking import estimate_tokens
|
|
208
|
+
|
|
209
|
+
text = getattr(obj, "text", None)
|
|
210
|
+
if text is None:
|
|
211
|
+
raise TypeError(f"tokens_for expected an object with .text, got {type(obj).__name__}")
|
|
212
|
+
return estimate_tokens(text)
|
|
213
|
+
|
|
214
|
+
def chunks(self, max_tokens: int = 4000, overlap: int = 200) -> Iterator[Chunk]:
|
|
215
|
+
from .chunking import chunk_document
|
|
216
|
+
|
|
217
|
+
yield from chunk_document(self, max_tokens=max_tokens, overlap=overlap)
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Optional context manager
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
def __enter__(self) -> "Document":
|
|
223
|
+
return self
|
|
224
|
+
|
|
225
|
+
def __exit__(self, exc_type, exc, tb) -> bool:
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
def __repr__(self) -> str:
|
|
229
|
+
return (
|
|
230
|
+
f"Document(format={self.meta.format!r}, pages={len(self._pages)}, "
|
|
231
|
+
f"reader={self.meta.reader!r})"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def __str__(self) -> str:
|
|
235
|
+
"""Return an LLM-ready payload: file-level header + ``doc.text``.
|
|
236
|
+
|
|
237
|
+
Intended as the one-liner you hand to a model:
|
|
238
|
+
``llm.complete(str(doc))``. The header surfaces filename, format,
|
|
239
|
+
and size so the model still knows what it is looking at when
|
|
240
|
+
only the string is passed in (no ``doc.meta`` alongside). The
|
|
241
|
+
filename in particular carries real semantic signal that
|
|
242
|
+
``doc.text`` alone would discard.
|
|
243
|
+
|
|
244
|
+
For the raw content without the wrapper, use ``doc.text``.
|
|
245
|
+
"""
|
|
246
|
+
header = self._file_level_header()
|
|
247
|
+
if not header:
|
|
248
|
+
return self.text
|
|
249
|
+
return f"{header}\n\n---\n\n{self.text}"
|
|
250
|
+
|
|
251
|
+
def _file_level_header(self) -> str:
|
|
252
|
+
"""Compose the outer Markdown header surfaced by ``__str__``.
|
|
253
|
+
|
|
254
|
+
Surfaces the metadata fields that carry real semantic signal for
|
|
255
|
+
an LLM reading the payload: filename, format, size, page count
|
|
256
|
+
(when >1), author, creation time, and any parse warnings. The
|
|
257
|
+
``reader`` field is deliberately omitted — it's an internal
|
|
258
|
+
implementation detail (which library parsed the file) with no
|
|
259
|
+
value to the model; developers who need it can read
|
|
260
|
+
``doc.meta.reader`` directly.
|
|
261
|
+
|
|
262
|
+
Content-specific metadata (audio duration, video keyframes,
|
|
263
|
+
detected language, ...) belongs to the reader's own inline
|
|
264
|
+
header inside ``doc.text`` and stays there.
|
|
265
|
+
|
|
266
|
+
The core fields are rendered as a two-column ``field | value``
|
|
267
|
+
Markdown table — one row per attribute. This shape stays
|
|
268
|
+
compact regardless of how many fields are present and matches
|
|
269
|
+
how LLMs naturally parse labeled key/value pairs. Warnings are
|
|
270
|
+
rendered as a separate bullet list because they are a
|
|
271
|
+
variable-length ``list[str]`` and don't fit the single-value
|
|
272
|
+
row shape.
|
|
273
|
+
"""
|
|
274
|
+
# Collect present (name, value) pairs. Every field is optional
|
|
275
|
+
# so missing ones simply do not produce a row.
|
|
276
|
+
fields: list[tuple[str, str]] = []
|
|
277
|
+
if self.meta.title:
|
|
278
|
+
# ``meta.title`` is the filename stem; ``meta.ext`` is the
|
|
279
|
+
# dot-less suffix — together they reconstruct the source
|
|
280
|
+
# filename. The filename is just another attribute here,
|
|
281
|
+
# not a separate heading.
|
|
282
|
+
filename = (
|
|
283
|
+
f"{self.meta.title}.{self.meta.ext}"
|
|
284
|
+
if self.meta.ext
|
|
285
|
+
else self.meta.title
|
|
286
|
+
)
|
|
287
|
+
fields.append(("filename", filename))
|
|
288
|
+
if self.meta.source:
|
|
289
|
+
# Only populated for ``http(s)://`` inputs — see ``Meta.source``
|
|
290
|
+
# for why local paths are excluded. The URL carries real
|
|
291
|
+
# semantic signal for the LLM (arxiv.org / github.com /
|
|
292
|
+
# a vendor's docs site all imply different content types).
|
|
293
|
+
fields.append(("source", self.meta.source))
|
|
294
|
+
if self.meta.format:
|
|
295
|
+
fields.append(("format", self.meta.format))
|
|
296
|
+
if self.meta.size:
|
|
297
|
+
fields.append(("size", _human_size(self.meta.size)))
|
|
298
|
+
if len(self._pages) > 1:
|
|
299
|
+
fields.append(("pages", str(len(self._pages))))
|
|
300
|
+
if self.meta.author:
|
|
301
|
+
fields.append(("author", self.meta.author))
|
|
302
|
+
if self.meta.created_at:
|
|
303
|
+
fields.append(
|
|
304
|
+
("created", self.meta.created_at.isoformat(timespec="seconds"))
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
lines: list[str] = []
|
|
308
|
+
if fields:
|
|
309
|
+
lines.append("| field | value |")
|
|
310
|
+
lines.append("| --- | --- |")
|
|
311
|
+
for name, value in fields:
|
|
312
|
+
lines.append(
|
|
313
|
+
f"| {_escape_table_cell(name)} | {_escape_table_cell(value)} |"
|
|
314
|
+
)
|
|
315
|
+
# Warnings are variable-length; give them their own bullet list
|
|
316
|
+
# so they never distort the table's two-column shape.
|
|
317
|
+
if self.meta.warnings:
|
|
318
|
+
if lines:
|
|
319
|
+
lines.append("")
|
|
320
|
+
lines.append("**Warnings:**")
|
|
321
|
+
for w in self.meta.warnings:
|
|
322
|
+
lines.append(f"- {w}")
|
|
323
|
+
return "\n".join(lines)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _human_size(n: int) -> str:
|
|
327
|
+
"""Render a byte count as ``1.2 MB`` / ``938.8 KB`` / ``420 B``."""
|
|
328
|
+
if n < 1024:
|
|
329
|
+
return f"{n} B"
|
|
330
|
+
size = float(n)
|
|
331
|
+
for unit in ("KB", "MB", "GB", "TB"):
|
|
332
|
+
size /= 1024
|
|
333
|
+
if size < 1024 or unit == "TB":
|
|
334
|
+
return f"{size:.1f} {unit}"
|
|
335
|
+
return f"{size:.1f} TB"
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _escape_table_cell(v: str) -> str:
|
|
339
|
+
"""Escape a value for safe inclusion in a single Markdown table cell.
|
|
340
|
+
|
|
341
|
+
Replaces ``|`` with ``\\|`` (table column separator) and collapses
|
|
342
|
+
embedded newlines to spaces so a rogue multi-line ``author`` or
|
|
343
|
+
``title`` value never breaks the table's single-row shape.
|
|
344
|
+
"""
|
|
345
|
+
return v.replace("|", "\\|").replace("\n", " ").replace("\r", " ")
|