fylepy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fyle/__init__.py +46 -0
- fyle/_core/__init__.py +5 -0
- fyle/_core/api.py +164 -0
- fyle/_core/chunking.py +107 -0
- fyle/_core/document.py +345 -0
- fyle/_core/fetcher.py +68 -0
- fyle/_core/registry.py +107 -0
- fyle/_core/sniffer.py +251 -0
- fyle/_readers/__init__.py +32 -0
- fyle/_readers/_md_structure.py +208 -0
- fyle/_readers/_whisper.py +126 -0
- fyle/_readers/archive/__init__.py +8 -0
- fyle/_readers/archive/stdlib.py +513 -0
- fyle/_readers/audio/__init__.py +9 -0
- fyle/_readers/audio/faster_whisper.py +162 -0
- fyle/_readers/base.py +70 -0
- fyle/_readers/csv/__init__.py +6 -0
- fyle/_readers/csv/stdlib.py +119 -0
- fyle/_readers/docx/__init__.py +6 -0
- fyle/_readers/docx/mammoth.py +130 -0
- fyle/_readers/html/__init__.py +6 -0
- fyle/_readers/html/markdownify.py +113 -0
- fyle/_readers/image/__init__.py +18 -0
- fyle/_readers/image/stdlib.py +136 -0
- fyle/_readers/markdown/__init__.py +6 -0
- fyle/_readers/markdown/stdlib.py +61 -0
- fyle/_readers/pdf/__init__.py +2 -0
- fyle/_readers/pdf/pymupdf4llm.py +202 -0
- fyle/_readers/pptx/__init__.py +7 -0
- fyle/_readers/pptx/python_pptx.py +306 -0
- fyle/_readers/sqlite/__init__.py +8 -0
- fyle/_readers/sqlite/stdlib.py +366 -0
- fyle/_readers/text/__init__.py +7 -0
- fyle/_readers/text/stdlib.py +76 -0
- fyle/_readers/video/__init__.py +10 -0
- fyle/_readers/video/scenedetect.py +330 -0
- fyle/_readers/xlsx/__init__.py +6 -0
- fyle/_readers/xlsx/openpyxl.py +158 -0
- fyle/errors.py +42 -0
- fyle/sqlite.py +175 -0
- fylepy-0.1.0.dist-info/METADATA +272 -0
- fylepy-0.1.0.dist-info/RECORD +44 -0
- fylepy-0.1.0.dist-info/WHEEL +4 -0
- fylepy-0.1.0.dist-info/licenses/LICENSE +21 -0
fyle/_core/fetcher.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""URL fetcher — built on ``httpx`` with timeout and max_bytes safety limits.
|
|
2
|
+
|
|
3
|
+
Defaults: ``timeout=30s`` and ``max_bytes=100MB``. Override via environment
|
|
4
|
+
variables ``FYLE_HTTP_TIMEOUT`` and ``FYLE_HTTP_MAX_BYTES``.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..errors import DownloadError
|
|
12
|
+
|
|
13
|
+
DEFAULT_TIMEOUT: float = 30.0
|
|
14
|
+
DEFAULT_MAX_BYTES: int = 100 * 1024 * 1024 # 100 MB
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _env_float(name: str, default: float) -> float:
|
|
18
|
+
v = os.environ.get(name)
|
|
19
|
+
if not v:
|
|
20
|
+
return default
|
|
21
|
+
try:
|
|
22
|
+
return float(v)
|
|
23
|
+
except ValueError:
|
|
24
|
+
return default
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _env_int(name: str, default: int) -> int:
|
|
28
|
+
v = os.environ.get(name)
|
|
29
|
+
if not v:
|
|
30
|
+
return default
|
|
31
|
+
try:
|
|
32
|
+
return int(v)
|
|
33
|
+
except ValueError:
|
|
34
|
+
return default
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def fetch(url: str) -> tuple[bytes, Optional[str]]:
|
|
38
|
+
"""Fetch ``url`` and return ``(bytes, content_type)``.
|
|
39
|
+
|
|
40
|
+
Timeouts, network errors, and responses exceeding ``max_bytes`` are all
|
|
41
|
+
raised as ``fyle.DownloadError`` (wrapping the underlying ``httpx`` error).
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
import httpx
|
|
45
|
+
except ImportError as e: # pragma: no cover
|
|
46
|
+
raise DownloadError("httpx is required for URL fetching: pip install httpx") from e
|
|
47
|
+
|
|
48
|
+
timeout = _env_float("FYLE_HTTP_TIMEOUT", DEFAULT_TIMEOUT)
|
|
49
|
+
max_bytes = _env_int("FYLE_HTTP_MAX_BYTES", DEFAULT_MAX_BYTES)
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
|
|
53
|
+
with client.stream("GET", url) as resp:
|
|
54
|
+
resp.raise_for_status()
|
|
55
|
+
content_type = resp.headers.get("content-type")
|
|
56
|
+
buf = bytearray()
|
|
57
|
+
for chunk in resp.iter_bytes():
|
|
58
|
+
buf.extend(chunk)
|
|
59
|
+
if len(buf) > max_bytes:
|
|
60
|
+
raise DownloadError(
|
|
61
|
+
f"Response exceeds max_bytes={max_bytes}. "
|
|
62
|
+
f"Override via FYLE_HTTP_MAX_BYTES env var."
|
|
63
|
+
)
|
|
64
|
+
return bytes(buf), content_type
|
|
65
|
+
except DownloadError:
|
|
66
|
+
raise
|
|
67
|
+
except httpx.HTTPError as e:
|
|
68
|
+
raise DownloadError(f"Failed to fetch {url!r}: {e}") from e
|
fyle/_core/registry.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Reader registry — populated at build time, read-only at runtime.
|
|
2
|
+
|
|
3
|
+
Not a public extension point. ``_register`` is invoked only from
|
|
4
|
+
``_readers/base.py`` via ``__init_subclass__``; users must not register their own.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from ..errors import ReaderNotFoundError
|
|
11
|
+
|
|
12
|
+
# Reader name -> Reader class.
|
|
13
|
+
_BY_NAME: dict[str, type] = {}
|
|
14
|
+
# Format name -> list of Reader classes, preserving registration order.
|
|
15
|
+
_BY_FORMAT: dict[str, list[type]] = {}
|
|
16
|
+
# Format name -> default Reader class for that format.
|
|
17
|
+
_DEFAULTS: dict[str, type] = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _register(cls: type) -> None:
|
|
21
|
+
"""Invoked by the Reader base class from ``__init_subclass__``."""
|
|
22
|
+
name = getattr(cls, "name", None)
|
|
23
|
+
formats = getattr(cls, "formats", None)
|
|
24
|
+
if not name or not formats:
|
|
25
|
+
raise RuntimeError(
|
|
26
|
+
f"Reader {cls.__name__} must define class attrs `name: str` and `formats: tuple[str, ...]`"
|
|
27
|
+
)
|
|
28
|
+
if name in _BY_NAME and _BY_NAME[name] is not cls:
|
|
29
|
+
raise RuntimeError(f"Reader name conflict: {name!r}")
|
|
30
|
+
_BY_NAME[name] = cls
|
|
31
|
+
|
|
32
|
+
is_default = bool(getattr(cls, "is_default", False))
|
|
33
|
+
for fmt in formats:
|
|
34
|
+
bucket = _BY_FORMAT.setdefault(fmt, [])
|
|
35
|
+
if cls not in bucket:
|
|
36
|
+
bucket.append(cls)
|
|
37
|
+
if is_default:
|
|
38
|
+
existing = _DEFAULTS.get(fmt)
|
|
39
|
+
if existing is not None and existing is not cls:
|
|
40
|
+
raise RuntimeError(
|
|
41
|
+
f"Multiple default readers for format {fmt!r}: "
|
|
42
|
+
f"{existing.name} and {cls.name}"
|
|
43
|
+
)
|
|
44
|
+
_DEFAULTS[fmt] = cls
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def validate() -> None:
|
|
48
|
+
"""Startup check: every registered format must have exactly one default reader.
|
|
49
|
+
|
|
50
|
+
Fail fast if any format has readers but no default marked ``is_default=True``.
|
|
51
|
+
"""
|
|
52
|
+
for fmt, readers in _BY_FORMAT.items():
|
|
53
|
+
if fmt not in _DEFAULTS:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
f"Format {fmt!r} has readers {[r.name for r in readers]} "
|
|
56
|
+
f"but no default (is_default=True). Fix at startup."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def resolve(fmt: str, name: Optional[str] = None) -> type:
|
|
61
|
+
"""Resolve a Reader class from a format and optional reader name.
|
|
62
|
+
|
|
63
|
+
- ``name=None``: return the default reader for ``fmt``; if no reader is
|
|
64
|
+
registered for ``fmt``, raise ``ReaderNotFoundError``.
|
|
65
|
+
- ``name`` given but not registered: raise ``ReaderNotFoundError``.
|
|
66
|
+
- ``name`` given but does not support ``fmt``: raise ``ReaderNotFoundError``.
|
|
67
|
+
"""
|
|
68
|
+
if name is None:
|
|
69
|
+
default_cls = _DEFAULTS.get(fmt)
|
|
70
|
+
if default_cls is None:
|
|
71
|
+
raise ReaderNotFoundError(
|
|
72
|
+
f"No reader registered for format {fmt!r}. "
|
|
73
|
+
f"Available: {sorted(_DEFAULTS)}"
|
|
74
|
+
)
|
|
75
|
+
return default_cls
|
|
76
|
+
|
|
77
|
+
cls = _BY_NAME.get(name)
|
|
78
|
+
if cls is None:
|
|
79
|
+
raise ReaderNotFoundError(
|
|
80
|
+
f"Reader {name!r} not found. Available: {sorted(_BY_NAME)}"
|
|
81
|
+
)
|
|
82
|
+
if fmt not in cls.formats:
|
|
83
|
+
raise ReaderNotFoundError(
|
|
84
|
+
f"Reader {name!r} does not support format {fmt!r} "
|
|
85
|
+
f"(supports: {list(cls.formats)})"
|
|
86
|
+
)
|
|
87
|
+
return cls
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def list_all() -> dict[str, list[str]]:
|
|
91
|
+
"""Return ``{fmt: [name, ...]}``.
|
|
92
|
+
|
|
93
|
+
The default reader for each format is placed first and suffixed with ``*``.
|
|
94
|
+
Backs the public ``fyle.readers()`` helper.
|
|
95
|
+
"""
|
|
96
|
+
out: dict[str, list[str]] = {}
|
|
97
|
+
for fmt, readers in _BY_FORMAT.items():
|
|
98
|
+
default_cls = _DEFAULTS.get(fmt)
|
|
99
|
+
names: list[str] = []
|
|
100
|
+
if default_cls is not None:
|
|
101
|
+
names.append(f"{default_cls.name}*")
|
|
102
|
+
for cls in readers:
|
|
103
|
+
if cls is default_cls:
|
|
104
|
+
continue
|
|
105
|
+
names.append(cls.name)
|
|
106
|
+
out[fmt] = names
|
|
107
|
+
return out
|
fyle/_core/sniffer.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Format sniffer — three-path detection: extension + magic bytes + HTTP Content-Type.
|
|
2
|
+
|
|
3
|
+
Used by ``fyle.open`` to pick the right reader for a given input.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, Union
|
|
9
|
+
|
|
10
|
+
from ..errors import UnsupportedFormatError
|
|
11
|
+
|
|
12
|
+
# Plain-text-ish extensions. Everything here routes to the ``text`` format
|
|
13
|
+
# and is handled by the passthrough PlainTextReader. The list is intentionally
|
|
14
|
+
# broad: source code, structured data, config, logs and lightweight markup are
|
|
15
|
+
# all legitimate "feed this to an LLM" inputs for a file → LLM SDK.
|
|
16
|
+
#
|
|
17
|
+
# Excluded on purpose:
|
|
18
|
+
# - ``.md`` / ``.markdown`` / ``.html`` / ``.htm`` / ``.csv``: have dedicated
|
|
19
|
+
# readers with structural extraction.
|
|
20
|
+
# - Binary / office formats (``.pdf`` / ``.docx`` / ``.xlsx`` / images / audio):
|
|
21
|
+
# obviously not plain text.
|
|
22
|
+
_TEXT_EXTS: tuple[str, ...] = (
|
|
23
|
+
# Generic plaintext
|
|
24
|
+
".txt", ".text", ".readme",
|
|
25
|
+
# Python
|
|
26
|
+
".py", ".pyi", ".pyx", ".pyw",
|
|
27
|
+
# JavaScript / TypeScript / web frontend sources
|
|
28
|
+
".js", ".mjs", ".cjs", ".jsx", ".ts", ".tsx",
|
|
29
|
+
".vue", ".svelte", ".astro",
|
|
30
|
+
# Stylesheet sources (treated as plaintext — fyle is not a CSS parser)
|
|
31
|
+
".css", ".scss", ".sass", ".less", ".styl",
|
|
32
|
+
# JVM family
|
|
33
|
+
".java", ".kt", ".kts", ".scala", ".sc", ".groovy",
|
|
34
|
+
".clj", ".cljs", ".cljc",
|
|
35
|
+
# Systems / native
|
|
36
|
+
".c", ".h", ".cc", ".cpp", ".cxx", ".hpp", ".hh", ".hxx", ".inl",
|
|
37
|
+
".m", ".mm", ".rs", ".go", ".swift", ".zig", ".d", ".nim",
|
|
38
|
+
# .NET
|
|
39
|
+
".cs", ".fs", ".fsx", ".vb",
|
|
40
|
+
# Dynamic / scripting
|
|
41
|
+
".rb", ".php", ".pl", ".pm", ".lua", ".dart",
|
|
42
|
+
".r", ".jl", ".hs", ".ml", ".mli",
|
|
43
|
+
".ex", ".exs", ".erl", ".hrl",
|
|
44
|
+
".elm", ".purs", ".cr", ".rkt",
|
|
45
|
+
# Shell / batch
|
|
46
|
+
".sh", ".bash", ".zsh", ".fish", ".ksh",
|
|
47
|
+
".ps1", ".psm1", ".psd1", ".bat", ".cmd",
|
|
48
|
+
# Structured data
|
|
49
|
+
".json", ".jsonl", ".ndjson", ".json5",
|
|
50
|
+
".yaml", ".yml",
|
|
51
|
+
".toml",
|
|
52
|
+
".xml", ".plist", ".rss", ".atom", ".svg",
|
|
53
|
+
".tsv",
|
|
54
|
+
# Config / env
|
|
55
|
+
".ini", ".cfg", ".conf", ".properties", ".env",
|
|
56
|
+
".editorconfig", ".gitignore", ".gitattributes", ".dockerignore",
|
|
57
|
+
".npmrc", ".nvmrc", ".prettierrc", ".eslintrc", ".babelrc",
|
|
58
|
+
# Build / lock
|
|
59
|
+
".mk", ".cmake", ".gradle", ".sbt", ".bazel", ".bzl",
|
|
60
|
+
".lock",
|
|
61
|
+
# SQL / query
|
|
62
|
+
".sql", ".psql", ".cql", ".hql", ".sparql", ".graphql", ".gql",
|
|
63
|
+
# Lightweight markup (beyond Markdown / HTML which have their own readers)
|
|
64
|
+
".rst", ".adoc", ".asciidoc", ".tex", ".bib", ".org", ".textile",
|
|
65
|
+
# Templates / template-ish sources
|
|
66
|
+
".hbs", ".handlebars", ".mustache", ".njk", ".liquid",
|
|
67
|
+
".ejs", ".pug", ".jade", ".jinja", ".jinja2", ".j2", ".tmpl", ".tpl",
|
|
68
|
+
# IDLs / schemas
|
|
69
|
+
".proto", ".thrift", ".avsc", ".capnp", ".fbs", ".smithy",
|
|
70
|
+
# Diagrams / dev meta
|
|
71
|
+
".dot", ".mmd", ".puml", ".drawio",
|
|
72
|
+
# Logs / diffs / patches
|
|
73
|
+
".log", ".diff", ".patch",
|
|
74
|
+
# Misc
|
|
75
|
+
".resx",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# File extension -> format name.
|
|
79
|
+
_EXT_MAP: dict[str, str] = {
|
|
80
|
+
".pdf": "pdf",
|
|
81
|
+
".docx": "docx",
|
|
82
|
+
".xlsx": "xlsx",
|
|
83
|
+
".pptx": "pptx",
|
|
84
|
+
".db": "sqlite",
|
|
85
|
+
".sqlite": "sqlite",
|
|
86
|
+
".sqlite3": "sqlite",
|
|
87
|
+
# Archive containers. The ``archive`` reader extracts to disk and
|
|
88
|
+
# reports a Markdown listing; it deliberately does not parse contents.
|
|
89
|
+
# Note: OOXML formats (.docx / .xlsx / .pptx) and SQLite databases are
|
|
90
|
+
# technically ZIP-based or have their own magic; they are handled by
|
|
91
|
+
# dedicated readers above and take precedence via extension.
|
|
92
|
+
".zip": "archive",
|
|
93
|
+
".tar": "archive",
|
|
94
|
+
".gz": "archive",
|
|
95
|
+
".tgz": "archive",
|
|
96
|
+
".bz2": "archive",
|
|
97
|
+
".tbz2": "archive",
|
|
98
|
+
".xz": "archive",
|
|
99
|
+
".txz": "archive",
|
|
100
|
+
".md": "markdown",
|
|
101
|
+
".markdown": "markdown",
|
|
102
|
+
".html": "html",
|
|
103
|
+
".htm": "html",
|
|
104
|
+
".csv": "csv",
|
|
105
|
+
".png": "image",
|
|
106
|
+
".jpg": "image",
|
|
107
|
+
".jpeg": "image",
|
|
108
|
+
".webp": "image",
|
|
109
|
+
".m4a": "audio",
|
|
110
|
+
".mp3": "audio",
|
|
111
|
+
".wav": "audio",
|
|
112
|
+
".mp4": "video",
|
|
113
|
+
".m4v": "video",
|
|
114
|
+
".mov": "video",
|
|
115
|
+
".avi": "video",
|
|
116
|
+
".mkv": "video",
|
|
117
|
+
".webm": "video",
|
|
118
|
+
**{ext: "text" for ext in _TEXT_EXTS},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# HTTP Content-Type -> format name.
|
|
122
|
+
_MIME_MAP: dict[str, str] = {
|
|
123
|
+
"application/pdf": "pdf",
|
|
124
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
|
125
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
|
126
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
|
127
|
+
"application/vnd.sqlite3": "sqlite",
|
|
128
|
+
"application/x-sqlite3": "sqlite",
|
|
129
|
+
# Archive MIME types → archive reader (extract + list).
|
|
130
|
+
"application/zip": "archive",
|
|
131
|
+
"application/x-zip-compressed": "archive",
|
|
132
|
+
"application/x-tar": "archive",
|
|
133
|
+
"application/gzip": "archive",
|
|
134
|
+
"application/x-gzip": "archive",
|
|
135
|
+
"application/x-bzip2": "archive",
|
|
136
|
+
"application/x-xz": "archive",
|
|
137
|
+
"text/markdown": "markdown",
|
|
138
|
+
"text/html": "html",
|
|
139
|
+
"application/xhtml+xml": "html",
|
|
140
|
+
"text/plain": "text",
|
|
141
|
+
"text/csv": "csv",
|
|
142
|
+
"application/csv": "csv",
|
|
143
|
+
# Structured text data — treat as plaintext for LLM consumption.
|
|
144
|
+
"application/json": "text",
|
|
145
|
+
"application/ld+json": "text",
|
|
146
|
+
"application/yaml": "text",
|
|
147
|
+
"application/x-yaml": "text",
|
|
148
|
+
"application/toml": "text",
|
|
149
|
+
"application/xml": "text",
|
|
150
|
+
"text/xml": "text",
|
|
151
|
+
"image/svg+xml": "text",
|
|
152
|
+
"application/javascript": "text",
|
|
153
|
+
"text/javascript": "text",
|
|
154
|
+
"application/typescript": "text",
|
|
155
|
+
"application/x-sh": "text",
|
|
156
|
+
"image/png": "image",
|
|
157
|
+
"image/jpeg": "image",
|
|
158
|
+
"image/webp": "image",
|
|
159
|
+
"audio/mp4": "audio",
|
|
160
|
+
"audio/mpeg": "audio",
|
|
161
|
+
"audio/wav": "audio",
|
|
162
|
+
"audio/x-wav": "audio",
|
|
163
|
+
"video/mp4": "video",
|
|
164
|
+
"video/quicktime": "video",
|
|
165
|
+
"video/x-msvideo": "video",
|
|
166
|
+
"video/x-matroska": "video",
|
|
167
|
+
"video/webm": "video",
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _sniff_magic(data: bytes) -> Optional[str]:
|
|
172
|
+
"""Detect format from magic bytes. Covers the main v1 formats."""
|
|
173
|
+
if len(data) == 0:
|
|
174
|
+
return None
|
|
175
|
+
if data.startswith(b"%PDF-"):
|
|
176
|
+
return "pdf"
|
|
177
|
+
# SQLite: the header is exactly "SQLite format 3\x00" (16 bytes).
|
|
178
|
+
# Extensions like ``.db`` are ambiguous in the wild, so magic-byte
|
|
179
|
+
# detection is the authoritative check.
|
|
180
|
+
if data.startswith(b"SQLite format 3\x00"):
|
|
181
|
+
return "sqlite"
|
|
182
|
+
# PNG
|
|
183
|
+
if data.startswith(b"\x89PNG\r\n\x1a\n"):
|
|
184
|
+
return "image"
|
|
185
|
+
# JPEG
|
|
186
|
+
if data.startswith(b"\xff\xd8\xff"):
|
|
187
|
+
return "image"
|
|
188
|
+
# WEBP: RIFF....WEBP
|
|
189
|
+
if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
|
|
190
|
+
return "image"
|
|
191
|
+
# WAV: RIFF....WAVE
|
|
192
|
+
if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WAVE":
|
|
193
|
+
return "audio"
|
|
194
|
+
# MP3: ID3 tag or MPEG frame header.
|
|
195
|
+
if data.startswith(b"ID3"):
|
|
196
|
+
return "audio"
|
|
197
|
+
if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0:
|
|
198
|
+
return "audio"
|
|
199
|
+
# HTML: common opening tags.
|
|
200
|
+
head = data[:256].lstrip().lower()
|
|
201
|
+
if head.startswith(b"<!doctype html") or head.startswith(b"<html"):
|
|
202
|
+
return "html"
|
|
203
|
+
# OOXML / ZIP containers need extension or Content-Type to disambiguate;
|
|
204
|
+
# return None so the caller falls back to the extension path.
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def detect(
|
|
209
|
+
src: Union[str, Path, bytes, bytearray],
|
|
210
|
+
*,
|
|
211
|
+
source_name: Optional[str] = None,
|
|
212
|
+
content_type: Optional[str] = None,
|
|
213
|
+
) -> str:
|
|
214
|
+
"""Detect the format name.
|
|
215
|
+
|
|
216
|
+
Detection priority:
|
|
217
|
+
1. HTTP Content-Type (passed by the caller in URL mode).
|
|
218
|
+
2. File extension from ``source_name`` or a string-valued ``src``.
|
|
219
|
+
3. Magic bytes.
|
|
220
|
+
|
|
221
|
+
Raises ``UnsupportedFormatError`` if all three paths fail.
|
|
222
|
+
"""
|
|
223
|
+
fmt: Optional[str] = None
|
|
224
|
+
|
|
225
|
+
# 1. Content-Type (preferred in URL mode).
|
|
226
|
+
if content_type:
|
|
227
|
+
mime = content_type.split(";", 1)[0].strip().lower()
|
|
228
|
+
fmt = _MIME_MAP.get(mime)
|
|
229
|
+
# Generic ``text/*`` fallback: any unrecognised ``text/*`` subtype
|
|
230
|
+
# (e.g. ``text/x-python``, ``text/vnd.something``) routes to the
|
|
231
|
+
# plaintext reader. Never downgrades a format we already mapped.
|
|
232
|
+
if fmt is None and mime.startswith("text/"):
|
|
233
|
+
fmt = "text"
|
|
234
|
+
|
|
235
|
+
# 2. File extension.
|
|
236
|
+
name = source_name
|
|
237
|
+
if fmt is None and name is None and isinstance(src, (str, Path)):
|
|
238
|
+
name = str(src)
|
|
239
|
+
if fmt is None and name:
|
|
240
|
+
ext = Path(name).suffix.lower()
|
|
241
|
+
fmt = _EXT_MAP.get(ext)
|
|
242
|
+
|
|
243
|
+
# 3. Magic bytes.
|
|
244
|
+
if fmt is None and isinstance(src, (bytes, bytearray)):
|
|
245
|
+
fmt = _sniff_magic(bytes(src[:512]))
|
|
246
|
+
|
|
247
|
+
if fmt is None:
|
|
248
|
+
raise UnsupportedFormatError(
|
|
249
|
+
f"Cannot detect format (source_name={source_name!r}, content_type={content_type!r})"
|
|
250
|
+
)
|
|
251
|
+
return fmt
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Import every reader subpackage to trigger auto-registration.
|
|
2
|
+
|
|
3
|
+
Reader subclasses register themselves via ``__init_subclass__`` in
|
|
4
|
+
``base.py``, which only fires once the defining module is imported. This
|
|
5
|
+
file is therefore the single place that decides which readers are available
|
|
6
|
+
at runtime — add one ``from . import <subpkg>`` line per new reader subpackage.
|
|
7
|
+
|
|
8
|
+
File-name convention inside each subpackage: every reader implementation
|
|
9
|
+
file is named after its *core driver library* (for example ``mammoth.py``,
|
|
10
|
+
``markdownify.py``, ``openpyxl.py``, ``pymupdf4llm.py``, ``stdlib.py``).
|
|
11
|
+
Post-processors (e.g. ``tabulate``, ``beautifulsoup4``) do not determine
|
|
12
|
+
the file name. This keeps the door open for same-format alternative
|
|
13
|
+
implementations to co-exist under their own library names.
|
|
14
|
+
"""
|
|
15
|
+
# Batch 1 (v0.2): text family — text / markdown / csv.
|
|
16
|
+
# Batch 2 (v0.3): structured documents — docx / html / xlsx.
|
|
17
|
+
# Batch 3 (v0.4): pptx / image.
|
|
18
|
+
# Batch 4 (placeholder): audio / video — reserve the format slots; readers
|
|
19
|
+
# raise ``NotImplementedReaderError`` until concrete backends land.
|
|
20
|
+
from . import pdf # noqa: F401
|
|
21
|
+
from . import text # noqa: F401
|
|
22
|
+
from . import markdown # noqa: F401
|
|
23
|
+
from . import csv # noqa: F401
|
|
24
|
+
from . import docx # noqa: F401
|
|
25
|
+
from . import html # noqa: F401
|
|
26
|
+
from . import xlsx # noqa: F401
|
|
27
|
+
from . import pptx # noqa: F401
|
|
28
|
+
from . import image # noqa: F401
|
|
29
|
+
from . import sqlite # noqa: F401
|
|
30
|
+
from . import archive # noqa: F401
|
|
31
|
+
from . import audio # noqa: F401
|
|
32
|
+
from . import video # noqa: F401
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Markdown structural extraction shared by every reader whose ``Page.text``
|
|
2
|
+
is Markdown (currently: ``markdown``, ``docx``, ``html``).
|
|
3
|
+
|
|
4
|
+
The contract is deliberately narrow: given a Markdown string, return the
|
|
5
|
+
``Table`` and ``Image`` objects that appear in it. Page text itself is not
|
|
6
|
+
modified — this is *extraction only*, so the caller's passthrough / rendering
|
|
7
|
+
decisions remain untouched.
|
|
8
|
+
|
|
9
|
+
Design notes:
|
|
10
|
+
- Parsing is delegated to ``markdown-it-py`` (GFM-like, with the table
|
|
11
|
+
plugin enabled). We never regex-parse Markdown structure ourselves
|
|
12
|
+
(see design doc §12.0).
|
|
13
|
+
- HTML ``<img>`` tags embedded in the Markdown are picked up via
|
|
14
|
+
BeautifulSoup when ``include_html_img=True``. This matters in practice
|
|
15
|
+
because:
|
|
16
|
+
- README / docs frequently write logos and badges as ``<img>`` for
|
|
17
|
+
width / alignment control;
|
|
18
|
+
- ``markdownify`` (used by docx & html readers) preserves HTML fragments
|
|
19
|
+
it can't map to Markdown.
|
|
20
|
+
- Image ``data_url`` may be a ``data:image/...;base64,...`` URL (DOCX /
|
|
21
|
+
PDF / HTML inline images) or a plain ``http(s)://`` URL (Markdown
|
|
22
|
+
references). Both are valid per the ``Image`` contract.
|
|
23
|
+
- Every failure path is non-fatal: if markdown-it-py or bs4 fail we append
|
|
24
|
+
a warning and return whatever we managed to collect. The reader's main
|
|
25
|
+
job (producing ``Page.text``) should never be blocked by optional
|
|
26
|
+
structural extraction.
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from typing import Optional
|
|
31
|
+
|
|
32
|
+
from .._core.document import Image, Table
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def extract_tables(
|
|
36
|
+
md_text: str,
|
|
37
|
+
*,
|
|
38
|
+
page: int = 1,
|
|
39
|
+
warnings: Optional[list[str]] = None,
|
|
40
|
+
) -> list[Table]:
|
|
41
|
+
"""Extract GFM pipe tables from Markdown.
|
|
42
|
+
|
|
43
|
+
``table.text`` is a verbatim slice of the source (using ``token.map``),
|
|
44
|
+
not a re-render. ``table.rows`` contains string cells.
|
|
45
|
+
"""
|
|
46
|
+
warnings = warnings if warnings is not None else []
|
|
47
|
+
try:
|
|
48
|
+
from markdown_it import MarkdownIt
|
|
49
|
+
except ImportError:
|
|
50
|
+
warnings.append("markdown-it-py not installed; skipping table extraction")
|
|
51
|
+
return []
|
|
52
|
+
try:
|
|
53
|
+
md_parser = MarkdownIt().enable("table")
|
|
54
|
+
tokens = md_parser.parse(md_text)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
warnings.append(f"markdown parse failed; tables not extracted: {e}")
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
lines = md_text.splitlines(keepends=True)
|
|
60
|
+
tables: list[Table] = []
|
|
61
|
+
i = 0
|
|
62
|
+
while i < len(tokens):
|
|
63
|
+
tok = tokens[i]
|
|
64
|
+
if tok.type == "table_open":
|
|
65
|
+
headers, rows, advance = _walk_table(tokens, i)
|
|
66
|
+
table_md = ""
|
|
67
|
+
if tok.map:
|
|
68
|
+
start, end = tok.map # half-open
|
|
69
|
+
table_md = "".join(lines[start:end]).rstrip("\n")
|
|
70
|
+
tables.append(
|
|
71
|
+
Table(
|
|
72
|
+
text=table_md,
|
|
73
|
+
rows=rows,
|
|
74
|
+
headers=headers,
|
|
75
|
+
page=page,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
i = advance
|
|
79
|
+
else:
|
|
80
|
+
i += 1
|
|
81
|
+
return tables
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _walk_table(tokens, start_idx: int) -> tuple[list[str], list[list[str]], int]:
|
|
85
|
+
"""Collect ``(headers, body_rows, index_after_table_close)`` from tokens.
|
|
86
|
+
|
|
87
|
+
markdown-it-py table token shape::
|
|
88
|
+
|
|
89
|
+
table_open
|
|
90
|
+
thead_open
|
|
91
|
+
tr_open
|
|
92
|
+
th_open, inline (cell), th_close ...
|
|
93
|
+
tr_close
|
|
94
|
+
thead_close
|
|
95
|
+
tbody_open
|
|
96
|
+
tr_open
|
|
97
|
+
td_open, inline (cell), td_close ...
|
|
98
|
+
tr_close
|
|
99
|
+
...
|
|
100
|
+
tbody_close
|
|
101
|
+
table_close
|
|
102
|
+
"""
|
|
103
|
+
headers: list[str] = []
|
|
104
|
+
rows: list[list[str]] = []
|
|
105
|
+
current_row: list[str] = []
|
|
106
|
+
in_thead = False
|
|
107
|
+
|
|
108
|
+
i = start_idx + 1
|
|
109
|
+
while i < len(tokens):
|
|
110
|
+
tok = tokens[i]
|
|
111
|
+
t = tok.type
|
|
112
|
+
if t == "table_close":
|
|
113
|
+
return headers, rows, i + 1
|
|
114
|
+
if t == "thead_open":
|
|
115
|
+
in_thead = True
|
|
116
|
+
elif t == "thead_close":
|
|
117
|
+
in_thead = False
|
|
118
|
+
elif t == "tr_open":
|
|
119
|
+
current_row = []
|
|
120
|
+
elif t == "tr_close":
|
|
121
|
+
if in_thead:
|
|
122
|
+
headers = current_row
|
|
123
|
+
else:
|
|
124
|
+
rows.append(current_row)
|
|
125
|
+
elif t == "inline":
|
|
126
|
+
current_row.append(tok.content)
|
|
127
|
+
i += 1
|
|
128
|
+
return headers, rows, i
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def extract_images(
|
|
132
|
+
md_text: str,
|
|
133
|
+
*,
|
|
134
|
+
page: int = 1,
|
|
135
|
+
warnings: Optional[list[str]] = None,
|
|
136
|
+
include_html_img: bool = True,
|
|
137
|
+
) -> list[Image]:
|
|
138
|
+
"""Extract image references from Markdown.
|
|
139
|
+
|
|
140
|
+
Two sources are consulted and the results are concatenated:
|
|
141
|
+
|
|
142
|
+
1. Native Markdown ```` tokens via markdown-it-py.
|
|
143
|
+
2. HTML ``<img src="..." alt="...">`` tags via BeautifulSoup
|
|
144
|
+
(only when ``include_html_img=True``).
|
|
145
|
+
|
|
146
|
+
``data_url`` carries whatever URL appeared in the source: a ``data:``
|
|
147
|
+
URL for inline base64 images (DOCX, PDF, HTML inline), or a plain
|
|
148
|
+
``http(s)://`` URL for referenced images. The reader does not fetch
|
|
149
|
+
remote URLs — that is an application concern.
|
|
150
|
+
"""
|
|
151
|
+
warnings = warnings if warnings is not None else []
|
|
152
|
+
images: list[Image] = []
|
|
153
|
+
|
|
154
|
+
# 1. Markdown native images.
|
|
155
|
+
try:
|
|
156
|
+
from markdown_it import MarkdownIt
|
|
157
|
+
md_parser = MarkdownIt().enable("table")
|
|
158
|
+
tokens = md_parser.parse(md_text)
|
|
159
|
+
for tok in tokens:
|
|
160
|
+
children = getattr(tok, "children", None) or []
|
|
161
|
+
for child in children:
|
|
162
|
+
if child.type == "image":
|
|
163
|
+
src = ""
|
|
164
|
+
if getattr(child, "attrs", None):
|
|
165
|
+
src = child.attrs.get("src") or ""
|
|
166
|
+
alt = (child.content or "").strip()
|
|
167
|
+
if src:
|
|
168
|
+
images.append(
|
|
169
|
+
Image(
|
|
170
|
+
data_url=src,
|
|
171
|
+
data=b"",
|
|
172
|
+
caption=alt or None,
|
|
173
|
+
page=page,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
except ImportError:
|
|
177
|
+
warnings.append(
|
|
178
|
+
"markdown-it-py not installed; skipping Markdown image extraction"
|
|
179
|
+
)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
warnings.append(f"Markdown image extraction failed: {e}")
|
|
182
|
+
|
|
183
|
+
# 2. HTML <img> tags mixed into the Markdown (common for badges / logos).
|
|
184
|
+
if include_html_img:
|
|
185
|
+
try:
|
|
186
|
+
from bs4 import BeautifulSoup
|
|
187
|
+
soup = BeautifulSoup(md_text, "html.parser")
|
|
188
|
+
for tag in soup.find_all("img"):
|
|
189
|
+
src = (tag.get("src") or "").strip()
|
|
190
|
+
if not src:
|
|
191
|
+
continue
|
|
192
|
+
alt = (tag.get("alt") or "").strip()
|
|
193
|
+
images.append(
|
|
194
|
+
Image(
|
|
195
|
+
data_url=src,
|
|
196
|
+
data=b"",
|
|
197
|
+
caption=alt or None,
|
|
198
|
+
page=page,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
except ImportError:
|
|
202
|
+
warnings.append(
|
|
203
|
+
"beautifulsoup4 not installed; skipping HTML <img> extraction"
|
|
204
|
+
)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
warnings.append(f"HTML <img> extraction failed: {e}")
|
|
207
|
+
|
|
208
|
+
return images
|