fylepy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. fyle/__init__.py +46 -0
  2. fyle/_core/__init__.py +5 -0
  3. fyle/_core/api.py +164 -0
  4. fyle/_core/chunking.py +107 -0
  5. fyle/_core/document.py +345 -0
  6. fyle/_core/fetcher.py +68 -0
  7. fyle/_core/registry.py +107 -0
  8. fyle/_core/sniffer.py +251 -0
  9. fyle/_readers/__init__.py +32 -0
  10. fyle/_readers/_md_structure.py +208 -0
  11. fyle/_readers/_whisper.py +126 -0
  12. fyle/_readers/archive/__init__.py +8 -0
  13. fyle/_readers/archive/stdlib.py +513 -0
  14. fyle/_readers/audio/__init__.py +9 -0
  15. fyle/_readers/audio/faster_whisper.py +162 -0
  16. fyle/_readers/base.py +70 -0
  17. fyle/_readers/csv/__init__.py +6 -0
  18. fyle/_readers/csv/stdlib.py +119 -0
  19. fyle/_readers/docx/__init__.py +6 -0
  20. fyle/_readers/docx/mammoth.py +130 -0
  21. fyle/_readers/html/__init__.py +6 -0
  22. fyle/_readers/html/markdownify.py +113 -0
  23. fyle/_readers/image/__init__.py +18 -0
  24. fyle/_readers/image/stdlib.py +136 -0
  25. fyle/_readers/markdown/__init__.py +6 -0
  26. fyle/_readers/markdown/stdlib.py +61 -0
  27. fyle/_readers/pdf/__init__.py +2 -0
  28. fyle/_readers/pdf/pymupdf4llm.py +202 -0
  29. fyle/_readers/pptx/__init__.py +7 -0
  30. fyle/_readers/pptx/python_pptx.py +306 -0
  31. fyle/_readers/sqlite/__init__.py +8 -0
  32. fyle/_readers/sqlite/stdlib.py +366 -0
  33. fyle/_readers/text/__init__.py +7 -0
  34. fyle/_readers/text/stdlib.py +76 -0
  35. fyle/_readers/video/__init__.py +10 -0
  36. fyle/_readers/video/scenedetect.py +330 -0
  37. fyle/_readers/xlsx/__init__.py +6 -0
  38. fyle/_readers/xlsx/openpyxl.py +158 -0
  39. fyle/errors.py +42 -0
  40. fyle/sqlite.py +175 -0
  41. fylepy-0.1.0.dist-info/METADATA +272 -0
  42. fylepy-0.1.0.dist-info/RECORD +44 -0
  43. fylepy-0.1.0.dist-info/WHEEL +4 -0
  44. fylepy-0.1.0.dist-info/licenses/LICENSE +21 -0
fyle/_core/fetcher.py ADDED
@@ -0,0 +1,68 @@
1
+ """URL fetcher — built on ``httpx`` with timeout and max_bytes safety limits.
2
+
3
+ Defaults: ``timeout=30s`` and ``max_bytes=100MB``. Override via environment
4
+ variables ``FYLE_HTTP_TIMEOUT`` and ``FYLE_HTTP_MAX_BYTES``.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ from typing import Optional
10
+
11
+ from ..errors import DownloadError
12
+
13
+ DEFAULT_TIMEOUT: float = 30.0
14
+ DEFAULT_MAX_BYTES: int = 100 * 1024 * 1024 # 100 MB
15
+
16
+
17
+ def _env_float(name: str, default: float) -> float:
18
+ v = os.environ.get(name)
19
+ if not v:
20
+ return default
21
+ try:
22
+ return float(v)
23
+ except ValueError:
24
+ return default
25
+
26
+
27
+ def _env_int(name: str, default: int) -> int:
28
+ v = os.environ.get(name)
29
+ if not v:
30
+ return default
31
+ try:
32
+ return int(v)
33
+ except ValueError:
34
+ return default
35
+
36
+
37
+ def fetch(url: str) -> tuple[bytes, Optional[str]]:
38
+ """Fetch ``url`` and return ``(bytes, content_type)``.
39
+
40
+ Timeouts, network errors, and responses exceeding ``max_bytes`` are all
41
+ raised as ``fyle.DownloadError`` (wrapping the underlying ``httpx`` error).
42
+ """
43
+ try:
44
+ import httpx
45
+ except ImportError as e: # pragma: no cover
46
+ raise DownloadError("httpx is required for URL fetching: pip install httpx") from e
47
+
48
+ timeout = _env_float("FYLE_HTTP_TIMEOUT", DEFAULT_TIMEOUT)
49
+ max_bytes = _env_int("FYLE_HTTP_MAX_BYTES", DEFAULT_MAX_BYTES)
50
+
51
+ try:
52
+ with httpx.Client(timeout=timeout, follow_redirects=True) as client:
53
+ with client.stream("GET", url) as resp:
54
+ resp.raise_for_status()
55
+ content_type = resp.headers.get("content-type")
56
+ buf = bytearray()
57
+ for chunk in resp.iter_bytes():
58
+ buf.extend(chunk)
59
+ if len(buf) > max_bytes:
60
+ raise DownloadError(
61
+ f"Response exceeds max_bytes={max_bytes}. "
62
+ f"Override via FYLE_HTTP_MAX_BYTES env var."
63
+ )
64
+ return bytes(buf), content_type
65
+ except DownloadError:
66
+ raise
67
+ except httpx.HTTPError as e:
68
+ raise DownloadError(f"Failed to fetch {url!r}: {e}") from e
fyle/_core/registry.py ADDED
@@ -0,0 +1,107 @@
1
+ """Reader registry — populated at build time, read-only at runtime.
2
+
3
+ Not a public extension point. ``_register`` is invoked only from
4
+ ``_readers/base.py`` via ``__init_subclass__``; users must not register their own.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from typing import Optional
9
+
10
+ from ..errors import ReaderNotFoundError
11
+
12
+ # Reader name -> Reader class.
13
+ _BY_NAME: dict[str, type] = {}
14
+ # Format name -> list of Reader classes, preserving registration order.
15
+ _BY_FORMAT: dict[str, list[type]] = {}
16
+ # Format name -> default Reader class for that format.
17
+ _DEFAULTS: dict[str, type] = {}
18
+
19
+
20
+ def _register(cls: type) -> None:
21
+ """Invoked by the Reader base class from ``__init_subclass__``."""
22
+ name = getattr(cls, "name", None)
23
+ formats = getattr(cls, "formats", None)
24
+ if not name or not formats:
25
+ raise RuntimeError(
26
+ f"Reader {cls.__name__} must define class attrs `name: str` and `formats: tuple[str, ...]`"
27
+ )
28
+ if name in _BY_NAME and _BY_NAME[name] is not cls:
29
+ raise RuntimeError(f"Reader name conflict: {name!r}")
30
+ _BY_NAME[name] = cls
31
+
32
+ is_default = bool(getattr(cls, "is_default", False))
33
+ for fmt in formats:
34
+ bucket = _BY_FORMAT.setdefault(fmt, [])
35
+ if cls not in bucket:
36
+ bucket.append(cls)
37
+ if is_default:
38
+ existing = _DEFAULTS.get(fmt)
39
+ if existing is not None and existing is not cls:
40
+ raise RuntimeError(
41
+ f"Multiple default readers for format {fmt!r}: "
42
+ f"{existing.name} and {cls.name}"
43
+ )
44
+ _DEFAULTS[fmt] = cls
45
+
46
+
47
+ def validate() -> None:
48
+ """Startup check: every registered format must have exactly one default reader.
49
+
50
+ Fail fast if any format has readers but no default marked ``is_default=True``.
51
+ """
52
+ for fmt, readers in _BY_FORMAT.items():
53
+ if fmt not in _DEFAULTS:
54
+ raise RuntimeError(
55
+ f"Format {fmt!r} has readers {[r.name for r in readers]} "
56
+ f"but no default (is_default=True). Fix at startup."
57
+ )
58
+
59
+
60
+ def resolve(fmt: str, name: Optional[str] = None) -> type:
61
+ """Resolve a Reader class from a format and optional reader name.
62
+
63
+ - ``name=None``: return the default reader for ``fmt``; if no reader is
64
+ registered for ``fmt``, raise ``ReaderNotFoundError``.
65
+ - ``name`` given but not registered: raise ``ReaderNotFoundError``.
66
+ - ``name`` given but does not support ``fmt``: raise ``ReaderNotFoundError``.
67
+ """
68
+ if name is None:
69
+ default_cls = _DEFAULTS.get(fmt)
70
+ if default_cls is None:
71
+ raise ReaderNotFoundError(
72
+ f"No reader registered for format {fmt!r}. "
73
+ f"Available: {sorted(_DEFAULTS)}"
74
+ )
75
+ return default_cls
76
+
77
+ cls = _BY_NAME.get(name)
78
+ if cls is None:
79
+ raise ReaderNotFoundError(
80
+ f"Reader {name!r} not found. Available: {sorted(_BY_NAME)}"
81
+ )
82
+ if fmt not in cls.formats:
83
+ raise ReaderNotFoundError(
84
+ f"Reader {name!r} does not support format {fmt!r} "
85
+ f"(supports: {list(cls.formats)})"
86
+ )
87
+ return cls
88
+
89
+
90
+ def list_all() -> dict[str, list[str]]:
91
+ """Return ``{fmt: [name, ...]}``.
92
+
93
+ The default reader for each format is placed first and suffixed with ``*``.
94
+ Backs the public ``fyle.readers()`` helper.
95
+ """
96
+ out: dict[str, list[str]] = {}
97
+ for fmt, readers in _BY_FORMAT.items():
98
+ default_cls = _DEFAULTS.get(fmt)
99
+ names: list[str] = []
100
+ if default_cls is not None:
101
+ names.append(f"{default_cls.name}*")
102
+ for cls in readers:
103
+ if cls is default_cls:
104
+ continue
105
+ names.append(cls.name)
106
+ out[fmt] = names
107
+ return out
fyle/_core/sniffer.py ADDED
@@ -0,0 +1,251 @@
1
+ """Format sniffer — three-path detection: extension + magic bytes + HTTP Content-Type.
2
+
3
+ Used by ``fyle.open`` to pick the right reader for a given input.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+ from typing import Optional, Union
9
+
10
+ from ..errors import UnsupportedFormatError
11
+
12
+ # Plain-text-ish extensions. Everything here routes to the ``text`` format
13
+ # and is handled by the passthrough PlainTextReader. The list is intentionally
14
+ # broad: source code, structured data, config, logs and lightweight markup are
15
+ # all legitimate "feed this to an LLM" inputs for a file → LLM SDK.
16
+ #
17
+ # Excluded on purpose:
18
+ # - ``.md`` / ``.markdown`` / ``.html`` / ``.htm`` / ``.csv``: have dedicated
19
+ # readers with structural extraction.
20
+ # - Binary / office formats (``.pdf`` / ``.docx`` / ``.xlsx`` / images / audio):
21
+ # obviously not plain text.
22
+ _TEXT_EXTS: tuple[str, ...] = (
23
+ # Generic plaintext
24
+ ".txt", ".text", ".readme",
25
+ # Python
26
+ ".py", ".pyi", ".pyx", ".pyw",
27
+ # JavaScript / TypeScript / web frontend sources
28
+ ".js", ".mjs", ".cjs", ".jsx", ".ts", ".tsx",
29
+ ".vue", ".svelte", ".astro",
30
+ # Stylesheet sources (treated as plaintext — fyle is not a CSS parser)
31
+ ".css", ".scss", ".sass", ".less", ".styl",
32
+ # JVM family
33
+ ".java", ".kt", ".kts", ".scala", ".sc", ".groovy",
34
+ ".clj", ".cljs", ".cljc",
35
+ # Systems / native
36
+ ".c", ".h", ".cc", ".cpp", ".cxx", ".hpp", ".hh", ".hxx", ".inl",
37
+ ".m", ".mm", ".rs", ".go", ".swift", ".zig", ".d", ".nim",
38
+ # .NET
39
+ ".cs", ".fs", ".fsx", ".vb",
40
+ # Dynamic / scripting
41
+ ".rb", ".php", ".pl", ".pm", ".lua", ".dart",
42
+ ".r", ".jl", ".hs", ".ml", ".mli",
43
+ ".ex", ".exs", ".erl", ".hrl",
44
+ ".elm", ".purs", ".cr", ".rkt",
45
+ # Shell / batch
46
+ ".sh", ".bash", ".zsh", ".fish", ".ksh",
47
+ ".ps1", ".psm1", ".psd1", ".bat", ".cmd",
48
+ # Structured data
49
+ ".json", ".jsonl", ".ndjson", ".json5",
50
+ ".yaml", ".yml",
51
+ ".toml",
52
+ ".xml", ".plist", ".rss", ".atom", ".svg",
53
+ ".tsv",
54
+ # Config / env
55
+ ".ini", ".cfg", ".conf", ".properties", ".env",
56
+ ".editorconfig", ".gitignore", ".gitattributes", ".dockerignore",
57
+ ".npmrc", ".nvmrc", ".prettierrc", ".eslintrc", ".babelrc",
58
+ # Build / lock
59
+ ".mk", ".cmake", ".gradle", ".sbt", ".bazel", ".bzl",
60
+ ".lock",
61
+ # SQL / query
62
+ ".sql", ".psql", ".cql", ".hql", ".sparql", ".graphql", ".gql",
63
+ # Lightweight markup (beyond Markdown / HTML which have their own readers)
64
+ ".rst", ".adoc", ".asciidoc", ".tex", ".bib", ".org", ".textile",
65
+ # Templates / template-ish sources
66
+ ".hbs", ".handlebars", ".mustache", ".njk", ".liquid",
67
+ ".ejs", ".pug", ".jade", ".jinja", ".jinja2", ".j2", ".tmpl", ".tpl",
68
+ # IDLs / schemas
69
+ ".proto", ".thrift", ".avsc", ".capnp", ".fbs", ".smithy",
70
+ # Diagrams / dev meta
71
+ ".dot", ".mmd", ".puml", ".drawio",
72
+ # Logs / diffs / patches
73
+ ".log", ".diff", ".patch",
74
+ # Misc
75
+ ".resx",
76
+ )
77
+
78
+ # File extension -> format name.
79
+ _EXT_MAP: dict[str, str] = {
80
+ ".pdf": "pdf",
81
+ ".docx": "docx",
82
+ ".xlsx": "xlsx",
83
+ ".pptx": "pptx",
84
+ ".db": "sqlite",
85
+ ".sqlite": "sqlite",
86
+ ".sqlite3": "sqlite",
87
+ # Archive containers. The ``archive`` reader extracts to disk and
88
+ # reports a Markdown listing; it deliberately does not parse contents.
89
+ # Note: OOXML formats (.docx / .xlsx / .pptx) and SQLite databases are
90
+ # technically ZIP-based or have their own magic; they are handled by
91
+ # dedicated readers above and take precedence via extension.
92
+ ".zip": "archive",
93
+ ".tar": "archive",
94
+ ".gz": "archive",
95
+ ".tgz": "archive",
96
+ ".bz2": "archive",
97
+ ".tbz2": "archive",
98
+ ".xz": "archive",
99
+ ".txz": "archive",
100
+ ".md": "markdown",
101
+ ".markdown": "markdown",
102
+ ".html": "html",
103
+ ".htm": "html",
104
+ ".csv": "csv",
105
+ ".png": "image",
106
+ ".jpg": "image",
107
+ ".jpeg": "image",
108
+ ".webp": "image",
109
+ ".m4a": "audio",
110
+ ".mp3": "audio",
111
+ ".wav": "audio",
112
+ ".mp4": "video",
113
+ ".m4v": "video",
114
+ ".mov": "video",
115
+ ".avi": "video",
116
+ ".mkv": "video",
117
+ ".webm": "video",
118
+ **{ext: "text" for ext in _TEXT_EXTS},
119
+ }
120
+
121
+ # HTTP Content-Type -> format name.
122
+ _MIME_MAP: dict[str, str] = {
123
+ "application/pdf": "pdf",
124
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
125
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
126
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
127
+ "application/vnd.sqlite3": "sqlite",
128
+ "application/x-sqlite3": "sqlite",
129
+ # Archive MIME types → archive reader (extract + list).
130
+ "application/zip": "archive",
131
+ "application/x-zip-compressed": "archive",
132
+ "application/x-tar": "archive",
133
+ "application/gzip": "archive",
134
+ "application/x-gzip": "archive",
135
+ "application/x-bzip2": "archive",
136
+ "application/x-xz": "archive",
137
+ "text/markdown": "markdown",
138
+ "text/html": "html",
139
+ "application/xhtml+xml": "html",
140
+ "text/plain": "text",
141
+ "text/csv": "csv",
142
+ "application/csv": "csv",
143
+ # Structured text data — treat as plaintext for LLM consumption.
144
+ "application/json": "text",
145
+ "application/ld+json": "text",
146
+ "application/yaml": "text",
147
+ "application/x-yaml": "text",
148
+ "application/toml": "text",
149
+ "application/xml": "text",
150
+ "text/xml": "text",
151
+ "image/svg+xml": "text",
152
+ "application/javascript": "text",
153
+ "text/javascript": "text",
154
+ "application/typescript": "text",
155
+ "application/x-sh": "text",
156
+ "image/png": "image",
157
+ "image/jpeg": "image",
158
+ "image/webp": "image",
159
+ "audio/mp4": "audio",
160
+ "audio/mpeg": "audio",
161
+ "audio/wav": "audio",
162
+ "audio/x-wav": "audio",
163
+ "video/mp4": "video",
164
+ "video/quicktime": "video",
165
+ "video/x-msvideo": "video",
166
+ "video/x-matroska": "video",
167
+ "video/webm": "video",
168
+ }
169
+
170
+
171
+ def _sniff_magic(data: bytes) -> Optional[str]:
172
+ """Detect format from magic bytes. Covers the main v1 formats."""
173
+ if len(data) == 0:
174
+ return None
175
+ if data.startswith(b"%PDF-"):
176
+ return "pdf"
177
+ # SQLite: the header is exactly "SQLite format 3\x00" (16 bytes).
178
+ # Extensions like ``.db`` are ambiguous in the wild, so magic-byte
179
+ # detection is the authoritative check.
180
+ if data.startswith(b"SQLite format 3\x00"):
181
+ return "sqlite"
182
+ # PNG
183
+ if data.startswith(b"\x89PNG\r\n\x1a\n"):
184
+ return "image"
185
+ # JPEG
186
+ if data.startswith(b"\xff\xd8\xff"):
187
+ return "image"
188
+ # WEBP: RIFF....WEBP
189
+ if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
190
+ return "image"
191
+ # WAV: RIFF....WAVE
192
+ if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WAVE":
193
+ return "audio"
194
+ # MP3: ID3 tag or MPEG frame header.
195
+ if data.startswith(b"ID3"):
196
+ return "audio"
197
+ if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0:
198
+ return "audio"
199
+ # HTML: common opening tags.
200
+ head = data[:256].lstrip().lower()
201
+ if head.startswith(b"<!doctype html") or head.startswith(b"<html"):
202
+ return "html"
203
+ # OOXML / ZIP containers need extension or Content-Type to disambiguate;
204
+ # return None so the caller falls back to the extension path.
205
+ return None
206
+
207
+
208
+ def detect(
209
+ src: Union[str, Path, bytes, bytearray],
210
+ *,
211
+ source_name: Optional[str] = None,
212
+ content_type: Optional[str] = None,
213
+ ) -> str:
214
+ """Detect the format name.
215
+
216
+ Detection priority:
217
+ 1. HTTP Content-Type (passed by the caller in URL mode).
218
+ 2. File extension from ``source_name`` or a string-valued ``src``.
219
+ 3. Magic bytes.
220
+
221
+ Raises ``UnsupportedFormatError`` if all three paths fail.
222
+ """
223
+ fmt: Optional[str] = None
224
+
225
+ # 1. Content-Type (preferred in URL mode).
226
+ if content_type:
227
+ mime = content_type.split(";", 1)[0].strip().lower()
228
+ fmt = _MIME_MAP.get(mime)
229
+ # Generic ``text/*`` fallback: any unrecognised ``text/*`` subtype
230
+ # (e.g. ``text/x-python``, ``text/vnd.something``) routes to the
231
+ # plaintext reader. Never downgrades a format we already mapped.
232
+ if fmt is None and mime.startswith("text/"):
233
+ fmt = "text"
234
+
235
+ # 2. File extension.
236
+ name = source_name
237
+ if fmt is None and name is None and isinstance(src, (str, Path)):
238
+ name = str(src)
239
+ if fmt is None and name:
240
+ ext = Path(name).suffix.lower()
241
+ fmt = _EXT_MAP.get(ext)
242
+
243
+ # 3. Magic bytes.
244
+ if fmt is None and isinstance(src, (bytes, bytearray)):
245
+ fmt = _sniff_magic(bytes(src[:512]))
246
+
247
+ if fmt is None:
248
+ raise UnsupportedFormatError(
249
+ f"Cannot detect format (source_name={source_name!r}, content_type={content_type!r})"
250
+ )
251
+ return fmt
@@ -0,0 +1,32 @@
1
+ """Import every reader subpackage to trigger auto-registration.
2
+
3
+ Reader subclasses register themselves via ``__init_subclass__`` in
4
+ ``base.py``, which only fires once the defining module is imported. This
5
+ file is therefore the single place that decides which readers are available
6
+ at runtime — add one ``from . import <subpkg>`` line per new reader subpackage.
7
+
8
+ File-name convention inside each subpackage: every reader implementation
9
+ file is named after its *core driver library* (for example ``mammoth.py``,
10
+ ``markdownify.py``, ``openpyxl.py``, ``pymupdf4llm.py``, ``stdlib.py``).
11
+ Post-processors (e.g. ``tabulate``, ``beautifulsoup4``) do not determine
12
+ the file name. This keeps the door open for same-format alternative
13
+ implementations to co-exist under their own library names.
14
+ """
15
+ # Batch 1 (v0.2): text family — text / markdown / csv.
16
+ # Batch 2 (v0.3): structured documents — docx / html / xlsx.
17
+ # Batch 3 (v0.4): pptx / image.
18
+ # Batch 4 (placeholder): audio / video — reserve the format slots; readers
19
+ # raise ``NotImplementedReaderError`` until concrete backends land.
20
+ from . import pdf # noqa: F401
21
+ from . import text # noqa: F401
22
+ from . import markdown # noqa: F401
23
+ from . import csv # noqa: F401
24
+ from . import docx # noqa: F401
25
+ from . import html # noqa: F401
26
+ from . import xlsx # noqa: F401
27
+ from . import pptx # noqa: F401
28
+ from . import image # noqa: F401
29
+ from . import sqlite # noqa: F401
30
+ from . import archive # noqa: F401
31
+ from . import audio # noqa: F401
32
+ from . import video # noqa: F401
@@ -0,0 +1,208 @@
1
+ """Markdown structural extraction shared by every reader whose ``Page.text``
2
+ is Markdown (currently: ``markdown``, ``docx``, ``html``).
3
+
4
+ The contract is deliberately narrow: given a Markdown string, return the
5
+ ``Table`` and ``Image`` objects that appear in it. Page text itself is not
6
+ modified — this is *extraction only*, so the caller's passthrough / rendering
7
+ decisions remain untouched.
8
+
9
+ Design notes:
10
+ - Parsing is delegated to ``markdown-it-py`` (GFM-like, with the table
11
+ plugin enabled). We never regex-parse Markdown structure ourselves
12
+ (see design doc §12.0).
13
+ - HTML ``<img>`` tags embedded in the Markdown are picked up via
14
+ BeautifulSoup when ``include_html_img=True``. This matters in practice
15
+ because:
16
+ - README / docs frequently write logos and badges as ``<img>`` for
17
+ width / alignment control;
18
+ - ``markdownify`` (used by docx & html readers) preserves HTML fragments
19
+ it can't map to Markdown.
20
+ - Image ``data_url`` may be a ``data:image/...;base64,...`` URL (DOCX /
21
+ PDF / HTML inline images) or a plain ``http(s)://`` URL (Markdown
22
+ references). Both are valid per the ``Image`` contract.
23
+ - Every failure path is non-fatal: if markdown-it-py or bs4 fail we append
24
+ a warning and return whatever we managed to collect. The reader's main
25
+ job (producing ``Page.text``) should never be blocked by optional
26
+ structural extraction.
27
+ """
28
+ from __future__ import annotations
29
+
30
+ from typing import Optional
31
+
32
+ from .._core.document import Image, Table
33
+
34
+
35
+ def extract_tables(
36
+ md_text: str,
37
+ *,
38
+ page: int = 1,
39
+ warnings: Optional[list[str]] = None,
40
+ ) -> list[Table]:
41
+ """Extract GFM pipe tables from Markdown.
42
+
43
+ ``table.text`` is a verbatim slice of the source (using ``token.map``),
44
+ not a re-render. ``table.rows`` contains string cells.
45
+ """
46
+ warnings = warnings if warnings is not None else []
47
+ try:
48
+ from markdown_it import MarkdownIt
49
+ except ImportError:
50
+ warnings.append("markdown-it-py not installed; skipping table extraction")
51
+ return []
52
+ try:
53
+ md_parser = MarkdownIt().enable("table")
54
+ tokens = md_parser.parse(md_text)
55
+ except Exception as e:
56
+ warnings.append(f"markdown parse failed; tables not extracted: {e}")
57
+ return []
58
+
59
+ lines = md_text.splitlines(keepends=True)
60
+ tables: list[Table] = []
61
+ i = 0
62
+ while i < len(tokens):
63
+ tok = tokens[i]
64
+ if tok.type == "table_open":
65
+ headers, rows, advance = _walk_table(tokens, i)
66
+ table_md = ""
67
+ if tok.map:
68
+ start, end = tok.map # half-open
69
+ table_md = "".join(lines[start:end]).rstrip("\n")
70
+ tables.append(
71
+ Table(
72
+ text=table_md,
73
+ rows=rows,
74
+ headers=headers,
75
+ page=page,
76
+ )
77
+ )
78
+ i = advance
79
+ else:
80
+ i += 1
81
+ return tables
82
+
83
+
84
+ def _walk_table(tokens, start_idx: int) -> tuple[list[str], list[list[str]], int]:
85
+ """Collect ``(headers, body_rows, index_after_table_close)`` from tokens.
86
+
87
+ markdown-it-py table token shape::
88
+
89
+ table_open
90
+ thead_open
91
+ tr_open
92
+ th_open, inline (cell), th_close ...
93
+ tr_close
94
+ thead_close
95
+ tbody_open
96
+ tr_open
97
+ td_open, inline (cell), td_close ...
98
+ tr_close
99
+ ...
100
+ tbody_close
101
+ table_close
102
+ """
103
+ headers: list[str] = []
104
+ rows: list[list[str]] = []
105
+ current_row: list[str] = []
106
+ in_thead = False
107
+
108
+ i = start_idx + 1
109
+ while i < len(tokens):
110
+ tok = tokens[i]
111
+ t = tok.type
112
+ if t == "table_close":
113
+ return headers, rows, i + 1
114
+ if t == "thead_open":
115
+ in_thead = True
116
+ elif t == "thead_close":
117
+ in_thead = False
118
+ elif t == "tr_open":
119
+ current_row = []
120
+ elif t == "tr_close":
121
+ if in_thead:
122
+ headers = current_row
123
+ else:
124
+ rows.append(current_row)
125
+ elif t == "inline":
126
+ current_row.append(tok.content)
127
+ i += 1
128
+ return headers, rows, i
129
+
130
+
131
+ def extract_images(
132
+ md_text: str,
133
+ *,
134
+ page: int = 1,
135
+ warnings: Optional[list[str]] = None,
136
+ include_html_img: bool = True,
137
+ ) -> list[Image]:
138
+ """Extract image references from Markdown.
139
+
140
+ Two sources are consulted and the results are concatenated:
141
+
142
+ 1. Native Markdown ``![alt](url)`` tokens via markdown-it-py.
143
+ 2. HTML ``<img src="..." alt="...">`` tags via BeautifulSoup
144
+ (only when ``include_html_img=True``).
145
+
146
+ ``data_url`` carries whatever URL appeared in the source: a ``data:``
147
+ URL for inline base64 images (DOCX, PDF, HTML inline), or a plain
148
+ ``http(s)://`` URL for referenced images. The reader does not fetch
149
+ remote URLs — that is an application concern.
150
+ """
151
+ warnings = warnings if warnings is not None else []
152
+ images: list[Image] = []
153
+
154
+ # 1. Markdown native images.
155
+ try:
156
+ from markdown_it import MarkdownIt
157
+ md_parser = MarkdownIt().enable("table")
158
+ tokens = md_parser.parse(md_text)
159
+ for tok in tokens:
160
+ children = getattr(tok, "children", None) or []
161
+ for child in children:
162
+ if child.type == "image":
163
+ src = ""
164
+ if getattr(child, "attrs", None):
165
+ src = child.attrs.get("src") or ""
166
+ alt = (child.content or "").strip()
167
+ if src:
168
+ images.append(
169
+ Image(
170
+ data_url=src,
171
+ data=b"",
172
+ caption=alt or None,
173
+ page=page,
174
+ )
175
+ )
176
+ except ImportError:
177
+ warnings.append(
178
+ "markdown-it-py not installed; skipping Markdown image extraction"
179
+ )
180
+ except Exception as e:
181
+ warnings.append(f"Markdown image extraction failed: {e}")
182
+
183
+ # 2. HTML <img> tags mixed into the Markdown (common for badges / logos).
184
+ if include_html_img:
185
+ try:
186
+ from bs4 import BeautifulSoup
187
+ soup = BeautifulSoup(md_text, "html.parser")
188
+ for tag in soup.find_all("img"):
189
+ src = (tag.get("src") or "").strip()
190
+ if not src:
191
+ continue
192
+ alt = (tag.get("alt") or "").strip()
193
+ images.append(
194
+ Image(
195
+ data_url=src,
196
+ data=b"",
197
+ caption=alt or None,
198
+ page=page,
199
+ )
200
+ )
201
+ except ImportError:
202
+ warnings.append(
203
+ "beautifulsoup4 not installed; skipping HTML <img> extraction"
204
+ )
205
+ except Exception as e:
206
+ warnings.append(f"HTML <img> extraction failed: {e}")
207
+
208
+ return images