aimd-book 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aimd_book/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ """Ebook package for aimd."""
2
+
3
+ from .cleaner import clean_markdown
4
+ from .processor import BookConversion, process_book_with_images
5
+ from ._plugin import (
6
+ AimdBookConverter,
7
+ __plugin_interface_version__,
8
+ register_converters,
9
+ )
10
+
11
+ __all__ = [
12
+ "AimdBookConverter",
13
+ "BookConversion",
14
+ "__plugin_interface_version__",
15
+ "clean_markdown",
16
+ "process_book_with_images",
17
+ "register_converters",
18
+ ]
aimd_book/_plugin.py ADDED
@@ -0,0 +1,61 @@
1
+ """MarkItDown plugin for ebook conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, BinaryIO
7
+
8
+ from markitdown import (
9
+ DocumentConverter,
10
+ DocumentConverterResult,
11
+ FailedConversionAttempt,
12
+ MarkItDown,
13
+ StreamInfo,
14
+ )
15
+
16
+ from .processor import process_book_with_images
17
+
18
+ BOOK_EXTENSIONS = {".epub", ".mobi", ".azw3"}
19
+
20
+ __plugin_interface_version__ = 1
21
+
22
+
23
+ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
24
+ """Register the ebook converter with MarkItDown."""
25
+ markitdown.register_converter(AimdBookConverter(), priority=10.0)
26
+
27
+
28
+ class AimdBookConverter(DocumentConverter):
29
+ """Convert EPUB-like ebooks to markdown with image extraction."""
30
+
31
+ def accepts(
32
+ self,
33
+ file_stream: BinaryIO,
34
+ stream_info: StreamInfo,
35
+ **kwargs: Any,
36
+ ) -> bool:
37
+ extension = (stream_info.extension or "").lower()
38
+ return extension in BOOK_EXTENSIONS
39
+
40
+ def convert(
41
+ self,
42
+ file_stream: BinaryIO,
43
+ stream_info: StreamInfo,
44
+ **kwargs: Any,
45
+ ) -> DocumentConverterResult:
46
+ if not stream_info.local_path:
47
+ raise FailedConversionAttempt("aimd-book requires a local file path")
48
+
49
+ try:
50
+ result = process_book_with_images(
51
+ Path(stream_info.local_path),
52
+ output_dir=kwargs.get("output_dir"),
53
+ temp_dir=kwargs.get("temp_dir"),
54
+ )
55
+ except Exception as exc:
56
+ raise FailedConversionAttempt(f"Book conversion failed: {exc}") from exc
57
+
58
+ return DocumentConverterResult(
59
+ title=result.title,
60
+ markdown=result.markdown,
61
+ )
aimd_book/cleaner.py ADDED
@@ -0,0 +1,262 @@
1
+ """Post-processing cleanup for pandoc-generated markdown from EPUB chapters.
2
+
3
+ Ported from the standalone epub-to-markdown shell script. Handles image path
4
+ normalisation, EPUB-style footnote conversion, TOC hierarchy flattening,
5
+ heading normalisation / merging / demotion, and whitespace tidying.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from pathlib import Path
12
+
13
+ _IMAGE_EXTS = r"jpg|jpeg|png|gif|webp|svg"
14
+
15
+
16
+ def clean_markdown(file_path: Path) -> None:
17
+ """Read *file_path*, apply all EPUB-specific fixups, write back."""
18
+ text = file_path.read_text(encoding="utf-8", errors="ignore")
19
+ text = _clean_spans(text)
20
+ text = _fix_image_refs(text)
21
+ text = _normalize_separators(text)
22
+ text = _convert_footnotes(text)
23
+ text = _strip_remaining_html(text)
24
+ text = _flatten_toc(text)
25
+ text = _normalize_headings(text)
26
+ text = _merge_consecutive_headings(text)
27
+ text = _demote_headings(text)
28
+ text = _dedup_headings(text)
29
+ text = _ensure_heading_spacing(text)
30
+ text = _final_whitespace(text)
31
+ file_path.write_text(text, encoding="utf-8")
32
+
33
+
34
+ def _clean_spans(text: str) -> str:
35
+ text = re.sub(
36
+ r'<span\b[^>]*class="[^"]*\bimage placeholder\b[^"]*"[^>]*>\s*</span>\n*',
37
+ "",
38
+ text,
39
+ flags=re.I,
40
+ )
41
+ text = re.sub(
42
+ r'<span\b[^>]*id="[^"]*"[^>]*>\s*</span>\n*',
43
+ "",
44
+ text,
45
+ flags=re.I,
46
+ )
47
+ return text
48
+
49
+
50
+ def _fix_image_refs(text: str) -> str:
51
+ text = re.sub(
52
+ rf'<img\b[^>]*src="[^"]*/images/([^"/]+\.(?:{_IMAGE_EXTS}))"[^>]*alt="([^"]*)"[^>]*/?>',
53
+ lambda m: f"![{m.group(2) or 'Image'}](images/{m.group(1)})",
54
+ text,
55
+ flags=re.I,
56
+ )
57
+ text = re.sub(
58
+ rf'<img\b[^>]*src="([^"]*?/)?([^"/]+\.(?:{_IMAGE_EXTS}))"[^>]*alt="([^"]*)"[^>]*/?>',
59
+ lambda m: f"![{m.group(3) or 'Image'}](images/{m.group(2)})",
60
+ text,
61
+ flags=re.I,
62
+ )
63
+ text = re.sub(
64
+ r"!\[([^\]]*)\]\((?:[^)\"]*/)?" r"images/([^)\"/]+)\)",
65
+ r"![\1](images/\2)",
66
+ text,
67
+ flags=re.I,
68
+ )
69
+ text = re.sub(
70
+ rf"!\[([^\]]*)\]\((?:[^)\"]*/)?" rf"([^)/\"]+\.(?:{_IMAGE_EXTS}))\)",
71
+ r"![\1](images/\2)",
72
+ text,
73
+ flags=re.I,
74
+ )
75
+ return text
76
+
77
+
78
+ def _normalize_separators(text: str) -> str:
79
+ return re.sub(r"\n[-]{5,}\n", "\n\n---\n\n", text)
80
+
81
+
82
+ def _convert_footnotes(text: str) -> str:
83
+ # ^[1](#ch1_fn1)^ Footnote text -> [^1]: Footnote text
84
+ text = re.sub(
85
+ r"^\s*\^\[([^\]]+)\]\(#.+?\)\^\s*",
86
+ lambda m: f"[^{m.group(1).strip()}]: ",
87
+ text,
88
+ flags=re.M,
89
+ )
90
+ # ^<a ...>1</a>^ Footnote text -> [^1]: Footnote text
91
+ text = re.sub(
92
+ r"^\s*\^<a\b[^>]*>(\d+)</a>\^\s*",
93
+ r"[^\1]: ",
94
+ text,
95
+ flags=re.M | re.I,
96
+ )
97
+ text = re.sub(
98
+ r"^\s*\^<a\b[^>]*>([^<]+)</a>\^\s*",
99
+ lambda m: f"[^{m.group(1).strip()}]: ",
100
+ text,
101
+ flags=re.M | re.I,
102
+ )
103
+ # Inline footnote refs: [^1](#id) -> [^1]
104
+ text = re.sub(r"\[\^([^\]]+)\]\(#.+?\)", r"[^\1]", text)
105
+ return text
106
+
107
+
108
+ def _strip_remaining_html(text: str) -> str:
109
+ text = re.sub(
110
+ r'<a\b[^>]*href="#[^"]+"[^>]*>([^<]+)</a>',
111
+ r"\1",
112
+ text,
113
+ flags=re.I,
114
+ )
115
+ text = re.sub(r"</?span\b[^>]*>", "", text, flags=re.I)
116
+ return text
117
+
118
+
119
+ def _flatten_toc(text: str) -> str:
120
+ """Convert .html/.xhtml links into TOC markers, then collapse them."""
121
+ text = re.sub(
122
+ r"\[([^\]]+)\]\([^)]*(?:\.html|\.xhtml)[^)]*\)",
123
+ r"@@TOC@@ \1",
124
+ text,
125
+ flags=re.I,
126
+ )
127
+ text = re.sub(r"\s*@@TOC@@\s*", r"\n@@TOC@@ ", text)
128
+ text = re.sub(r"\n{3,}", "\n\n", text)
129
+
130
+ lines = text.splitlines()
131
+ new_lines: list[str] = []
132
+ i = 0
133
+ while i < len(lines):
134
+ line = lines[i]
135
+ if line.strip().startswith("@@TOC@@ "):
136
+ block: list[str] = []
137
+ while i < len(lines) and lines[i].strip().startswith("@@TOC@@ "):
138
+ title = lines[i].strip()[len("@@TOC@@ ") :].strip()
139
+ if title:
140
+ block.append(title)
141
+ i += 1
142
+ if block:
143
+ new_lines.append(f"## {': '.join(block)}")
144
+ new_lines.append("")
145
+ continue
146
+ new_lines.append(line)
147
+ i += 1
148
+
149
+ text = "\n".join(new_lines)
150
+ return re.sub(r"@@TOC@@\s*", "", text)
151
+
152
+
153
+ def _normalize_heading_line(line: str) -> str:
154
+ s = line.strip()
155
+ if not s:
156
+ return ""
157
+ if re.fullmatch(r"#{1,6}", s):
158
+ return ""
159
+
160
+ while True:
161
+ new_s = re.sub(
162
+ r"^(#{1,6})\s+(#{1,6})(\s+.*)$",
163
+ lambda m: "#" * (len(m.group(1)) + len(m.group(2))) + m.group(3),
164
+ s,
165
+ )
166
+ if new_s == s:
167
+ break
168
+ s = new_s
169
+
170
+ s = re.sub(r"^(#{1,6})(\S)", r"\1 \2", s)
171
+ s = re.sub(r"^(#{1,6})\s+(#{1,6})\s+", r"\1 ", s)
172
+ s = re.sub(r"^(#{1,6})\s+", r"\1 ", s)
173
+ s = re.sub(r"[ \t]+", " ", s).strip()
174
+ return s
175
+
176
+
177
+ _LABEL_RE = re.compile(
178
+ r"^(Chapter|Part|Section|Book|Volume|Appendix)\s+[A-Za-z0-9\.]+$", re.I
179
+ )
180
+ _SHORT_LABEL_RE = re.compile(r"^[A-Z0-9]+[\.\)]?$", re.I)
181
+
182
+
183
+ def _normalize_headings(text: str) -> str:
184
+ lines = text.splitlines()
185
+ fixed: list[str] = []
186
+ for line in lines:
187
+ stripped = line.strip()
188
+ if re.match(r"^#{1,6}(\s|#|$)", stripped):
189
+ normalized = _normalize_heading_line(line)
190
+ if normalized:
191
+ fixed.append(normalized)
192
+ else:
193
+ fixed.append(line.rstrip())
194
+ return "\n".join(fixed)
195
+
196
+
197
+ def _merge_consecutive_headings(text: str) -> str:
198
+ """Merge consecutive headings when the first looks like a structural label."""
199
+ fixed = text.splitlines()
200
+ merged: list[str] = []
201
+ i = 0
202
+ while i < len(fixed):
203
+ line = fixed[i]
204
+ m_cur = re.match(r"^(#{1,6})\s+(.*)", line)
205
+ if not m_cur:
206
+ merged.append(line)
207
+ i += 1
208
+ continue
209
+
210
+ current_level = m_cur.group(1)
211
+ parts = [m_cur.group(2).strip()]
212
+ j = i + 1
213
+ while j < len(fixed):
214
+ lookahead = fixed[j]
215
+ if not lookahead.strip():
216
+ j += 1
217
+ continue
218
+ m_next = re.match(r"^(#{1,6})\s+(.*)", lookahead)
219
+ if m_next:
220
+ last = parts[-1]
221
+ if _LABEL_RE.match(last) or _SHORT_LABEL_RE.match(last):
222
+ parts.append(m_next.group(2).strip())
223
+ j += 1
224
+ else:
225
+ break
226
+ else:
227
+ break
228
+
229
+ merged.append(f"{current_level} {': '.join(parts)}")
230
+ i = j
231
+
232
+ return "\n".join(merged)
233
+
234
+
235
+ def _demote_headings(text: str) -> str:
236
+ """Shift # -> ## and ## -> ### so chapters start at ##."""
237
+ lines = text.splitlines()
238
+ adjusted: list[str] = []
239
+ for line in lines:
240
+ s = line.strip()
241
+ if re.match(r"^#\s+", s):
242
+ s = re.sub(r"^#\s+", "## ", s)
243
+ elif re.match(r"^##\s+", s):
244
+ s = re.sub(r"^##\s+", "### ", s)
245
+ adjusted.append(s if s else "")
246
+ return "\n".join(adjusted)
247
+
248
+
249
+ def _dedup_headings(text: str) -> str:
250
+ return re.sub(r"^(#{1,6}\s+.+)\n+\1$", r"\1", text, flags=re.M)
251
+
252
+
253
+ def _ensure_heading_spacing(text: str) -> str:
254
+ text = re.sub(r"([^\n])\n(#{1,6}\s)", r"\1\n\n\2", text)
255
+ text = re.sub(r"(#{1,6}\s[^\n]+)\n([^\n#])", r"\1\n\n\2", text)
256
+ return text
257
+
258
+
259
+ def _final_whitespace(text: str) -> str:
260
+ text = re.sub(r"\n{3,}", "\n\n", text)
261
+ text = re.sub(r"[ \t]+$", "", text, flags=re.M)
262
+ return text.strip() + "\n"
aimd_book/processor.py ADDED
@@ -0,0 +1,221 @@
1
+ """Ebook extraction and conversion pipeline.
2
+
3
+ The current implementation handles EPUB-compatible ZIP/spine ebooks and aligns
4
+ with the standalone epub-to-markdown shell script:
5
+ - Spine-based chapter ordering (container.xml -> OPF -> manifest + spine)
6
+ - Pandoc conversion via subprocess: ``-f html -t markdown_mmd-raw_html --wrap=none``
7
+ - Post-processing via :func:`epub_cleaner.clean_markdown`
8
+ - Flat ``images/`` directory (no subdirectory nesting)
9
+ - Chapter files named after the original HTML stem
10
+ - Combined book file uses ``---`` separators between chapters
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ import shutil
17
+ import subprocess
18
+ import tempfile
19
+ import urllib.parse
20
+ import zipfile
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+
24
+ from logly import logger
25
+
26
+ from .cleaner import clean_markdown
27
+
28
+ _IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp"}
29
+
30
+
31
+ @dataclass(slots=True, frozen=True)
32
+ class BookConversion:
33
+ """Book conversion result used by the MarkItDown adapter."""
34
+
35
+ title: str
36
+ markdown: str
37
+ output_dir: Path
38
+
39
+
40
+ def _extract_title_from_markdown(content: str, fallback_title: str) -> str:
41
+ """Extract a simple title from generated markdown."""
42
+ for line in content.splitlines():
43
+ stripped = line.strip()
44
+ if stripped.startswith("# "):
45
+ return stripped[2:].strip() or fallback_title
46
+ if stripped and not stripped.startswith(("![", "<", ":::")):
47
+ return stripped[:100]
48
+ return fallback_title
49
+
50
+
51
+ def _find_oebps_dir(temp_path: Path) -> Path:
52
+ """Recursively search for OEBPS/OPS directory; fall back to *temp_path*."""
53
+ for dirpath in temp_path.rglob("*"):
54
+ if dirpath.is_dir() and dirpath.name in ("OEBPS", "OPS"):
55
+ return dirpath
56
+ return temp_path
57
+
58
+
59
+ def _read_spine_order(temp_path: Path) -> list[Path]:
60
+ """Parse EPUB spine for correct chapter reading order."""
61
+ container = temp_path / "META-INF" / "container.xml"
62
+ if not container.exists():
63
+ return []
64
+
65
+ c_text = container.read_text(encoding="utf-8", errors="ignore")
66
+ m = re.search(r'full-path="([^"]+)"', c_text)
67
+ if not m:
68
+ return []
69
+
70
+ opf_path = temp_path / m.group(1)
71
+ if not opf_path.exists():
72
+ return []
73
+
74
+ opf_dir = opf_path.parent
75
+ opf_text = opf_path.read_text(encoding="utf-8", errors="ignore")
76
+
77
+ manifest: dict[str, str] = {}
78
+ for match in re.finditer(
79
+ r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_text, re.I
80
+ ):
81
+ manifest[match.group(1)] = match.group(2)
82
+ for match in re.finditer(
83
+ r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_text, re.I
84
+ ):
85
+ manifest[match.group(2)] = match.group(1)
86
+
87
+ spine_match = re.search(r"<spine[^>]*>(.*?)</spine>", opf_text, re.I | re.S)
88
+ if not spine_match:
89
+ return []
90
+
91
+ spine_files: list[Path] = []
92
+ for itemref in re.finditer(
93
+ r'<itemref\s+[^>]*idref="([^"]+)"', spine_match.group(1), re.I
94
+ ):
95
+ idref = itemref.group(1)
96
+ if idref not in manifest:
97
+ continue
98
+ href = urllib.parse.unquote(manifest[idref]).split("#")[0]
99
+ file_path = (opf_dir / href).resolve()
100
+ if file_path.exists() and file_path.suffix.lower() in {
101
+ ".html",
102
+ ".xhtml",
103
+ ".htm",
104
+ }:
105
+ spine_files.append(file_path)
106
+
107
+ return spine_files
108
+
109
+
110
+ def _extract_images(oebps_dir: Path, images_dir: Path) -> None:
111
+ """Copy all images under *oebps_dir* into a flat *images_dir*."""
112
+ for img_path in oebps_dir.rglob("*"):
113
+ if img_path.is_file() and img_path.suffix.lower() in _IMAGE_SUFFIXES:
114
+ shutil.copy(img_path, images_dir / img_path.name)
115
+
116
+
117
+ def _convert_html_to_markdown(html_file: Path, output_file: Path) -> None:
118
+ """Convert a single HTML/XHTML file to markdown via the pandoc CLI."""
119
+ result = subprocess.run(
120
+ [
121
+ "pandoc",
122
+ str(html_file),
123
+ "-f",
124
+ "html",
125
+ "-t",
126
+ "markdown_mmd-raw_html",
127
+ "--wrap=none",
128
+ "-o",
129
+ str(output_file),
130
+ ],
131
+ capture_output=True,
132
+ text=True,
133
+ check=False,
134
+ )
135
+ if result.returncode != 0:
136
+ raise RuntimeError(
137
+ f"Pandoc conversion failed for {html_file.name}: {result.stderr.strip()}"
138
+ )
139
+
140
+
141
+ def process_book_with_images(
142
+ file_path: str | Path,
143
+ output_dir: Path | None = None,
144
+ temp_dir: Path | None = None,
145
+ ) -> BookConversion:
146
+ """Process an ebook file with image extraction and spine-ordered chapters."""
147
+ file_path = Path(file_path)
148
+
149
+ if not file_path.exists():
150
+ raise FileNotFoundError(f"Book file not found: {file_path}")
151
+
152
+ if output_dir is None:
153
+ output_dir = file_path.parent / file_path.stem
154
+
155
+ chapters_dir = output_dir / "chapters"
156
+ images_dir = output_dir / "images"
157
+ output_dir.mkdir(parents=True, exist_ok=True)
158
+ chapters_dir.mkdir(exist_ok=True)
159
+ images_dir.mkdir(exist_ok=True)
160
+
161
+ with tempfile.TemporaryDirectory(dir=temp_dir) as tmp:
162
+ temp_path = Path(tmp)
163
+
164
+ try:
165
+ with zipfile.ZipFile(file_path, "r") as zip_ref:
166
+ zip_ref.extractall(temp_path)
167
+ except zipfile.BadZipFile as e:
168
+ raise RuntimeError(f"Invalid book file (not a valid ZIP): {e}") from e
169
+
170
+ oebps_dir = _find_oebps_dir(temp_path)
171
+ _extract_images(oebps_dir, images_dir)
172
+
173
+ spine_files = _read_spine_order(temp_path)
174
+ if not spine_files:
175
+ html_files: list[Path] = []
176
+ for ext in ("*.xhtml", "*.html", "*.htm"):
177
+ html_files.extend(oebps_dir.rglob(ext))
178
+ spine_files = sorted(html_files)
179
+
180
+ if not spine_files:
181
+ raise RuntimeError("No HTML/XHTML chapter files found in book")
182
+
183
+ chapter_files: list[tuple[str, str]] = []
184
+ for html_file in spine_files:
185
+ if not html_file.exists():
186
+ continue
187
+
188
+ basename = html_file.stem
189
+ out_md = chapters_dir / f"{basename}.md"
190
+
191
+ try:
192
+ _convert_html_to_markdown(html_file, out_md)
193
+ clean_markdown(out_md)
194
+ content = out_md.read_text(encoding="utf-8")
195
+ chapter_files.append((basename, content))
196
+ except Exception as e:
197
+ logger.warning(f"Failed to convert {html_file.name}: {e}")
198
+ continue
199
+
200
+ if not chapter_files:
201
+ raise RuntimeError("Failed to convert any HTML files to markdown")
202
+
203
+ combined_content = "\n\n---\n\n".join(
204
+ content.strip() for _, content in chapter_files
205
+ )
206
+
207
+ full_md_path = output_dir / f"{file_path.stem}.md"
208
+ full_md_path.write_text(combined_content.strip() + "\n", encoding="utf-8")
209
+
210
+ title = _extract_title_from_markdown(chapter_files[0][1], file_path.stem)
211
+
212
+ logger.info(
213
+ f"Book extracted to {output_dir}: {len(chapter_files)} chapters, "
214
+ f"{sum(1 for _ in images_dir.iterdir())} images"
215
+ )
216
+
217
+ return BookConversion(
218
+ title=title,
219
+ markdown=combined_content,
220
+ output_dir=output_dir,
221
+ )
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.3
2
+ Name: aimd-book
3
+ Version: 0.9.2
4
+ Summary: Ebook conversion package for aimd.
5
+ Author: Shu Li
6
+ Author-email: Shu Li <zetarylee@gmail.com>
7
+ Requires-Dist: logly>=0.1.6
8
+ Requires-Dist: markitdown>=0.1.1,<0.2.0
9
+ Requires-Python: >=3.10, <3.13
@@ -0,0 +1,8 @@
1
+ aimd_book/__init__.py,sha256=LuyQW-u0L32gtsfyY59TtvLtIv_LIVh98W1-N9lnkUY,417
2
+ aimd_book/_plugin.py,sha256=o9xf_DMHYM-vuSXW4TO33zrPu2z_PRrs4-tdd4b8hsg,1700
3
+ aimd_book/cleaner.py,sha256=i8ZrWjLTd3-anN7w6IRkhjwo6hLgInecJQPZdXZsZHk,7576
4
+ aimd_book/processor.py,sha256=snZ3kUgXf8q1J9vcUFfkvTG51EhXJpwjavV48gx8QNU,7199
5
+ aimd_book-0.9.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
6
+ aimd_book-0.9.2.dist-info/entry_points.txt,sha256=Yol1vB3Votz-ZKKmal5w97fC7AKJR51As1E2xFPZFb8,43
7
+ aimd_book-0.9.2.dist-info/METADATA,sha256=19V8qMfsbgl42QGr0yOXnZEpMPvCCuyUiYqYCYNh9v0,254
8
+ aimd_book-0.9.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [markitdown.plugin]
2
+ aimd_book = aimd_book
3
+