PyPI - aimd-book - Versions diffs - 0.9.2__py3-none-any.whl - Mend

aimd-book 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

aimd_book/__init__.py +18 -0
aimd_book/_plugin.py +61 -0
aimd_book/cleaner.py +262 -0
aimd_book/processor.py +221 -0
aimd_book-0.9.2.dist-info/METADATA +9 -0
aimd_book-0.9.2.dist-info/RECORD +8 -0
aimd_book-0.9.2.dist-info/WHEEL +4 -0
aimd_book-0.9.2.dist-info/entry_points.txt +3 -0

aimd_book/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Ebook package for aimd."""
+from .cleaner import clean_markdown
+from .processor import BookConversion, process_book_with_images
+from ._plugin import (
+    AimdBookConverter,
+    __plugin_interface_version__,
+    register_converters,
+)
+__all__ = [
+    "AimdBookConverter",
+    "BookConversion",
+    "__plugin_interface_version__",
+    "clean_markdown",
+    "process_book_with_images",
+    "register_converters",
+]

aimd_book/_plugin.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""MarkItDown plugin for ebook conversion."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, BinaryIO
+from markitdown import (
+    DocumentConverter,
+    DocumentConverterResult,
+    FailedConversionAttempt,
+    MarkItDown,
+    StreamInfo,
+)
+from .processor import process_book_with_images
+BOOK_EXTENSIONS = {".epub", ".mobi", ".azw3"}
+__plugin_interface_version__ = 1
+def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
+    """Register the ebook converter with MarkItDown."""
+    markitdown.register_converter(AimdBookConverter(), priority=10.0)
+class AimdBookConverter(DocumentConverter):
+    """Convert EPUB-like ebooks to markdown with image extraction."""
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        extension = (stream_info.extension or "").lower()
+        return extension in BOOK_EXTENSIONS
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        if not stream_info.local_path:
+            raise FailedConversionAttempt("aimd-book requires a local file path")
+        try:
+            result = process_book_with_images(
+                Path(stream_info.local_path),
+                output_dir=kwargs.get("output_dir"),
+                temp_dir=kwargs.get("temp_dir"),
+            )
+        except Exception as exc:
+            raise FailedConversionAttempt(f"Book conversion failed: {exc}") from exc
+        return DocumentConverterResult(
+            title=result.title,
+            markdown=result.markdown,
+        )

aimd_book/cleaner.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Post-processing cleanup for pandoc-generated markdown from EPUB chapters.
+Ported from the standalone epub-to-markdown shell script.  Handles image path
+normalisation, EPUB-style footnote conversion, TOC hierarchy flattening,
+heading normalisation / merging / demotion, and whitespace tidying.
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+_IMAGE_EXTS = r"jpg|jpeg|png|gif|webp|svg"
+def clean_markdown(file_path: Path) -> None:
+    """Read *file_path*, apply all EPUB-specific fixups, write back."""
+    text = file_path.read_text(encoding="utf-8", errors="ignore")
+    text = _clean_spans(text)
+    text = _fix_image_refs(text)
+    text = _normalize_separators(text)
+    text = _convert_footnotes(text)
+    text = _strip_remaining_html(text)
+    text = _flatten_toc(text)
+    text = _normalize_headings(text)
+    text = _merge_consecutive_headings(text)
+    text = _demote_headings(text)
+    text = _dedup_headings(text)
+    text = _ensure_heading_spacing(text)
+    text = _final_whitespace(text)
+    file_path.write_text(text, encoding="utf-8")
+def _clean_spans(text: str) -> str:
+    text = re.sub(
+        r'<span\b[^>]*class="[^"]*\bimage placeholder\b[^"]*"[^>]*>\s*</span>\n*',
+        "",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(
+        r'<span\b[^>]*id="[^"]*"[^>]*>\s*</span>\n*',
+        "",
+        text,
+        flags=re.I,
+    )
+    return text
+def _fix_image_refs(text: str) -> str:
+    text = re.sub(
+        rf'<img\b[^>]*src="[^"]*/images/([^"/]+\.(?:{_IMAGE_EXTS}))"[^>]*alt="([^"]*)"[^>]*/?>',
+        lambda m: f"![{m.group(2) or 'Image'}](images/{m.group(1)})",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(
+        rf'<img\b[^>]*src="([^"]*?/)?([^"/]+\.(?:{_IMAGE_EXTS}))"[^>]*alt="([^"]*)"[^>]*/?>',
+        lambda m: f"![{m.group(3) or 'Image'}](images/{m.group(2)})",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(
+        r"!\[([^\]]*)\]\((?:[^)\"]*/)?" r"images/([^)\"/]+)\)",
+        r"![\1](images/\2)",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(
+        rf"!\[([^\]]*)\]\((?:[^)\"]*/)?" rf"([^)/\"]+\.(?:{_IMAGE_EXTS}))\)",
+        r"![\1](images/\2)",
+        text,
+        flags=re.I,
+    )
+    return text
+def _normalize_separators(text: str) -> str:
+    return re.sub(r"\n[-]{5,}\n", "\n\n---\n\n", text)
+def _convert_footnotes(text: str) -> str:
+    # ^[1](#ch1_fn1)^ Footnote text  ->  [^1]: Footnote text
+    text = re.sub(
+        r"^\s*\^\[([^\]]+)\]\(#.+?\)\^\s*",
+        lambda m: f"[^{m.group(1).strip()}]: ",
+        text,
+        flags=re.M,
+    )
+    # ^<a ...>1</a>^ Footnote text  ->  [^1]: Footnote text
+    text = re.sub(
+        r"^\s*\^<a\b[^>]*>(\d+)</a>\^\s*",
+        r"[^\1]: ",
+        text,
+        flags=re.M | re.I,
+    )
+    text = re.sub(
+        r"^\s*\^<a\b[^>]*>([^<]+)</a>\^\s*",
+        lambda m: f"[^{m.group(1).strip()}]: ",
+        text,
+        flags=re.M | re.I,
+    )
+    # Inline footnote refs: [^1](#id) -> [^1]
+    text = re.sub(r"\[\^([^\]]+)\]\(#.+?\)", r"[^\1]", text)
+    return text
+def _strip_remaining_html(text: str) -> str:
+    text = re.sub(
+        r'<a\b[^>]*href="#[^"]+"[^>]*>([^<]+)</a>',
+        r"\1",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(r"</?span\b[^>]*>", "", text, flags=re.I)
+    return text
+def _flatten_toc(text: str) -> str:
+    """Convert .html/.xhtml links into TOC markers, then collapse them."""
+    text = re.sub(
+        r"\[([^\]]+)\]\([^)]*(?:\.html|\.xhtml)[^)]*\)",
+        r"@@TOC@@ \1",
+        text,
+        flags=re.I,
+    )
+    text = re.sub(r"\s*@@TOC@@\s*", r"\n@@TOC@@ ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    lines = text.splitlines()
+    new_lines: list[str] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line.strip().startswith("@@TOC@@ "):
+            block: list[str] = []
+            while i < len(lines) and lines[i].strip().startswith("@@TOC@@ "):
+                title = lines[i].strip()[len("@@TOC@@ ") :].strip()
+                if title:
+                    block.append(title)
+                i += 1
+            if block:
+                new_lines.append(f"## {': '.join(block)}")
+                new_lines.append("")
+            continue
+        new_lines.append(line)
+        i += 1
+    text = "\n".join(new_lines)
+    return re.sub(r"@@TOC@@\s*", "", text)
+def _normalize_heading_line(line: str) -> str:
+    s = line.strip()
+    if not s:
+        return ""
+    if re.fullmatch(r"#{1,6}", s):
+        return ""
+    while True:
+        new_s = re.sub(
+            r"^(#{1,6})\s+(#{1,6})(\s+.*)$",
+            lambda m: "#" * (len(m.group(1)) + len(m.group(2))) + m.group(3),
+            s,
+        )
+        if new_s == s:
+            break
+        s = new_s
+    s = re.sub(r"^(#{1,6})(\S)", r"\1 \2", s)
+    s = re.sub(r"^(#{1,6})\s+(#{1,6})\s+", r"\1 ", s)
+    s = re.sub(r"^(#{1,6})\s+", r"\1 ", s)
+    s = re.sub(r"[ \t]+", " ", s).strip()
+    return s
+_LABEL_RE = re.compile(
+    r"^(Chapter|Part|Section|Book|Volume|Appendix)\s+[A-Za-z0-9\.]+$", re.I
+)
+_SHORT_LABEL_RE = re.compile(r"^[A-Z0-9]+[\.\)]?$", re.I)
+def _normalize_headings(text: str) -> str:
+    lines = text.splitlines()
+    fixed: list[str] = []
+    for line in lines:
+        stripped = line.strip()
+        if re.match(r"^#{1,6}(\s|#|$)", stripped):
+            normalized = _normalize_heading_line(line)
+            if normalized:
+                fixed.append(normalized)
+        else:
+            fixed.append(line.rstrip())
+    return "\n".join(fixed)
+def _merge_consecutive_headings(text: str) -> str:
+    """Merge consecutive headings when the first looks like a structural label."""
+    fixed = text.splitlines()
+    merged: list[str] = []
+    i = 0
+    while i < len(fixed):
+        line = fixed[i]
+        m_cur = re.match(r"^(#{1,6})\s+(.*)", line)
+        if not m_cur:
+            merged.append(line)
+            i += 1
+            continue
+        current_level = m_cur.group(1)
+        parts = [m_cur.group(2).strip()]
+        j = i + 1
+        while j < len(fixed):
+            lookahead = fixed[j]
+            if not lookahead.strip():
+                j += 1
+                continue
+            m_next = re.match(r"^(#{1,6})\s+(.*)", lookahead)
+            if m_next:
+                last = parts[-1]
+                if _LABEL_RE.match(last) or _SHORT_LABEL_RE.match(last):
+                    parts.append(m_next.group(2).strip())
+                    j += 1
+                else:
+                    break
+            else:
+                break
+        merged.append(f"{current_level} {': '.join(parts)}")
+        i = j
+    return "\n".join(merged)
+def _demote_headings(text: str) -> str:
+    """Shift # -> ## and ## -> ### so chapters start at ##."""
+    lines = text.splitlines()
+    adjusted: list[str] = []
+    for line in lines:
+        s = line.strip()
+        if re.match(r"^#\s+", s):
+            s = re.sub(r"^#\s+", "## ", s)
+        elif re.match(r"^##\s+", s):
+            s = re.sub(r"^##\s+", "### ", s)
+        adjusted.append(s if s else "")
+    return "\n".join(adjusted)
+def _dedup_headings(text: str) -> str:
+    return re.sub(r"^(#{1,6}\s+.+)\n+\1$", r"\1", text, flags=re.M)
+def _ensure_heading_spacing(text: str) -> str:
+    text = re.sub(r"([^\n])\n(#{1,6}\s)", r"\1\n\n\2", text)
+    text = re.sub(r"(#{1,6}\s[^\n]+)\n([^\n#])", r"\1\n\n\2", text)
+    return text
+def _final_whitespace(text: str) -> str:
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"[ \t]+$", "", text, flags=re.M)
+    return text.strip() + "\n"

aimd_book/processor.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Ebook extraction and conversion pipeline.
+The current implementation handles EPUB-compatible ZIP/spine ebooks and aligns
+with the standalone epub-to-markdown shell script:
+  - Spine-based chapter ordering (container.xml -> OPF -> manifest + spine)
+  - Pandoc conversion via subprocess: ``-f html -t markdown_mmd-raw_html --wrap=none``
+  - Post-processing via :func:`epub_cleaner.clean_markdown`
+  - Flat ``images/`` directory (no subdirectory nesting)
+  - Chapter files named after the original HTML stem
+  - Combined book file uses ``---`` separators between chapters
+"""
+from __future__ import annotations
+import re
+import shutil
+import subprocess
+import tempfile
+import urllib.parse
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from logly import logger
+from .cleaner import clean_markdown
+_IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp"}
+@dataclass(slots=True, frozen=True)
+class BookConversion:
+    """Book conversion result used by the MarkItDown adapter."""
+    title: str
+    markdown: str
+    output_dir: Path
+def _extract_title_from_markdown(content: str, fallback_title: str) -> str:
+    """Extract a simple title from generated markdown."""
+    for line in content.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("# "):
+            return stripped[2:].strip() or fallback_title
+        if stripped and not stripped.startswith(("![", "<", ":::")):
+            return stripped[:100]
+    return fallback_title
+def _find_oebps_dir(temp_path: Path) -> Path:
+    """Recursively search for OEBPS/OPS directory; fall back to *temp_path*."""
+    for dirpath in temp_path.rglob("*"):
+        if dirpath.is_dir() and dirpath.name in ("OEBPS", "OPS"):
+            return dirpath
+    return temp_path
+def _read_spine_order(temp_path: Path) -> list[Path]:
+    """Parse EPUB spine for correct chapter reading order."""
+    container = temp_path / "META-INF" / "container.xml"
+    if not container.exists():
+        return []
+    c_text = container.read_text(encoding="utf-8", errors="ignore")
+    m = re.search(r'full-path="([^"]+)"', c_text)
+    if not m:
+        return []
+    opf_path = temp_path / m.group(1)
+    if not opf_path.exists():
+        return []
+    opf_dir = opf_path.parent
+    opf_text = opf_path.read_text(encoding="utf-8", errors="ignore")
+    manifest: dict[str, str] = {}
+    for match in re.finditer(
+        r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_text, re.I
+    ):
+        manifest[match.group(1)] = match.group(2)
+    for match in re.finditer(
+        r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_text, re.I
+    ):
+        manifest[match.group(2)] = match.group(1)
+    spine_match = re.search(r"<spine[^>]*>(.*?)</spine>", opf_text, re.I | re.S)
+    if not spine_match:
+        return []
+    spine_files: list[Path] = []
+    for itemref in re.finditer(
+        r'<itemref\s+[^>]*idref="([^"]+)"', spine_match.group(1), re.I
+    ):
+        idref = itemref.group(1)
+        if idref not in manifest:
+            continue
+        href = urllib.parse.unquote(manifest[idref]).split("#")[0]
+        file_path = (opf_dir / href).resolve()
+        if file_path.exists() and file_path.suffix.lower() in {
+            ".html",
+            ".xhtml",
+            ".htm",
+        }:
+            spine_files.append(file_path)
+    return spine_files
+def _extract_images(oebps_dir: Path, images_dir: Path) -> None:
+    """Copy all images under *oebps_dir* into a flat *images_dir*."""
+    for img_path in oebps_dir.rglob("*"):
+        if img_path.is_file() and img_path.suffix.lower() in _IMAGE_SUFFIXES:
+            shutil.copy(img_path, images_dir / img_path.name)
+def _convert_html_to_markdown(html_file: Path, output_file: Path) -> None:
+    """Convert a single HTML/XHTML file to markdown via the pandoc CLI."""
+    result = subprocess.run(
+        [
+            "pandoc",
+            str(html_file),
+            "-f",
+            "html",
+            "-t",
+            "markdown_mmd-raw_html",
+            "--wrap=none",
+            "-o",
+            str(output_file),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"Pandoc conversion failed for {html_file.name}: {result.stderr.strip()}"
+        )
+def process_book_with_images(
+    file_path: str | Path,
+    output_dir: Path | None = None,
+    temp_dir: Path | None = None,
+) -> BookConversion:
+    """Process an ebook file with image extraction and spine-ordered chapters."""
+    file_path = Path(file_path)
+    if not file_path.exists():
+        raise FileNotFoundError(f"Book file not found: {file_path}")
+    if output_dir is None:
+        output_dir = file_path.parent / file_path.stem
+    chapters_dir = output_dir / "chapters"
+    images_dir = output_dir / "images"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    chapters_dir.mkdir(exist_ok=True)
+    images_dir.mkdir(exist_ok=True)
+    with tempfile.TemporaryDirectory(dir=temp_dir) as tmp:
+        temp_path = Path(tmp)
+        try:
+            with zipfile.ZipFile(file_path, "r") as zip_ref:
+                zip_ref.extractall(temp_path)
+        except zipfile.BadZipFile as e:
+            raise RuntimeError(f"Invalid book file (not a valid ZIP): {e}") from e
+        oebps_dir = _find_oebps_dir(temp_path)
+        _extract_images(oebps_dir, images_dir)
+        spine_files = _read_spine_order(temp_path)
+        if not spine_files:
+            html_files: list[Path] = []
+            for ext in ("*.xhtml", "*.html", "*.htm"):
+                html_files.extend(oebps_dir.rglob(ext))
+            spine_files = sorted(html_files)
+        if not spine_files:
+            raise RuntimeError("No HTML/XHTML chapter files found in book")
+        chapter_files: list[tuple[str, str]] = []
+        for html_file in spine_files:
+            if not html_file.exists():
+                continue
+            basename = html_file.stem
+            out_md = chapters_dir / f"{basename}.md"
+            try:
+                _convert_html_to_markdown(html_file, out_md)
+                clean_markdown(out_md)
+                content = out_md.read_text(encoding="utf-8")
+                chapter_files.append((basename, content))
+            except Exception as e:
+                logger.warning(f"Failed to convert {html_file.name}: {e}")
+                continue
+        if not chapter_files:
+            raise RuntimeError("Failed to convert any HTML files to markdown")
+        combined_content = "\n\n---\n\n".join(
+            content.strip() for _, content in chapter_files
+        )
+        full_md_path = output_dir / f"{file_path.stem}.md"
+        full_md_path.write_text(combined_content.strip() + "\n", encoding="utf-8")
+        title = _extract_title_from_markdown(chapter_files[0][1], file_path.stem)
+        logger.info(
+            f"Book extracted to {output_dir}: {len(chapter_files)} chapters, "
+            f"{sum(1 for _ in images_dir.iterdir())} images"
+        )
+        return BookConversion(
+            title=title,
+            markdown=combined_content,
+            output_dir=output_dir,
+        )

aimd_book-0.9.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,9 @@
+Metadata-Version: 2.3
+Name: aimd-book
+Version: 0.9.2
+Summary: Ebook conversion package for aimd.
+Author: Shu Li
+Author-email: Shu Li <zetarylee@gmail.com>
+Requires-Dist: logly>=0.1.6
+Requires-Dist: markitdown>=0.1.1,<0.2.0
+Requires-Python: >=3.10, <3.13

aimd_book-0.9.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+aimd_book/__init__.py,sha256=LuyQW-u0L32gtsfyY59TtvLtIv_LIVh98W1-N9lnkUY,417
+aimd_book/_plugin.py,sha256=o9xf_DMHYM-vuSXW4TO33zrPu2z_PRrs4-tdd4b8hsg,1700
+aimd_book/cleaner.py,sha256=i8ZrWjLTd3-anN7w6IRkhjwo6hLgInecJQPZdXZsZHk,7576
+aimd_book/processor.py,sha256=snZ3kUgXf8q1J9vcUFfkvTG51EhXJpwjavV48gx8QNU,7199
+aimd_book-0.9.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+aimd_book-0.9.2.dist-info/entry_points.txt,sha256=Yol1vB3Votz-ZKKmal5w97fC7AKJR51As1E2xFPZFb8,43
+aimd_book-0.9.2.dist-info/METADATA,sha256=19V8qMfsbgl42QGr0yOXnZEpMPvCCuyUiYqYCYNh9v0,254
+aimd_book-0.9.2.dist-info/RECORD,,

aimd_book-0.9.2.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.8.24
+Root-Is-Purelib: true
+Tag: py3-none-any

aimd_book-0.9.2.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[markitdown.plugin]
+aimd_book = aimd_book