PyPI - markitai - Versions diffs - 0.3.0__py3-none-any.whl - Mend

markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

markitai/__init__.py +3 -0
markitai/batch.py +1316 -0
markitai/cli.py +3979 -0
markitai/config.py +602 -0
markitai/config.schema.json +748 -0
markitai/constants.py +222 -0
markitai/converter/__init__.py +49 -0
markitai/converter/_patches.py +98 -0
markitai/converter/base.py +164 -0
markitai/converter/image.py +181 -0
markitai/converter/legacy.py +606 -0
markitai/converter/office.py +526 -0
markitai/converter/pdf.py +679 -0
markitai/converter/text.py +63 -0
markitai/fetch.py +1725 -0
markitai/image.py +1335 -0
markitai/json_order.py +550 -0
markitai/llm.py +4339 -0
markitai/ocr.py +347 -0
markitai/prompts/__init__.py +159 -0
markitai/prompts/cleaner.md +93 -0
markitai/prompts/document_enhance.md +77 -0
markitai/prompts/document_enhance_complete.md +65 -0
markitai/prompts/document_process.md +60 -0
markitai/prompts/frontmatter.md +28 -0
markitai/prompts/image_analysis.md +21 -0
markitai/prompts/image_caption.md +8 -0
markitai/prompts/image_description.md +13 -0
markitai/prompts/page_content.md +17 -0
markitai/prompts/url_enhance.md +78 -0
markitai/security.py +286 -0
markitai/types.py +30 -0
markitai/urls.py +187 -0
markitai/utils/__init__.py +33 -0
markitai/utils/executor.py +69 -0
markitai/utils/mime.py +85 -0
markitai/utils/office.py +262 -0
markitai/utils/output.py +53 -0
markitai/utils/paths.py +81 -0
markitai/utils/text.py +359 -0
markitai/workflow/__init__.py +37 -0
markitai/workflow/core.py +760 -0
markitai/workflow/helpers.py +509 -0
markitai/workflow/single.py +369 -0
markitai-0.3.0.dist-info/METADATA +159 -0
markitai-0.3.0.dist-info/RECORD +48 -0
markitai-0.3.0.dist-info/WHEEL +4 -0
markitai-0.3.0.dist-info/entry_points.txt +2 -0

markitai/ocr.py ADDED Viewed

@@ -0,0 +1,347 @@
+"""OCR module using RapidOCR."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from loguru import logger
+from markitai.constants import DEFAULT_OCR_SAMPLE_PAGES, DEFAULT_RENDER_DPI
+if TYPE_CHECKING:
+    from markitai.config import OCRConfig
+@dataclass
+class OCRResult:
+    """Result of OCR processing."""
+    text: str
+    confidence: float
+    boxes: list[list[float]]
+class OCRProcessor:
+    """OCR processor using RapidOCR."""
+    def __init__(self, config: OCRConfig | None = None) -> None:
+        """
+        Initialize OCR processor.
+        Args:
+            config: Optional OCR configuration
+        """
+        self.config = config
+        self._engine = None
+    @property
+    def engine(self):
+        """Get or create the RapidOCR engine (lazy loading)."""
+        if self._engine is None:
+            self._engine = self._create_engine()
+        return self._engine
+    def _get_lang_enum(self, lang_code: str):
+        """Map language code to RapidOCR LangRec enum."""
+        from rapidocr import LangRec
+        lang_map = {
+            "zh": LangRec.CH,
+            "ch": LangRec.CH,
+            "en": LangRec.EN,
+            "ja": LangRec.JAPAN,
+            "japan": LangRec.JAPAN,
+            "ko": LangRec.KOREAN,
+            "korean": LangRec.KOREAN,
+            "ar": LangRec.ARABIC,
+            "arabic": LangRec.ARABIC,
+            "th": LangRec.TH,
+            "latin": LangRec.LATIN,
+        }
+        return lang_map.get(lang_code.lower(), LangRec.CH)
+    def _create_engine(self):
+        """Create RapidOCR engine with configuration."""
+        try:
+            from rapidocr import RapidOCR
+        except ImportError as e:
+            raise ImportError(
+                "RapidOCR is not installed. "
+                "Install with: pip install rapidocr onnxruntime"
+            ) from e
+        # Build params
+        params = {
+            "Global.log_level": "warning",  # Reduce logging noise
+        }
+        # Set language if configured (must use LangRec enum)
+        # Note: RapidOCR params dict expects specific types, type checker doesn't recognize LangRec enum
+        if self.config and self.config.lang:
+            params["Rec.lang_type"] = self._get_lang_enum(self.config.lang)  # type: ignore[assignment]
+        return RapidOCR(params=params)
+    def recognize(self, image_path: Path | str) -> OCRResult:
+        """
+        Perform OCR on an image file.
+        Args:
+            image_path: Path to the image file
+        Returns:
+            OCRResult with recognized text and metadata
+        """
+        image_path = Path(image_path)
+        if not image_path.exists():
+            raise FileNotFoundError(f"Image not found: {image_path}")
+        logger.debug(f"Running OCR on: {image_path.name}")
+        result: Any = self.engine(str(image_path))
+        # Extract text from result (RapidOCR returns union type with incomplete stubs)
+        # Use 'is not None' to avoid numpy array boolean ambiguity
+        texts = list(result.txts) if result.txts is not None else []
+        scores = list(result.scores) if result.scores is not None else []
+        boxes = list(result.boxes) if result.boxes is not None else []
+        # Join all recognized text
+        full_text = "\n".join(texts)
+        # Calculate average confidence
+        avg_confidence = sum(scores) / len(scores) if scores else 0.0
+        logger.debug(
+            f"OCR completed: {len(texts)} text blocks, "
+            f"avg confidence: {avg_confidence:.2f}"
+        )
+        return OCRResult(
+            text=full_text,
+            confidence=avg_confidence,
+            boxes=[
+                box.tolist() if hasattr(box, "tolist") else list(box) for box in boxes
+            ],
+        )
+    def recognize_numpy(self, image_array: Any) -> OCRResult:
+        """
+        Perform OCR on a numpy array (RGB image data).
+        This is more efficient than recognize_bytes as it avoids
+        intermediate file I/O when the image is already in memory.
+        Args:
+            image_array: numpy array of shape (H, W, 3) or (H, W, 4) in RGB(A) format
+        Returns:
+            OCRResult with recognized text and metadata
+        """
+        import numpy as np
+        # Ensure we have a proper numpy array
+        if not isinstance(image_array, np.ndarray):
+            raise TypeError(f"Expected numpy array, got {type(image_array)}")
+        logger.debug(f"Running OCR on numpy array: shape={image_array.shape}")
+        # RapidOCR can accept numpy arrays directly
+        result: Any = self.engine(image_array)
+        # Extract text from result
+        # Use 'is not None' to avoid numpy array boolean ambiguity
+        texts = list(result.txts) if result.txts is not None else []
+        scores = list(result.scores) if result.scores is not None else []
+        boxes = list(result.boxes) if result.boxes is not None else []
+        # Join all recognized text
+        full_text = "\n".join(texts)
+        # Calculate average confidence
+        avg_confidence = sum(scores) / len(scores) if scores else 0.0
+        logger.debug(
+            f"OCR completed: {len(texts)} text blocks, "
+            f"avg confidence: {avg_confidence:.2f}"
+        )
+        return OCRResult(
+            text=full_text,
+            confidence=avg_confidence,
+            boxes=[
+                box.tolist() if hasattr(box, "tolist") else list(box) for box in boxes
+            ],
+        )
+    def recognize_bytes(self, image_data: bytes) -> OCRResult:
+        """
+        Perform OCR on image bytes.
+        Args:
+            image_data: Raw image bytes
+        Returns:
+            OCRResult with recognized text and metadata
+        """
+        import io
+        import numpy as np
+        from PIL import Image
+        # Load image from bytes
+        image = Image.open(io.BytesIO(image_data))
+        # Convert to RGB if needed (RapidOCR works best with RGB)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Convert to numpy array and use recognize_numpy directly
+        # This avoids temporary file I/O
+        image_array = np.array(image)
+        return self.recognize_numpy(image_array)
+    def recognize_pdf_page(
+        self,
+        pdf_path: Path,
+        page_num: int,
+        dpi: int = DEFAULT_RENDER_DPI,
+    ) -> OCRResult:
+        """
+        Perform OCR on a specific PDF page.
+        Args:
+            pdf_path: Path to the PDF file
+            page_num: Page number (0-indexed)
+            dpi: Resolution for rendering
+        Returns:
+            OCRResult with recognized text
+        """
+        try:
+            import fitz  # pymupdf
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is not installed. Install with: pip install pymupdf"
+            ) from e
+        doc = fitz.open(pdf_path)
+        try:
+            if page_num >= len(doc):
+                raise ValueError(
+                    f"Page {page_num} out of range. PDF has {len(doc)} pages."
+                )
+            page = doc[page_num]
+            # Render page to image
+            mat = fitz.Matrix(dpi / 72, dpi / 72)
+            pix = page.get_pixmap(matrix=mat)
+            # Use recognize_pixmap for direct processing
+            return self.recognize_pixmap(pix.samples, pix.width, pix.height, pix.n)
+        finally:
+            doc.close()
+    def recognize_pixmap(
+        self,
+        samples: bytes,
+        width: int,
+        height: int,
+        n_channels: int,
+    ) -> OCRResult:
+        """
+        Perform OCR on raw pixel data (e.g., from pymupdf pixmap).
+        This method is optimized for use with pymupdf's pixmap.samples,
+        avoiding redundant image encoding/decoding.
+        Args:
+            samples: Raw pixel data bytes
+            width: Image width in pixels
+            height: Image height in pixels
+            n_channels: Number of color channels (3 for RGB, 4 for RGBA)
+        Returns:
+            OCRResult with recognized text
+        """
+        import numpy as np
+        # Convert raw bytes to numpy array
+        image_array = np.frombuffer(samples, dtype=np.uint8).reshape(
+            (height, width, n_channels)
+        )
+        # If RGBA, convert to RGB
+        if n_channels == 4:
+            image_array = image_array[:, :, :3]
+        return self.recognize_numpy(image_array)
+    def is_scanned_pdf(
+        self, pdf_path: Path, sample_pages: int = DEFAULT_OCR_SAMPLE_PAGES
+    ) -> bool:
+        """
+        Check if a PDF is likely scanned (image-based).
+        Args:
+            pdf_path: Path to the PDF file
+            sample_pages: Number of pages to sample
+        Returns:
+            True if PDF appears to be scanned
+        """
+        try:
+            import fitz
+        except ImportError:
+            return False
+        doc = fitz.open(pdf_path)
+        try:
+            total_text_length = 0
+            pages_to_check = min(sample_pages, len(doc))
+            for i in range(pages_to_check):
+                page = doc[i]
+                # Note: pymupdf get_text() returns str but type stubs say Any
+                text: str = page.get_text()  # type: ignore[assignment]
+                total_text_length += len(text.strip())
+            # If very little text extracted, likely scanned
+            avg_text_per_page = total_text_length / pages_to_check
+            return avg_text_per_page < 100  # Threshold: less than 100 chars per page
+        finally:
+            doc.close()
+    def recognize_to_markdown(self, image_path: Path | str) -> str:
+        """
+        Perform OCR and format result as markdown.
+        Uses RapidOCR's built-in to_markdown() method if available.
+        Args:
+            image_path: Path to the image file
+        Returns:
+            Markdown formatted text
+        """
+        image_path = Path(image_path)
+        result: Any = self.engine(
+            str(image_path),
+            return_word_box=True,
+            return_single_char_box=True,
+        )
+        # Try to use built-in markdown conversion
+        if hasattr(result, "to_markdown"):
+            return result.to_markdown()
+        # Fallback: simple text extraction
+        texts = result.txts if result.txts else []
+        return "\n\n".join(texts)

markitai/prompts/__init__.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""Prompt management module."""
+from __future__ import annotations
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from markitai.config import PromptsConfig
+# Built-in prompts directory
+BUILTIN_PROMPTS_DIR = Path(__file__).parent
+class PromptManager:
+    """Manager for loading and rendering prompts."""
+    # Available prompt names
+    PROMPT_NAMES = (
+        "cleaner",
+        "frontmatter",
+        "image_caption",
+        "image_description",
+        "image_analysis",
+        "page_content",
+        "document_enhance",
+        "document_process",
+        "document_enhance_complete",
+        "url_enhance",
+    )
+    def __init__(self, config: PromptsConfig | None = None) -> None:
+        """
+        Initialize prompt manager.
+        Args:
+            config: Optional prompts configuration
+        """
+        self.config = config
+        self._cache: dict[str, str] = {}
+    def get_prompt(self, name: str, **variables: str) -> str:
+        """
+        Get a prompt by name with variables substituted.
+        Args:
+            name: Prompt name (cleaner, frontmatter, image_caption, image_description)
+            **variables: Variables to substitute in the template
+        Returns:
+            Rendered prompt string
+        Raises:
+            ValueError: If prompt name is not valid
+        """
+        if name not in self.PROMPT_NAMES:
+            raise ValueError(
+                f"Unknown prompt: {name}. Valid names: {', '.join(self.PROMPT_NAMES)}"
+            )
+        template = self._load_prompt(name)
+        return self._render(template, **variables)
+    def _load_prompt(self, name: str) -> str:
+        """
+        Load prompt template from file.
+        Priority:
+        1. Config-specified path
+        2. Custom directory
+        3. Built-in prompts
+        """
+        # Check cache first
+        if name in self._cache:
+            return self._cache[name]
+        template = None
+        # 1. Check config-specified path
+        if self.config:
+            config_path = getattr(self.config, name, None)
+            if config_path:
+                path = Path(config_path).expanduser()
+                if path.exists():
+                    template = path.read_text(encoding="utf-8")
+        # 2. Check custom directory
+        if template is None and self.config:
+            custom_dir = Path(self.config.dir).expanduser()
+            custom_path = custom_dir / f"{name}.md"
+            if custom_path.exists():
+                template = custom_path.read_text(encoding="utf-8")
+        # 3. Fall back to built-in
+        if template is None:
+            builtin_path = BUILTIN_PROMPTS_DIR / f"{name}.md"
+            if builtin_path.exists():
+                template = builtin_path.read_text(encoding="utf-8")
+            else:
+                raise FileNotFoundError(f"Built-in prompt not found: {name}")
+        # Cache the template
+        self._cache[name] = template
+        return template
+    def _render(self, template: str, **variables: str) -> str:
+        """
+        Render a template with variables.
+        Uses simple {variable} substitution.
+        """
+        result = template
+        # Add default variables
+        if "timestamp" not in variables:
+            variables["timestamp"] = datetime.now().astimezone().isoformat()
+        for key, value in variables.items():
+            result = result.replace(f"{{{key}}}", str(value))
+        return result
+    def clear_cache(self) -> None:
+        """Clear the prompt cache."""
+        self._cache.clear()
+    def list_prompts(self) -> dict[str, str]:
+        """
+        List all available prompts with their sources.
+        Returns:
+            Dict mapping prompt names to their source paths
+        """
+        result = {}
+        for name in self.PROMPT_NAMES:
+            source = "built-in"
+            # Check config-specified path
+            if self.config:
+                config_path = getattr(self.config, name, None)
+                if config_path:
+                    path = Path(config_path).expanduser()
+                    if path.exists():
+                        source = str(path)
+                        result[name] = source
+                        continue
+                # Check custom directory
+                custom_dir = Path(self.config.dir).expanduser()
+                custom_path = custom_dir / f"{name}.md"
+                if custom_path.exists():
+                    source = str(custom_path)
+            result[name] = source
+        return result

markitai/prompts/cleaner.md ADDED Viewed

@@ -0,0 +1,93 @@
+Markdown 格式优化任务。
+【输出要求】
+- 直接输出纯 Markdown，不要包裹在代码块中
+- 不要添加任何解释或说明
+【核心原则 - 必须严格遵守】
+- **禁止翻译（CRITICAL - DO NOT TRANSLATE）**：
+  - 英文输入 → 英文输出（English in → English out）
+  - 中文输入 → 中文输出（中文输入 → 中文输出）
+  - 绝对禁止将任何语言翻译成另一种语言
+  - 违反此规则将导致输出无效
+- **禁止改写**：保留原文的用词和表达方式，只做格式调整
+- 英文内容用英文标点，中文内容用中文标点
+- 混合语言文档 → 根据上下文判断，保持局部一致性
+【清理规范】
+- 保留幻灯片标记（如 `<!-- Slide number: X -->`），不要添加新的 slide 注释
+- 保留页面图片注释（如 `<!-- Page images for reference -->` 和 `<!-- ![Page X](...) -->`）
+- 删除其他 HTML 注释
+- **删除 PPT/PDF 页眉页脚**（IMPORTANT - 必须删除）：
+  - 特征：每页/slide 末尾重复出现的 2-4 行短文本（每行 < 30 字符）+ 页码
+  - 示例 1：`FTD\nFREE TEST DATA\n2`（品牌缩写 + 品牌名 + 页码）
+  - 示例 2：`Company Name\n© 2024\n5`
+  - 删除条件：相同模式在 ≥3 页重复出现
+  - 位置：通常在每个 `<!-- Slide number: X -->` 或 `<!-- Page number: X -->` 块的末尾
+- 删除图表残留的孤立数字行（如坐标轴标签被单独提取成行）
+- 删除无意义的重复标题（如每页都有相同的文档标题）
+- 删除标题前的孤立短文本（无格式的1-3个词，如重复的文档名）
+【空行规范】
+- 标题(#)前后各保留一个空行
+- 代码块(```)前后各保留一个空行
+- 列表块前后各保留一个空行
+- 表格前后各保留一个空行
+- HTML 注释（如 `<!-- Page number: X -->`、`<!-- Slide number: X -->`）前后各保留一个空行
+- 段落间保留一个空行，删除多余空行
+【标点与强调】
+- 根据文档语言使用对应标点
+- 闭合强调标记放在标点内侧：错误「内容。**」→ 正确「内容**。」
+- 合并连续强调标记：「**这是****一个****强调**」→「**这是一个强调**」
+- 粗体/斜体标记与中文之间不加空格
+【代码块规范】
+- 嵌套时外层反引号数 = 内层最大反引号数 + 1
+【列表规范】
+- 无序列表统一使用 - 符号
+- 有序列表使用 1. 2. 3. 格式
+- 嵌套列表缩进 2 空格
+【段落规范】
+- 合并不应断行的段落（同一句话被错误换行）
+- 保留有意义的换行（如诗歌、地址）
+【链接格式修复】
+- 修复换行的链接：将 `[标题\n\n描述](url)` 合并为 `[标题](url)`
+- 示例：
+  - 错误：`[Evergreen notes\n\nEvergreen notes allow you to...](/evergreen-notes)`
+  - 正确：`[Evergreen notes](/evergreen-notes)`
+【表格规范】
+- 若列头为空且内容语义清晰，可根据语义补充列头
+- 若第一列是纯数字行号且无列头，可补充行号列头（如 # 或 No.）
+- **CRITICAL: 列头语言必须严格与表格数据内容的语言一致**（非 frontmatter 的语言）
+  - 英文/拉丁文数据 → 英文列头（**绝对禁止使用中文列头**）
+  - 中文数据 → 中文列头（**绝对禁止使用英文列头**）
+  - Lorem ipsum 视为英文/拉丁文，必须用英文列头
+  - 示例：数据 "Dulce, Abril, Female, United States" → 列头 "# | First Name | Last Name | Gender | Country"
+  - 示例：数据 "Lorem ipsum, Dolor sit" → 列头 "# | Text | Category" (NOT 文本内容 | 分类)
+  - 示例：数据 "张三, 李四, 男, 中国" → 列头 "# | 姓 | 名 | 性别 | 国家"
+【保持不变 - 严格保留，不得删除或修改】
+- 代码块内容
+- 表格行列结构（列头补充除外）
+- **所有图片链接 `![...](...)` 必须完整保留，URL 不得修改**
+- **所有超链接 `[...](...)` 必须完整保留，URL 不得修改**
+- HTML 注释中的图片链接（如 `<!-- ![Page X](...) -->`）
+- **所有 `__MARKITAI_*__` 占位符必须原样保留**（如 `__MARKITAI_IMG_0__`、`__MARKITAI_SLIDE_0__`）
+- **原文内容（禁止翻译或改写）**
+【URL 保护 - CRITICAL】
+- **禁止修改任何 URL** - 图片链接和超链接的 URL 必须与原文完全一致
+- **禁止编造 URL** - 绝对不能猜测、推断或生成原文中不存在的 URL
+- **禁止替换 URL** - 即使 URL 看起来"不正确"，也必须保留原样
+- 示例：原文 `![](https://example.com/a.jpg)` → 输出必须是 `![](https://example.com/a.jpg)`
+- 违反此规则将导致输出无效
+输入：
+{content}
+直接输出优化后的 Markdown：

markitai/prompts/document_enhance.md ADDED Viewed

@@ -0,0 +1,77 @@
+你是一个文档格式清理专家。你的任务是清理提取文本中的格式问题，同时保持内容完整性。
+你会收到：
+1. **提取的文本**：程序提取的 Markdown 内容
+2. **页面图片**：用于验证格式的视觉参考
+## 核心原则 - 必须严格遵守
+- **禁止翻译（CRITICAL - DO NOT TRANSLATE）**：
+  - 英文输入 → 英文输出（English in → English out）
+  - 中文输入 → 中文输出（中文输入 → 中文输出）
+  - 绝对禁止将任何语言翻译成另一种语言
+  - 违反此规则将导致输出无效
+- **禁止改写**：保留原文的用词和表达方式，只做格式调整
+## 清理任务
+【删除残留 - 仅删除明显垃圾，不删除正文】
+- 删除图表提取残留的孤立数字行（如单独一行的 "12", "10", "8" 等，通常是坐标轴标签）
+- 删除 PPT/PDF 页眉页脚：
+  - 特征：每页末尾重复出现的 2-4 行短文本（每行 < 30 字符）
+  - 示例：`FTD\nFREE TEST DATA\n2`（品牌名 + 页码）
+  - 示例：`Company Name\n© 2024\n5`
+  - **仅当相同文本在多页重复出现时才删除**
+- 删除无意义的重复标题（如每页都有相同的文档名）
+【格式修正】
+- 参考页面图片修正标题层级（##、###等）
+- 修正列表格式（缩进、符号）
+- 修正表格结构
+- 为 `![](assets/...)` 图片添加简短 alt text
+- 修复换行的链接格式：将 `[文本\n\n描述](url)` 合并为 `[文本](url)`
+【空行规范】
+- 标题(#)前后各保留一个空行
+- 列表块/表格前后各保留一个空行
+- 段落间保留一个空行，删除多余空行
+## 禁止事项 - CRITICAL
+- **禁止翻译任何内容** - 原文是什么语言就保留什么语言
+- **禁止删除任何正文段落**（CRITICAL - DO NOT DELETE CONTENT）：
+  - 每个 `<!-- Page number: X -->` 标记的页面内容必须完整保留
+  - 输入有多少页，输出就必须有多少页
+  - 只能删除明显的残留/垃圾（孤立数字、重复页眉页脚）
+  - 如果不确定是否应该删除，就保留
+- **页码注释必须与内容对齐**（CRITICAL - PAGE MARKER ALIGNMENT）：
+  - `<!-- Page number: X -->` 注释后面的内容必须是第 X 页的实际内容
+  - 禁止将一个页面的内容移动到另一个页码注释下
+  - 如果某页内容为空，保留页码注释即可，不要删除
+  - 输出的页码顺序必须与输入完全一致（1, 2, 3... 不能变成 1, 3, 2...）
+- **禁止移动内容位置** - 保持原有顺序
+- **禁止重写或改述内容** - 保留原文
+- **禁止添加新内容** - 只做清理
+- **禁止用代码块包裹输出** - 直接输出纯 Markdown，不要用 \`\`\`markdown 包裹
+- **必须保留所有链接** - `[文本](url)` 原样保留，URL 不得修改
+- **必须保留所有图片引用** - `![...](assets/...)` 位置不变，URL 不得修改
+- **禁止修改任何 URL** - 图片链接和超链接的 URL 必须与原文完全一致
+- **禁止编造 URL** - 绝对不能猜测、推断或生成原文中不存在的 URL
+- **必须保留所有 Slide 注释** - `<!-- Slide number: X -->` 原样保留在每个 slide 内容开头，位置不变，不要添加新的 slide 注释
+- **必须保留所有页码注释** - `<!-- Page number: X -->` 原样保留在每页内容开头，位置不变，不要添加新的页码注释
+- **必须保留所有占位符** - `__MARKITAI_*__` 格式的占位符原样保留，位置不变
+- **禁止输出页面截图引用** - 不要输出 `![Page X](screenshots/...)`
+- **禁止输出页面/图片标记** - 不要输出 `## Page X Image:`、`__MARKITAI_PAGE_LABEL_X__`、`__MARKITAI_IMG_LABEL_X__` 等系统内部标记
+## 图片语法规范
+图片引用必须严格遵循 Markdown 语法，**不要添加多余的括号**：
+- 正确: `![alt text](assets/image.jpg)`
+- 错误: `![alt text](assets/image.jpg))` (多余的右括号)
+- 错误: `![alt text](assets/image.jpg)))` (多余的右括号)
+## 输出要求
+- 仅输出清理后的 Markdown 内容
+- 输出语言与源文档保持一致
+- 不要添加任何说明文字