PyPI - markitai - Versions diffs - 0.3.0__py3-none-any.whl - Mend

markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

markitai/__init__.py +3 -0
markitai/batch.py +1316 -0
markitai/cli.py +3979 -0
markitai/config.py +602 -0
markitai/config.schema.json +748 -0
markitai/constants.py +222 -0
markitai/converter/__init__.py +49 -0
markitai/converter/_patches.py +98 -0
markitai/converter/base.py +164 -0
markitai/converter/image.py +181 -0
markitai/converter/legacy.py +606 -0
markitai/converter/office.py +526 -0
markitai/converter/pdf.py +679 -0
markitai/converter/text.py +63 -0
markitai/fetch.py +1725 -0
markitai/image.py +1335 -0
markitai/json_order.py +550 -0
markitai/llm.py +4339 -0
markitai/ocr.py +347 -0
markitai/prompts/__init__.py +159 -0
markitai/prompts/cleaner.md +93 -0
markitai/prompts/document_enhance.md +77 -0
markitai/prompts/document_enhance_complete.md +65 -0
markitai/prompts/document_process.md +60 -0
markitai/prompts/frontmatter.md +28 -0
markitai/prompts/image_analysis.md +21 -0
markitai/prompts/image_caption.md +8 -0
markitai/prompts/image_description.md +13 -0
markitai/prompts/page_content.md +17 -0
markitai/prompts/url_enhance.md +78 -0
markitai/security.py +286 -0
markitai/types.py +30 -0
markitai/urls.py +187 -0
markitai/utils/__init__.py +33 -0
markitai/utils/executor.py +69 -0
markitai/utils/mime.py +85 -0
markitai/utils/office.py +262 -0
markitai/utils/output.py +53 -0
markitai/utils/paths.py +81 -0
markitai/utils/text.py +359 -0
markitai/workflow/__init__.py +37 -0
markitai/workflow/core.py +760 -0
markitai/workflow/helpers.py +509 -0
markitai/workflow/single.py +369 -0
markitai-0.3.0.dist-info/METADATA +159 -0
markitai-0.3.0.dist-info/RECORD +48 -0
markitai-0.3.0.dist-info/WHEEL +4 -0
markitai-0.3.0.dist-info/entry_points.txt +2 -0

markitai/urls.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""URL list parsing module for batch URL processing."""
+from __future__ import annotations
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from loguru import logger
+# URL pattern for validation
+_URL_PATTERN = re.compile(r"^https?://", re.IGNORECASE)
+@dataclass
+class UrlEntry:
+    """Represents a URL entry from a URL list file.
+    Attributes:
+        url: The URL to process
+        output_name: Optional custom output filename (without extension)
+    """
+    url: str
+    output_name: str | None = None
+class UrlListParseError(Exception):
+    """Raised when URL list file cannot be parsed."""
+    pass
+def is_url_list_file(path: Path) -> bool:
+    """Check if path is a URL list file.
+    URL list files are identified by the .urls extension.
+    Args:
+        path: Path to check
+    Returns:
+        True if the file has .urls extension
+    """
+    return path.suffix.lower() == ".urls"
+def parse_url_list(file_path: Path) -> list[UrlEntry]:
+    """Parse a URL list file.
+    Supported formats:
+    1. Plain text: one URL per line
+       - Empty lines are ignored
+       - Lines starting with # are comments
+       - Lines can optionally have a custom output name after whitespace:
+         https://example.com custom_name
+    2. JSON array of strings:
+       ["https://example1.com", "https://example2.com"]
+    3. JSON array of objects:
+       [
+         {"url": "https://example1.com"},
+         {"url": "https://example2.com", "output_name": "custom"}
+       ]
+    Args:
+        file_path: Path to the URL list file
+    Returns:
+        List of UrlEntry objects
+    Raises:
+        UrlListParseError: If the file cannot be parsed
+        FileNotFoundError: If the file does not exist
+    """
+    if not file_path.exists():
+        raise FileNotFoundError(f"URL list file not found: {file_path}")
+    content = file_path.read_text(encoding="utf-8").strip()
+    if not content:
+        return []
+    # Try JSON first
+    if content.startswith("["):
+        return _parse_json_url_list(content, file_path)
+    # Fall back to plain text
+    return _parse_text_url_list(content, file_path)
+def _parse_json_url_list(content: str, file_path: Path) -> list[UrlEntry]:
+    """Parse JSON format URL list."""
+    try:
+        data = json.loads(content)
+    except json.JSONDecodeError as e:
+        raise UrlListParseError(f"Invalid JSON in {file_path}: {e}")
+    if not isinstance(data, list):
+        raise UrlListParseError(
+            f"Expected JSON array in {file_path}, got {type(data).__name__}"
+        )
+    entries = []
+    for i, item in enumerate(data):
+        if isinstance(item, str):
+            # Simple string URL
+            url = item.strip()
+            if not url:
+                continue
+            if not _URL_PATTERN.match(url):
+                logger.warning(f"Skipping invalid URL at index {i}: {url[:50]}...")
+                continue
+            entries.append(UrlEntry(url=url))
+        elif isinstance(item, dict):
+            # Object with url and optional output_name
+            url = item.get("url", "").strip()
+            if not url:
+                logger.warning(f"Skipping entry at index {i}: missing 'url' field")
+                continue
+            if not _URL_PATTERN.match(url):
+                logger.warning(f"Skipping invalid URL at index {i}: {url[:50]}...")
+                continue
+            output_name = item.get("output_name")
+            if output_name:
+                output_name = str(output_name).strip() or None
+            entries.append(UrlEntry(url=url, output_name=output_name))
+        else:
+            logger.warning(
+                f"Skipping entry at index {i}: expected string or object, got {type(item).__name__}"
+            )
+    return entries
+def _parse_text_url_list(content: str, file_path: Path) -> list[UrlEntry]:
+    """Parse plain text format URL list."""
+    entries = []
+    for line_num, line in enumerate(content.splitlines(), start=1):
+        line = line.strip()
+        # Skip empty lines and comments
+        if not line or line.startswith("#"):
+            continue
+        # Split line into URL and optional output name
+        parts = line.split(None, 1)  # Split on first whitespace
+        url = parts[0]
+        if not _URL_PATTERN.match(url):
+            logger.warning(f"Skipping invalid URL at line {line_num}: {url[:50]}...")
+            continue
+        output_name = None
+        if len(parts) > 1:
+            output_name = parts[1].strip()
+            # Remove quotes if present
+            if (output_name.startswith('"') and output_name.endswith('"')) or (
+                output_name.startswith("'") and output_name.endswith("'")
+            ):
+                output_name = output_name[1:-1]
+        entries.append(UrlEntry(url=url, output_name=output_name or None))
+    return entries
+def find_url_list_files(directory: Path) -> list[Path]:
+    """Find all .urls files in a directory (recursive).
+    Args:
+        directory: Directory to search
+    Returns:
+        List of paths to .urls files, sorted by path
+    """
+    if not directory.is_dir():
+        return []
+    return sorted(directory.glob("**/*.urls"))

markitai/utils/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Markitai utilities."""
+from markitai.utils.executor import (
+    get_converter_executor,
+    run_in_converter_thread,
+    shutdown_converter_executor,
+)
+from markitai.utils.mime import get_extension_from_mime, get_mime_type
+from markitai.utils.office import find_libreoffice, has_ms_office
+from markitai.utils.output import resolve_output_path
+from markitai.utils.paths import (
+    ensure_assets_dir,
+    ensure_dir,
+    ensure_screenshots_dir,
+    ensure_subdir,
+)
+from markitai.utils.text import normalize_markdown_whitespace
+__all__ = [
+    "ensure_assets_dir",
+    "ensure_dir",
+    "ensure_screenshots_dir",
+    "ensure_subdir",
+    "find_libreoffice",
+    "get_converter_executor",
+    "get_extension_from_mime",
+    "get_mime_type",
+    "has_ms_office",
+    "normalize_markdown_whitespace",
+    "resolve_output_path",
+    "run_in_converter_thread",
+    "shutdown_converter_executor",
+]

markitai/utils/executor.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Shared ThreadPoolExecutor for CPU-bound converter operations."""
+from __future__ import annotations
+import asyncio
+import os
+import threading
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, TypeVar
+T = TypeVar("T")
+# Global converter thread pool executor with thread-safe initialization
+_CONVERTER_EXECUTOR: ThreadPoolExecutor | None = None
+_CONVERTER_MAX_WORKERS = min(os.cpu_count() or 4, 8)
+_EXECUTOR_LOCK = threading.Lock()
+def get_converter_executor() -> ThreadPoolExecutor:
+    """Get or create the shared converter thread pool executor.
+    Uses double-checked locking for thread-safe lazy initialization.
+    Returns:
+        Shared ThreadPoolExecutor instance for converter operations
+    """
+    global _CONVERTER_EXECUTOR
+    if _CONVERTER_EXECUTOR is None:
+        with _EXECUTOR_LOCK:
+            # Double-check after acquiring lock
+            if _CONVERTER_EXECUTOR is None:
+                _CONVERTER_EXECUTOR = ThreadPoolExecutor(
+                    max_workers=_CONVERTER_MAX_WORKERS,
+                    thread_name_prefix="markitai-converter",
+                )
+    return _CONVERTER_EXECUTOR
+async def run_in_converter_thread(
+    func: Callable[..., T], *args: Any, **kwargs: Any
+) -> T:
+    """Run a function in the shared converter thread pool.
+    This is used for CPU-bound converter operations (PDF parsing,
+    document conversion, etc.) to avoid blocking the event loop.
+    Args:
+        func: Function to run in thread pool
+        *args: Positional arguments to pass to func
+        **kwargs: Keyword arguments to pass to func
+    Returns:
+        Result of func(*args, **kwargs)
+    """
+    loop = asyncio.get_running_loop()
+    executor = get_converter_executor()
+    return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
+def shutdown_converter_executor() -> None:
+    """Shutdown the shared converter executor.
+    Call this during application cleanup to ensure clean shutdown.
+    """
+    global _CONVERTER_EXECUTOR
+    if _CONVERTER_EXECUTOR is not None:
+        _CONVERTER_EXECUTOR.shutdown(wait=True)
+        _CONVERTER_EXECUTOR = None

markitai/utils/mime.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""MIME type utilities for image handling.
+This module provides helper functions for MIME type operations,
+using the centralized mappings defined in constants.py.
+"""
+from __future__ import annotations
+from markitai.constants import EXTENSION_TO_MIME, MIME_TO_EXTENSION
+# MIME types supported by vision LLMs (Anthropic Claude, Google Gemini, OpenAI GPT-4V)
+# SVG, BMP, ICO etc. are NOT supported
+LLM_SUPPORTED_MIME_TYPES = frozenset(
+    {"image/jpeg", "image/png", "image/gif", "image/webp"}
+)
+def get_mime_type(extension: str, default: str = "image/jpeg") -> str:
+    """Get MIME type from file extension.
+    Args:
+        extension: File extension (with or without leading dot), e.g. ".jpg" or "jpg"
+        default: Default MIME type if extension is not recognized
+    Returns:
+        MIME type string, e.g. "image/jpeg"
+    Examples:
+        >>> get_mime_type(".jpg")
+        'image/jpeg'
+        >>> get_mime_type("png")
+        'image/png'
+        >>> get_mime_type(".unknown")
+        'image/jpeg'
+    """
+    # Normalize extension to have leading dot and be lowercase
+    ext = extension.lower()
+    if not ext.startswith("."):
+        ext = f".{ext}"
+    return EXTENSION_TO_MIME.get(ext, default)
+def get_extension_from_mime(mime_type: str, default: str = ".jpg") -> str:
+    """Get file extension from MIME type.
+    Args:
+        mime_type: MIME type string, e.g. "image/jpeg"
+        default: Default extension if MIME type is not recognized
+    Returns:
+        File extension with leading dot, e.g. ".jpg"
+    Examples:
+        >>> get_extension_from_mime("image/jpeg")
+        '.jpg'
+        >>> get_extension_from_mime("image/png")
+        '.png'
+        >>> get_extension_from_mime("image/unknown")
+        '.jpg'
+    """
+    # Handle content-type with parameters (e.g. "image/jpeg; charset=utf-8")
+    clean_mime = mime_type.lower().split(";")[0].strip()
+    return MIME_TO_EXTENSION.get(clean_mime, default)
+def is_llm_supported_image(extension: str) -> bool:
+    """Check if image format is supported by vision LLMs.
+    Vision LLMs (Claude, Gemini, GPT-4V) only support jpeg, png, gif, webp.
+    Formats like SVG, BMP, ICO are NOT supported.
+    Args:
+        extension: File extension (with or without leading dot)
+    Returns:
+        True if the format is supported by vision LLMs
+    Examples:
+        >>> is_llm_supported_image(".jpg")
+        True
+        >>> is_llm_supported_image(".svg")
+        False
+    """
+    mime_type = get_mime_type(extension, default="")
+    return mime_type in LLM_SUPPORTED_MIME_TYPES

markitai/utils/office.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Office application detection utilities.
+Provides detection for MS Office (Windows) and LibreOffice (cross-platform).
+- MS Office COM: Used for legacy format conversion (.doc/.ppt) and PPTX slide rendering
+- LibreOffice: Used as fallback for legacy format conversion and PDF export
+"""
+from __future__ import annotations
+import platform
+import shutil
+from functools import lru_cache
+from pathlib import Path
+from loguru import logger
+# Common MS Office installation paths on Windows
+_MS_OFFICE_PATHS = [
+    # Microsoft 365 / Office 2019+ (Click-to-Run)
+    r"C:\Program Files\Microsoft Office\root\Office16",
+    r"C:\Program Files (x86)\Microsoft Office\root\Office16",
+    # Office 2016 (MSI)
+    r"C:\Program Files\Microsoft Office\Office16",
+    r"C:\Program Files (x86)\Microsoft Office\Office16",
+    # Office 2013
+    r"C:\Program Files\Microsoft Office\Office15",
+    r"C:\Program Files (x86)\Microsoft Office\Office15",
+    # Office 2010
+    r"C:\Program Files\Microsoft Office\Office14",
+    r"C:\Program Files (x86)\Microsoft Office\Office14",
+]
+def _is_windows() -> bool:
+    """Check if running on Windows."""
+    return platform.system() == "Windows"
+def _check_office_exe_exists(app_name: str) -> bool:
+    """Check if an Office application executable exists in common paths.
+    Args:
+        app_name: Application name without extension (e.g., "POWERPNT", "WINWORD", "EXCEL")
+    Returns:
+        True if the executable is found in any common path.
+    """
+    exe_name = f"{app_name}.EXE"
+    for office_path in _MS_OFFICE_PATHS:
+        exe_path = Path(office_path) / exe_name
+        if exe_path.exists():
+            logger.debug(f"Found {app_name} at: {exe_path}")
+            return True
+    return False
+@lru_cache(maxsize=1)
+def check_ms_powerpoint_available() -> bool:
+    """Check if MS Office PowerPoint is installed (Windows only).
+    Detection strategy:
+    1. Windows Registry lookup (fast, preferred)
+    2. Direct file path check (fallback for Click-to-Run installations)
+    Returns:
+        True if PowerPoint is installed, False otherwise.
+    """
+    if not _is_windows():
+        return False
+    # Method 1: Registry lookup
+    try:
+        import winreg  # type: ignore[import-not-found]
+        try:
+            key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"PowerPoint.Application")  # type: ignore[attr-defined]
+            winreg.CloseKey(key)  # type: ignore[attr-defined]
+            logger.debug("MS PowerPoint detected via registry")
+            return True
+        except OSError:
+            pass  # Registry key not found, try file path
+    except ImportError:
+        pass  # winreg not available
+    # Method 2: Direct file path check (for Click-to-Run installations)
+    if _check_office_exe_exists("POWERPNT"):
+        logger.debug("MS PowerPoint detected via file path")
+        return True
+    logger.debug("MS PowerPoint not found")
+    return False
+@lru_cache(maxsize=1)
+def check_ms_word_available() -> bool:
+    """Check if MS Office Word is installed (Windows only).
+    Detection strategy:
+    1. Windows Registry lookup (fast, preferred)
+    2. Direct file path check (fallback for Click-to-Run installations)
+    Returns:
+        True if Word is installed, False otherwise.
+    """
+    if not _is_windows():
+        return False
+    # Method 1: Registry lookup
+    try:
+        import winreg  # type: ignore[import-not-found]
+        try:
+            key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"Word.Application")  # type: ignore[attr-defined]
+            winreg.CloseKey(key)  # type: ignore[attr-defined]
+            logger.debug("MS Word detected via registry")
+            return True
+        except OSError:
+            pass  # Registry key not found, try file path
+    except ImportError:
+        pass  # winreg not available
+    # Method 2: Direct file path check (for Click-to-Run installations)
+    if _check_office_exe_exists("WINWORD"):
+        logger.debug("MS Word detected via file path")
+        return True
+    logger.debug("MS Word not found")
+    return False
+@lru_cache(maxsize=1)
+def check_ms_excel_available() -> bool:
+    """Check if MS Office Excel is installed (Windows only).
+    Detection strategy:
+    1. Windows Registry lookup (fast, preferred)
+    2. Direct file path check (fallback for Click-to-Run installations)
+    Returns:
+        True if Excel is installed, False otherwise.
+    """
+    if not _is_windows():
+        return False
+    # Method 1: Registry lookup
+    try:
+        import winreg  # type: ignore[import-not-found]
+        try:
+            key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"Excel.Application")  # type: ignore[attr-defined]
+            winreg.CloseKey(key)  # type: ignore[attr-defined]
+            logger.debug("MS Excel detected via registry")
+            return True
+        except OSError:
+            pass  # Registry key not found, try file path
+    except ImportError:
+        pass  # winreg not available
+    # Method 2: Direct file path check (for Click-to-Run installations)
+    if _check_office_exe_exists("EXCEL"):
+        logger.debug("MS Excel detected via file path")
+        return True
+    logger.debug("MS Excel not found")
+    return False
+import threading
+# Thread-safe cache for has_ms_office result
+_ms_office_check_lock = threading.Lock()
+_ms_office_checked = False
+_ms_office_available = False
+def has_ms_office() -> bool:
+    """Detect if MS Office PowerPoint is available via COM (Windows only).
+    Used for optional high-quality PPTX slide rendering.
+    Text extraction uses MarkItDown (cross-platform) and doesn't need COM.
+    Note: For checking installation status, prefer `check_ms_powerpoint_available()`
+    which uses registry lookup and is faster.
+    Returns:
+        True if PowerPoint COM is available, False otherwise.
+    """
+    global _ms_office_checked, _ms_office_available
+    # Fast path: already checked
+    if _ms_office_checked:
+        return _ms_office_available
+    if not _is_windows():
+        _ms_office_checked = True
+        _ms_office_available = False
+        return False
+    # Thread-safe check with proper COM initialization
+    with _ms_office_check_lock:
+        # Double-check after acquiring lock
+        if _ms_office_checked:
+            return _ms_office_available
+        try:
+            import pythoncom  # type: ignore[import-not-found]
+            import win32com.client  # type: ignore[import-not-found]
+            # Initialize COM for this thread (required in worker threads)
+            pythoncom.CoInitialize()
+            try:
+                # Check PowerPoint availability (most relevant for PPTX)
+                ppt = win32com.client.Dispatch("PowerPoint.Application")
+                ppt.Quit()
+                logger.debug("MS Office (PowerPoint) detected via COM")
+                _ms_office_available = True
+            finally:
+                pythoncom.CoUninitialize()
+        except Exception:
+            logger.debug("MS Office not available via COM")
+            _ms_office_available = False
+        _ms_office_checked = True
+        return _ms_office_available
+@lru_cache(maxsize=1)
+def find_libreoffice() -> str | None:
+    """Find LibreOffice soffice executable (cached).
+    Searches PATH first, then common installation paths.
+    Returns:
+        Path to soffice executable, or None if not found.
+    """
+    # Check PATH first
+    for cmd in ("soffice", "libreoffice"):
+        path = shutil.which(cmd)
+        if path:
+            logger.debug(f"LibreOffice found in PATH: {path}")
+            return path
+    # Check common installation paths
+    common_paths = [
+        # Linux
+        "/usr/bin/soffice",
+        "/usr/local/bin/soffice",
+        "/opt/libreoffice/program/soffice",
+        # macOS
+        "/Applications/LibreOffice.app/Contents/MacOS/soffice",
+        # Windows
+        r"C:\Program Files\LibreOffice\program\soffice.exe",
+        r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
+    ]
+    for path in common_paths:
+        if shutil.which(path):
+            logger.debug(f"LibreOffice found at: {path}")
+            return path
+    logger.debug("LibreOffice not found")
+    return None

markitai/utils/output.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Output path utilities for Markitai."""
+from __future__ import annotations
+from pathlib import Path
+def resolve_output_path(
+    base_path: Path,
+    on_conflict: str,
+) -> Path | None:
+    """Resolve output path based on conflict strategy.
+    Args:
+        base_path: The original output file path
+        on_conflict: Conflict resolution strategy ("skip", "overwrite", "rename")
+    Returns:
+        Resolved path, or None if file should be skipped.
+        For rename strategy: file.pdf.md -> file.pdf.v2.md -> file.pdf.v3.md
+        For rename with .llm.md: file.pdf.llm.md -> file.pdf.v2.llm.md
+        This ensures files sort in natural order (A-Z).
+    """
+    if not base_path.exists():
+        return base_path
+    if on_conflict == "skip":
+        return None
+    elif on_conflict == "overwrite":
+        return base_path
+    else:  # rename
+        # Parse filename to insert version number before .md/.llm.md suffix
+        # e.g., "file.pdf.md" -> "file.pdf.v2.md" -> "file.pdf.v3.md"
+        # e.g., "file.pdf.llm.md" -> "file.pdf.v2.llm.md"
+        # This ensures files sort in natural A-Z order (.md < .v2.md < .v3.md)
+        name = base_path.name
+        # Determine the markitai suffix (.md or .llm.md)
+        if name.endswith(".llm.md"):
+            base_stem = name[:-7]  # Remove ".llm.md" -> "file.pdf"
+            markitai_suffix = ".llm.md"
+        else:
+            base_stem = name[:-3]  # Remove ".md" -> "file.pdf"
+            markitai_suffix = ".md"
+        # Find next available sequence number
+        seq = 2
+        while True:
+            new_name = f"{base_stem}.v{seq}{markitai_suffix}"
+            new_path = base_path.parent / new_name
+            if not new_path.exists():
+                return new_path
+            seq += 1