PyPI - infinity-parser2 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

infinity_parser2/__init__.py +28 -0
infinity_parser2/__main__.py +6 -0
infinity_parser2/backends/__init__.py +13 -0
infinity_parser2/backends/base.py +61 -0
infinity_parser2/backends/transformers.py +159 -0
infinity_parser2/backends/vllm_engine.py +117 -0
infinity_parser2/backends/vllm_server.py +148 -0
infinity_parser2/cli.py +207 -0
infinity_parser2/parser.py +278 -0
infinity_parser2/prompts.py +57 -0
infinity_parser2/utils/__init__.py +43 -0
infinity_parser2/utils/file.py +190 -0
infinity_parser2/utils/image.py +99 -0
infinity_parser2/utils/model.py +243 -0
infinity_parser2/utils/pdf.py +46 -0
infinity_parser2/utils/utils.py +159 -0
infinity_parser2-0.1.0.dist-info/METADATA +310 -0
infinity_parser2-0.1.0.dist-info/RECORD +25 -0
infinity_parser2-0.1.0.dist-info/WHEEL +5 -0
infinity_parser2-0.1.0.dist-info/entry_points.txt +2 -0
infinity_parser2-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +1 -0
tests/test_backends.py +490 -0
tests/test_parser.py +464 -0
tests/test_utils.py +689 -0

infinity_parser2/utils/file.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""File system utilities for Infinity-Parser2."""
+import os
+import uuid
+from pathlib import Path
+from typing import List, Union
+from PIL import Image
+from .pdf import convert_pdf_to_images
+from .utils import convert_json_to_markdown
+SUPPORTED_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
+SUPPORTED_DOC_EXTENSIONS = {".pdf"}
+SUPPORTED_OUTPUT_FORMATS = ["md", "json"]
+def prepare_batch_entries(
+    inputs: List[Union[str, Image.Image]]
+) -> tuple[list[tuple[int, Union[str, Image.Image]]], list[int]]:
+    """Expand inputs into batch entries, splitting PDFs into individual pages.
+    Args:
+        inputs: List of file paths or PIL Images.
+    Returns:
+        batch_entries: List of (file_idx, item) tuples, where item is either
+          a file path (for non-PDF) or a PIL Image (for PDF pages or images).
+    """
+    batch_entries: list[tuple[int, Union[str, Image.Image]]] = []
+    for idx, item in enumerate(inputs):
+        if isinstance(item, str):
+            ext = Path(item).suffix.lower()
+            if ext == ".pdf":
+                page_images = convert_pdf_to_images(item)
+                for page_img in page_images:
+                    batch_entries.append((idx, page_img))
+            else:
+                batch_entries.append((idx, item))
+        else:
+            batch_entries.append((idx, item))
+    return batch_entries
+def normalize_input(input_data: Union[str, List[str], Image.Image]) -> List[Union[str, Image.Image]]:
+    """Normalize input to a list of file paths or images.
+    Args:
+        input_data: Input can be:
+            - str: Single file path or directory path
+            - List[str]: List of file paths
+            - PIL.Image.Image: Image object
+    Returns:
+        List of file paths or PIL Images.
+    Raises:
+        FileNotFoundError: If file or directory not found.
+        TypeError: If list contains non-string items.
+        ValueError: If directory is empty or file type is unsupported.
+    """
+    if isinstance(input_data, str):
+        if os.path.isdir(input_data):
+            file_paths = get_files_from_directory(input_data)
+            if not file_paths:
+                raise ValueError(f"No supported files found in directory: {input_data}")
+            return file_paths
+        elif os.path.isfile(input_data):
+            if not is_supported_file(input_data):
+                raise ValueError(f"Unsupported file type: {input_data}")
+            return [input_data]
+        else:
+            raise FileNotFoundError(f"File or directory not found: {input_data}")
+    elif isinstance(input_data, list):
+        file_paths = []
+        for item in input_data:
+            if not isinstance(item, str):
+                raise TypeError(f"Expected str in list, got {type(item)}")
+            if not os.path.isfile(item):
+                raise FileNotFoundError(f"File not found: {item}")
+            if not is_supported_file(item):
+                raise ValueError(f"Unsupported file type: {item}")
+            file_paths.append(item)
+        return file_paths
+    elif isinstance(input_data, Image.Image):
+        return [input_data]
+    else:
+        raise TypeError(
+            f"Unsupported input type: {type(input_data)}. "
+            "Expected str, List[str], or PIL.Image.Image."
+        )
+def is_supported_file(file_path: str) -> bool:
+    """Check if file is supported."""
+    ext = Path(file_path).suffix.lower()
+    return ext in SUPPORTED_IMAGE_EXTENSIONS or ext in SUPPORTED_DOC_EXTENSIONS
+def get_files_from_directory(directory: str) -> List[str]:
+    """Get all supported files from a directory."""
+    files = []
+    for root, _, filenames in os.walk(directory):
+        for filename in filenames:
+            file_path = os.path.join(root, filename)
+            if is_supported_file(file_path):
+                files.append(file_path)
+    return sorted(files)
+def save_results(
+    inputs: List[Union[str, Image.Image]],
+    results: List[str],
+    output_dir: str,
+    task_type: str = "doc2json",
+    output_format: str = "md",
+) -> None:
+    """Save parsing results to output directory.
+    Unified entry point that delegates to save_results_json or save_results_md
+    based on the task_type and output_format. Prints the output directory
+    path to console.
+    Args:
+        inputs: Original inputs (file paths or PIL Images).
+        results: Parsed results (same order as inputs).
+        output_dir: Base output directory.
+        task_type: Task type (e.g., "doc2json", "doc2md", "custom").
+        output_format: Output format to save. Options: "md" or "json".
+            - "md": Save only markdown result.
+            - "json": Save only JSON result (only valid for doc2json mode).
+    """
+    keys = [uuid.uuid4().hex[:8] if isinstance(inp, Image.Image) else inp for inp in inputs]
+    if output_format == "json":
+        assert task_type == "doc2json", "output_format='json' is only supported for doc2json tasks."
+        save_results_json(keys, results, output_dir)
+    else:
+        save_results_md(keys, results, output_dir)
+    print(f"[Infinity-Parser2] Results saved to: {os.path.abspath(output_dir)}")
+def save_results_md(keys: List[str], results: List[str], output_dir: str) -> None:
+    """Save markdown parsing results to output directory.
+    Creates a subdirectory for each entry and writes result.md inside it.
+    For file paths, the folder name is the filename (basename); for UUIDs,
+    the folder name is the UUID itself.
+    Args:
+        keys: Identifiers (file paths or UUIDs).
+        results: Parsed markdown text results (same order as keys).
+        output_dir: Base output directory.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    for key, result in zip(keys, results):
+        folder_name = Path(key).name
+        file_dir = os.path.join(output_dir, folder_name)
+        os.makedirs(file_dir, exist_ok=True)
+        result_path = os.path.join(file_dir, "result.md")
+        with open(result_path, "w", encoding="utf-8") as f:
+            f.write(result)
+def save_results_json(keys: List[str], results: List[str], output_dir: str) -> None:
+    """Save JSON parsing results to output directory.
+    Creates a subdirectory for each entry and writes result.json inside it.
+    For file paths, the folder name is the filename (basename); for UUIDs,
+    the folder name is the UUID itself.
+    Args:
+        keys: Identifiers (file paths or UUIDs).
+        results: Parsed JSON text results (same order as keys).
+        output_dir: Base output directory.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    for key, result in zip(keys, results):
+        folder_name = Path(key).name
+        file_dir = os.path.join(output_dir, folder_name)
+        os.makedirs(file_dir, exist_ok=True)
+        result_path = os.path.join(file_dir, "result.json")
+        with open(result_path, "w", encoding="utf-8") as f:
+            f.write(result)

infinity_parser2/utils/image.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Image encoding and loading utilities."""
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import Tuple, Union
+from PIL import Image
+from qwen_vl_utils.vision_process import smart_resize
+try:
+    from importlib import metadata
+    _qwen_vl_utils_version = metadata.version("qwen-vl-utils")
+    if _qwen_vl_utils_version < "0.0.14":
+        raise ImportError("qwen-vl-utils version 0.0.14 or higher is required")
+except metadata.PackageNotFoundError:
+    raise ImportError("qwen-vl-utils is not installed. Install it with: pip install qwen-vl-utils")
+# MIME type mapping for common image formats
+IMAGE_MIME_TYPES = {
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".png": "image/png",
+    ".webp": "image/webp",
+    ".bmp": "image/bmp",
+    ".gif": "image/gif",
+    ".tiff": "image/tiff",
+    ".tif": "image/tiff",
+}
+def load_image(
+    input_data: Union[str, Image.Image],
+) -> Image.Image:
+    """Load image from file path or PIL Image and convert to RGB.
+    Args:
+        input_data: File path or PIL Image.
+    Returns:
+        PIL Image in RGB mode.
+    Raises:
+        TypeError: If input_data is an unsupported type.
+    """
+    if isinstance(input_data, str):
+        return Image.open(input_data).convert("RGB")
+    elif isinstance(input_data, Image.Image):
+        return input_data.convert("RGB")
+    else:
+        raise TypeError(f"Unsupported input type: {type(input_data)}")
+def encode_file_to_base64(
+    image_obj: Union[Image.Image, str],
+    min_pixels: int = 2048,
+    max_pixels: int = 16777216,
+) -> Tuple[str, str]:
+    """Encode image to base64 string and determine its MIME type.
+    Args:
+        image_obj: File path or PIL Image.
+        min_pixels: Minimum number of pixels for resizing.
+        max_pixels: Maximum number of pixels for resizing.
+    Returns:
+        Tuple of (base64 string, MIME type string).
+    """
+    if isinstance(image_obj, str):
+        image = Image.open(image_obj)
+        ext = Path(image_obj).suffix.lower()
+        mime_type = IMAGE_MIME_TYPES.get(ext, "image/jpeg")
+    else:
+        # Note: image.copy() loses the format attribute, so get it before copying
+        original_format = image_obj.format
+        image = image_obj.copy()
+        # Try to get format from original PIL Image, default to jpeg
+        mime_type = IMAGE_MIME_TYPES.get(f".{original_format}".lower(), "image/jpeg") if original_format else "image/jpeg"
+    resized_height, resized_width = smart_resize(
+        height=image.size[1],
+        width=image.size[0],
+        factor=32,
+        min_pixels=min_pixels,
+        max_pixels=max_pixels,
+    )
+    image = image.resize((resized_width, resized_height))
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    output_buffer = BytesIO()
+    image.save(output_buffer, format="PNG")
+    byte_data = output_buffer.getvalue()
+    base64_str = base64.b64encode(byte_data).decode("utf-8")
+    return base64_str, mime_type

infinity_parser2/utils/model.py ADDED Viewed

@@ -0,0 +1,243 @@
+"""Model cache management for Infinity-Parser2."""
+import json
+import os
+import socket
+import ssl
+import urllib.request
+import urllib.error
+from typing import Optional
+from huggingface_hub import snapshot_download
+# Default cache directory
+DEFAULT_CACHE_DIR = os.path.expanduser("~/.cache/infinity_parser2")
+# HuggingFace endpoints
+HF_ENDPOINT_DEFAULT = "https://huggingface.co"
+HF_ENDPOINT_MIRROR = "https://hf-mirror.com"
+# Timeout for connectivity check (seconds)
+_HF_CONNECT_TIMEOUT = 5.0
+def _check_endpoint_reachable(url: str, timeout: float = _HF_CONNECT_TIMEOUT) -> bool:
+    """Check if an HTTP endpoint is reachable.
+    Args:
+        url: The URL to check.
+        timeout: Connection timeout in seconds.
+    Returns:
+        True if the endpoint responds within the timeout, False otherwise.
+    """
+    try:
+        req = urllib.request.Request(
+            url,
+            method="HEAD",
+            headers={"User-Agent": "Mozilla/5.0 (compatible; Infinity-Parser2)"},
+        )
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return resp.status == 200
+    except (
+        urllib.error.URLError,
+        socket.timeout,
+        ConnectionError,
+        ssl.SSLError,
+        OSError,
+    ):
+        return False
+def _resolve_hf_endpoint() -> str:
+    """Resolve the best HuggingFace endpoint based on connectivity.
+    Checks if the default HuggingFace endpoint (https://huggingface.co) is reachable.
+    If not, falls back to the mirror (https://hf-mirror.com/).
+    Returns:
+        The URL string of the reachable endpoint.
+    """
+    if _check_endpoint_reachable(HF_ENDPOINT_DEFAULT):
+        return HF_ENDPOINT_DEFAULT
+    print(
+        f"[Infinity-Parser2] Default HF endpoint ({HF_ENDPOINT_DEFAULT}) is not reachable. "
+        f"Falling back to mirror: {HF_ENDPOINT_MIRROR}"
+    )
+    return HF_ENDPOINT_MIRROR
+class ModelCache:
+    """Manages local model cache for Infinity-Parser2.
+    Automatically detects if a model is already downloaded and cached locally.
+    If not, prompts the user and downloads it from HuggingFace Hub.
+    Attributes:
+        cache_dir: Directory where model cache metadata is stored.
+        models_file: Path to the JSON file containing cached model information.
+    """
+    def __init__(self, cache_dir: Optional[str] = None):
+        """Initialize ModelCache.
+        Args:
+            cache_dir: Custom cache directory. Defaults to ~/.cache/infinity_parser2.
+        """
+        self.cache_dir = cache_dir or DEFAULT_CACHE_DIR
+        self.models_file = os.path.join(self.cache_dir, "models_cache.json")
+        self._ensure_cache_dir()
+        self._models_cache: dict = self._load_cache()
+    def _ensure_cache_dir(self) -> None:
+        """Create cache directory if it doesn't exist."""
+        os.makedirs(self.cache_dir, exist_ok=True)
+    def _load_cache(self) -> dict:
+        """Load cached model information from JSON file."""
+        if not os.path.exists(self.models_file):
+            return {}
+        try:
+            with open(self.models_file, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError):
+            return {}
+    def _save_cache(self) -> None:
+        """Save cached model information to JSON file."""
+        with open(self.models_file, "w", encoding="utf-8") as f:
+            json.dump(self._models_cache, f, indent=2, ensure_ascii=False)
+    def is_cached(self, model_name: str) -> bool:
+        """Check if a model is already cached locally.
+        Args:
+            model_name: HuggingFace model name (e.g., "infly/Infinity-Parser2-Pro").
+        Returns:
+            True if model is cached and the local path exists.
+        """
+        if model_name not in self._models_cache:
+            return False
+        local_path = self._models_cache[model_name].get("local_path")
+        if not local_path or not os.path.exists(local_path):
+            return False
+        return True
+    def get_cached_path(self, model_name: str) -> Optional[str]:
+        """Get the cached local path for a model.
+        Args:
+            model_name: HuggingFace model name.
+        Returns:
+            Local path if cached, None otherwise.
+        """
+        if not self.is_cached(model_name):
+            return None
+        return self._models_cache[model_name].get("local_path")
+    def cache_model(self, model_name: str, local_path: str) -> None:
+        """Cache a model's local path.
+        Args:
+            model_name: HuggingFace model name.
+            local_path: Local directory where the model is stored.
+        """
+        self._models_cache[model_name] = {
+            "local_path": local_path,
+            "cached": True,
+        }
+        self._save_cache()
+    def download_and_cache(
+        self,
+        model_name: str,
+        target_dir: Optional[str] = None,
+        force_download: bool = False,
+    ) -> str:
+        """Download a model from HuggingFace Hub and cache its location.
+        Args:
+            model_name: HuggingFace model name (e.g., "infly/Infinity-Parser2-Pro").
+            target_dir: Custom download directory. If None, uses cache_dir/model_name.
+            force_download: If True, re-download even if cached.
+        Returns:
+            Local path where the model is stored.
+        """
+        if target_dir is None:
+            safe_name = model_name.replace("/", "_")
+            target_dir = os.path.join(self.cache_dir, safe_name)
+        # If already cached and not forcing download, return cached path
+        if self.is_cached(model_name) and not force_download:
+            cached_path = self.get_cached_path(model_name)
+            print(f"[Infinity-Parser2] Model already cached at: {cached_path}")
+            return cached_path
+        print(f"[Infinity-Parser2] Model '{model_name}' not found locally.")
+        print(f"[Infinity-Parser2] Starting download to: {target_dir}")
+        print("[Infinity-Parser2] This may take a few minutes depending on model size and network...")
+        # Resolve the best HF endpoint (cached per session)
+        resolved_endpoint = _resolve_hf_endpoint()
+        print(f"[Infinity-Parser2] Using endpoint: {resolved_endpoint}")
+        os.makedirs(target_dir, exist_ok=True)
+        snapshot_download(
+            repo_id=model_name,
+            local_dir=target_dir,
+            local_dir_use_symlinks=False,
+            endpoint=resolved_endpoint,
+        )
+        self.cache_model(model_name, target_dir)
+        print(f"[Infinity-Parser2] Model downloaded and cached successfully!")
+        print(f"[Infinity-Parser2] Cache location: {target_dir}")
+        return target_dir
+    def resolve_model_path(self, model_name: str) -> str:
+        """Resolve the model path for loading.
+        If model is not cached, downloads it automatically.
+        If model is a local path, returns it directly.
+        Args:
+            model_name: HuggingFace model name or local path.
+        Returns:
+            Resolved local path for model loading.
+        """
+        # If it's already a local path, return it directly
+        if os.path.exists(model_name):
+            return model_name
+        # If cached, return cached path
+        if self.is_cached(model_name):
+            cached_path = self.get_cached_path(model_name)
+            print(f"[Infinity-Parser2] Found cached model at: {cached_path}")
+            return cached_path
+        # Otherwise, download and cache
+        return self.download_and_cache(model_name)
+# Global model cache instance
+_model_cache: Optional[ModelCache] = None
+def get_model_cache(cache_dir: Optional[str] = None) -> ModelCache:
+    """Get or create the global ModelCache instance.
+    Args:
+        cache_dir: Custom cache directory for this session.
+    Returns:
+        The global ModelCache instance.
+    """
+    global _model_cache
+    if _model_cache is None:
+        _model_cache = ModelCache(cache_dir)
+    return _model_cache

infinity_parser2/utils/pdf.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""PDF to image conversion utility."""
+import io
+from typing import List, Union
+from PIL import Image
+try:
+    import fitz  # PyMuPDF
+except ImportError:
+    raise ImportError(
+        "PyMuPDF is required for PDF rendering. Install it with: pip install pymupdf"
+    )
+def convert_pdf_to_images(
+    pdf_path: Union[str, bytes],
+    dpi: int = 300,
+) -> List[Image.Image]:
+    """Convert a PDF file to a list of PIL Images (one per page).
+    Args:
+        pdf_path: Path to the PDF file or PDF bytes.
+        dpi: Resolution for rendering. Higher values give better quality
+             but use more memory. Defaults to 300.
+    Returns:
+        List of PIL Images, one per PDF page.
+    """
+    Image.MAX_IMAGE_PIXELS = None  # Disable decompression bomb check for large PDF pages
+    if isinstance(pdf_path, bytes):
+        doc = fitz.open(stream=pdf_path, filetype="pdf")
+    else:
+        doc = fitz.open(pdf_path)
+    images = []
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        mat = fitz.Matrix(dpi / 72, dpi / 72)
+        pix = page.get_pixmap(matrix=mat)
+        img_data = pix.tobytes("png")
+        images.append(Image.open(io.BytesIO(img_data)).convert("RGB"))
+    doc.close()
+    return images