PyPI - infinity-parser2 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

infinity_parser2/__init__.py +28 -0
infinity_parser2/__main__.py +6 -0
infinity_parser2/backends/__init__.py +13 -0
infinity_parser2/backends/base.py +61 -0
infinity_parser2/backends/transformers.py +159 -0
infinity_parser2/backends/vllm_engine.py +117 -0
infinity_parser2/backends/vllm_server.py +148 -0
infinity_parser2/cli.py +207 -0
infinity_parser2/parser.py +278 -0
infinity_parser2/prompts.py +57 -0
infinity_parser2/utils/__init__.py +43 -0
infinity_parser2/utils/file.py +190 -0
infinity_parser2/utils/image.py +99 -0
infinity_parser2/utils/model.py +243 -0
infinity_parser2/utils/pdf.py +46 -0
infinity_parser2/utils/utils.py +159 -0
infinity_parser2-0.1.0.dist-info/METADATA +310 -0
infinity_parser2-0.1.0.dist-info/RECORD +25 -0
infinity_parser2-0.1.0.dist-info/WHEEL +5 -0
infinity_parser2-0.1.0.dist-info/entry_points.txt +2 -0
infinity_parser2-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +1 -0
tests/test_backends.py +490 -0
tests/test_parser.py +464 -0
tests/test_utils.py +689 -0

infinity_parser2/utils/utils.py ADDED Viewed

@@ -0,0 +1,159 @@
+import json
+import re
+from pathlib import Path
+from typing import Union
+from PIL import Image
+# ---------------------------------------------------------------------------
+# JSON extraction & cleanup
+# ---------------------------------------------------------------------------
+def extract_json_content(text: str) -> str:
+    """Extract the JSON block from a markdown-wrapped LLM response."""
+    match = re.search(r"```json\n(.*?)\n```", text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    partial = re.search(r"```json\n(.*)", text, re.DOTALL)
+    if partial:
+        return partial.group(1).strip()
+    return text
+def truncate_last_incomplete_element(text: str) -> tuple[str, bool]:
+    """
+    Truncate the response at the last complete dict entry so the JSON is always parseable.
+    Returns (cleaned_text, was_truncated).
+    """
+    needs_truncation = len(text) > 50_000 or not text.rstrip().endswith("]")
+    if not needs_truncation:
+        return text, False
+    if text.count('{"bbox":') <= 1:
+        return text, False
+    last_bbox_pos = text.rfind('{"bbox":')
+    truncated = text[:last_bbox_pos].rstrip()
+    if truncated.endswith(","):
+        truncated = truncated[:-1] + "]"
+    return truncated, True
+# ---------------------------------------------------------------------------
+# Coordinate normalisation
+# ---------------------------------------------------------------------------
+def obtain_origin_hw(image: Union[str, Path, Image.Image]) -> tuple[int, int]:
+    """
+    Return (height, width) of the image.
+    Accepts a file path (str/Path) or a PIL Image object.
+    """
+    if isinstance(image, Image.Image):
+        w, h = image.size
+        return h, w  # (height, width)
+    try:
+        img = Image.open(image).convert("RGB")
+        w, h = img.size
+        return h, w  # (height, width)
+    except Exception:
+        return 1000, 1000
+def restore_abs_bbox_coordinates(ans: str, origin_h: float, origin_w: float) -> str:
+    """Convert normalised [0-1000] bboxes back to pixel coordinates."""
+    try:
+        data = json.loads(ans)
+    except json.JSONDecodeError:
+        return ans
+    valid = True
+    for item in data:
+        for key in item:
+            if "bbox" not in key:
+                continue
+            bbox = item[key]
+            if len(bbox) == 4 and all(isinstance(c, (int, float)) for c in bbox):
+                x1, y1, x2, y2 = bbox
+                item[key] = [
+                    int(x1 / 1000.0 * origin_w),
+                    int(y1 / 1000.0 * origin_h),
+                    int(x2 / 1000.0 * origin_w),
+                    int(y2 / 1000.0 * origin_h),
+                ]
+            else:
+                valid = False
+    return json.dumps(data, ensure_ascii=False) if valid else ans
+# ---------------------------------------------------------------------------
+# JSON → Markdown
+# ---------------------------------------------------------------------------
+def convert_json_to_markdown(ans: str, keep_header_footer: bool = False) -> str:
+    """Convert the layout JSON list into a markdown string."""
+    try:
+        items = json.loads(ans)
+        if not isinstance(items, list):
+            return ans
+        lines = []
+        for sub in items:
+            if "text" not in sub or not sub["text"]:
+                continue
+            if keep_header_footer:
+                lines.append(sub["text"])
+            else:
+                if sub.get("category") not in ("header", "footer", "page_footnote"):
+                    lines.append(sub["text"])
+        return "\n\n".join(lines) if lines else ans
+    except Exception:
+        return ans
+# ---------------------------------------------------------------------------
+# DOC2JSON postprocess
+# ---------------------------------------------------------------------------
+def postprocess_doc2json_result(
+    raw_text: str,
+    image: Union[str, Path, Image.Image],
+    output_format: str = "json",
+) -> str:
+    """
+    Postprocess raw LLM output for DOC2JSON mode:
+      1. Extract JSON block from markdown-wrapped response
+      2. Truncate last incomplete element for parseable JSON
+      3. Restore normalised [0-1000] bboxes to pixel coordinates
+    """
+    text = extract_json_content(raw_text)
+    text, _ = truncate_last_incomplete_element(text)
+    origin_h, origin_w = obtain_origin_hw(image)
+    text = restore_abs_bbox_coordinates(text, origin_h, origin_w)
+    if output_format == "md":
+        text = convert_json_to_markdown(text)
+    return text
+# ---------------------------------------------------------------------------
+# Markdown cleanup
+# ---------------------------------------------------------------------------
+def postprocess_doc2md_result(text: str) -> str:
+    """Remove markdown code block fences from text.
+    Removes ```markdown\n and ``` (or similar) fences from the beginning
+    and end of text if present.
+    Args:
+        text: Input text that may contain markdown code block fences.
+    Returns:
+        Text with code block fences removed.
+    """
+    text = text.strip()
+    text = re.sub(r"^```markdown\s*\n?", "", text)
+    text = re.sub(r"^```\s*\n?", "", text)
+    text = re.sub(r"\n?```$", "", text)
+    return text.strip()

infinity_parser2-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,310 @@
+Metadata-Version: 2.4
+Name: infinity_parser2
+Version: 0.1.0
+Summary: Document parsing Python package supporting PDF and image parsing using Infinity-Parser2-Pro model.
+Home-page: https://github.com/infly-ai/INF-MLLM
+Author: INF Tech
+Author-email: contact@inftech.ai
+Keywords: document parsing
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: transformers==5.3.0
+Requires-Dist: tokenizers>=0.22.2
+Requires-Dist: qwen-vl-utils>=0.0.14
+Requires-Dist: Pillow>=9.0.0
+Requires-Dist: pypdf>=3.0.0
+Requires-Dist: pymupdf>=1.20.0
+Requires-Dist: openai>=1.0.0
+Requires-Dist: msgspec>=0.19.0
+Requires-Dist: pybase64>=1.4.2
+Requires-Dist: gguf>=0.17.1
+Requires-Dist: cbor2>=5.7.0
+Requires-Dist: py-cpuinfo>=9.0.0
+Requires-Dist: distro>=1.9.0
+Requires-Dist: openai_harmony>=0.0.4
+Requires-Dist: fastapi>=0.135.1
+Requires-Dist: starlette>=0.50.0
+Requires-Dist: annotated_doc>=0.0.4
+Requires-Dist: typing_inspection>=0.4.2
+Requires-Dist: llguidance>=1.3.0
+Requires-Dist: diskcache>=5.6.3
+Requires-Dist: xgrammar>=0.1.29
+Requires-Dist: partial_json_parser>=0.2.1.1.post6
+Requires-Dist: huggingface-hub>=0.24.0
+Requires-Dist: scikit-learn>=1.8.0
+Requires-Dist: scipy>=1.17.1
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# Infinity-Parser2
+Infinity-Parser2 is a document parsing tool powered by the Infinity-Parser2-Pro model. It converts **PDF files** and **images** (PNG, JPG, WEBP) into structured Markdown or JSON with layout information.
+## Quick Start
+### Installation
+#### Pre-requisites
+```bash
+# Install PyTorch (CUDA). Find the proper version on the [official site](https://pytorch.org/get-started/previous-versions) based on your CUDA version.
+pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu128
+# Install FlashAttention (required for NVIDIA GPUs).
+# This command builds flash-attn from source, which can take 10 to 30 minutes.
+pip install flash-attn==2.8.3 --no-build-isolation
+# For Hopper GPUs (e.g. H100, H800), we recommend FlashAttention-3 instead. See the [official guide](https://github.com/Dao-AILab/flash-attention).
+# Install vLLM
+# NOTE: you may need to run the command below to resolve triton and numpy conflicts before installing vllm.
+# pip uninstall -y pytorch-triton opencv-python opencv-python-headless numpy && rm -rf "$(python -c 'import site; print(site.getsitepackages()[0])')/cv2"
+pip install vllm==0.17.1
+```
+#### Install infinity_parser2
+```bash
+# From PyPI
+pip install infinity_parser2
+# From source
+git clone https://github.com/infly-ai/INF-MLLM.git
+cd INF-MLLM/Infinity-Parser2
+pip install -e .
+```
+### Usage
+#### Command Line
+The `parser` command is the fastest way to get started.
+```bash
+# Parse a PDF (outputs Markdown by default)
+parser demo_data/demo.pdf
+# Parse an image
+parser demo_data/demo.png
+# Batch parse multiple files
+parser demo_data/demo.pdf demo_data/demo.png -o ./output
+# Parse an entire directory
+parser demo_data -o ./output
+# Output raw JSON with layout bboxes
+parser demo_data/demo.pdf --output-format json
+# Convert to Markdown directly
+parser demo_data/demo.png --task doc2md
+```
+```bash
+# View all options
+parser --help
+```
+#### Python API
+```python
+from infinity_parser2 import InfinityParser2
+parser = InfinityParser2()
+# Parse a single file (returns Markdown)
+result = parser.parse("demo_data/demo.pdf")
+print(result)
+# Parse multiple files (returns list)
+results = parser.parse(["demo_data/demo.pdf", "demo_data/demo.png"])
+# Parse a directory (returns dict)
+results = parser.parse("demo_data")
+```
+**Output formats:**
+| task_type   | Description                                          | Default Output |
+|-------------|------------------------------------------------------|----------------|
+| `doc2json`  | Extract layout elements with bboxes (default)        | Markdown       |
+| `doc2md`    | Directly convert to Markdown                         | Markdown       |
+| `custom`    | Use your own prompt                                 | Raw model output |
+```python
+# doc2json: get raw JSON with bbox coordinates
+result = parser.parse("demo_data/demo.pdf", output_format="json")
+# doc2md: direct Markdown conversion
+result = parser.parse("demo_data/demo.pdf", task_type="doc2md")
+# Custom prompt
+result = parser.parse("demo_data/demo.pdf", task_type="custom",
+                      custom_prompt="Extract the title and authors only.")
+# Batch processing with custom batch size
+result = parser.parse("demo_data", batch_size=8)
+# Save results to directory
+parser.parse("demo_data/demo.pdf", output_dir="./output")
+```
+**Backends:**
+Infinity-Parser2 supports three inference backends. By default it uses the **vLLM Engine** (offline batch inference).
+```python
+# vLLM Engine (default) — offline batch inference
+parser = InfinityParser2(
+    model_name="infly/Infinity-Parser2-Pro",
+    backend="vllm-engine",        # default
+    tensor_parallel_size=2,
+)
+# Transformers — local single-GPU inference
+parser = InfinityParser2(
+    model_name="infly/Infinity-Parser2-Pro",
+    backend="transformers",
+    device="cuda",
+    torch_dtype="bfloat16",       # "float16" or "bfloat16"
+)
+# vLLM Server — online HTTP API (start server first)
+parser = InfinityParser2(
+    model_name="infly/Infinity-Parser2-Pro",
+    backend="vllm-server",
+    api_url="http://localhost:8000/v1/chat/completions",
+    api_key="EMPTY",
+)
+```
+To start a vLLM server:
+```bash
+vllm serve infly/Infinity-Parser2-Pro \
+    --trust-remote-code \
+    --reasoning-parser qwen3 \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --tensor-parallel-size 2 \
+    --gpu-memory-utilization 0.85 \
+    --max-model-len 65536 \
+    --mm-encoder-tp-mode data \
+    --mm-processor-cache-type shm \
+    --enable-prefix-caching
+```
+## API Reference
+### InfinityParser2
+```python
+parser = InfinityParser2(model_name="infly/Infinity-Parser2-Pro")
+```
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `model_name` | `str` | `"infly/Infinity-Parser2-Pro"` | HuggingFace model name or local path |
+| `backend` | `str` | `"vllm-engine"` | Inference backend: `"transformers"`, `"vllm-engine"`, or `"vllm-server"` |
+| `tensor_parallel_size` | `int` | `None` | GPU count by default. Tensor parallel size for vLLM Engine |
+| `device` | `str` | `"cuda"` | Only `"cuda"` is supported |
+| `api_url` | `str` | `"http://localhost:8000/v1/chat/completions"` | API URL for vLLM Server backend |
+| `api_key` | `str` | `"EMPTY"` | API key for vLLM Server backend |
+| `min_pixels` | `int` | `2048` | Minimum pixel count for image input (transformers backend only) |
+| `max_pixels` | `int` | `16777216` | Maximum pixel count (~4096x4096), transformers backend only |
+| `model_cache_dir` | `str` | `None` | Model cache directory (defaults to `~/.cache/infinity_parser2/`) |
+### parse()
+```python
+result = parser.parse("demo_data/demo.pdf")
+```
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `input_data` | `str \| List[str] \| PIL.Image` | **Required** | File path(s), directory path, or PIL Image object |
+| `task_type` | `str` | `"doc2json"` | `"doc2json"` (layout to JSON) \| `"doc2md"` (direct Markdown) \| `"custom"` |
+| `custom_prompt` | `str` | `None` | Custom prompt; required when `task_type="custom"` |
+| `batch_size` | `int` | `4` | Number of images to process per batch |
+| `output_dir` | `str` | `None` | If set, saves results to this directory instead of returning them |
+| `output_format` | `str` | `"md"` | `"md"` \| `"json"`. Only `"md"` is supported for `doc2md` / `custom` tasks |
+| `**kwargs` | — | — | Additional args passed to the model (e.g., `max_new_tokens`, `temperature`) |
+### Return Values
+| Input           | output_dir=None                  | output_dir set |
+|-----------------|----------------------------------|---------------|
+| Single file     | `str`                            | `None`        |
+| List of files   | `List[str]`                      | `None`        |
+| Directory       | `Dict[str, str]` (path→content) | `None`        |
+When `output_dir` is set, results are saved to `output_dir/{filename}/result.md` (or `result.json`).
+## Advanced Usage
+### Model Caching
+Models are downloaded automatically on first use and cached at `~/.cache/infinity_parser2/`. You can customize the cache location:
+```python
+parser = InfinityParser2(
+    model_name="infly/Infinity-Parser2-Pro",
+    model_cache_dir="/path/to/cache"
+)
+```
+### Generation Parameters
+```python
+result = parser.parse(
+    "demo_data/demo.pdf",
+    max_new_tokens=16384,
+    temperature=0.01,
+    top_p=0.95,
+)
+```
+### Utility Functions
+```python
+from infinity_parser2 import (
+    convert_pdf_to_images,
+    convert_json_to_markdown,
+    extract_json_content,
+    get_files_from_directory,
+    is_supported_file,
+    SUPPORTED_TASK_TYPES,
+    ModelCache,
+    get_model_cache,
+)
+# Convert PDF pages to PIL Images
+images = convert_pdf_to_images("demo_data/demo.pdf", dpi=300)
+# Convert layout JSON to Markdown
+markdown = convert_json_to_markdown(json_string)
+# Check model cache
+cache = get_model_cache()
+print(cache.resolve_model_path("infly/Infinity-Parser2-Pro"))
+```
+## Requirements
+- Python 3.12+
+- CUDA-compatible GPU
+- See `setup.py` for full dependency list.

infinity_parser2-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,25 @@
+infinity_parser2/__init__.py,sha256=mPnfmZGnRMZHSkkQU_An4F2caOw0gCeSVaxK2-Yzjio,650
+infinity_parser2/__main__.py,sha256=jq6TasYXniIXVrHNwTEIKz2RQfMGbrPnsf0hFp__KEI,140
+infinity_parser2/cli.py,sha256=oPoqzw0OK4KBP36VGt2Cp_i2vx-BVVL0ChRKI1u5Jcw,5953
+infinity_parser2/parser.py,sha256=O1ofwt4vEIK8jvpRzqb9qkFmLfYYv3h5gdKi92oyUU0,11363
+infinity_parser2/prompts.py,sha256=SL8BF-i7UxAcmzu7EqyRfFLfy8ZExD9m3Pw980t_KZs,2449
+infinity_parser2/backends/__init__.py,sha256=nLAnCAE8MhzekyvnbxSh2_gome48J5kfMwAGevKdoVE,321
+infinity_parser2/backends/base.py,sha256=Pd2eg2i6L-IH-d7CyAPQcHZGTsO8yskoPetNr461rRQ,1669
+infinity_parser2/backends/transformers.py,sha256=xK6K8jKINUxgtFw29QiV38lYrCmgZK8t5idd5kIZV8g,5164
+infinity_parser2/backends/vllm_engine.py,sha256=E6eutMOwjTMzoeupSrsVFdupRqDpigKlC3RXwPW1qkM,3884
+infinity_parser2/backends/vllm_server.py,sha256=klJtF-dk38cy20Ovj2OyvKiWqftv6gtqh4qdYhAur-I,5232
+infinity_parser2/utils/__init__.py,sha256=h_0QVqgdDFbAMQxyCyzZ7UpStpGc8XDUef1D5i_2nW8,1115
+infinity_parser2/utils/file.py,sha256=XI-FyzJo45c21-P8G5mErZGw2GXiJTopBqjl4_UKTs0,6921
+infinity_parser2/utils/image.py,sha256=OIPLqPJfYIWZ2lS-iyaetuK9xL1KkF7wPqQVkzX-D8g,2969
+infinity_parser2/utils/model.py,sha256=i6le1AIiHSk3qhm7Ffaq6wnVuqxASiu0rvssNPmcH0k,7993
+infinity_parser2/utils/pdf.py,sha256=_85akAxDFtkJqH186pxeHM8FPAB01b5FoEyUx48Bgeg,1243
+infinity_parser2/utils/utils.py,sha256=xorQdku2vSQUpFEplR_WSt8lYVZV9MY4GJDF8X_itVY,5242
+tests/__init__.py,sha256=7Rlm_cmZDLA9lgnW5glysBurhFGIw625JteUd2Cy_mw,47
+tests/test_backends.py,sha256=qJ_fVDyTLxjbSH_XkWtE6KbjF1hWJ71vjKz0uyg5AQA,22598
+tests/test_parser.py,sha256=G2U2s3HPDIi4SJmj4F22tP-H8I03bm_lC9Tog3r9yjg,21298
+tests/test_utils.py,sha256=KWQABAkdnnujuU5NTi8AFdlfR_FFbraWjL2fh1bazUQ,28672
+infinity_parser2-0.1.0.dist-info/METADATA,sha256=J29ekUlaXS4iRn-pYHGPel1hkem4aFur_DO7wk3s-Uo,9849
+infinity_parser2-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+infinity_parser2-0.1.0.dist-info/entry_points.txt,sha256=OnFPg-KtPNeV_J8g_NX_kTWJoQ1PxDR5KCktGzJ5TQo,53
+infinity_parser2-0.1.0.dist-info/top_level.txt,sha256=hvtIIcwbRgweH2KgktUwTMHcEIDL_4wK7Eees2ux8ws,23
+infinity_parser2-0.1.0.dist-info/RECORD,,

infinity_parser2-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

infinity_parser2-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ parser = infinity_parser2.cli:main

infinity_parser2-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ infinity_parser2
2	+ tests

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Unit tests for Infinity-Parser2 package."""