infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """Infinity-Parser2: Document parsing Python package."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .parser import InfinityParser2
6
+ from .backends import (
7
+ BaseBackend,
8
+ TransformersBackend,
9
+ VLLMEngineBackend,
10
+ VLLMServerBackend,
11
+ )
12
+ from .prompts import PROMPT_DOC2JSON, PROMPT_DOC2MD, SUPPORTED_TASK_TYPES
13
+ from .utils import convert_pdf_to_images
14
+ from .cli import main as cli_main
15
+
16
+ __all__ = [
17
+ "InfinityParser2",
18
+ "BaseBackend",
19
+ "TransformersBackend",
20
+ "VLLMEngineBackend",
21
+ "VLLMServerBackend",
22
+ "convert_pdf_to_images",
23
+ "PROMPT_DOC2JSON",
24
+ "PROMPT_DOC2MD",
25
+ "SUPPORTED_TASK_TYPES",
26
+ "__version__",
27
+ "cli_main",
28
+ ]
@@ -0,0 +1,6 @@
1
+ """Entry point for running `python -m infinity_parser2`."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
@@ -0,0 +1,13 @@
1
+ """Inference backends for Infinity-Parser2."""
2
+
3
+ from .base import BaseBackend
4
+ from .transformers import TransformersBackend
5
+ from .vllm_engine import VLLMEngineBackend
6
+ from .vllm_server import VLLMServerBackend
7
+
8
+ __all__ = [
9
+ "BaseBackend",
10
+ "TransformersBackend",
11
+ "VLLMEngineBackend",
12
+ "VLLMServerBackend",
13
+ ]
@@ -0,0 +1,61 @@
1
+ """Base backend interface for Infinity-Parser2."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, Union
5
+
6
+ from PIL import Image
7
+
8
+
9
+ class BaseBackend(ABC):
10
+ """Abstract base class for inference backends.
11
+
12
+ All backends must implement init() and parse_batch() methods.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ model_name: str = "infly/Infinity-Parser2-Pro",
18
+ device: str = "cuda",
19
+ **kwargs,
20
+ ):
21
+ """Initialize backend.
22
+
23
+ Args:
24
+ model_name: Model name on HuggingFace Hub or local path to the model.
25
+ device: Device type, "cuda" or "cpu".
26
+ **kwargs: Additional backend-specific arguments.
27
+ """
28
+ self.model_name = model_name
29
+ self.device = device
30
+ self.kwargs = kwargs
31
+
32
+ @abstractmethod
33
+ def init(self) -> None:
34
+ """Initialize the model and processor.
35
+
36
+ This method should be called before parse_batch().
37
+ """
38
+ pass
39
+
40
+ @abstractmethod
41
+ def parse_batch(
42
+ self,
43
+ input_data: list[Union[str, Image.Image]],
44
+ prompt: str,
45
+ batch_size: int = 1,
46
+ **kwargs,
47
+ ) -> list[str]:
48
+ """Parse multiple documents.
49
+
50
+ Args:
51
+ input_data: List of inputs, each can be:
52
+ - str: File path (image or PDF)
53
+ - PIL.Image.Image: Image object
54
+ prompt: Prompt text for the model.
55
+ batch_size: Maximum number of images to process in one batch.
56
+ **kwargs: Additional arguments passed to the model.
57
+
58
+ Returns:
59
+ List of parsed text content (one per input in the same order).
60
+ """
61
+ pass
@@ -0,0 +1,159 @@
1
+ """Transformers backend for Infinity-Parser2."""
2
+
3
+ import sys
4
+ from typing import Union
5
+
6
+ from PIL import Image
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoModelForImageTextToText, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info
11
+
12
+ from ..utils import load_image
13
+ from .base import BaseBackend
14
+
15
+
16
+ class TransformersBackend(BaseBackend):
17
+ """Inference backend using HuggingFace transformers.
18
+
19
+ Supports local model inference with automatic device mapping.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ model_name: str = "infly/Infinity-Parser2-Pro",
25
+ device: str = "cuda",
26
+ torch_dtype: str = "bfloat16",
27
+ min_pixels: int = 2048,
28
+ max_pixels: int = 16777216,
29
+ **kwargs,
30
+ ):
31
+ """Initialize Transformers backend.
32
+
33
+ Args:
34
+ model_name: Model name on HuggingFace Hub or local path.
35
+ device: Device type, "cuda" or "cpu".
36
+ torch_dtype: Data type for model weights, "float16" or "bfloat16".
37
+ min_pixels: Minimum number of pixels for image input.
38
+ max_pixels: Maximum number of pixels for image input.
39
+ **kwargs: Additional arguments for AutoModelForImageTextToText.from_pretrained.
40
+ """
41
+ super().__init__(model_name, device, **kwargs)
42
+ self.torch_dtype = getattr(torch, torch_dtype, torch.bfloat16)
43
+ self.min_pixels = min_pixels
44
+ self.max_pixels = max_pixels
45
+ self.init()
46
+
47
+ def init(self) -> None:
48
+ """Initialize the model and processor."""
49
+ self._model = AutoModelForImageTextToText.from_pretrained(
50
+ self.model_name,
51
+ torch_dtype=self.torch_dtype,
52
+ device_map="auto",
53
+ **self.kwargs,
54
+ )
55
+ self._processor = AutoProcessor.from_pretrained(self.model_name)
56
+
57
+ def _process_inputs(
58
+ self,
59
+ inputs: list[Union[str, Image.Image]],
60
+ prompt: str,
61
+ **kwargs,
62
+ ) -> dict:
63
+ """Process inputs for generation.
64
+
65
+ Returns:
66
+ Dictionary with processed inputs for the model.
67
+ """
68
+ images = [load_image(item) for item in inputs]
69
+
70
+ messages = [
71
+ [
72
+ {
73
+ "role": "user",
74
+ "content": [
75
+ {"type": "image", "image": img, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels},
76
+ {"type": "text", "text": prompt},
77
+ ],
78
+ }
79
+ ]
80
+ for img in images
81
+ ]
82
+
83
+ chat_template_kwargs = {"enable_thinking": False}
84
+
85
+ text = self._processor.apply_chat_template(
86
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs
87
+ )
88
+ image_inputs, _ = process_vision_info(messages, image_patch_size=16)
89
+
90
+ inputs = self._processor(
91
+ text=text,
92
+ images=image_inputs,
93
+ do_resize=False,
94
+ padding=True,
95
+ return_tensors="pt",
96
+ )
97
+ inputs.pop("token_type_ids", None)
98
+ return inputs
99
+
100
+ def _generate(self, inputs: dict, **kwargs) -> list[str]:
101
+ """Run model generation and decode outputs.
102
+
103
+ Args:
104
+ inputs: Processed inputs from _process_inputs.
105
+ **kwargs: Generation arguments.
106
+
107
+ Returns:
108
+ List of generated text outputs.
109
+ """
110
+ # Move tensors to device
111
+ inputs = {
112
+ k: v.to(self._model.device) if isinstance(v, torch.Tensor) else v
113
+ for k, v in inputs.items()
114
+ }
115
+
116
+ generated_ids = self._model.generate(
117
+ **inputs,
118
+ max_new_tokens=kwargs.get("max_new_tokens", 32768),
119
+ temperature=kwargs.get("temperature", 0.0),
120
+ top_p=kwargs.get("top_p", 1.0),
121
+ )
122
+
123
+ generated_ids_trimmed = [
124
+ out_ids[len(in_ids):]
125
+ for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
126
+ ]
127
+ output_text = self._processor.batch_decode(
128
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
129
+ )
130
+ return output_text
131
+
132
+ def parse_batch(
133
+ self,
134
+ input_data: list[Union[str, Image.Image]],
135
+ prompt: str,
136
+ batch_size: int = 1,
137
+ **kwargs,
138
+ ) -> list[str]:
139
+ """Parse multiple documents with batched inference.
140
+
141
+ Args:
142
+ input_data: List of file paths or PIL Images.
143
+ prompt: Prompt text for the model.
144
+ batch_size: Maximum number of images to process in one batch.
145
+ **kwargs: Additional arguments.
146
+
147
+ Returns:
148
+ List of parsed text content (one per input in the same order).
149
+ """
150
+ results = [None] * len(input_data)
151
+
152
+ for i in tqdm(range(0, len(input_data), batch_size), desc="Parsing", file=sys.stdout):
153
+ batch = input_data[i : i + batch_size]
154
+ inputs = self._process_inputs(batch, prompt, **kwargs)
155
+ batch_results = self._generate(inputs, **kwargs)
156
+ for j, result in enumerate(batch_results):
157
+ results[i + j] = result
158
+
159
+ return results
@@ -0,0 +1,117 @@
1
+ """vLLM Engine backend for Infinity-Parser2.
2
+
3
+ Uses vLLM's LLM class for offline batch inference.
4
+ """
5
+
6
+ import sys
7
+ from typing import Union
8
+
9
+ from PIL import Image
10
+ from tqdm import tqdm
11
+ from vllm import LLM, SamplingParams
12
+
13
+ from .base import BaseBackend
14
+ from ..utils import encode_file_to_base64
15
+
16
+
17
+ class VLLMEngineBackend(BaseBackend):
18
+ """Offline inference backend using vLLM Engine.
19
+
20
+ Uses vLLM's LLM class for local batch inference with tensor parallelism.
21
+ Reference: https://docs.vllm.ai/en/latest/serving/offline_inference.html
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ model_name: str = "infly/Infinity-Parser2-Pro",
27
+ device: str = "cuda",
28
+ tensor_parallel_size: int = 1,
29
+ min_pixels: int = 2048,
30
+ max_pixels: int = 16777216,
31
+ **kwargs,
32
+ ):
33
+ """Initialize vLLM Engine backend.
34
+
35
+ Args:
36
+ model_name: Model name on HuggingFace Hub or local path.
37
+ device: Device type, "cuda" or "cpu".
38
+ tensor_parallel_size: Number of GPUs for tensor parallelism.
39
+ **kwargs: Additional arguments for vllm.LLM.
40
+ """
41
+ super().__init__(model_name, device, **kwargs)
42
+ self.tensor_parallel_size = tensor_parallel_size
43
+ self.min_pixels = min_pixels
44
+ self.max_pixels = max_pixels
45
+ self.init()
46
+
47
+ def init(self) -> None:
48
+ """Initialize the vLLM LLM instance."""
49
+ # model_name can be a HuggingFace model ID or local path
50
+ self._llm = LLM(
51
+ model=self.model_name,
52
+ trust_remote_code=True,
53
+ tensor_parallel_size=self.tensor_parallel_size,
54
+ gpu_memory_utilization=0.85,
55
+ **self.kwargs,
56
+ )
57
+
58
+ def _build_messages(self, base64_data: str, mime_type: str, prompt: str) -> list:
59
+ """Build chat messages with base64-encoded image.
60
+
61
+ Returns:
62
+ List of message dictionaries.
63
+ """
64
+ return [
65
+ {
66
+ "role": "user",
67
+ "content": [
68
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_data}"}},
69
+ {"type": "text", "text": prompt},
70
+ ],
71
+ }
72
+ ]
73
+
74
+ def parse_batch(
75
+ self,
76
+ input_data: list[Union[str, Image.Image]],
77
+ prompt: str,
78
+ batch_size: int = 1,
79
+ **kwargs,
80
+ ) -> list[str]:
81
+ """Parse multiple documents with batched inference.
82
+
83
+ Args:
84
+ input_data: List of file paths or PIL Images.
85
+ prompt: Prompt text for the model.
86
+ batch_size: Maximum number of images to process in one batch.
87
+ **kwargs: Additional arguments.
88
+
89
+ Returns:
90
+ List of parsed text content (one per input in the same order).
91
+ """
92
+
93
+ sampling_params = SamplingParams(
94
+ max_tokens=kwargs.get("max_new_tokens", 32768),
95
+ temperature=kwargs.get("temperature", 0.0),
96
+ top_p=kwargs.get("top_p", 1.0),
97
+ )
98
+ chat_template_kwargs = {"enable_thinking": False}
99
+
100
+ all_messages = []
101
+ for item in input_data:
102
+ base64_data, mime_type = encode_file_to_base64(item, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
103
+ all_messages.append(self._build_messages(base64_data, mime_type, prompt))
104
+
105
+ results = [None] * len(input_data)
106
+ for i in tqdm(range(0, len(all_messages), batch_size), desc="Parsing", file=sys.stdout):
107
+ batch_messages = all_messages[i : i + batch_size]
108
+ outputs = self._llm.chat(
109
+ batch_messages,
110
+ sampling_params=sampling_params,
111
+ use_tqdm=False,
112
+ chat_template_kwargs=chat_template_kwargs,
113
+ )
114
+ for j, output in enumerate(outputs):
115
+ results[i + j] = output.outputs[0].text
116
+
117
+ return results
@@ -0,0 +1,148 @@
1
+ """vLLM Server backend for Infinity-Parser2.
2
+
3
+ Uses vLLM OpenAI-Compatible Server for online inference.
4
+ """
5
+
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ import sys
8
+ from typing import Union
9
+
10
+ from openai import OpenAI
11
+ from PIL import Image
12
+ from tqdm import tqdm
13
+
14
+ from .base import BaseBackend
15
+ from ..utils import encode_file_to_base64
16
+
17
+
18
+ class VLLMServerBackend(BaseBackend):
19
+ """Online inference backend using vLLM OpenAI-Compatible Server.
20
+
21
+ Sends requests to a running vLLM server via HTTP API.
22
+ Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ model_name: str = "infly/Infinity-Parser2-Pro",
28
+ api_url: str = "http://localhost:8000/v1/chat/completions",
29
+ api_key: str = "EMPTY",
30
+ timeout: int = 300,
31
+ min_pixels: int = 2048,
32
+ max_pixels: int = 16777216,
33
+ **kwargs,
34
+ ):
35
+ """Initialize vLLM Server backend.
36
+
37
+ Args:
38
+ model_name: Model name (must match server).
39
+ api_url: Full URL to the chat completions endpoint.
40
+ api_key: API key for authentication.
41
+ timeout: Request timeout in seconds.
42
+ **kwargs: Additional arguments for requests.
43
+ """
44
+ device = kwargs.pop("device", "cuda")
45
+ super().__init__(model_name, device, **kwargs)
46
+ self.api_url = api_url
47
+ self.api_key = api_key
48
+ self.timeout = timeout
49
+ self.min_pixels = min_pixels
50
+ self.max_pixels = max_pixels
51
+ self.client = OpenAI(api_key=self.api_key, base_url=self.api_url.rsplit("/chat/completions", 1)[0])
52
+ self.init()
53
+
54
+ def init(self) -> None:
55
+ """Validate server connection.
56
+
57
+ Note: This is a no-op as the server is started separately.
58
+ Call this to verify connectivity.
59
+ """
60
+ try:
61
+ self.client.chat.completions.create(
62
+ model=self.model_name,
63
+ messages=[{"role": "user", "content": "ping"}],
64
+ max_tokens=1,
65
+ timeout=5,
66
+ )
67
+ except Exception as e:
68
+ raise RuntimeError(
69
+ f"Cannot connect to vLLM server at {self.api_url}. "
70
+ f"Please ensure the server is running. Error: {e}"
71
+ )
72
+
73
+ def parse_batch(
74
+ self,
75
+ input_data: list[Union[str, Image.Image]],
76
+ prompt: str,
77
+ batch_size: int = 1,
78
+ **kwargs,
79
+ ) -> list[str]:
80
+ """Parse multiple documents via HTTP API with batched requests.
81
+
82
+ Args:
83
+ input_data: List of file paths or PIL Images.
84
+ prompt: Prompt text for the model.
85
+ batch_size: Maximum number of images to process in one batch.
86
+ **kwargs: Additional arguments.
87
+
88
+ Returns:
89
+ List of parsed text content (one per input in the same order).
90
+ """
91
+ if not input_data:
92
+ return []
93
+
94
+ max_tokens = kwargs.get("max_new_tokens", kwargs.get("max_tokens", 32768))
95
+ temperature = kwargs.get("temperature", 0.0)
96
+ top_p = kwargs.get("top_p", 1.0)
97
+ extra_body = {
98
+ "chat_template_kwargs": {
99
+ "enable_thinking": False
100
+ }
101
+ }
102
+
103
+ def parse_one(item: Union[str, Image.Image]) -> str:
104
+ base64_data, mime_type = encode_file_to_base64(item, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
105
+ messages = [
106
+ {
107
+ "role": "user",
108
+ "content": [
109
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_data}"}},
110
+ {"type": "text", "text": prompt},
111
+ ],
112
+ }
113
+ ]
114
+ response = self.client.chat.completions.create(
115
+ model=self.model_name,
116
+ messages=messages,
117
+ max_tokens=max_tokens,
118
+ temperature=temperature,
119
+ top_p=top_p,
120
+ timeout=self.timeout,
121
+ extra_body=extra_body,
122
+ )
123
+ return response.choices[0].message.content
124
+
125
+ results: list[str] = [None] * len(input_data)
126
+ num_batches = (len(input_data) + batch_size - 1) // batch_size
127
+
128
+ for batch_idx in range(num_batches):
129
+ start = batch_idx * batch_size
130
+ end = min(start + batch_size, len(input_data))
131
+ batch_items = input_data[start:end]
132
+
133
+ with ThreadPoolExecutor(max_workers=len(batch_items)) as executor:
134
+ future_to_index = {
135
+ executor.submit(parse_one, item): start + i
136
+ for i, item in enumerate(batch_items)
137
+ }
138
+
139
+ for future in tqdm(as_completed(future_to_index), total=len(batch_items), desc=f"Batch {batch_idx + 1}/{num_batches}", file=sys.stdout):
140
+ idx = future_to_index[future]
141
+ try:
142
+ results[idx] = future.result()
143
+ except Exception as e:
144
+ raise RuntimeError(
145
+ f"Failed to parse input at index {idx}: {e}"
146
+ ) from e
147
+
148
+ return results