infinity-parser2 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infinity_parser2/__init__.py +28 -0
- infinity_parser2/__main__.py +6 -0
- infinity_parser2/backends/__init__.py +13 -0
- infinity_parser2/backends/base.py +61 -0
- infinity_parser2/backends/transformers.py +159 -0
- infinity_parser2/backends/vllm_engine.py +117 -0
- infinity_parser2/backends/vllm_server.py +148 -0
- infinity_parser2/cli.py +207 -0
- infinity_parser2/parser.py +278 -0
- infinity_parser2/prompts.py +57 -0
- infinity_parser2/utils/__init__.py +43 -0
- infinity_parser2/utils/file.py +190 -0
- infinity_parser2/utils/image.py +99 -0
- infinity_parser2/utils/model.py +243 -0
- infinity_parser2/utils/pdf.py +46 -0
- infinity_parser2/utils/utils.py +159 -0
- infinity_parser2-0.1.0.dist-info/METADATA +310 -0
- infinity_parser2-0.1.0.dist-info/RECORD +25 -0
- infinity_parser2-0.1.0.dist-info/WHEEL +5 -0
- infinity_parser2-0.1.0.dist-info/entry_points.txt +2 -0
- infinity_parser2-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_backends.py +490 -0
- tests/test_parser.py +464 -0
- tests/test_utils.py +689 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Infinity-Parser2: Document parsing Python package."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .parser import InfinityParser2
|
|
6
|
+
from .backends import (
|
|
7
|
+
BaseBackend,
|
|
8
|
+
TransformersBackend,
|
|
9
|
+
VLLMEngineBackend,
|
|
10
|
+
VLLMServerBackend,
|
|
11
|
+
)
|
|
12
|
+
from .prompts import PROMPT_DOC2JSON, PROMPT_DOC2MD, SUPPORTED_TASK_TYPES
|
|
13
|
+
from .utils import convert_pdf_to_images
|
|
14
|
+
from .cli import main as cli_main
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"InfinityParser2",
|
|
18
|
+
"BaseBackend",
|
|
19
|
+
"TransformersBackend",
|
|
20
|
+
"VLLMEngineBackend",
|
|
21
|
+
"VLLMServerBackend",
|
|
22
|
+
"convert_pdf_to_images",
|
|
23
|
+
"PROMPT_DOC2JSON",
|
|
24
|
+
"PROMPT_DOC2MD",
|
|
25
|
+
"SUPPORTED_TASK_TYPES",
|
|
26
|
+
"__version__",
|
|
27
|
+
"cli_main",
|
|
28
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Inference backends for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
from .base import BaseBackend
|
|
4
|
+
from .transformers import TransformersBackend
|
|
5
|
+
from .vllm_engine import VLLMEngineBackend
|
|
6
|
+
from .vllm_server import VLLMServerBackend
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseBackend",
|
|
10
|
+
"TransformersBackend",
|
|
11
|
+
"VLLMEngineBackend",
|
|
12
|
+
"VLLMServerBackend",
|
|
13
|
+
]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Base backend interface for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Dict, Union
|
|
5
|
+
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseBackend(ABC):
|
|
10
|
+
"""Abstract base class for inference backends.
|
|
11
|
+
|
|
12
|
+
All backends must implement init() and parse_batch() methods.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
model_name: str = "infly/Infinity-Parser2-Pro",
|
|
18
|
+
device: str = "cuda",
|
|
19
|
+
**kwargs,
|
|
20
|
+
):
|
|
21
|
+
"""Initialize backend.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
model_name: Model name on HuggingFace Hub or local path to the model.
|
|
25
|
+
device: Device type, "cuda" or "cpu".
|
|
26
|
+
**kwargs: Additional backend-specific arguments.
|
|
27
|
+
"""
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
self.device = device
|
|
30
|
+
self.kwargs = kwargs
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def init(self) -> None:
|
|
34
|
+
"""Initialize the model and processor.
|
|
35
|
+
|
|
36
|
+
This method should be called before parse_batch().
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def parse_batch(
|
|
42
|
+
self,
|
|
43
|
+
input_data: list[Union[str, Image.Image]],
|
|
44
|
+
prompt: str,
|
|
45
|
+
batch_size: int = 1,
|
|
46
|
+
**kwargs,
|
|
47
|
+
) -> list[str]:
|
|
48
|
+
"""Parse multiple documents.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
input_data: List of inputs, each can be:
|
|
52
|
+
- str: File path (image or PDF)
|
|
53
|
+
- PIL.Image.Image: Image object
|
|
54
|
+
prompt: Prompt text for the model.
|
|
55
|
+
batch_size: Maximum number of images to process in one batch.
|
|
56
|
+
**kwargs: Additional arguments passed to the model.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of parsed text content (one per input in the same order).
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Transformers backend for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import torch
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
10
|
+
from qwen_vl_utils import process_vision_info
|
|
11
|
+
|
|
12
|
+
from ..utils import load_image
|
|
13
|
+
from .base import BaseBackend
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TransformersBackend(BaseBackend):
|
|
17
|
+
"""Inference backend using HuggingFace transformers.
|
|
18
|
+
|
|
19
|
+
Supports local model inference with automatic device mapping.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
model_name: str = "infly/Infinity-Parser2-Pro",
|
|
25
|
+
device: str = "cuda",
|
|
26
|
+
torch_dtype: str = "bfloat16",
|
|
27
|
+
min_pixels: int = 2048,
|
|
28
|
+
max_pixels: int = 16777216,
|
|
29
|
+
**kwargs,
|
|
30
|
+
):
|
|
31
|
+
"""Initialize Transformers backend.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_name: Model name on HuggingFace Hub or local path.
|
|
35
|
+
device: Device type, "cuda" or "cpu".
|
|
36
|
+
torch_dtype: Data type for model weights, "float16" or "bfloat16".
|
|
37
|
+
min_pixels: Minimum number of pixels for image input.
|
|
38
|
+
max_pixels: Maximum number of pixels for image input.
|
|
39
|
+
**kwargs: Additional arguments for AutoModelForImageTextToText.from_pretrained.
|
|
40
|
+
"""
|
|
41
|
+
super().__init__(model_name, device, **kwargs)
|
|
42
|
+
self.torch_dtype = getattr(torch, torch_dtype, torch.bfloat16)
|
|
43
|
+
self.min_pixels = min_pixels
|
|
44
|
+
self.max_pixels = max_pixels
|
|
45
|
+
self.init()
|
|
46
|
+
|
|
47
|
+
def init(self) -> None:
|
|
48
|
+
"""Initialize the model and processor."""
|
|
49
|
+
self._model = AutoModelForImageTextToText.from_pretrained(
|
|
50
|
+
self.model_name,
|
|
51
|
+
torch_dtype=self.torch_dtype,
|
|
52
|
+
device_map="auto",
|
|
53
|
+
**self.kwargs,
|
|
54
|
+
)
|
|
55
|
+
self._processor = AutoProcessor.from_pretrained(self.model_name)
|
|
56
|
+
|
|
57
|
+
def _process_inputs(
|
|
58
|
+
self,
|
|
59
|
+
inputs: list[Union[str, Image.Image]],
|
|
60
|
+
prompt: str,
|
|
61
|
+
**kwargs,
|
|
62
|
+
) -> dict:
|
|
63
|
+
"""Process inputs for generation.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Dictionary with processed inputs for the model.
|
|
67
|
+
"""
|
|
68
|
+
images = [load_image(item) for item in inputs]
|
|
69
|
+
|
|
70
|
+
messages = [
|
|
71
|
+
[
|
|
72
|
+
{
|
|
73
|
+
"role": "user",
|
|
74
|
+
"content": [
|
|
75
|
+
{"type": "image", "image": img, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels},
|
|
76
|
+
{"type": "text", "text": prompt},
|
|
77
|
+
],
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
for img in images
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
chat_template_kwargs = {"enable_thinking": False}
|
|
84
|
+
|
|
85
|
+
text = self._processor.apply_chat_template(
|
|
86
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs
|
|
87
|
+
)
|
|
88
|
+
image_inputs, _ = process_vision_info(messages, image_patch_size=16)
|
|
89
|
+
|
|
90
|
+
inputs = self._processor(
|
|
91
|
+
text=text,
|
|
92
|
+
images=image_inputs,
|
|
93
|
+
do_resize=False,
|
|
94
|
+
padding=True,
|
|
95
|
+
return_tensors="pt",
|
|
96
|
+
)
|
|
97
|
+
inputs.pop("token_type_ids", None)
|
|
98
|
+
return inputs
|
|
99
|
+
|
|
100
|
+
def _generate(self, inputs: dict, **kwargs) -> list[str]:
|
|
101
|
+
"""Run model generation and decode outputs.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
inputs: Processed inputs from _process_inputs.
|
|
105
|
+
**kwargs: Generation arguments.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of generated text outputs.
|
|
109
|
+
"""
|
|
110
|
+
# Move tensors to device
|
|
111
|
+
inputs = {
|
|
112
|
+
k: v.to(self._model.device) if isinstance(v, torch.Tensor) else v
|
|
113
|
+
for k, v in inputs.items()
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
generated_ids = self._model.generate(
|
|
117
|
+
**inputs,
|
|
118
|
+
max_new_tokens=kwargs.get("max_new_tokens", 32768),
|
|
119
|
+
temperature=kwargs.get("temperature", 0.0),
|
|
120
|
+
top_p=kwargs.get("top_p", 1.0),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
generated_ids_trimmed = [
|
|
124
|
+
out_ids[len(in_ids):]
|
|
125
|
+
for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
|
126
|
+
]
|
|
127
|
+
output_text = self._processor.batch_decode(
|
|
128
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
129
|
+
)
|
|
130
|
+
return output_text
|
|
131
|
+
|
|
132
|
+
def parse_batch(
|
|
133
|
+
self,
|
|
134
|
+
input_data: list[Union[str, Image.Image]],
|
|
135
|
+
prompt: str,
|
|
136
|
+
batch_size: int = 1,
|
|
137
|
+
**kwargs,
|
|
138
|
+
) -> list[str]:
|
|
139
|
+
"""Parse multiple documents with batched inference.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
input_data: List of file paths or PIL Images.
|
|
143
|
+
prompt: Prompt text for the model.
|
|
144
|
+
batch_size: Maximum number of images to process in one batch.
|
|
145
|
+
**kwargs: Additional arguments.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
List of parsed text content (one per input in the same order).
|
|
149
|
+
"""
|
|
150
|
+
results = [None] * len(input_data)
|
|
151
|
+
|
|
152
|
+
for i in tqdm(range(0, len(input_data), batch_size), desc="Parsing", file=sys.stdout):
|
|
153
|
+
batch = input_data[i : i + batch_size]
|
|
154
|
+
inputs = self._process_inputs(batch, prompt, **kwargs)
|
|
155
|
+
batch_results = self._generate(inputs, **kwargs)
|
|
156
|
+
for j, result in enumerate(batch_results):
|
|
157
|
+
results[i + j] = result
|
|
158
|
+
|
|
159
|
+
return results
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""vLLM Engine backend for Infinity-Parser2.
|
|
2
|
+
|
|
3
|
+
Uses vLLM's LLM class for offline batch inference.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
from PIL import Image
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from vllm import LLM, SamplingParams
|
|
12
|
+
|
|
13
|
+
from .base import BaseBackend
|
|
14
|
+
from ..utils import encode_file_to_base64
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class VLLMEngineBackend(BaseBackend):
|
|
18
|
+
"""Offline inference backend using vLLM Engine.
|
|
19
|
+
|
|
20
|
+
Uses vLLM's LLM class for local batch inference with tensor parallelism.
|
|
21
|
+
Reference: https://docs.vllm.ai/en/latest/serving/offline_inference.html
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
model_name: str = "infly/Infinity-Parser2-Pro",
|
|
27
|
+
device: str = "cuda",
|
|
28
|
+
tensor_parallel_size: int = 1,
|
|
29
|
+
min_pixels: int = 2048,
|
|
30
|
+
max_pixels: int = 16777216,
|
|
31
|
+
**kwargs,
|
|
32
|
+
):
|
|
33
|
+
"""Initialize vLLM Engine backend.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
model_name: Model name on HuggingFace Hub or local path.
|
|
37
|
+
device: Device type, "cuda" or "cpu".
|
|
38
|
+
tensor_parallel_size: Number of GPUs for tensor parallelism.
|
|
39
|
+
**kwargs: Additional arguments for vllm.LLM.
|
|
40
|
+
"""
|
|
41
|
+
super().__init__(model_name, device, **kwargs)
|
|
42
|
+
self.tensor_parallel_size = tensor_parallel_size
|
|
43
|
+
self.min_pixels = min_pixels
|
|
44
|
+
self.max_pixels = max_pixels
|
|
45
|
+
self.init()
|
|
46
|
+
|
|
47
|
+
def init(self) -> None:
|
|
48
|
+
"""Initialize the vLLM LLM instance."""
|
|
49
|
+
# model_name can be a HuggingFace model ID or local path
|
|
50
|
+
self._llm = LLM(
|
|
51
|
+
model=self.model_name,
|
|
52
|
+
trust_remote_code=True,
|
|
53
|
+
tensor_parallel_size=self.tensor_parallel_size,
|
|
54
|
+
gpu_memory_utilization=0.85,
|
|
55
|
+
**self.kwargs,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _build_messages(self, base64_data: str, mime_type: str, prompt: str) -> list:
|
|
59
|
+
"""Build chat messages with base64-encoded image.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of message dictionaries.
|
|
63
|
+
"""
|
|
64
|
+
return [
|
|
65
|
+
{
|
|
66
|
+
"role": "user",
|
|
67
|
+
"content": [
|
|
68
|
+
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_data}"}},
|
|
69
|
+
{"type": "text", "text": prompt},
|
|
70
|
+
],
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
def parse_batch(
|
|
75
|
+
self,
|
|
76
|
+
input_data: list[Union[str, Image.Image]],
|
|
77
|
+
prompt: str,
|
|
78
|
+
batch_size: int = 1,
|
|
79
|
+
**kwargs,
|
|
80
|
+
) -> list[str]:
|
|
81
|
+
"""Parse multiple documents with batched inference.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
input_data: List of file paths or PIL Images.
|
|
85
|
+
prompt: Prompt text for the model.
|
|
86
|
+
batch_size: Maximum number of images to process in one batch.
|
|
87
|
+
**kwargs: Additional arguments.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of parsed text content (one per input in the same order).
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
sampling_params = SamplingParams(
|
|
94
|
+
max_tokens=kwargs.get("max_new_tokens", 32768),
|
|
95
|
+
temperature=kwargs.get("temperature", 0.0),
|
|
96
|
+
top_p=kwargs.get("top_p", 1.0),
|
|
97
|
+
)
|
|
98
|
+
chat_template_kwargs = {"enable_thinking": False}
|
|
99
|
+
|
|
100
|
+
all_messages = []
|
|
101
|
+
for item in input_data:
|
|
102
|
+
base64_data, mime_type = encode_file_to_base64(item, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
|
|
103
|
+
all_messages.append(self._build_messages(base64_data, mime_type, prompt))
|
|
104
|
+
|
|
105
|
+
results = [None] * len(input_data)
|
|
106
|
+
for i in tqdm(range(0, len(all_messages), batch_size), desc="Parsing", file=sys.stdout):
|
|
107
|
+
batch_messages = all_messages[i : i + batch_size]
|
|
108
|
+
outputs = self._llm.chat(
|
|
109
|
+
batch_messages,
|
|
110
|
+
sampling_params=sampling_params,
|
|
111
|
+
use_tqdm=False,
|
|
112
|
+
chat_template_kwargs=chat_template_kwargs,
|
|
113
|
+
)
|
|
114
|
+
for j, output in enumerate(outputs):
|
|
115
|
+
results[i + j] = output.outputs[0].text
|
|
116
|
+
|
|
117
|
+
return results
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""vLLM Server backend for Infinity-Parser2.
|
|
2
|
+
|
|
3
|
+
Uses vLLM OpenAI-Compatible Server for online inference.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Union
|
|
9
|
+
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
from PIL import Image
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
from .base import BaseBackend
|
|
15
|
+
from ..utils import encode_file_to_base64
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class VLLMServerBackend(BaseBackend):
|
|
19
|
+
"""Online inference backend using vLLM OpenAI-Compatible Server.
|
|
20
|
+
|
|
21
|
+
Sends requests to a running vLLM server via HTTP API.
|
|
22
|
+
Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
model_name: str = "infly/Infinity-Parser2-Pro",
|
|
28
|
+
api_url: str = "http://localhost:8000/v1/chat/completions",
|
|
29
|
+
api_key: str = "EMPTY",
|
|
30
|
+
timeout: int = 300,
|
|
31
|
+
min_pixels: int = 2048,
|
|
32
|
+
max_pixels: int = 16777216,
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize vLLM Server backend.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model_name: Model name (must match server).
|
|
39
|
+
api_url: Full URL to the chat completions endpoint.
|
|
40
|
+
api_key: API key for authentication.
|
|
41
|
+
timeout: Request timeout in seconds.
|
|
42
|
+
**kwargs: Additional arguments for requests.
|
|
43
|
+
"""
|
|
44
|
+
device = kwargs.pop("device", "cuda")
|
|
45
|
+
super().__init__(model_name, device, **kwargs)
|
|
46
|
+
self.api_url = api_url
|
|
47
|
+
self.api_key = api_key
|
|
48
|
+
self.timeout = timeout
|
|
49
|
+
self.min_pixels = min_pixels
|
|
50
|
+
self.max_pixels = max_pixels
|
|
51
|
+
self.client = OpenAI(api_key=self.api_key, base_url=self.api_url.rsplit("/chat/completions", 1)[0])
|
|
52
|
+
self.init()
|
|
53
|
+
|
|
54
|
+
def init(self) -> None:
|
|
55
|
+
"""Validate server connection.
|
|
56
|
+
|
|
57
|
+
Note: This is a no-op as the server is started separately.
|
|
58
|
+
Call this to verify connectivity.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
self.client.chat.completions.create(
|
|
62
|
+
model=self.model_name,
|
|
63
|
+
messages=[{"role": "user", "content": "ping"}],
|
|
64
|
+
max_tokens=1,
|
|
65
|
+
timeout=5,
|
|
66
|
+
)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"Cannot connect to vLLM server at {self.api_url}. "
|
|
70
|
+
f"Please ensure the server is running. Error: {e}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def parse_batch(
|
|
74
|
+
self,
|
|
75
|
+
input_data: list[Union[str, Image.Image]],
|
|
76
|
+
prompt: str,
|
|
77
|
+
batch_size: int = 1,
|
|
78
|
+
**kwargs,
|
|
79
|
+
) -> list[str]:
|
|
80
|
+
"""Parse multiple documents via HTTP API with batched requests.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
input_data: List of file paths or PIL Images.
|
|
84
|
+
prompt: Prompt text for the model.
|
|
85
|
+
batch_size: Maximum number of images to process in one batch.
|
|
86
|
+
**kwargs: Additional arguments.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List of parsed text content (one per input in the same order).
|
|
90
|
+
"""
|
|
91
|
+
if not input_data:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
max_tokens = kwargs.get("max_new_tokens", kwargs.get("max_tokens", 32768))
|
|
95
|
+
temperature = kwargs.get("temperature", 0.0)
|
|
96
|
+
top_p = kwargs.get("top_p", 1.0)
|
|
97
|
+
extra_body = {
|
|
98
|
+
"chat_template_kwargs": {
|
|
99
|
+
"enable_thinking": False
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
def parse_one(item: Union[str, Image.Image]) -> str:
|
|
104
|
+
base64_data, mime_type = encode_file_to_base64(item, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
|
|
105
|
+
messages = [
|
|
106
|
+
{
|
|
107
|
+
"role": "user",
|
|
108
|
+
"content": [
|
|
109
|
+
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_data}"}},
|
|
110
|
+
{"type": "text", "text": prompt},
|
|
111
|
+
],
|
|
112
|
+
}
|
|
113
|
+
]
|
|
114
|
+
response = self.client.chat.completions.create(
|
|
115
|
+
model=self.model_name,
|
|
116
|
+
messages=messages,
|
|
117
|
+
max_tokens=max_tokens,
|
|
118
|
+
temperature=temperature,
|
|
119
|
+
top_p=top_p,
|
|
120
|
+
timeout=self.timeout,
|
|
121
|
+
extra_body=extra_body,
|
|
122
|
+
)
|
|
123
|
+
return response.choices[0].message.content
|
|
124
|
+
|
|
125
|
+
results: list[str] = [None] * len(input_data)
|
|
126
|
+
num_batches = (len(input_data) + batch_size - 1) // batch_size
|
|
127
|
+
|
|
128
|
+
for batch_idx in range(num_batches):
|
|
129
|
+
start = batch_idx * batch_size
|
|
130
|
+
end = min(start + batch_size, len(input_data))
|
|
131
|
+
batch_items = input_data[start:end]
|
|
132
|
+
|
|
133
|
+
with ThreadPoolExecutor(max_workers=len(batch_items)) as executor:
|
|
134
|
+
future_to_index = {
|
|
135
|
+
executor.submit(parse_one, item): start + i
|
|
136
|
+
for i, item in enumerate(batch_items)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
for future in tqdm(as_completed(future_to_index), total=len(batch_items), desc=f"Batch {batch_idx + 1}/{num_batches}", file=sys.stdout):
|
|
140
|
+
idx = future_to_index[future]
|
|
141
|
+
try:
|
|
142
|
+
results[idx] = future.result()
|
|
143
|
+
except Exception as e:
|
|
144
|
+
raise RuntimeError(
|
|
145
|
+
f"Failed to parse input at index {idx}: {e}"
|
|
146
|
+
) from e
|
|
147
|
+
|
|
148
|
+
return results
|