infinity-parser2 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/PKG-INFO +110 -10
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/README.md +109 -9
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/__init__.py +1 -1
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/cli.py +1 -1
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/parser.py +5 -2
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/PKG-INFO +110 -10
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/setup.py +1 -1
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/__main__.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/backends/__init__.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/backends/base.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/backends/transformers.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/backends/vllm_engine.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/backends/vllm_server.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/prompts.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/__init__.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/file.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/image.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/model.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/pdf.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2/utils/utils.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/SOURCES.txt +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/dependency_links.txt +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/entry_points.txt +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/requires.txt +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/top_level.txt +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/setup.cfg +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/tests/__init__.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/tests/test_backends.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/tests/test_parser.py +0 -0
- {infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: infinity_parser2
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Document parsing Python package supporting PDF and image parsing using Infinity-Parser2-Pro model.
|
|
5
5
|
Home-page: https://github.com/infly-ai/INF-MLLM
|
|
6
6
|
Author: INF Tech
|
|
@@ -83,7 +83,101 @@ We are excited to release Infinity-Parser2-Pro, our latest flagship document und
|
|
|
83
83
|
|
|
84
84
|
## Quick Start
|
|
85
85
|
|
|
86
|
-
###
|
|
86
|
+
### 1. Minimal "Hello World" (Native Transformers)
|
|
87
|
+
|
|
88
|
+
If you are looking for a minimal script to parse a single image to Markdown using the native `transformers` library, here is a simple snippet:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from PIL import Image
|
|
92
|
+
import torch
|
|
93
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
94
|
+
from qwen_vl_utils import process_vision_info
|
|
95
|
+
|
|
96
|
+
# Load the model and processor
|
|
97
|
+
model = AutoModelForImageTextToText.from_pretrained(
|
|
98
|
+
"infly/Infinity-Parser2-Pro",
|
|
99
|
+
torch_dtype="float16",
|
|
100
|
+
device_map="auto",
|
|
101
|
+
)
|
|
102
|
+
processor = AutoProcessor.from_pretrained("infly/Infinity-Parser2-Pro")
|
|
103
|
+
|
|
104
|
+
# Build the messages for the model
|
|
105
|
+
pil_image = Image.open("demo_data/demo.png").convert("RGB")
|
|
106
|
+
min_pixels = 2048 # 32 * 64
|
|
107
|
+
max_pixels = 16777216 # 4096 * 4096
|
|
108
|
+
prompt = """
|
|
109
|
+
Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
110
|
+
1. Bbox format: [x1, y1, x2, y2]
|
|
111
|
+
2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
|
|
112
|
+
3. Text Extraction & Formatting Rules:
|
|
113
|
+
- Figure: For the 'figure' category, the text field should be empty string.
|
|
114
|
+
- Formula: Format its text as LaTeX.
|
|
115
|
+
- Table: Format its text as HTML.
|
|
116
|
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
117
|
+
4. Constraints:
|
|
118
|
+
- The output text must be the original text from the image, with no translation.
|
|
119
|
+
- All layout elements must be sorted according to human reading order.
|
|
120
|
+
5. Final Output: The entire output must be a single JSON object.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
messages = [
|
|
124
|
+
{
|
|
125
|
+
"role": "user",
|
|
126
|
+
"content": [
|
|
127
|
+
{
|
|
128
|
+
"type": "image",
|
|
129
|
+
"image": pil_image,
|
|
130
|
+
"min_pixels": min_pixels,
|
|
131
|
+
"max_pixels": max_pixels,
|
|
132
|
+
},
|
|
133
|
+
{"type": "text", "text": prompt},
|
|
134
|
+
],
|
|
135
|
+
}
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
chat_template_kwargs = {"enable_thinking": False}
|
|
139
|
+
|
|
140
|
+
text = processor.apply_chat_template(
|
|
141
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs
|
|
142
|
+
)
|
|
143
|
+
image_inputs, _ = process_vision_info(messages, image_patch_size=16)
|
|
144
|
+
|
|
145
|
+
inputs = processor(
|
|
146
|
+
text=text,
|
|
147
|
+
images=image_inputs,
|
|
148
|
+
do_resize=False,
|
|
149
|
+
padding=True,
|
|
150
|
+
return_tensors="pt",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Move all tensors to the same device as the model
|
|
154
|
+
inputs = {
|
|
155
|
+
k: v.to(model.device) if isinstance(v, torch.Tensor) else v
|
|
156
|
+
for k, v in inputs.items()
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# Generate the response
|
|
160
|
+
generated_ids = model.generate(
|
|
161
|
+
**inputs,
|
|
162
|
+
max_new_tokens=32768,
|
|
163
|
+
temperature=0.0,
|
|
164
|
+
top_p=1.0,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Strip input tokens, keeping only the newly generated response
|
|
168
|
+
generated_ids_trimmed = [
|
|
169
|
+
out_ids[len(in_ids) :]
|
|
170
|
+
for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
|
171
|
+
]
|
|
172
|
+
output_text = processor.batch_decode(
|
|
173
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
174
|
+
)
|
|
175
|
+
print(output_text)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### 2. Advanced Pipeline (infinity_parser2)
|
|
179
|
+
|
|
180
|
+
For bulk processing, advanced features, or an end-to-end PDF parsing pipeline, we recommend using our infinity_parser2 wrapper.
|
|
87
181
|
|
|
88
182
|
#### Pre-requisites
|
|
89
183
|
|
|
@@ -95,10 +189,12 @@ conda activate infinity_parser2
|
|
|
95
189
|
# Install PyTorch (CUDA). Find the proper version at https://pytorch.org/get-started/previous-versions based on your CUDA version.
|
|
96
190
|
pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu128
|
|
97
191
|
|
|
98
|
-
# Install FlashAttention (
|
|
99
|
-
#
|
|
192
|
+
# Install FlashAttention (FlashAttention-2 is recommended by default)
|
|
193
|
+
# Standard install (compiles from source, ~10-30 min):
|
|
100
194
|
pip install flash-attn==2.8.3 --no-build-isolation
|
|
101
|
-
#
|
|
195
|
+
# Faster install: download wheel from https://github.com/Dao-AILab/flash-attention/releases. Then run: pip install /path/to/<wheel_filename>.whl
|
|
196
|
+
# For Hopper GPUs (e.g. H100, H800), we recommend FlashAttention-3 instead. See: https://github.com/Dao-AILab/flash-attention
|
|
197
|
+
# NOTE: The code will prioritize detecting FlashAttention-3. If not found, it falls back to FlashAttention-2.
|
|
102
198
|
|
|
103
199
|
# Install vLLM
|
|
104
200
|
# NOTE: you may need to run the command below to resolve triton and numpy conflicts before installing vllm.
|
|
@@ -108,19 +204,23 @@ pip install vllm==0.17.1
|
|
|
108
204
|
|
|
109
205
|
#### Install infinity_parser2
|
|
110
206
|
|
|
207
|
+
Install from PyPI
|
|
208
|
+
|
|
111
209
|
```bash
|
|
112
|
-
# From PyPI
|
|
113
210
|
pip install infinity_parser2
|
|
211
|
+
```
|
|
114
212
|
|
|
115
|
-
|
|
213
|
+
Install from source code
|
|
214
|
+
|
|
215
|
+
```bash
|
|
116
216
|
git clone https://github.com/infly-ai/INF-MLLM.git
|
|
117
217
|
cd INF-MLLM/Infinity-Parser2
|
|
118
218
|
pip install -e .
|
|
119
219
|
```
|
|
120
220
|
|
|
121
|
-
|
|
221
|
+
#### Usage
|
|
122
222
|
|
|
123
|
-
|
|
223
|
+
##### Command Line
|
|
124
224
|
|
|
125
225
|
The `parser` command is the fastest way to get started.
|
|
126
226
|
|
|
@@ -151,7 +251,7 @@ parser demo_data/demo.png --task doc2md
|
|
|
151
251
|
parser --help
|
|
152
252
|
```
|
|
153
253
|
|
|
154
|
-
|
|
254
|
+
##### Python API
|
|
155
255
|
|
|
156
256
|
```python
|
|
157
257
|
# NOTE: The Infinity-Parser2 model will be automatically downloaded on the first run.
|
|
@@ -30,7 +30,101 @@ We are excited to release Infinity-Parser2-Pro, our latest flagship document und
|
|
|
30
30
|
|
|
31
31
|
## Quick Start
|
|
32
32
|
|
|
33
|
-
###
|
|
33
|
+
### 1. Minimal "Hello World" (Native Transformers)
|
|
34
|
+
|
|
35
|
+
If you are looking for a minimal script to parse a single image to Markdown using the native `transformers` library, here is a simple snippet:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from PIL import Image
|
|
39
|
+
import torch
|
|
40
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
41
|
+
from qwen_vl_utils import process_vision_info
|
|
42
|
+
|
|
43
|
+
# Load the model and processor
|
|
44
|
+
model = AutoModelForImageTextToText.from_pretrained(
|
|
45
|
+
"infly/Infinity-Parser2-Pro",
|
|
46
|
+
torch_dtype="float16",
|
|
47
|
+
device_map="auto",
|
|
48
|
+
)
|
|
49
|
+
processor = AutoProcessor.from_pretrained("infly/Infinity-Parser2-Pro")
|
|
50
|
+
|
|
51
|
+
# Build the messages for the model
|
|
52
|
+
pil_image = Image.open("demo_data/demo.png").convert("RGB")
|
|
53
|
+
min_pixels = 2048 # 32 * 64
|
|
54
|
+
max_pixels = 16777216 # 4096 * 4096
|
|
55
|
+
prompt = """
|
|
56
|
+
Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
57
|
+
1. Bbox format: [x1, y1, x2, y2]
|
|
58
|
+
2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
|
|
59
|
+
3. Text Extraction & Formatting Rules:
|
|
60
|
+
- Figure: For the 'figure' category, the text field should be empty string.
|
|
61
|
+
- Formula: Format its text as LaTeX.
|
|
62
|
+
- Table: Format its text as HTML.
|
|
63
|
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
64
|
+
4. Constraints:
|
|
65
|
+
- The output text must be the original text from the image, with no translation.
|
|
66
|
+
- All layout elements must be sorted according to human reading order.
|
|
67
|
+
5. Final Output: The entire output must be a single JSON object.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
messages = [
|
|
71
|
+
{
|
|
72
|
+
"role": "user",
|
|
73
|
+
"content": [
|
|
74
|
+
{
|
|
75
|
+
"type": "image",
|
|
76
|
+
"image": pil_image,
|
|
77
|
+
"min_pixels": min_pixels,
|
|
78
|
+
"max_pixels": max_pixels,
|
|
79
|
+
},
|
|
80
|
+
{"type": "text", "text": prompt},
|
|
81
|
+
],
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
chat_template_kwargs = {"enable_thinking": False}
|
|
86
|
+
|
|
87
|
+
text = processor.apply_chat_template(
|
|
88
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs
|
|
89
|
+
)
|
|
90
|
+
image_inputs, _ = process_vision_info(messages, image_patch_size=16)
|
|
91
|
+
|
|
92
|
+
inputs = processor(
|
|
93
|
+
text=text,
|
|
94
|
+
images=image_inputs,
|
|
95
|
+
do_resize=False,
|
|
96
|
+
padding=True,
|
|
97
|
+
return_tensors="pt",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Move all tensors to the same device as the model
|
|
101
|
+
inputs = {
|
|
102
|
+
k: v.to(model.device) if isinstance(v, torch.Tensor) else v
|
|
103
|
+
for k, v in inputs.items()
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Generate the response
|
|
107
|
+
generated_ids = model.generate(
|
|
108
|
+
**inputs,
|
|
109
|
+
max_new_tokens=32768,
|
|
110
|
+
temperature=0.0,
|
|
111
|
+
top_p=1.0,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Strip input tokens, keeping only the newly generated response
|
|
115
|
+
generated_ids_trimmed = [
|
|
116
|
+
out_ids[len(in_ids) :]
|
|
117
|
+
for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
|
118
|
+
]
|
|
119
|
+
output_text = processor.batch_decode(
|
|
120
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
121
|
+
)
|
|
122
|
+
print(output_text)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### 2. Advanced Pipeline (infinity_parser2)
|
|
126
|
+
|
|
127
|
+
For bulk processing, advanced features, or an end-to-end PDF parsing pipeline, we recommend using our infinity_parser2 wrapper.
|
|
34
128
|
|
|
35
129
|
#### Pre-requisites
|
|
36
130
|
|
|
@@ -42,10 +136,12 @@ conda activate infinity_parser2
|
|
|
42
136
|
# Install PyTorch (CUDA). Find the proper version at https://pytorch.org/get-started/previous-versions based on your CUDA version.
|
|
43
137
|
pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu128
|
|
44
138
|
|
|
45
|
-
# Install FlashAttention (
|
|
46
|
-
#
|
|
139
|
+
# Install FlashAttention (FlashAttention-2 is recommended by default)
|
|
140
|
+
# Standard install (compiles from source, ~10-30 min):
|
|
47
141
|
pip install flash-attn==2.8.3 --no-build-isolation
|
|
48
|
-
#
|
|
142
|
+
# Faster install: download wheel from https://github.com/Dao-AILab/flash-attention/releases. Then run: pip install /path/to/<wheel_filename>.whl
|
|
143
|
+
# For Hopper GPUs (e.g. H100, H800), we recommend FlashAttention-3 instead. See: https://github.com/Dao-AILab/flash-attention
|
|
144
|
+
# NOTE: The code will prioritize detecting FlashAttention-3. If not found, it falls back to FlashAttention-2.
|
|
49
145
|
|
|
50
146
|
# Install vLLM
|
|
51
147
|
# NOTE: you may need to run the command below to resolve triton and numpy conflicts before installing vllm.
|
|
@@ -55,19 +151,23 @@ pip install vllm==0.17.1
|
|
|
55
151
|
|
|
56
152
|
#### Install infinity_parser2
|
|
57
153
|
|
|
154
|
+
Install from PyPI
|
|
155
|
+
|
|
58
156
|
```bash
|
|
59
|
-
# From PyPI
|
|
60
157
|
pip install infinity_parser2
|
|
158
|
+
```
|
|
61
159
|
|
|
62
|
-
|
|
160
|
+
Install from source code
|
|
161
|
+
|
|
162
|
+
```bash
|
|
63
163
|
git clone https://github.com/infly-ai/INF-MLLM.git
|
|
64
164
|
cd INF-MLLM/Infinity-Parser2
|
|
65
165
|
pip install -e .
|
|
66
166
|
```
|
|
67
167
|
|
|
68
|
-
|
|
168
|
+
#### Usage
|
|
69
169
|
|
|
70
|
-
|
|
170
|
+
##### Command Line
|
|
71
171
|
|
|
72
172
|
The `parser` command is the fastest way to get started.
|
|
73
173
|
|
|
@@ -98,7 +198,7 @@ parser demo_data/demo.png --task doc2md
|
|
|
98
198
|
parser --help
|
|
99
199
|
```
|
|
100
200
|
|
|
101
|
-
|
|
201
|
+
##### Python API
|
|
102
202
|
|
|
103
203
|
```python
|
|
104
204
|
# NOTE: The Infinity-Parser2 model will be automatically downloaded on the first run.
|
|
@@ -86,8 +86,11 @@ class InfinityParser2:
|
|
|
86
86
|
self.kwargs = kwargs
|
|
87
87
|
|
|
88
88
|
# Initialize model cache and resolve model path (stored separately)
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
if self.backend_name == "vllm-server":
|
|
90
|
+
self._model_path = self.model_name
|
|
91
|
+
else:
|
|
92
|
+
cache = get_model_cache(model_cache_dir)
|
|
93
|
+
self._model_path = cache.resolve_model_path(self.model_name)
|
|
91
94
|
|
|
92
95
|
self._backend: BaseBackend = self._init_backend()
|
|
93
96
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: infinity_parser2
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Document parsing Python package supporting PDF and image parsing using Infinity-Parser2-Pro model.
|
|
5
5
|
Home-page: https://github.com/infly-ai/INF-MLLM
|
|
6
6
|
Author: INF Tech
|
|
@@ -83,7 +83,101 @@ We are excited to release Infinity-Parser2-Pro, our latest flagship document und
|
|
|
83
83
|
|
|
84
84
|
## Quick Start
|
|
85
85
|
|
|
86
|
-
###
|
|
86
|
+
### 1. Minimal "Hello World" (Native Transformers)
|
|
87
|
+
|
|
88
|
+
If you are looking for a minimal script to parse a single image to Markdown using the native `transformers` library, here is a simple snippet:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from PIL import Image
|
|
92
|
+
import torch
|
|
93
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
94
|
+
from qwen_vl_utils import process_vision_info
|
|
95
|
+
|
|
96
|
+
# Load the model and processor
|
|
97
|
+
model = AutoModelForImageTextToText.from_pretrained(
|
|
98
|
+
"infly/Infinity-Parser2-Pro",
|
|
99
|
+
torch_dtype="float16",
|
|
100
|
+
device_map="auto",
|
|
101
|
+
)
|
|
102
|
+
processor = AutoProcessor.from_pretrained("infly/Infinity-Parser2-Pro")
|
|
103
|
+
|
|
104
|
+
# Build the messages for the model
|
|
105
|
+
pil_image = Image.open("demo_data/demo.png").convert("RGB")
|
|
106
|
+
min_pixels = 2048 # 32 * 64
|
|
107
|
+
max_pixels = 16777216 # 4096 * 4096
|
|
108
|
+
prompt = """
|
|
109
|
+
Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
110
|
+
1. Bbox format: [x1, y1, x2, y2]
|
|
111
|
+
2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
|
|
112
|
+
3. Text Extraction & Formatting Rules:
|
|
113
|
+
- Figure: For the 'figure' category, the text field should be empty string.
|
|
114
|
+
- Formula: Format its text as LaTeX.
|
|
115
|
+
- Table: Format its text as HTML.
|
|
116
|
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
117
|
+
4. Constraints:
|
|
118
|
+
- The output text must be the original text from the image, with no translation.
|
|
119
|
+
- All layout elements must be sorted according to human reading order.
|
|
120
|
+
5. Final Output: The entire output must be a single JSON object.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
messages = [
|
|
124
|
+
{
|
|
125
|
+
"role": "user",
|
|
126
|
+
"content": [
|
|
127
|
+
{
|
|
128
|
+
"type": "image",
|
|
129
|
+
"image": pil_image,
|
|
130
|
+
"min_pixels": min_pixels,
|
|
131
|
+
"max_pixels": max_pixels,
|
|
132
|
+
},
|
|
133
|
+
{"type": "text", "text": prompt},
|
|
134
|
+
],
|
|
135
|
+
}
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
chat_template_kwargs = {"enable_thinking": False}
|
|
139
|
+
|
|
140
|
+
text = processor.apply_chat_template(
|
|
141
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs
|
|
142
|
+
)
|
|
143
|
+
image_inputs, _ = process_vision_info(messages, image_patch_size=16)
|
|
144
|
+
|
|
145
|
+
inputs = processor(
|
|
146
|
+
text=text,
|
|
147
|
+
images=image_inputs,
|
|
148
|
+
do_resize=False,
|
|
149
|
+
padding=True,
|
|
150
|
+
return_tensors="pt",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Move all tensors to the same device as the model
|
|
154
|
+
inputs = {
|
|
155
|
+
k: v.to(model.device) if isinstance(v, torch.Tensor) else v
|
|
156
|
+
for k, v in inputs.items()
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# Generate the response
|
|
160
|
+
generated_ids = model.generate(
|
|
161
|
+
**inputs,
|
|
162
|
+
max_new_tokens=32768,
|
|
163
|
+
temperature=0.0,
|
|
164
|
+
top_p=1.0,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Strip input tokens, keeping only the newly generated response
|
|
168
|
+
generated_ids_trimmed = [
|
|
169
|
+
out_ids[len(in_ids) :]
|
|
170
|
+
for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
|
171
|
+
]
|
|
172
|
+
output_text = processor.batch_decode(
|
|
173
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
174
|
+
)
|
|
175
|
+
print(output_text)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### 2. Advanced Pipeline (infinity_parser2)
|
|
179
|
+
|
|
180
|
+
For bulk processing, advanced features, or an end-to-end PDF parsing pipeline, we recommend using our infinity_parser2 wrapper.
|
|
87
181
|
|
|
88
182
|
#### Pre-requisites
|
|
89
183
|
|
|
@@ -95,10 +189,12 @@ conda activate infinity_parser2
|
|
|
95
189
|
# Install PyTorch (CUDA). Find the proper version at https://pytorch.org/get-started/previous-versions based on your CUDA version.
|
|
96
190
|
pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu128
|
|
97
191
|
|
|
98
|
-
# Install FlashAttention (
|
|
99
|
-
#
|
|
192
|
+
# Install FlashAttention (FlashAttention-2 is recommended by default)
|
|
193
|
+
# Standard install (compiles from source, ~10-30 min):
|
|
100
194
|
pip install flash-attn==2.8.3 --no-build-isolation
|
|
101
|
-
#
|
|
195
|
+
# Faster install: download wheel from https://github.com/Dao-AILab/flash-attention/releases. Then run: pip install /path/to/<wheel_filename>.whl
|
|
196
|
+
# For Hopper GPUs (e.g. H100, H800), we recommend FlashAttention-3 instead. See: https://github.com/Dao-AILab/flash-attention
|
|
197
|
+
# NOTE: The code will prioritize detecting FlashAttention-3. If not found, it falls back to FlashAttention-2.
|
|
102
198
|
|
|
103
199
|
# Install vLLM
|
|
104
200
|
# NOTE: you may need to run the command below to resolve triton and numpy conflicts before installing vllm.
|
|
@@ -108,19 +204,23 @@ pip install vllm==0.17.1
|
|
|
108
204
|
|
|
109
205
|
#### Install infinity_parser2
|
|
110
206
|
|
|
207
|
+
Install from PyPI
|
|
208
|
+
|
|
111
209
|
```bash
|
|
112
|
-
# From PyPI
|
|
113
210
|
pip install infinity_parser2
|
|
211
|
+
```
|
|
114
212
|
|
|
115
|
-
|
|
213
|
+
Install from source code
|
|
214
|
+
|
|
215
|
+
```bash
|
|
116
216
|
git clone https://github.com/infly-ai/INF-MLLM.git
|
|
117
217
|
cd INF-MLLM/Infinity-Parser2
|
|
118
218
|
pip install -e .
|
|
119
219
|
```
|
|
120
220
|
|
|
121
|
-
|
|
221
|
+
#### Usage
|
|
122
222
|
|
|
123
|
-
|
|
223
|
+
##### Command Line
|
|
124
224
|
|
|
125
225
|
The `parser` command is the fastest way to get started.
|
|
126
226
|
|
|
@@ -151,7 +251,7 @@ parser demo_data/demo.png --task doc2md
|
|
|
151
251
|
parser --help
|
|
152
252
|
```
|
|
153
253
|
|
|
154
|
-
|
|
254
|
+
##### Python API
|
|
155
255
|
|
|
156
256
|
```python
|
|
157
257
|
# NOTE: The Infinity-Parser2 model will be automatically downloaded on the first run.
|
|
@@ -32,7 +32,7 @@ install_requires = [
|
|
|
32
32
|
|
|
33
33
|
setup(
|
|
34
34
|
name="infinity_parser2",
|
|
35
|
-
version="0.
|
|
35
|
+
version="0.3.0",
|
|
36
36
|
description="Document parsing Python package supporting PDF and image parsing using Infinity-Parser2-Pro model.",
|
|
37
37
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
|
38
38
|
long_description_content_type="text/markdown",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{infinity_parser2-0.2.0 → infinity_parser2-0.3.0}/infinity_parser2.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|