infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,159 @@
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ from PIL import Image
7
+
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # JSON extraction & cleanup
11
+ # ---------------------------------------------------------------------------
12
+
13
+ def extract_json_content(text: str) -> str:
14
+ """Extract the JSON block from a markdown-wrapped LLM response."""
15
+ match = re.search(r"```json\n(.*?)\n```", text, re.DOTALL)
16
+ if match:
17
+ return match.group(1).strip()
18
+ partial = re.search(r"```json\n(.*)", text, re.DOTALL)
19
+ if partial:
20
+ return partial.group(1).strip()
21
+ return text
22
+
23
+
24
+ def truncate_last_incomplete_element(text: str) -> tuple[str, bool]:
25
+ """
26
+ Truncate the response at the last complete dict entry so the JSON is always parseable.
27
+ Returns (cleaned_text, was_truncated).
28
+ """
29
+ needs_truncation = len(text) > 50_000 or not text.rstrip().endswith("]")
30
+
31
+ if not needs_truncation:
32
+ return text, False
33
+
34
+ if text.count('{"bbox":') <= 1:
35
+ return text, False
36
+
37
+ last_bbox_pos = text.rfind('{"bbox":')
38
+ truncated = text[:last_bbox_pos].rstrip()
39
+ if truncated.endswith(","):
40
+ truncated = truncated[:-1] + "]"
41
+ return truncated, True
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Coordinate normalisation
46
+ # ---------------------------------------------------------------------------
47
+
48
+ def obtain_origin_hw(image: Union[str, Path, Image.Image]) -> tuple[int, int]:
49
+ """
50
+ Return (height, width) of the image.
51
+ Accepts a file path (str/Path) or a PIL Image object.
52
+ """
53
+ if isinstance(image, Image.Image):
54
+ w, h = image.size
55
+ return h, w # (height, width)
56
+ try:
57
+ img = Image.open(image).convert("RGB")
58
+ w, h = img.size
59
+ return h, w # (height, width)
60
+ except Exception:
61
+ return 1000, 1000
62
+
63
+
64
+ def restore_abs_bbox_coordinates(ans: str, origin_h: float, origin_w: float) -> str:
65
+ """Convert normalised [0-1000] bboxes back to pixel coordinates."""
66
+ try:
67
+ data = json.loads(ans)
68
+ except json.JSONDecodeError:
69
+ return ans
70
+
71
+ valid = True
72
+ for item in data:
73
+ for key in item:
74
+ if "bbox" not in key:
75
+ continue
76
+ bbox = item[key]
77
+ if len(bbox) == 4 and all(isinstance(c, (int, float)) for c in bbox):
78
+ x1, y1, x2, y2 = bbox
79
+ item[key] = [
80
+ int(x1 / 1000.0 * origin_w),
81
+ int(y1 / 1000.0 * origin_h),
82
+ int(x2 / 1000.0 * origin_w),
83
+ int(y2 / 1000.0 * origin_h),
84
+ ]
85
+ else:
86
+ valid = False
87
+
88
+ return json.dumps(data, ensure_ascii=False) if valid else ans
89
+
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # JSON → Markdown
93
+ # ---------------------------------------------------------------------------
94
+
95
+ def convert_json_to_markdown(ans: str, keep_header_footer: bool = False) -> str:
96
+ """Convert the layout JSON list into a markdown string."""
97
+ try:
98
+ items = json.loads(ans)
99
+ if not isinstance(items, list):
100
+ return ans
101
+ lines = []
102
+ for sub in items:
103
+ if "text" not in sub or not sub["text"]:
104
+ continue
105
+ if keep_header_footer:
106
+ lines.append(sub["text"])
107
+ else:
108
+ if sub.get("category") not in ("header", "footer", "page_footnote"):
109
+ lines.append(sub["text"])
110
+ return "\n\n".join(lines) if lines else ans
111
+ except Exception:
112
+ return ans
113
+
114
+
115
+ # ---------------------------------------------------------------------------
116
+ # DOC2JSON postprocess
117
+ # ---------------------------------------------------------------------------
118
+
119
+ def postprocess_doc2json_result(
120
+ raw_text: str,
121
+ image: Union[str, Path, Image.Image],
122
+ output_format: str = "json",
123
+ ) -> str:
124
+ """
125
+ Postprocess raw LLM output for DOC2JSON mode:
126
+ 1. Extract JSON block from markdown-wrapped response
127
+ 2. Truncate last incomplete element for parseable JSON
128
+ 3. Restore normalised [0-1000] bboxes to pixel coordinates
129
+ """
130
+ text = extract_json_content(raw_text)
131
+ text, _ = truncate_last_incomplete_element(text)
132
+ origin_h, origin_w = obtain_origin_hw(image)
133
+ text = restore_abs_bbox_coordinates(text, origin_h, origin_w)
134
+ if output_format == "md":
135
+ text = convert_json_to_markdown(text)
136
+ return text
137
+
138
+
139
+ # ---------------------------------------------------------------------------
140
+ # Markdown cleanup
141
+ # ---------------------------------------------------------------------------
142
+
143
+ def postprocess_doc2md_result(text: str) -> str:
144
+ """Remove markdown code block fences from text.
145
+
146
+ Removes ```markdown\n and ``` (or similar) fences from the beginning
147
+ and end of text if present.
148
+
149
+ Args:
150
+ text: Input text that may contain markdown code block fences.
151
+
152
+ Returns:
153
+ Text with code block fences removed.
154
+ """
155
+ text = text.strip()
156
+ text = re.sub(r"^```markdown\s*\n?", "", text)
157
+ text = re.sub(r"^```\s*\n?", "", text)
158
+ text = re.sub(r"\n?```$", "", text)
159
+ return text.strip()
@@ -0,0 +1,310 @@
1
+ Metadata-Version: 2.4
2
+ Name: infinity_parser2
3
+ Version: 0.1.0
4
+ Summary: Document parsing Python package supporting PDF and image parsing using Infinity-Parser2-Pro model.
5
+ Home-page: https://github.com/infly-ai/INF-MLLM
6
+ Author: INF Tech
7
+ Author-email: contact@inftech.ai
8
+ Keywords: document parsing
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: transformers==5.3.0
19
+ Requires-Dist: tokenizers>=0.22.2
20
+ Requires-Dist: qwen-vl-utils>=0.0.14
21
+ Requires-Dist: Pillow>=9.0.0
22
+ Requires-Dist: pypdf>=3.0.0
23
+ Requires-Dist: pymupdf>=1.20.0
24
+ Requires-Dist: openai>=1.0.0
25
+ Requires-Dist: msgspec>=0.19.0
26
+ Requires-Dist: pybase64>=1.4.2
27
+ Requires-Dist: gguf>=0.17.1
28
+ Requires-Dist: cbor2>=5.7.0
29
+ Requires-Dist: py-cpuinfo>=9.0.0
30
+ Requires-Dist: distro>=1.9.0
31
+ Requires-Dist: openai_harmony>=0.0.4
32
+ Requires-Dist: fastapi>=0.135.1
33
+ Requires-Dist: starlette>=0.50.0
34
+ Requires-Dist: annotated_doc>=0.0.4
35
+ Requires-Dist: typing_inspection>=0.4.2
36
+ Requires-Dist: llguidance>=1.3.0
37
+ Requires-Dist: diskcache>=5.6.3
38
+ Requires-Dist: xgrammar>=0.1.29
39
+ Requires-Dist: partial_json_parser>=0.2.1.1.post6
40
+ Requires-Dist: huggingface-hub>=0.24.0
41
+ Requires-Dist: scikit-learn>=1.8.0
42
+ Requires-Dist: scipy>=1.17.1
43
+ Dynamic: author
44
+ Dynamic: author-email
45
+ Dynamic: classifier
46
+ Dynamic: description
47
+ Dynamic: description-content-type
48
+ Dynamic: home-page
49
+ Dynamic: keywords
50
+ Dynamic: requires-dist
51
+ Dynamic: requires-python
52
+ Dynamic: summary
53
+
54
+ # Infinity-Parser2
55
+
56
+ Infinity-Parser2 is a document parsing tool powered by the Infinity-Parser2-Pro model. It converts **PDF files** and **images** (PNG, JPG, WEBP) into structured Markdown or JSON with layout information.
57
+
58
+ ## Quick Start
59
+
60
+ ### Installation
61
+
62
+ #### Pre-requisites
63
+
64
+ ```bash
65
+ # Install PyTorch (CUDA). Find the proper version on the [official site](https://pytorch.org/get-started/previous-versions) based on your CUDA version.
66
+ pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu128
67
+
68
+ # Install FlashAttention (required for NVIDIA GPUs).
69
+ # This command builds flash-attn from source, which can take 10 to 30 minutes.
70
+ pip install flash-attn==2.8.3 --no-build-isolation
71
+ # For Hopper GPUs (e.g. H100, H800), we recommend FlashAttention-3 instead. See the [official guide](https://github.com/Dao-AILab/flash-attention).
72
+
73
+ # Install vLLM
74
+ # NOTE: you may need to run the command below to resolve triton and numpy conflicts before installing vllm.
75
+ # pip uninstall -y pytorch-triton opencv-python opencv-python-headless numpy && rm -rf "$(python -c 'import site; print(site.getsitepackages()[0])')/cv2"
76
+ pip install vllm==0.17.1
77
+ ```
78
+
79
+ #### Install infinity_parser2
80
+
81
+ ```bash
82
+ # From PyPI
83
+ pip install infinity_parser2
84
+
85
+ # From source
86
+ git clone https://github.com/infly-ai/INF-MLLM.git
87
+ cd INF-MLLM/Infinity-Parser2
88
+ pip install -e .
89
+ ```
90
+
91
+ ### Usage
92
+
93
+ #### Command Line
94
+
95
+ The `parser` command is the fastest way to get started.
96
+
97
+ ```bash
98
+ # Parse a PDF (outputs Markdown by default)
99
+ parser demo_data/demo.pdf
100
+
101
+ # Parse an image
102
+ parser demo_data/demo.png
103
+
104
+ # Batch parse multiple files
105
+ parser demo_data/demo.pdf demo_data/demo.png -o ./output
106
+
107
+ # Parse an entire directory
108
+ parser demo_data -o ./output
109
+
110
+ # Output raw JSON with layout bboxes
111
+ parser demo_data/demo.pdf --output-format json
112
+
113
+ # Convert to Markdown directly
114
+ parser demo_data/demo.png --task doc2md
115
+ ```
116
+
117
+ ```bash
118
+ # View all options
119
+ parser --help
120
+ ```
121
+
122
+ #### Python API
123
+
124
+ ```python
125
+ from infinity_parser2 import InfinityParser2
126
+
127
+ parser = InfinityParser2()
128
+
129
+ # Parse a single file (returns Markdown)
130
+ result = parser.parse("demo_data/demo.pdf")
131
+ print(result)
132
+
133
+ # Parse multiple files (returns list)
134
+ results = parser.parse(["demo_data/demo.pdf", "demo_data/demo.png"])
135
+
136
+ # Parse a directory (returns dict)
137
+ results = parser.parse("demo_data")
138
+ ```
139
+
140
+ **Output formats:**
141
+
142
+ | task_type | Description | Default Output |
143
+ |-------------|------------------------------------------------------|----------------|
144
+ | `doc2json` | Extract layout elements with bboxes (default) | Markdown |
145
+ | `doc2md` | Directly convert to Markdown | Markdown |
146
+ | `custom` | Use your own prompt | Raw model output |
147
+
148
+ ```python
149
+ # doc2json: get raw JSON with bbox coordinates
150
+ result = parser.parse("demo_data/demo.pdf", output_format="json")
151
+
152
+ # doc2md: direct Markdown conversion
153
+ result = parser.parse("demo_data/demo.pdf", task_type="doc2md")
154
+
155
+ # Custom prompt
156
+ result = parser.parse("demo_data/demo.pdf", task_type="custom",
157
+ custom_prompt="Extract the title and authors only.")
158
+
159
+ # Batch processing with custom batch size
160
+ result = parser.parse("demo_data", batch_size=8)
161
+
162
+ # Save results to directory
163
+ parser.parse("demo_data/demo.pdf", output_dir="./output")
164
+ ```
165
+
166
+ **Backends:**
167
+
168
+ Infinity-Parser2 supports three inference backends. By default it uses the **vLLM Engine** (offline batch inference).
169
+
170
+ ```python
171
+ # vLLM Engine (default) — offline batch inference
172
+ parser = InfinityParser2(
173
+ model_name="infly/Infinity-Parser2-Pro",
174
+ backend="vllm-engine", # default
175
+ tensor_parallel_size=2,
176
+ )
177
+
178
+ # Transformers — local single-GPU inference
179
+ parser = InfinityParser2(
180
+ model_name="infly/Infinity-Parser2-Pro",
181
+ backend="transformers",
182
+ device="cuda",
183
+ torch_dtype="bfloat16", # "float16" or "bfloat16"
184
+ )
185
+
186
+ # vLLM Server — online HTTP API (start server first)
187
+ parser = InfinityParser2(
188
+ model_name="infly/Infinity-Parser2-Pro",
189
+ backend="vllm-server",
190
+ api_url="http://localhost:8000/v1/chat/completions",
191
+ api_key="EMPTY",
192
+ )
193
+ ```
194
+
195
+ To start a vLLM server:
196
+
197
+ ```bash
198
+ vllm serve infly/Infinity-Parser2-Pro \
199
+ --trust-remote-code \
200
+ --reasoning-parser qwen3 \
201
+ --host 0.0.0.0 \
202
+ --port 8000 \
203
+ --tensor-parallel-size 2 \
204
+ --gpu-memory-utilization 0.85 \
205
+ --max-model-len 65536 \
206
+ --mm-encoder-tp-mode data \
207
+ --mm-processor-cache-type shm \
208
+ --enable-prefix-caching
209
+ ```
210
+
211
+ ## API Reference
212
+
213
+ ### InfinityParser2
214
+
215
+ ```python
216
+ parser = InfinityParser2(model_name="infly/Infinity-Parser2-Pro")
217
+ ```
218
+
219
+ | Parameter | Type | Default | Description |
220
+ |-----------|------|---------|-------------|
221
+ | `model_name` | `str` | `"infly/Infinity-Parser2-Pro"` | HuggingFace model name or local path |
222
+ | `backend` | `str` | `"vllm-engine"` | Inference backend: `"transformers"`, `"vllm-engine"`, or `"vllm-server"` |
223
+ | `tensor_parallel_size` | `int` | `None` | GPU count by default. Tensor parallel size for vLLM Engine |
224
+ | `device` | `str` | `"cuda"` | Only `"cuda"` is supported |
225
+ | `api_url` | `str` | `"http://localhost:8000/v1/chat/completions"` | API URL for vLLM Server backend |
226
+ | `api_key` | `str` | `"EMPTY"` | API key for vLLM Server backend |
227
+ | `min_pixels` | `int` | `2048` | Minimum pixel count for image input (transformers backend only) |
228
+ | `max_pixels` | `int` | `16777216` | Maximum pixel count (~4096x4096), transformers backend only |
229
+ | `model_cache_dir` | `str` | `None` | Model cache directory (defaults to `~/.cache/infinity_parser2/`) |
230
+
231
+ ### parse()
232
+
233
+ ```python
234
+ result = parser.parse("demo_data/demo.pdf")
235
+ ```
236
+
237
+ | Parameter | Type | Default | Description |
238
+ |-----------|------|---------|-------------|
239
+ | `input_data` | `str \| List[str] \| PIL.Image` | **Required** | File path(s), directory path, or PIL Image object |
240
+ | `task_type` | `str` | `"doc2json"` | `"doc2json"` (layout to JSON) \| `"doc2md"` (direct Markdown) \| `"custom"` |
241
+ | `custom_prompt` | `str` | `None` | Custom prompt; required when `task_type="custom"` |
242
+ | `batch_size` | `int` | `4` | Number of images to process per batch |
243
+ | `output_dir` | `str` | `None` | If set, saves results to this directory instead of returning them |
244
+ | `output_format` | `str` | `"md"` | `"md"` \| `"json"`. Only `"md"` is supported for `doc2md` / `custom` tasks |
245
+ | `**kwargs` | — | — | Additional args passed to the model (e.g., `max_new_tokens`, `temperature`) |
246
+
247
+ ### Return Values
248
+
249
+ | Input | output_dir=None | output_dir set |
250
+ |-----------------|----------------------------------|---------------|
251
+ | Single file | `str` | `None` |
252
+ | List of files | `List[str]` | `None` |
253
+ | Directory | `Dict[str, str]` (path→content) | `None` |
254
+
255
+ When `output_dir` is set, results are saved to `output_dir/{filename}/result.md` (or `result.json`).
256
+
257
+ ## Advanced Usage
258
+
259
+ ### Model Caching
260
+
261
+ Models are downloaded automatically on first use and cached at `~/.cache/infinity_parser2/`. You can customize the cache location:
262
+
263
+ ```python
264
+ parser = InfinityParser2(
265
+ model_name="infly/Infinity-Parser2-Pro",
266
+ model_cache_dir="/path/to/cache"
267
+ )
268
+ ```
269
+
270
+ ### Generation Parameters
271
+
272
+ ```python
273
+ result = parser.parse(
274
+ "demo_data/demo.pdf",
275
+ max_new_tokens=16384,
276
+ temperature=0.01,
277
+ top_p=0.95,
278
+ )
279
+ ```
280
+
281
+ ### Utility Functions
282
+
283
+ ```python
284
+ from infinity_parser2 import (
285
+ convert_pdf_to_images,
286
+ convert_json_to_markdown,
287
+ extract_json_content,
288
+ get_files_from_directory,
289
+ is_supported_file,
290
+ SUPPORTED_TASK_TYPES,
291
+ ModelCache,
292
+ get_model_cache,
293
+ )
294
+
295
+ # Convert PDF pages to PIL Images
296
+ images = convert_pdf_to_images("demo_data/demo.pdf", dpi=300)
297
+
298
+ # Convert layout JSON to Markdown
299
+ markdown = convert_json_to_markdown(json_string)
300
+
301
+ # Check model cache
302
+ cache = get_model_cache()
303
+ print(cache.resolve_model_path("infly/Infinity-Parser2-Pro"))
304
+ ```
305
+
306
+ ## Requirements
307
+
308
+ - Python 3.12+
309
+ - CUDA-compatible GPU
310
+ - See `setup.py` for full dependency list.
@@ -0,0 +1,25 @@
1
+ infinity_parser2/__init__.py,sha256=mPnfmZGnRMZHSkkQU_An4F2caOw0gCeSVaxK2-Yzjio,650
2
+ infinity_parser2/__main__.py,sha256=jq6TasYXniIXVrHNwTEIKz2RQfMGbrPnsf0hFp__KEI,140
3
+ infinity_parser2/cli.py,sha256=oPoqzw0OK4KBP36VGt2Cp_i2vx-BVVL0ChRKI1u5Jcw,5953
4
+ infinity_parser2/parser.py,sha256=O1ofwt4vEIK8jvpRzqb9qkFmLfYYv3h5gdKi92oyUU0,11363
5
+ infinity_parser2/prompts.py,sha256=SL8BF-i7UxAcmzu7EqyRfFLfy8ZExD9m3Pw980t_KZs,2449
6
+ infinity_parser2/backends/__init__.py,sha256=nLAnCAE8MhzekyvnbxSh2_gome48J5kfMwAGevKdoVE,321
7
+ infinity_parser2/backends/base.py,sha256=Pd2eg2i6L-IH-d7CyAPQcHZGTsO8yskoPetNr461rRQ,1669
8
+ infinity_parser2/backends/transformers.py,sha256=xK6K8jKINUxgtFw29QiV38lYrCmgZK8t5idd5kIZV8g,5164
9
+ infinity_parser2/backends/vllm_engine.py,sha256=E6eutMOwjTMzoeupSrsVFdupRqDpigKlC3RXwPW1qkM,3884
10
+ infinity_parser2/backends/vllm_server.py,sha256=klJtF-dk38cy20Ovj2OyvKiWqftv6gtqh4qdYhAur-I,5232
11
+ infinity_parser2/utils/__init__.py,sha256=h_0QVqgdDFbAMQxyCyzZ7UpStpGc8XDUef1D5i_2nW8,1115
12
+ infinity_parser2/utils/file.py,sha256=XI-FyzJo45c21-P8G5mErZGw2GXiJTopBqjl4_UKTs0,6921
13
+ infinity_parser2/utils/image.py,sha256=OIPLqPJfYIWZ2lS-iyaetuK9xL1KkF7wPqQVkzX-D8g,2969
14
+ infinity_parser2/utils/model.py,sha256=i6le1AIiHSk3qhm7Ffaq6wnVuqxASiu0rvssNPmcH0k,7993
15
+ infinity_parser2/utils/pdf.py,sha256=_85akAxDFtkJqH186pxeHM8FPAB01b5FoEyUx48Bgeg,1243
16
+ infinity_parser2/utils/utils.py,sha256=xorQdku2vSQUpFEplR_WSt8lYVZV9MY4GJDF8X_itVY,5242
17
+ tests/__init__.py,sha256=7Rlm_cmZDLA9lgnW5glysBurhFGIw625JteUd2Cy_mw,47
18
+ tests/test_backends.py,sha256=qJ_fVDyTLxjbSH_XkWtE6KbjF1hWJ71vjKz0uyg5AQA,22598
19
+ tests/test_parser.py,sha256=G2U2s3HPDIi4SJmj4F22tP-H8I03bm_lC9Tog3r9yjg,21298
20
+ tests/test_utils.py,sha256=KWQABAkdnnujuU5NTi8AFdlfR_FFbraWjL2fh1bazUQ,28672
21
+ infinity_parser2-0.1.0.dist-info/METADATA,sha256=J29ekUlaXS4iRn-pYHGPel1hkem4aFur_DO7wk3s-Uo,9849
22
+ infinity_parser2-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
23
+ infinity_parser2-0.1.0.dist-info/entry_points.txt,sha256=OnFPg-KtPNeV_J8g_NX_kTWJoQ1PxDR5KCktGzJ5TQo,53
24
+ infinity_parser2-0.1.0.dist-info/top_level.txt,sha256=hvtIIcwbRgweH2KgktUwTMHcEIDL_4wK7Eees2ux8ws,23
25
+ infinity_parser2-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ parser = infinity_parser2.cli:main
@@ -0,0 +1,2 @@
1
+ infinity_parser2
2
+ tests
tests/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Unit tests for Infinity-Parser2 package."""