infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ """Command-line interface for Infinity-Parser2."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ from typing import List, Optional
7
+
8
+ from . import InfinityParser2
9
+
10
+
11
+ def parse_bool(value: str) -> bool:
12
+ """Convert string to boolean."""
13
+ if value.lower() in ("true", "1", "yes"):
14
+ return True
15
+ elif value.lower() in ("false", "0", "no"):
16
+ return False
17
+ raise argparse.ArgumentTypeError(f"Invalid boolean value: {value}")
18
+
19
+
20
+ def build_parser() -> argparse.ArgumentParser:
21
+ """Build the argument parser."""
22
+ parser = argparse.ArgumentParser(
23
+ prog="parser",
24
+ description="Infinity-Parser2: Document parsing tool using Infinity-Parser2-Pro model.",
25
+ formatter_class=argparse.RawDescriptionHelpFormatter,
26
+ epilog="""
27
+ Examples:
28
+ # Parse a PDF file (default: doc2json -> markdown output)
29
+ parser document.pdf
30
+
31
+ # Parse with doc2md task type
32
+ parser document.pdf --task doc2md
33
+
34
+ # Parse with custom prompt
35
+ parser document.pdf --task custom --prompt "Extract the title and authors"
36
+
37
+ # Parse multiple files
38
+ parser doc1.pdf doc2.png --output-dir ./results
39
+
40
+ # Parse a directory
41
+ parser ./docs --output-dir ./results
42
+
43
+ # Output raw JSON
44
+ parser document.pdf --output-format json
45
+
46
+ # Use transformers backend
47
+ parser document.pdf --backend transformers
48
+
49
+ # Use vllm-server backend
50
+ parser document.pdf --backend vllm-server --api-url http://localhost:8000/v1/chat/completions
51
+ """,
52
+ )
53
+
54
+ parser.add_argument(
55
+ "input",
56
+ nargs="+",
57
+ help="Input file(s) or directory path. Supports PDF, PNG, JPG, JPEG, WEBP.",
58
+ )
59
+ parser.add_argument(
60
+ "-o", "--output-dir",
61
+ default=None,
62
+ help="Output directory. If not provided, result is printed to stdout.",
63
+ )
64
+ parser.add_argument(
65
+ "--task",
66
+ default="doc2json",
67
+ choices=["doc2json", "doc2md", "custom"],
68
+ help="Parsing task type. Defaults to 'doc2json'.",
69
+ )
70
+ parser.add_argument(
71
+ "--prompt",
72
+ default=None,
73
+ help="Custom prompt used only when --task custom.",
74
+ )
75
+ parser.add_argument(
76
+ "--output-format",
77
+ default="md",
78
+ choices=["md", "json"],
79
+ help="Output format. Defaults to 'md'.",
80
+ )
81
+ parser.add_argument(
82
+ "--batch-size",
83
+ type=int,
84
+ default=4,
85
+ help="Batch size for inference. Defaults to 4.",
86
+ )
87
+ parser.add_argument(
88
+ "--backend",
89
+ default="vllm-engine",
90
+ choices=["transformers", "vllm-engine", "vllm-server"],
91
+ help="Inference backend. Defaults to 'vllm-engine'.",
92
+ )
93
+ parser.add_argument(
94
+ "--model-name",
95
+ default="infly/Infinity-Parser2-Pro",
96
+ help="Model name on HuggingFace Hub or local path.",
97
+ )
98
+ parser.add_argument(
99
+ "--tensor-parallel-size",
100
+ type=int,
101
+ default=None,
102
+ help="Tensor parallel size for vllm-engine backend.",
103
+ )
104
+ parser.add_argument(
105
+ "--api-url",
106
+ default="http://localhost:8000/v1/chat/completions",
107
+ help="API URL for vllm-server backend.",
108
+ )
109
+ parser.add_argument(
110
+ "--api-key",
111
+ default="EMPTY",
112
+ help="API key for vllm-server backend.",
113
+ )
114
+ parser.add_argument(
115
+ "--model-cache-dir",
116
+ default=None,
117
+ help="Model cache directory.",
118
+ )
119
+ parser.add_argument(
120
+ "--min-pixels",
121
+ type=int,
122
+ default=2048,
123
+ help="Minimum number of pixels for image input (transformers backend only).",
124
+ )
125
+ parser.add_argument(
126
+ "--max-pixels",
127
+ type=int,
128
+ default=16777216,
129
+ help="Maximum number of pixels for image input (transformers backend only).",
130
+ )
131
+ parser.add_argument(
132
+ "--verbose", "-v",
133
+ action="store_true",
134
+ help="Print verbose output.",
135
+ )
136
+ parser.add_argument(
137
+ "--version",
138
+ action="version",
139
+ version="Infinity-Parser2 0.1.0",
140
+ )
141
+
142
+ return parser
143
+
144
+
145
+ def main(argv: Optional[List[str]] = None) -> int:
146
+ """CLI entry point."""
147
+ parser = build_parser()
148
+ args = parser.parse_args(argv)
149
+
150
+ input_paths = args.input
151
+ if len(input_paths) == 1 and os.path.isdir(input_paths[0]):
152
+ input_data = input_paths[0]
153
+ else:
154
+ input_data = input_paths
155
+
156
+ if args.verbose:
157
+ print(f"[Infinity-Parser2] Backend: {args.backend}")
158
+ print(f"[Infinity-Parser2] Model: {args.model_name}")
159
+ print(f"[Infinity-Parser2] Task: {args.task}")
160
+ print(f"[Infinity-Parser2] Input: {input_data}")
161
+
162
+ try:
163
+ parser_client = InfinityParser2(
164
+ model_name=args.model_name,
165
+ backend=args.backend,
166
+ tensor_parallel_size=args.tensor_parallel_size,
167
+ api_url=args.api_url,
168
+ api_key=args.api_key,
169
+ min_pixels=args.min_pixels,
170
+ max_pixels=args.max_pixels,
171
+ model_cache_dir=args.model_cache_dir,
172
+ )
173
+
174
+ result = parser_client.parse(
175
+ input_data=input_data,
176
+ task_type=args.task,
177
+ custom_prompt=args.prompt,
178
+ batch_size=args.batch_size,
179
+ output_dir=args.output_dir,
180
+ output_format=args.output_format,
181
+ )
182
+
183
+ if result is not None:
184
+ if isinstance(result, dict):
185
+ for path, content in result.items():
186
+ print(f"=== {path} ===")
187
+ print(content)
188
+ elif isinstance(result, list):
189
+ for item in result:
190
+ print(item)
191
+ else:
192
+ print(result)
193
+ elif args.verbose:
194
+ print("[Infinity-Parser2] Results saved to output directory.")
195
+
196
+ return 0
197
+
198
+ except Exception as e:
199
+ print(f"[Infinity-Parser2] Error: {e}", file=sys.stderr)
200
+ if args.verbose:
201
+ import traceback
202
+ traceback.print_exc()
203
+ return 1
204
+
205
+
206
+ if __name__ == "__main__":
207
+ sys.exit(main())
@@ -0,0 +1,278 @@
1
+ """Infinity-Parser2 main interface."""
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Union
7
+
8
+ import torch
9
+ from PIL import Image
10
+
11
+ from .backends import (
12
+ BaseBackend,
13
+ TransformersBackend,
14
+ VLLMEngineBackend,
15
+ VLLMServerBackend,
16
+ )
17
+ from .prompts import PROMPT_DOC2JSON, PROMPT_DOC2MD, SUPPORTED_TASK_TYPES
18
+ from .utils import *
19
+
20
+
21
+ BACKEND_REGISTRY = {
22
+ "transformers": TransformersBackend,
23
+ "vllm-engine": VLLMEngineBackend,
24
+ "vllm-server": VLLMServerBackend,
25
+ }
26
+
27
+
28
+ class InfinityParser2:
29
+ """Document parser using Infinity-Parser2-Pro model.
30
+
31
+ Supports parsing of PDF files and images (PNG, JPG, etc.) into structured text.
32
+
33
+ Args:
34
+ model_name: Model name on HuggingFace Hub (e.g., "infly/Infinity-Parser2-Pro")
35
+ or local path to a downloaded model. Defaults to "infly/Infinity-Parser2-Pro".
36
+ backend: Inference backend. Options:
37
+ - "transformers": HuggingFace transformers (local inference)
38
+ - "vllm-engine": vLLM Engine (local batch inference via LLM class)
39
+ - "vllm-server": vLLM OpenAI-Compatible Server (HTTP API)
40
+ Defaults to "vllm-engine".
41
+ tensor_parallel_size: Tensor parallel size for vllm-engine.
42
+ Defaults to the number of available GPUs (via torch.cuda.device_count()).
43
+ device: Device type, must be "cuda". Raises ValueError if set to anything else.
44
+ api_url: API URL for vllm-server backend.
45
+ api_key: API key for vllm-server backend.
46
+ min_pixels: Minimum number of pixels for image input (transformers backend only).
47
+ Defaults to 2048.
48
+ max_pixels: Maximum number of pixels for image input (transformers backend only).
49
+ Defaults to 16777216 (~4096x4096).
50
+ **kwargs: Additional arguments passed to the backend.
51
+
52
+ Example:
53
+ >>> from infinity_parser2 import InfinityParser2
54
+ >>> parser = InfinityParser2(model_name="infly/Infinity-Parser2-Pro")
55
+ >>> result = parser.parse("document.pdf")
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ model_name: str = "infly/Infinity-Parser2-Pro",
61
+ backend: str = "vllm-engine",
62
+ tensor_parallel_size: Optional[int] = None,
63
+ device: str = "cuda",
64
+ api_url: str = "http://localhost:8000/v1/chat/completions",
65
+ api_key: str = "EMPTY",
66
+ min_pixels: int = 2048,
67
+ max_pixels: int = 16777216,
68
+ model_cache_dir: Optional[str] = None,
69
+ **kwargs,
70
+ ):
71
+ if device != "cuda":
72
+ raise ValueError("device must be 'cuda' for Infinity-Parser2-Pro.")
73
+
74
+ self.model_name = model_name
75
+ self.backend_name = backend.lower()
76
+ self.tensor_parallel_size = (
77
+ tensor_parallel_size
78
+ if tensor_parallel_size is not None
79
+ else torch.cuda.device_count()
80
+ )
81
+ self.device = device
82
+ self.api_url = api_url
83
+ self.api_key = api_key
84
+ self.min_pixels = min_pixels
85
+ self.max_pixels = max_pixels
86
+ self.kwargs = kwargs
87
+
88
+ # Initialize model cache and resolve model path (stored separately)
89
+ cache = get_model_cache(model_cache_dir)
90
+ self._model_path = cache.resolve_model_path(self.model_name)
91
+
92
+ self._backend: BaseBackend = self._init_backend()
93
+
94
+ def _init_backend(self) -> BaseBackend:
95
+ """Initialize and return the backend instance."""
96
+ if self.backend_name not in BACKEND_REGISTRY:
97
+ raise ValueError(
98
+ f"Unsupported backend: {self.backend_name}. "
99
+ f"Supported backends: {list(BACKEND_REGISTRY.keys())}"
100
+ )
101
+ backend_cls = BACKEND_REGISTRY[self.backend_name]
102
+ common_kwargs = {
103
+ "model_name": self._model_path,
104
+ "device": self.device,
105
+ "min_pixels": self.min_pixels,
106
+ "max_pixels": self.max_pixels,
107
+ **self.kwargs,
108
+ }
109
+ if self.backend_name == "vllm-server":
110
+ backend_kwargs = {**common_kwargs, "api_url": self.api_url, "api_key": self.api_key}
111
+ elif self.backend_name == "vllm-engine":
112
+ backend_kwargs = {**common_kwargs, "tensor_parallel_size": self.tensor_parallel_size}
113
+ else: # transformers
114
+ backend_kwargs = common_kwargs
115
+ return backend_cls(**backend_kwargs)
116
+
117
+ def _resolve_prompt(self, task_type: str, custom_prompt: Optional[str]) -> str:
118
+ """Resolve the prompt to use based on task_type and custom_prompt.
119
+
120
+ Args:
121
+ task_type: The task type (e.g., "doc2json", "doc2md", "custom").
122
+ custom_prompt: Custom prompt, only used when task_type is "custom".
123
+
124
+ Returns:
125
+ The resolved prompt string.
126
+ """
127
+ if task_type == "custom":
128
+ assert custom_prompt is not None, "custom_prompt must be provided when task_type='custom'"
129
+ return custom_prompt
130
+ if task_type == "doc2json":
131
+ return PROMPT_DOC2JSON
132
+ if task_type == "doc2md":
133
+ return PROMPT_DOC2MD
134
+ # Fallback for unknown task types (should not happen with proper validation)
135
+ return "Please transform the document's contents into Markdown format."
136
+
137
+ def parse(
138
+ self,
139
+ input_data: Union[str, List[str], Image.Image],
140
+ task_type: str = "doc2json",
141
+ custom_prompt: Optional[str] = None,
142
+ batch_size: int = 4,
143
+ output_dir: Optional[str] = None,
144
+ output_format: str = "md",
145
+ **kwargs,
146
+ ) -> Optional[Union[str, List[str], Dict[str, str]]]:
147
+ """Parse document(s) and extract text content.
148
+
149
+ Args:
150
+ input_data: Input can be:
151
+ - str: Single file path or directory path
152
+ - List[str]: List of file paths
153
+ - PIL.Image.Image: Image object
154
+ task_type: Parsing task type. Options:
155
+ - "doc2json": Extract layout to JSON, return JSON string.
156
+ - "doc2md": Directly convert to Markdown, return Markdown.
157
+ - "custom": Use custom_prompt for parsing.
158
+ Defaults to "doc2json".
159
+ custom_prompt: Custom prompt text for the model. Used only when
160
+ task_type is "custom". Defaults to None.
161
+ batch_size: Number of images to process in one batch. Defaults to 4.
162
+ output_dir: If provided, results are saved to output_dir and this function
163
+ returns None. If None, results are returned directly.
164
+ output_format: Output format for results. Options: "md" or "json".
165
+ Defaults to "md".
166
+ - For doc2json tasks:
167
+ - output_format="md": Returns markdown (converts JSON to markdown
168
+ via convert_json_to_markdown). If output_dir is set, saves only
169
+ the markdown result.
170
+ - output_format="json": Returns raw JSON result. If output_dir is
171
+ set, saves only the JSON result.
172
+ - For doc2md tasks or custom prompts: Only "md" is supported.
173
+ If "json" is passed, a ValueError will be raised.
174
+ **kwargs: Additional arguments passed to the model.
175
+
176
+ Returns:
177
+ When output_dir is None:
178
+ - str: Parsed result for a single file or image.
179
+ - List[str]: Parsed results for a list of files.
180
+ - Dict[str, str]: Mapping from file path to parsed result for a directory.
181
+ When output_dir is set, returns None.
182
+
183
+ Example:
184
+ >>> parser = InfinityParser2()
185
+ >>> # Single file, returns str
186
+ >>> result = parser.parse("document.pdf")
187
+ >>> # Multiple files, returns List[str]
188
+ >>> result = parser.parse(["doc1.pdf", "doc2.pdf"])
189
+ >>> # Directory, returns Dict[str, str]
190
+ >>> result = parser.parse("/path/to/docs")
191
+ >>> # Save results to output_dir, returns None
192
+ >>> parser.parse("document.pdf", output_dir="./output")
193
+ """
194
+ if task_type not in SUPPORTED_TASK_TYPES:
195
+ raise ValueError(f"task_type must be one of {SUPPORTED_TASK_TYPES}, got '{task_type}'")
196
+
197
+ if output_format not in SUPPORTED_OUTPUT_FORMATS:
198
+ raise ValueError(f"output_format must be one of {SUPPORTED_OUTPUT_FORMATS}, got '{output_format}'")
199
+
200
+ if output_format == "json" and task_type != "doc2json":
201
+ raise ValueError(
202
+ "output_format='json' is only supported for doc2json tasks. "
203
+ "For other task types, output_format must be 'md'."
204
+ )
205
+
206
+ prompt = self._resolve_prompt(task_type, custom_prompt)
207
+
208
+ is_directory = isinstance(input_data, str) and os.path.isdir(input_data)
209
+ file_paths = normalize_input(input_data)
210
+ file_results = self._parse_files(
211
+ file_paths, prompt, task_type, batch_size, output_format, **kwargs
212
+ )
213
+
214
+ if output_dir is not None:
215
+ save_results(
216
+ file_paths, file_results, output_dir,
217
+ task_type=task_type, output_format=output_format
218
+ )
219
+ elif is_directory:
220
+ return dict(zip(file_paths, file_results))
221
+ elif len(file_results) == 1:
222
+ return file_results[0]
223
+ else:
224
+ return file_results
225
+
226
+ def _parse_files(
227
+ self,
228
+ inputs: List[Union[str, Image.Image]],
229
+ prompt: Optional[str],
230
+ task_type: str,
231
+ batch_size: int = 4,
232
+ output_format: str = "md",
233
+ **kwargs,
234
+ ) -> List[str]:
235
+ """Parse multiple files with batched inference.
236
+
237
+ All images (including PDF pages) are collected and batched together for
238
+ efficient inference. Results are then aggregated back to the original
239
+ file-level granularity.
240
+ """
241
+
242
+ # prepare batch entries
243
+ batch_entries = prepare_batch_entries(inputs)
244
+ if not batch_entries:
245
+ return [] if len(inputs) > 1 else ""
246
+
247
+ # parse batch
248
+ raw_inputs = [entry[1] for entry in batch_entries]
249
+ batch_results = self._backend.parse_batch(raw_inputs, prompt, batch_size=batch_size, **kwargs)
250
+
251
+ # aggregate batch results
252
+ num_files = len({entry[0] for entry in batch_entries})
253
+ page_results: List[List[str]] = [[] for _ in range(num_files)]
254
+ file_results: List[str] = [""] * num_files
255
+
256
+ for entry_idx, (file_idx, image_input) in enumerate(batch_entries):
257
+ raw_result = batch_results[entry_idx]
258
+
259
+ # postprocess result
260
+ if task_type == "doc2json":
261
+ text = postprocess_doc2json_result(raw_result, image_input, output_format)
262
+ elif task_type == "doc2md":
263
+ text = postprocess_doc2md_result(raw_result)
264
+ else:
265
+ text = raw_result
266
+
267
+ page_results[file_idx].append(text)
268
+
269
+ # Join results based on length of page_results and output_format
270
+ for idx in range(num_files):
271
+ if len(page_results[idx]) == 1:
272
+ file_results[idx] = page_results[idx][0]
273
+ elif output_format == "json":
274
+ file_results[idx] = "[" + ",".join(page_results[idx]) + "]"
275
+ else:
276
+ file_results[idx] = "\n\n".join(page_results[idx])
277
+
278
+ return file_results
@@ -0,0 +1,57 @@
1
+ """Prompts for Infinity-Parser2."""
2
+
3
+ __all__ = [
4
+ "PROMPT_DOC2JSON",
5
+ "PROMPT_DOC2MD",
6
+ "SUPPORTED_TASK_TYPES",
7
+ ]
8
+
9
+ SUPPORTED_TASK_TYPES = ["doc2json", "doc2md", "custom"]
10
+
11
+
12
+ # doc2json prompt (outputs JSON format)
13
+ PROMPT_DOC2JSON = """
14
+ - Extract layout information from the provided PDF image.
15
+ - For each layout element, output its bbox, category, and the text content within the bbox.
16
+ - Bbox format: [x1, y1, x2, y2].
17
+ - Allowed layout categories: ['header', 'title', 'text', 'figure', 'table', 'formula',
18
+ 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote',
19
+ 'table_footnote', 'page_footnote', 'footer'].
20
+ - Text extraction and formatting:
21
+ 1) For 'figure', the text field must be an empty string.
22
+ 2) For 'formula', format text as LaTeX.
23
+ 3) For 'table', format text as HTML.
24
+ 4) For all other categories (e.g., text, title), format text as Markdown.
25
+ - The output text must be exactly the original text from the image,
26
+ with no translation or rewriting.
27
+ - Sort all layout elements in human reading order.
28
+ - Final output must be a single JSON object.
29
+ """
30
+
31
+ # doc2md prompt (outputs Markdown format directly)
32
+ PROMPT_DOC2MD = """
33
+ You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
34
+
35
+ 1. Text Processing:
36
+ - Accurately recognize all text content in the PDF image without guessing or inferring.
37
+ - Convert the recognized text into Markdown format.
38
+ - Maintain the original document structure, including headings, paragraphs, lists, etc.
39
+
40
+ 2. Mathematical Formula Processing:
41
+ - Convert all mathematical formulas to LaTeX format.
42
+ - Enclose inline formulas with $ $. For example: This is an inline formula $E = mc^2$
43
+ - Enclose block formulas with $$ $$. For example: $$\\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$$
44
+
45
+ 3. Table Processing:
46
+ - Convert tables to HTML format.
47
+
48
+ 4. Figure Handling:
49
+ - Ignore figures content in the PDF image. Do not attempt to describe or convert images.
50
+
51
+ 5. Output Format:
52
+ - Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
53
+ - For complex layouts, try to maintain the original document's structure and format as closely as possible.
54
+
55
+ Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
56
+ """
57
+
@@ -0,0 +1,43 @@
1
+ """Utility functions for Infinity-Parser2."""
2
+
3
+ from .file import (
4
+ get_files_from_directory,
5
+ is_supported_file,
6
+ normalize_input,
7
+ prepare_batch_entries,
8
+ save_results,
9
+ SUPPORTED_OUTPUT_FORMATS,
10
+ )
11
+ from .image import encode_file_to_base64, load_image
12
+ from .model import ModelCache, get_model_cache
13
+ from .pdf import convert_pdf_to_images
14
+ from .utils import (
15
+ convert_json_to_markdown,
16
+ extract_json_content,
17
+ obtain_origin_hw,
18
+ postprocess_doc2json_result,
19
+ restore_abs_bbox_coordinates,
20
+ postprocess_doc2md_result,
21
+ truncate_last_incomplete_element,
22
+ )
23
+
24
+ __all__ = [
25
+ "convert_pdf_to_images",
26
+ "convert_json_to_markdown",
27
+ "extract_json_content",
28
+ "encode_file_to_base64",
29
+ "get_files_from_directory",
30
+ "get_model_cache",
31
+ "is_supported_file",
32
+ "load_image",
33
+ "ModelCache",
34
+ "normalize_input",
35
+ "obtain_origin_hw",
36
+ "postprocess_doc2json_result",
37
+ "postprocess_doc2md_result",
38
+ "prepare_batch_entries",
39
+ "restore_abs_bbox_coordinates",
40
+ "save_results",
41
+ "SUPPORTED_OUTPUT_FORMATS",
42
+ "truncate_last_incomplete_element",
43
+ ]