infinity-parser2 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infinity_parser2/__init__.py +28 -0
- infinity_parser2/__main__.py +6 -0
- infinity_parser2/backends/__init__.py +13 -0
- infinity_parser2/backends/base.py +61 -0
- infinity_parser2/backends/transformers.py +159 -0
- infinity_parser2/backends/vllm_engine.py +117 -0
- infinity_parser2/backends/vllm_server.py +148 -0
- infinity_parser2/cli.py +207 -0
- infinity_parser2/parser.py +278 -0
- infinity_parser2/prompts.py +57 -0
- infinity_parser2/utils/__init__.py +43 -0
- infinity_parser2/utils/file.py +190 -0
- infinity_parser2/utils/image.py +99 -0
- infinity_parser2/utils/model.py +243 -0
- infinity_parser2/utils/pdf.py +46 -0
- infinity_parser2/utils/utils.py +159 -0
- infinity_parser2-0.1.0.dist-info/METADATA +310 -0
- infinity_parser2-0.1.0.dist-info/RECORD +25 -0
- infinity_parser2-0.1.0.dist-info/WHEEL +5 -0
- infinity_parser2-0.1.0.dist-info/entry_points.txt +2 -0
- infinity_parser2-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_backends.py +490 -0
- tests/test_parser.py +464 -0
- tests/test_utils.py +689 -0
infinity_parser2/cli.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Command-line interface for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from . import InfinityParser2
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parse_bool(value: str) -> bool:
|
|
12
|
+
"""Convert string to boolean."""
|
|
13
|
+
if value.lower() in ("true", "1", "yes"):
|
|
14
|
+
return True
|
|
15
|
+
elif value.lower() in ("false", "0", "no"):
|
|
16
|
+
return False
|
|
17
|
+
raise argparse.ArgumentTypeError(f"Invalid boolean value: {value}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
21
|
+
"""Build the argument parser."""
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="parser",
|
|
24
|
+
description="Infinity-Parser2: Document parsing tool using Infinity-Parser2-Pro model.",
|
|
25
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
26
|
+
epilog="""
|
|
27
|
+
Examples:
|
|
28
|
+
# Parse a PDF file (default: doc2json -> markdown output)
|
|
29
|
+
parser document.pdf
|
|
30
|
+
|
|
31
|
+
# Parse with doc2md task type
|
|
32
|
+
parser document.pdf --task doc2md
|
|
33
|
+
|
|
34
|
+
# Parse with custom prompt
|
|
35
|
+
parser document.pdf --task custom --prompt "Extract the title and authors"
|
|
36
|
+
|
|
37
|
+
# Parse multiple files
|
|
38
|
+
parser doc1.pdf doc2.png --output-dir ./results
|
|
39
|
+
|
|
40
|
+
# Parse a directory
|
|
41
|
+
parser ./docs --output-dir ./results
|
|
42
|
+
|
|
43
|
+
# Output raw JSON
|
|
44
|
+
parser document.pdf --output-format json
|
|
45
|
+
|
|
46
|
+
# Use transformers backend
|
|
47
|
+
parser document.pdf --backend transformers
|
|
48
|
+
|
|
49
|
+
# Use vllm-server backend
|
|
50
|
+
parser document.pdf --backend vllm-server --api-url http://localhost:8000/v1/chat/completions
|
|
51
|
+
""",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"input",
|
|
56
|
+
nargs="+",
|
|
57
|
+
help="Input file(s) or directory path. Supports PDF, PNG, JPG, JPEG, WEBP.",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"-o", "--output-dir",
|
|
61
|
+
default=None,
|
|
62
|
+
help="Output directory. If not provided, result is printed to stdout.",
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--task",
|
|
66
|
+
default="doc2json",
|
|
67
|
+
choices=["doc2json", "doc2md", "custom"],
|
|
68
|
+
help="Parsing task type. Defaults to 'doc2json'.",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--prompt",
|
|
72
|
+
default=None,
|
|
73
|
+
help="Custom prompt used only when --task custom.",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--output-format",
|
|
77
|
+
default="md",
|
|
78
|
+
choices=["md", "json"],
|
|
79
|
+
help="Output format. Defaults to 'md'.",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--batch-size",
|
|
83
|
+
type=int,
|
|
84
|
+
default=4,
|
|
85
|
+
help="Batch size for inference. Defaults to 4.",
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--backend",
|
|
89
|
+
default="vllm-engine",
|
|
90
|
+
choices=["transformers", "vllm-engine", "vllm-server"],
|
|
91
|
+
help="Inference backend. Defaults to 'vllm-engine'.",
|
|
92
|
+
)
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
"--model-name",
|
|
95
|
+
default="infly/Infinity-Parser2-Pro",
|
|
96
|
+
help="Model name on HuggingFace Hub or local path.",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--tensor-parallel-size",
|
|
100
|
+
type=int,
|
|
101
|
+
default=None,
|
|
102
|
+
help="Tensor parallel size for vllm-engine backend.",
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--api-url",
|
|
106
|
+
default="http://localhost:8000/v1/chat/completions",
|
|
107
|
+
help="API URL for vllm-server backend.",
|
|
108
|
+
)
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
"--api-key",
|
|
111
|
+
default="EMPTY",
|
|
112
|
+
help="API key for vllm-server backend.",
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--model-cache-dir",
|
|
116
|
+
default=None,
|
|
117
|
+
help="Model cache directory.",
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--min-pixels",
|
|
121
|
+
type=int,
|
|
122
|
+
default=2048,
|
|
123
|
+
help="Minimum number of pixels for image input (transformers backend only).",
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument(
|
|
126
|
+
"--max-pixels",
|
|
127
|
+
type=int,
|
|
128
|
+
default=16777216,
|
|
129
|
+
help="Maximum number of pixels for image input (transformers backend only).",
|
|
130
|
+
)
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"--verbose", "-v",
|
|
133
|
+
action="store_true",
|
|
134
|
+
help="Print verbose output.",
|
|
135
|
+
)
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--version",
|
|
138
|
+
action="version",
|
|
139
|
+
version="Infinity-Parser2 0.1.0",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return parser
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
146
|
+
"""CLI entry point."""
|
|
147
|
+
parser = build_parser()
|
|
148
|
+
args = parser.parse_args(argv)
|
|
149
|
+
|
|
150
|
+
input_paths = args.input
|
|
151
|
+
if len(input_paths) == 1 and os.path.isdir(input_paths[0]):
|
|
152
|
+
input_data = input_paths[0]
|
|
153
|
+
else:
|
|
154
|
+
input_data = input_paths
|
|
155
|
+
|
|
156
|
+
if args.verbose:
|
|
157
|
+
print(f"[Infinity-Parser2] Backend: {args.backend}")
|
|
158
|
+
print(f"[Infinity-Parser2] Model: {args.model_name}")
|
|
159
|
+
print(f"[Infinity-Parser2] Task: {args.task}")
|
|
160
|
+
print(f"[Infinity-Parser2] Input: {input_data}")
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
parser_client = InfinityParser2(
|
|
164
|
+
model_name=args.model_name,
|
|
165
|
+
backend=args.backend,
|
|
166
|
+
tensor_parallel_size=args.tensor_parallel_size,
|
|
167
|
+
api_url=args.api_url,
|
|
168
|
+
api_key=args.api_key,
|
|
169
|
+
min_pixels=args.min_pixels,
|
|
170
|
+
max_pixels=args.max_pixels,
|
|
171
|
+
model_cache_dir=args.model_cache_dir,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
result = parser_client.parse(
|
|
175
|
+
input_data=input_data,
|
|
176
|
+
task_type=args.task,
|
|
177
|
+
custom_prompt=args.prompt,
|
|
178
|
+
batch_size=args.batch_size,
|
|
179
|
+
output_dir=args.output_dir,
|
|
180
|
+
output_format=args.output_format,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if result is not None:
|
|
184
|
+
if isinstance(result, dict):
|
|
185
|
+
for path, content in result.items():
|
|
186
|
+
print(f"=== {path} ===")
|
|
187
|
+
print(content)
|
|
188
|
+
elif isinstance(result, list):
|
|
189
|
+
for item in result:
|
|
190
|
+
print(item)
|
|
191
|
+
else:
|
|
192
|
+
print(result)
|
|
193
|
+
elif args.verbose:
|
|
194
|
+
print("[Infinity-Parser2] Results saved to output directory.")
|
|
195
|
+
|
|
196
|
+
return 0
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
print(f"[Infinity-Parser2] Error: {e}", file=sys.stderr)
|
|
200
|
+
if args.verbose:
|
|
201
|
+
import traceback
|
|
202
|
+
traceback.print_exc()
|
|
203
|
+
return 1
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
sys.exit(main())
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""Infinity-Parser2 main interface."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
11
|
+
from .backends import (
|
|
12
|
+
BaseBackend,
|
|
13
|
+
TransformersBackend,
|
|
14
|
+
VLLMEngineBackend,
|
|
15
|
+
VLLMServerBackend,
|
|
16
|
+
)
|
|
17
|
+
from .prompts import PROMPT_DOC2JSON, PROMPT_DOC2MD, SUPPORTED_TASK_TYPES
|
|
18
|
+
from .utils import *
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
BACKEND_REGISTRY = {
|
|
22
|
+
"transformers": TransformersBackend,
|
|
23
|
+
"vllm-engine": VLLMEngineBackend,
|
|
24
|
+
"vllm-server": VLLMServerBackend,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class InfinityParser2:
|
|
29
|
+
"""Document parser using Infinity-Parser2-Pro model.
|
|
30
|
+
|
|
31
|
+
Supports parsing of PDF files and images (PNG, JPG, etc.) into structured text.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_name: Model name on HuggingFace Hub (e.g., "infly/Infinity-Parser2-Pro")
|
|
35
|
+
or local path to a downloaded model. Defaults to "infly/Infinity-Parser2-Pro".
|
|
36
|
+
backend: Inference backend. Options:
|
|
37
|
+
- "transformers": HuggingFace transformers (local inference)
|
|
38
|
+
- "vllm-engine": vLLM Engine (local batch inference via LLM class)
|
|
39
|
+
- "vllm-server": vLLM OpenAI-Compatible Server (HTTP API)
|
|
40
|
+
Defaults to "vllm-engine".
|
|
41
|
+
tensor_parallel_size: Tensor parallel size for vllm-engine.
|
|
42
|
+
Defaults to the number of available GPUs (via torch.cuda.device_count()).
|
|
43
|
+
device: Device type, must be "cuda". Raises ValueError if set to anything else.
|
|
44
|
+
api_url: API URL for vllm-server backend.
|
|
45
|
+
api_key: API key for vllm-server backend.
|
|
46
|
+
min_pixels: Minimum number of pixels for image input (transformers backend only).
|
|
47
|
+
Defaults to 2048.
|
|
48
|
+
max_pixels: Maximum number of pixels for image input (transformers backend only).
|
|
49
|
+
Defaults to 16777216 (~4096x4096).
|
|
50
|
+
**kwargs: Additional arguments passed to the backend.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> from infinity_parser2 import InfinityParser2
|
|
54
|
+
>>> parser = InfinityParser2(model_name="infly/Infinity-Parser2-Pro")
|
|
55
|
+
>>> result = parser.parse("document.pdf")
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
model_name: str = "infly/Infinity-Parser2-Pro",
|
|
61
|
+
backend: str = "vllm-engine",
|
|
62
|
+
tensor_parallel_size: Optional[int] = None,
|
|
63
|
+
device: str = "cuda",
|
|
64
|
+
api_url: str = "http://localhost:8000/v1/chat/completions",
|
|
65
|
+
api_key: str = "EMPTY",
|
|
66
|
+
min_pixels: int = 2048,
|
|
67
|
+
max_pixels: int = 16777216,
|
|
68
|
+
model_cache_dir: Optional[str] = None,
|
|
69
|
+
**kwargs,
|
|
70
|
+
):
|
|
71
|
+
if device != "cuda":
|
|
72
|
+
raise ValueError("device must be 'cuda' for Infinity-Parser2-Pro.")
|
|
73
|
+
|
|
74
|
+
self.model_name = model_name
|
|
75
|
+
self.backend_name = backend.lower()
|
|
76
|
+
self.tensor_parallel_size = (
|
|
77
|
+
tensor_parallel_size
|
|
78
|
+
if tensor_parallel_size is not None
|
|
79
|
+
else torch.cuda.device_count()
|
|
80
|
+
)
|
|
81
|
+
self.device = device
|
|
82
|
+
self.api_url = api_url
|
|
83
|
+
self.api_key = api_key
|
|
84
|
+
self.min_pixels = min_pixels
|
|
85
|
+
self.max_pixels = max_pixels
|
|
86
|
+
self.kwargs = kwargs
|
|
87
|
+
|
|
88
|
+
# Initialize model cache and resolve model path (stored separately)
|
|
89
|
+
cache = get_model_cache(model_cache_dir)
|
|
90
|
+
self._model_path = cache.resolve_model_path(self.model_name)
|
|
91
|
+
|
|
92
|
+
self._backend: BaseBackend = self._init_backend()
|
|
93
|
+
|
|
94
|
+
def _init_backend(self) -> BaseBackend:
|
|
95
|
+
"""Initialize and return the backend instance."""
|
|
96
|
+
if self.backend_name not in BACKEND_REGISTRY:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"Unsupported backend: {self.backend_name}. "
|
|
99
|
+
f"Supported backends: {list(BACKEND_REGISTRY.keys())}"
|
|
100
|
+
)
|
|
101
|
+
backend_cls = BACKEND_REGISTRY[self.backend_name]
|
|
102
|
+
common_kwargs = {
|
|
103
|
+
"model_name": self._model_path,
|
|
104
|
+
"device": self.device,
|
|
105
|
+
"min_pixels": self.min_pixels,
|
|
106
|
+
"max_pixels": self.max_pixels,
|
|
107
|
+
**self.kwargs,
|
|
108
|
+
}
|
|
109
|
+
if self.backend_name == "vllm-server":
|
|
110
|
+
backend_kwargs = {**common_kwargs, "api_url": self.api_url, "api_key": self.api_key}
|
|
111
|
+
elif self.backend_name == "vllm-engine":
|
|
112
|
+
backend_kwargs = {**common_kwargs, "tensor_parallel_size": self.tensor_parallel_size}
|
|
113
|
+
else: # transformers
|
|
114
|
+
backend_kwargs = common_kwargs
|
|
115
|
+
return backend_cls(**backend_kwargs)
|
|
116
|
+
|
|
117
|
+
def _resolve_prompt(self, task_type: str, custom_prompt: Optional[str]) -> str:
|
|
118
|
+
"""Resolve the prompt to use based on task_type and custom_prompt.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
task_type: The task type (e.g., "doc2json", "doc2md", "custom").
|
|
122
|
+
custom_prompt: Custom prompt, only used when task_type is "custom".
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
The resolved prompt string.
|
|
126
|
+
"""
|
|
127
|
+
if task_type == "custom":
|
|
128
|
+
assert custom_prompt is not None, "custom_prompt must be provided when task_type='custom'"
|
|
129
|
+
return custom_prompt
|
|
130
|
+
if task_type == "doc2json":
|
|
131
|
+
return PROMPT_DOC2JSON
|
|
132
|
+
if task_type == "doc2md":
|
|
133
|
+
return PROMPT_DOC2MD
|
|
134
|
+
# Fallback for unknown task types (should not happen with proper validation)
|
|
135
|
+
return "Please transform the document's contents into Markdown format."
|
|
136
|
+
|
|
137
|
+
def parse(
|
|
138
|
+
self,
|
|
139
|
+
input_data: Union[str, List[str], Image.Image],
|
|
140
|
+
task_type: str = "doc2json",
|
|
141
|
+
custom_prompt: Optional[str] = None,
|
|
142
|
+
batch_size: int = 4,
|
|
143
|
+
output_dir: Optional[str] = None,
|
|
144
|
+
output_format: str = "md",
|
|
145
|
+
**kwargs,
|
|
146
|
+
) -> Optional[Union[str, List[str], Dict[str, str]]]:
|
|
147
|
+
"""Parse document(s) and extract text content.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
input_data: Input can be:
|
|
151
|
+
- str: Single file path or directory path
|
|
152
|
+
- List[str]: List of file paths
|
|
153
|
+
- PIL.Image.Image: Image object
|
|
154
|
+
task_type: Parsing task type. Options:
|
|
155
|
+
- "doc2json": Extract layout to JSON, return JSON string.
|
|
156
|
+
- "doc2md": Directly convert to Markdown, return Markdown.
|
|
157
|
+
- "custom": Use custom_prompt for parsing.
|
|
158
|
+
Defaults to "doc2json".
|
|
159
|
+
custom_prompt: Custom prompt text for the model. Used only when
|
|
160
|
+
task_type is "custom". Defaults to None.
|
|
161
|
+
batch_size: Number of images to process in one batch. Defaults to 4.
|
|
162
|
+
output_dir: If provided, results are saved to output_dir and this function
|
|
163
|
+
returns None. If None, results are returned directly.
|
|
164
|
+
output_format: Output format for results. Options: "md" or "json".
|
|
165
|
+
Defaults to "md".
|
|
166
|
+
- For doc2json tasks:
|
|
167
|
+
- output_format="md": Returns markdown (converts JSON to markdown
|
|
168
|
+
via convert_json_to_markdown). If output_dir is set, saves only
|
|
169
|
+
the markdown result.
|
|
170
|
+
- output_format="json": Returns raw JSON result. If output_dir is
|
|
171
|
+
set, saves only the JSON result.
|
|
172
|
+
- For doc2md tasks or custom prompts: Only "md" is supported.
|
|
173
|
+
If "json" is passed, a ValueError will be raised.
|
|
174
|
+
**kwargs: Additional arguments passed to the model.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
When output_dir is None:
|
|
178
|
+
- str: Parsed result for a single file or image.
|
|
179
|
+
- List[str]: Parsed results for a list of files.
|
|
180
|
+
- Dict[str, str]: Mapping from file path to parsed result for a directory.
|
|
181
|
+
When output_dir is set, returns None.
|
|
182
|
+
|
|
183
|
+
Example:
|
|
184
|
+
>>> parser = InfinityParser2()
|
|
185
|
+
>>> # Single file, returns str
|
|
186
|
+
>>> result = parser.parse("document.pdf")
|
|
187
|
+
>>> # Multiple files, returns List[str]
|
|
188
|
+
>>> result = parser.parse(["doc1.pdf", "doc2.pdf"])
|
|
189
|
+
>>> # Directory, returns Dict[str, str]
|
|
190
|
+
>>> result = parser.parse("/path/to/docs")
|
|
191
|
+
>>> # Save results to output_dir, returns None
|
|
192
|
+
>>> parser.parse("document.pdf", output_dir="./output")
|
|
193
|
+
"""
|
|
194
|
+
if task_type not in SUPPORTED_TASK_TYPES:
|
|
195
|
+
raise ValueError(f"task_type must be one of {SUPPORTED_TASK_TYPES}, got '{task_type}'")
|
|
196
|
+
|
|
197
|
+
if output_format not in SUPPORTED_OUTPUT_FORMATS:
|
|
198
|
+
raise ValueError(f"output_format must be one of {SUPPORTED_OUTPUT_FORMATS}, got '{output_format}'")
|
|
199
|
+
|
|
200
|
+
if output_format == "json" and task_type != "doc2json":
|
|
201
|
+
raise ValueError(
|
|
202
|
+
"output_format='json' is only supported for doc2json tasks. "
|
|
203
|
+
"For other task types, output_format must be 'md'."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
prompt = self._resolve_prompt(task_type, custom_prompt)
|
|
207
|
+
|
|
208
|
+
is_directory = isinstance(input_data, str) and os.path.isdir(input_data)
|
|
209
|
+
file_paths = normalize_input(input_data)
|
|
210
|
+
file_results = self._parse_files(
|
|
211
|
+
file_paths, prompt, task_type, batch_size, output_format, **kwargs
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if output_dir is not None:
|
|
215
|
+
save_results(
|
|
216
|
+
file_paths, file_results, output_dir,
|
|
217
|
+
task_type=task_type, output_format=output_format
|
|
218
|
+
)
|
|
219
|
+
elif is_directory:
|
|
220
|
+
return dict(zip(file_paths, file_results))
|
|
221
|
+
elif len(file_results) == 1:
|
|
222
|
+
return file_results[0]
|
|
223
|
+
else:
|
|
224
|
+
return file_results
|
|
225
|
+
|
|
226
|
+
def _parse_files(
|
|
227
|
+
self,
|
|
228
|
+
inputs: List[Union[str, Image.Image]],
|
|
229
|
+
prompt: Optional[str],
|
|
230
|
+
task_type: str,
|
|
231
|
+
batch_size: int = 4,
|
|
232
|
+
output_format: str = "md",
|
|
233
|
+
**kwargs,
|
|
234
|
+
) -> List[str]:
|
|
235
|
+
"""Parse multiple files with batched inference.
|
|
236
|
+
|
|
237
|
+
All images (including PDF pages) are collected and batched together for
|
|
238
|
+
efficient inference. Results are then aggregated back to the original
|
|
239
|
+
file-level granularity.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# prepare batch entries
|
|
243
|
+
batch_entries = prepare_batch_entries(inputs)
|
|
244
|
+
if not batch_entries:
|
|
245
|
+
return [] if len(inputs) > 1 else ""
|
|
246
|
+
|
|
247
|
+
# parse batch
|
|
248
|
+
raw_inputs = [entry[1] for entry in batch_entries]
|
|
249
|
+
batch_results = self._backend.parse_batch(raw_inputs, prompt, batch_size=batch_size, **kwargs)
|
|
250
|
+
|
|
251
|
+
# aggregate batch results
|
|
252
|
+
num_files = len({entry[0] for entry in batch_entries})
|
|
253
|
+
page_results: List[List[str]] = [[] for _ in range(num_files)]
|
|
254
|
+
file_results: List[str] = [""] * num_files
|
|
255
|
+
|
|
256
|
+
for entry_idx, (file_idx, image_input) in enumerate(batch_entries):
|
|
257
|
+
raw_result = batch_results[entry_idx]
|
|
258
|
+
|
|
259
|
+
# postprocess result
|
|
260
|
+
if task_type == "doc2json":
|
|
261
|
+
text = postprocess_doc2json_result(raw_result, image_input, output_format)
|
|
262
|
+
elif task_type == "doc2md":
|
|
263
|
+
text = postprocess_doc2md_result(raw_result)
|
|
264
|
+
else:
|
|
265
|
+
text = raw_result
|
|
266
|
+
|
|
267
|
+
page_results[file_idx].append(text)
|
|
268
|
+
|
|
269
|
+
# Join results based on length of page_results and output_format
|
|
270
|
+
for idx in range(num_files):
|
|
271
|
+
if len(page_results[idx]) == 1:
|
|
272
|
+
file_results[idx] = page_results[idx][0]
|
|
273
|
+
elif output_format == "json":
|
|
274
|
+
file_results[idx] = "[" + ",".join(page_results[idx]) + "]"
|
|
275
|
+
else:
|
|
276
|
+
file_results[idx] = "\n\n".join(page_results[idx])
|
|
277
|
+
|
|
278
|
+
return file_results
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Prompts for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"PROMPT_DOC2JSON",
|
|
5
|
+
"PROMPT_DOC2MD",
|
|
6
|
+
"SUPPORTED_TASK_TYPES",
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
SUPPORTED_TASK_TYPES = ["doc2json", "doc2md", "custom"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# doc2json prompt (outputs JSON format)
|
|
13
|
+
PROMPT_DOC2JSON = """
|
|
14
|
+
- Extract layout information from the provided PDF image.
|
|
15
|
+
- For each layout element, output its bbox, category, and the text content within the bbox.
|
|
16
|
+
- Bbox format: [x1, y1, x2, y2].
|
|
17
|
+
- Allowed layout categories: ['header', 'title', 'text', 'figure', 'table', 'formula',
|
|
18
|
+
'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote',
|
|
19
|
+
'table_footnote', 'page_footnote', 'footer'].
|
|
20
|
+
- Text extraction and formatting:
|
|
21
|
+
1) For 'figure', the text field must be an empty string.
|
|
22
|
+
2) For 'formula', format text as LaTeX.
|
|
23
|
+
3) For 'table', format text as HTML.
|
|
24
|
+
4) For all other categories (e.g., text, title), format text as Markdown.
|
|
25
|
+
- The output text must be exactly the original text from the image,
|
|
26
|
+
with no translation or rewriting.
|
|
27
|
+
- Sort all layout elements in human reading order.
|
|
28
|
+
- Final output must be a single JSON object.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# doc2md prompt (outputs Markdown format directly)
|
|
32
|
+
PROMPT_DOC2MD = """
|
|
33
|
+
You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
|
|
34
|
+
|
|
35
|
+
1. Text Processing:
|
|
36
|
+
- Accurately recognize all text content in the PDF image without guessing or inferring.
|
|
37
|
+
- Convert the recognized text into Markdown format.
|
|
38
|
+
- Maintain the original document structure, including headings, paragraphs, lists, etc.
|
|
39
|
+
|
|
40
|
+
2. Mathematical Formula Processing:
|
|
41
|
+
- Convert all mathematical formulas to LaTeX format.
|
|
42
|
+
- Enclose inline formulas with $ $. For example: This is an inline formula $E = mc^2$
|
|
43
|
+
- Enclose block formulas with $$ $$. For example: $$\\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$$
|
|
44
|
+
|
|
45
|
+
3. Table Processing:
|
|
46
|
+
- Convert tables to HTML format.
|
|
47
|
+
|
|
48
|
+
4. Figure Handling:
|
|
49
|
+
- Ignore figures content in the PDF image. Do not attempt to describe or convert images.
|
|
50
|
+
|
|
51
|
+
5. Output Format:
|
|
52
|
+
- Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
|
|
53
|
+
- For complex layouts, try to maintain the original document's structure and format as closely as possible.
|
|
54
|
+
|
|
55
|
+
Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
|
|
56
|
+
"""
|
|
57
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Utility functions for Infinity-Parser2."""
|
|
2
|
+
|
|
3
|
+
from .file import (
|
|
4
|
+
get_files_from_directory,
|
|
5
|
+
is_supported_file,
|
|
6
|
+
normalize_input,
|
|
7
|
+
prepare_batch_entries,
|
|
8
|
+
save_results,
|
|
9
|
+
SUPPORTED_OUTPUT_FORMATS,
|
|
10
|
+
)
|
|
11
|
+
from .image import encode_file_to_base64, load_image
|
|
12
|
+
from .model import ModelCache, get_model_cache
|
|
13
|
+
from .pdf import convert_pdf_to_images
|
|
14
|
+
from .utils import (
|
|
15
|
+
convert_json_to_markdown,
|
|
16
|
+
extract_json_content,
|
|
17
|
+
obtain_origin_hw,
|
|
18
|
+
postprocess_doc2json_result,
|
|
19
|
+
restore_abs_bbox_coordinates,
|
|
20
|
+
postprocess_doc2md_result,
|
|
21
|
+
truncate_last_incomplete_element,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"convert_pdf_to_images",
|
|
26
|
+
"convert_json_to_markdown",
|
|
27
|
+
"extract_json_content",
|
|
28
|
+
"encode_file_to_base64",
|
|
29
|
+
"get_files_from_directory",
|
|
30
|
+
"get_model_cache",
|
|
31
|
+
"is_supported_file",
|
|
32
|
+
"load_image",
|
|
33
|
+
"ModelCache",
|
|
34
|
+
"normalize_input",
|
|
35
|
+
"obtain_origin_hw",
|
|
36
|
+
"postprocess_doc2json_result",
|
|
37
|
+
"postprocess_doc2md_result",
|
|
38
|
+
"prepare_batch_entries",
|
|
39
|
+
"restore_abs_bbox_coordinates",
|
|
40
|
+
"save_results",
|
|
41
|
+
"SUPPORTED_OUTPUT_FORMATS",
|
|
42
|
+
"truncate_last_incomplete_element",
|
|
43
|
+
]
|