infinity-parser2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ """File system utilities for Infinity-Parser2."""
2
+
3
+ import os
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import List, Union
7
+
8
+ from PIL import Image
9
+
10
+ from .pdf import convert_pdf_to_images
11
+ from .utils import convert_json_to_markdown
12
+
13
+
14
+ SUPPORTED_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
15
+ SUPPORTED_DOC_EXTENSIONS = {".pdf"}
16
+ SUPPORTED_OUTPUT_FORMATS = ["md", "json"]
17
+
18
+
19
+ def prepare_batch_entries(
20
+ inputs: List[Union[str, Image.Image]]
21
+ ) -> tuple[list[tuple[int, Union[str, Image.Image]]], list[int]]:
22
+ """Expand inputs into batch entries, splitting PDFs into individual pages.
23
+
24
+ Args:
25
+ inputs: List of file paths or PIL Images.
26
+
27
+ Returns:
28
+ batch_entries: List of (file_idx, item) tuples, where item is either
29
+ a file path (for non-PDF) or a PIL Image (for PDF pages or images).
30
+ """
31
+ batch_entries: list[tuple[int, Union[str, Image.Image]]] = []
32
+
33
+ for idx, item in enumerate(inputs):
34
+ if isinstance(item, str):
35
+ ext = Path(item).suffix.lower()
36
+ if ext == ".pdf":
37
+ page_images = convert_pdf_to_images(item)
38
+ for page_img in page_images:
39
+ batch_entries.append((idx, page_img))
40
+ else:
41
+ batch_entries.append((idx, item))
42
+ else:
43
+ batch_entries.append((idx, item))
44
+
45
+ return batch_entries
46
+
47
+
48
+ def normalize_input(input_data: Union[str, List[str], Image.Image]) -> List[Union[str, Image.Image]]:
49
+ """Normalize input to a list of file paths or images.
50
+
51
+ Args:
52
+ input_data: Input can be:
53
+ - str: Single file path or directory path
54
+ - List[str]: List of file paths
55
+ - PIL.Image.Image: Image object
56
+
57
+ Returns:
58
+ List of file paths or PIL Images.
59
+
60
+ Raises:
61
+ FileNotFoundError: If file or directory not found.
62
+ TypeError: If list contains non-string items.
63
+ ValueError: If directory is empty or file type is unsupported.
64
+ """
65
+ if isinstance(input_data, str):
66
+ if os.path.isdir(input_data):
67
+ file_paths = get_files_from_directory(input_data)
68
+ if not file_paths:
69
+ raise ValueError(f"No supported files found in directory: {input_data}")
70
+ return file_paths
71
+ elif os.path.isfile(input_data):
72
+ if not is_supported_file(input_data):
73
+ raise ValueError(f"Unsupported file type: {input_data}")
74
+ return [input_data]
75
+ else:
76
+ raise FileNotFoundError(f"File or directory not found: {input_data}")
77
+ elif isinstance(input_data, list):
78
+ file_paths = []
79
+ for item in input_data:
80
+ if not isinstance(item, str):
81
+ raise TypeError(f"Expected str in list, got {type(item)}")
82
+ if not os.path.isfile(item):
83
+ raise FileNotFoundError(f"File not found: {item}")
84
+ if not is_supported_file(item):
85
+ raise ValueError(f"Unsupported file type: {item}")
86
+ file_paths.append(item)
87
+ return file_paths
88
+ elif isinstance(input_data, Image.Image):
89
+ return [input_data]
90
+ else:
91
+ raise TypeError(
92
+ f"Unsupported input type: {type(input_data)}. "
93
+ "Expected str, List[str], or PIL.Image.Image."
94
+ )
95
+
96
+
97
+ def is_supported_file(file_path: str) -> bool:
98
+ """Check if file is supported."""
99
+ ext = Path(file_path).suffix.lower()
100
+ return ext in SUPPORTED_IMAGE_EXTENSIONS or ext in SUPPORTED_DOC_EXTENSIONS
101
+
102
+
103
+ def get_files_from_directory(directory: str) -> List[str]:
104
+ """Get all supported files from a directory."""
105
+ files = []
106
+ for root, _, filenames in os.walk(directory):
107
+ for filename in filenames:
108
+ file_path = os.path.join(root, filename)
109
+ if is_supported_file(file_path):
110
+ files.append(file_path)
111
+ return sorted(files)
112
+
113
+
114
+ def save_results(
115
+ inputs: List[Union[str, Image.Image]],
116
+ results: List[str],
117
+ output_dir: str,
118
+ task_type: str = "doc2json",
119
+ output_format: str = "md",
120
+ ) -> None:
121
+ """Save parsing results to output directory.
122
+
123
+ Unified entry point that delegates to save_results_json or save_results_md
124
+ based on the task_type and output_format. Prints the output directory
125
+ path to console.
126
+
127
+ Args:
128
+ inputs: Original inputs (file paths or PIL Images).
129
+ results: Parsed results (same order as inputs).
130
+ output_dir: Base output directory.
131
+ task_type: Task type (e.g., "doc2json", "doc2md", "custom").
132
+ output_format: Output format to save. Options: "md" or "json".
133
+ - "md": Save only markdown result.
134
+ - "json": Save only JSON result (only valid for doc2json mode).
135
+ """
136
+ keys = [uuid.uuid4().hex[:8] if isinstance(inp, Image.Image) else inp for inp in inputs]
137
+
138
+ if output_format == "json":
139
+ assert task_type == "doc2json", "output_format='json' is only supported for doc2json tasks."
140
+ save_results_json(keys, results, output_dir)
141
+ else:
142
+ save_results_md(keys, results, output_dir)
143
+
144
+ print(f"[Infinity-Parser2] Results saved to: {os.path.abspath(output_dir)}")
145
+
146
+
147
+ def save_results_md(keys: List[str], results: List[str], output_dir: str) -> None:
148
+ """Save markdown parsing results to output directory.
149
+
150
+ Creates a subdirectory for each entry and writes result.md inside it.
151
+ For file paths, the folder name is the filename (basename); for UUIDs,
152
+ the folder name is the UUID itself.
153
+
154
+ Args:
155
+ keys: Identifiers (file paths or UUIDs).
156
+ results: Parsed markdown text results (same order as keys).
157
+ output_dir: Base output directory.
158
+ """
159
+ os.makedirs(output_dir, exist_ok=True)
160
+
161
+ for key, result in zip(keys, results):
162
+ folder_name = Path(key).name
163
+ file_dir = os.path.join(output_dir, folder_name)
164
+ os.makedirs(file_dir, exist_ok=True)
165
+ result_path = os.path.join(file_dir, "result.md")
166
+ with open(result_path, "w", encoding="utf-8") as f:
167
+ f.write(result)
168
+
169
+
170
+ def save_results_json(keys: List[str], results: List[str], output_dir: str) -> None:
171
+ """Save JSON parsing results to output directory.
172
+
173
+ Creates a subdirectory for each entry and writes result.json inside it.
174
+ For file paths, the folder name is the filename (basename); for UUIDs,
175
+ the folder name is the UUID itself.
176
+
177
+ Args:
178
+ keys: Identifiers (file paths or UUIDs).
179
+ results: Parsed JSON text results (same order as keys).
180
+ output_dir: Base output directory.
181
+ """
182
+ os.makedirs(output_dir, exist_ok=True)
183
+
184
+ for key, result in zip(keys, results):
185
+ folder_name = Path(key).name
186
+ file_dir = os.path.join(output_dir, folder_name)
187
+ os.makedirs(file_dir, exist_ok=True)
188
+ result_path = os.path.join(file_dir, "result.json")
189
+ with open(result_path, "w", encoding="utf-8") as f:
190
+ f.write(result)
@@ -0,0 +1,99 @@
1
+ """Image encoding and loading utilities."""
2
+
3
+ import base64
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Tuple, Union
7
+
8
+ from PIL import Image
9
+
10
+ from qwen_vl_utils.vision_process import smart_resize
11
+
12
+ try:
13
+ from importlib import metadata
14
+ _qwen_vl_utils_version = metadata.version("qwen-vl-utils")
15
+ if _qwen_vl_utils_version < "0.0.14":
16
+ raise ImportError("qwen-vl-utils version 0.0.14 or higher is required")
17
+ except metadata.PackageNotFoundError:
18
+ raise ImportError("qwen-vl-utils is not installed. Install it with: pip install qwen-vl-utils")
19
+
20
+
21
+ # MIME type mapping for common image formats
22
+ IMAGE_MIME_TYPES = {
23
+ ".jpg": "image/jpeg",
24
+ ".jpeg": "image/jpeg",
25
+ ".png": "image/png",
26
+ ".webp": "image/webp",
27
+ ".bmp": "image/bmp",
28
+ ".gif": "image/gif",
29
+ ".tiff": "image/tiff",
30
+ ".tif": "image/tiff",
31
+ }
32
+
33
+
34
+ def load_image(
35
+ input_data: Union[str, Image.Image],
36
+ ) -> Image.Image:
37
+ """Load image from file path or PIL Image and convert to RGB.
38
+
39
+ Args:
40
+ input_data: File path or PIL Image.
41
+
42
+ Returns:
43
+ PIL Image in RGB mode.
44
+
45
+ Raises:
46
+ TypeError: If input_data is an unsupported type.
47
+ """
48
+ if isinstance(input_data, str):
49
+ return Image.open(input_data).convert("RGB")
50
+ elif isinstance(input_data, Image.Image):
51
+ return input_data.convert("RGB")
52
+ else:
53
+ raise TypeError(f"Unsupported input type: {type(input_data)}")
54
+
55
+
56
+ def encode_file_to_base64(
57
+ image_obj: Union[Image.Image, str],
58
+ min_pixels: int = 2048,
59
+ max_pixels: int = 16777216,
60
+ ) -> Tuple[str, str]:
61
+ """Encode image to base64 string and determine its MIME type.
62
+
63
+ Args:
64
+ image_obj: File path or PIL Image.
65
+ min_pixels: Minimum number of pixels for resizing.
66
+ max_pixels: Maximum number of pixels for resizing.
67
+
68
+ Returns:
69
+ Tuple of (base64 string, MIME type string).
70
+ """
71
+ if isinstance(image_obj, str):
72
+ image = Image.open(image_obj)
73
+ ext = Path(image_obj).suffix.lower()
74
+ mime_type = IMAGE_MIME_TYPES.get(ext, "image/jpeg")
75
+ else:
76
+ # Note: image.copy() loses the format attribute, so get it before copying
77
+ original_format = image_obj.format
78
+ image = image_obj.copy()
79
+ # Try to get format from original PIL Image, default to jpeg
80
+ mime_type = IMAGE_MIME_TYPES.get(f".{original_format}".lower(), "image/jpeg") if original_format else "image/jpeg"
81
+
82
+ resized_height, resized_width = smart_resize(
83
+ height=image.size[1],
84
+ width=image.size[0],
85
+ factor=32,
86
+ min_pixels=min_pixels,
87
+ max_pixels=max_pixels,
88
+ )
89
+ image = image.resize((resized_width, resized_height))
90
+
91
+ if image.mode != "RGB":
92
+ image = image.convert("RGB")
93
+
94
+ output_buffer = BytesIO()
95
+ image.save(output_buffer, format="PNG")
96
+ byte_data = output_buffer.getvalue()
97
+
98
+ base64_str = base64.b64encode(byte_data).decode("utf-8")
99
+ return base64_str, mime_type
@@ -0,0 +1,243 @@
1
+ """Model cache management for Infinity-Parser2."""
2
+
3
+ import json
4
+ import os
5
+ import socket
6
+ import ssl
7
+ import urllib.request
8
+ import urllib.error
9
+ from typing import Optional
10
+
11
+ from huggingface_hub import snapshot_download
12
+
13
+ # Default cache directory
14
+ DEFAULT_CACHE_DIR = os.path.expanduser("~/.cache/infinity_parser2")
15
+
16
+ # HuggingFace endpoints
17
+ HF_ENDPOINT_DEFAULT = "https://huggingface.co"
18
+ HF_ENDPOINT_MIRROR = "https://hf-mirror.com"
19
+ # Timeout for connectivity check (seconds)
20
+ _HF_CONNECT_TIMEOUT = 5.0
21
+
22
+
23
+ def _check_endpoint_reachable(url: str, timeout: float = _HF_CONNECT_TIMEOUT) -> bool:
24
+ """Check if an HTTP endpoint is reachable.
25
+
26
+ Args:
27
+ url: The URL to check.
28
+ timeout: Connection timeout in seconds.
29
+
30
+ Returns:
31
+ True if the endpoint responds within the timeout, False otherwise.
32
+ """
33
+ try:
34
+ req = urllib.request.Request(
35
+ url,
36
+ method="HEAD",
37
+ headers={"User-Agent": "Mozilla/5.0 (compatible; Infinity-Parser2)"},
38
+ )
39
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
40
+ return resp.status == 200
41
+ except (
42
+ urllib.error.URLError,
43
+ socket.timeout,
44
+ ConnectionError,
45
+ ssl.SSLError,
46
+ OSError,
47
+ ):
48
+ return False
49
+
50
+
51
+ def _resolve_hf_endpoint() -> str:
52
+ """Resolve the best HuggingFace endpoint based on connectivity.
53
+
54
+ Checks if the default HuggingFace endpoint (https://huggingface.co) is reachable.
55
+ If not, falls back to the mirror (https://hf-mirror.com/).
56
+
57
+ Returns:
58
+ The URL string of the reachable endpoint.
59
+ """
60
+ if _check_endpoint_reachable(HF_ENDPOINT_DEFAULT):
61
+ return HF_ENDPOINT_DEFAULT
62
+ print(
63
+ f"[Infinity-Parser2] Default HF endpoint ({HF_ENDPOINT_DEFAULT}) is not reachable. "
64
+ f"Falling back to mirror: {HF_ENDPOINT_MIRROR}"
65
+ )
66
+ return HF_ENDPOINT_MIRROR
67
+
68
+
69
+ class ModelCache:
70
+ """Manages local model cache for Infinity-Parser2.
71
+
72
+ Automatically detects if a model is already downloaded and cached locally.
73
+ If not, prompts the user and downloads it from HuggingFace Hub.
74
+
75
+ Attributes:
76
+ cache_dir: Directory where model cache metadata is stored.
77
+ models_file: Path to the JSON file containing cached model information.
78
+ """
79
+
80
+ def __init__(self, cache_dir: Optional[str] = None):
81
+ """Initialize ModelCache.
82
+
83
+ Args:
84
+ cache_dir: Custom cache directory. Defaults to ~/.cache/infinity_parser2.
85
+ """
86
+ self.cache_dir = cache_dir or DEFAULT_CACHE_DIR
87
+ self.models_file = os.path.join(self.cache_dir, "models_cache.json")
88
+ self._ensure_cache_dir()
89
+ self._models_cache: dict = self._load_cache()
90
+
91
+ def _ensure_cache_dir(self) -> None:
92
+ """Create cache directory if it doesn't exist."""
93
+ os.makedirs(self.cache_dir, exist_ok=True)
94
+
95
+ def _load_cache(self) -> dict:
96
+ """Load cached model information from JSON file."""
97
+ if not os.path.exists(self.models_file):
98
+ return {}
99
+ try:
100
+ with open(self.models_file, "r", encoding="utf-8") as f:
101
+ return json.load(f)
102
+ except (json.JSONDecodeError, IOError):
103
+ return {}
104
+
105
+ def _save_cache(self) -> None:
106
+ """Save cached model information to JSON file."""
107
+ with open(self.models_file, "w", encoding="utf-8") as f:
108
+ json.dump(self._models_cache, f, indent=2, ensure_ascii=False)
109
+
110
+ def is_cached(self, model_name: str) -> bool:
111
+ """Check if a model is already cached locally.
112
+
113
+ Args:
114
+ model_name: HuggingFace model name (e.g., "infly/Infinity-Parser2-Pro").
115
+
116
+ Returns:
117
+ True if model is cached and the local path exists.
118
+ """
119
+ if model_name not in self._models_cache:
120
+ return False
121
+ local_path = self._models_cache[model_name].get("local_path")
122
+ if not local_path or not os.path.exists(local_path):
123
+ return False
124
+ return True
125
+
126
+ def get_cached_path(self, model_name: str) -> Optional[str]:
127
+ """Get the cached local path for a model.
128
+
129
+ Args:
130
+ model_name: HuggingFace model name.
131
+
132
+ Returns:
133
+ Local path if cached, None otherwise.
134
+ """
135
+ if not self.is_cached(model_name):
136
+ return None
137
+ return self._models_cache[model_name].get("local_path")
138
+
139
+ def cache_model(self, model_name: str, local_path: str) -> None:
140
+ """Cache a model's local path.
141
+
142
+ Args:
143
+ model_name: HuggingFace model name.
144
+ local_path: Local directory where the model is stored.
145
+ """
146
+ self._models_cache[model_name] = {
147
+ "local_path": local_path,
148
+ "cached": True,
149
+ }
150
+ self._save_cache()
151
+
152
+ def download_and_cache(
153
+ self,
154
+ model_name: str,
155
+ target_dir: Optional[str] = None,
156
+ force_download: bool = False,
157
+ ) -> str:
158
+ """Download a model from HuggingFace Hub and cache its location.
159
+
160
+ Args:
161
+ model_name: HuggingFace model name (e.g., "infly/Infinity-Parser2-Pro").
162
+ target_dir: Custom download directory. If None, uses cache_dir/model_name.
163
+ force_download: If True, re-download even if cached.
164
+
165
+ Returns:
166
+ Local path where the model is stored.
167
+ """
168
+ if target_dir is None:
169
+ safe_name = model_name.replace("/", "_")
170
+ target_dir = os.path.join(self.cache_dir, safe_name)
171
+
172
+ # If already cached and not forcing download, return cached path
173
+ if self.is_cached(model_name) and not force_download:
174
+ cached_path = self.get_cached_path(model_name)
175
+ print(f"[Infinity-Parser2] Model already cached at: {cached_path}")
176
+ return cached_path
177
+
178
+ print(f"[Infinity-Parser2] Model '{model_name}' not found locally.")
179
+ print(f"[Infinity-Parser2] Starting download to: {target_dir}")
180
+ print("[Infinity-Parser2] This may take a few minutes depending on model size and network...")
181
+
182
+ # Resolve the best HF endpoint (cached per session)
183
+ resolved_endpoint = _resolve_hf_endpoint()
184
+ print(f"[Infinity-Parser2] Using endpoint: {resolved_endpoint}")
185
+
186
+ os.makedirs(target_dir, exist_ok=True)
187
+
188
+ snapshot_download(
189
+ repo_id=model_name,
190
+ local_dir=target_dir,
191
+ local_dir_use_symlinks=False,
192
+ endpoint=resolved_endpoint,
193
+ )
194
+
195
+ self.cache_model(model_name, target_dir)
196
+ print(f"[Infinity-Parser2] Model downloaded and cached successfully!")
197
+ print(f"[Infinity-Parser2] Cache location: {target_dir}")
198
+
199
+ return target_dir
200
+
201
+ def resolve_model_path(self, model_name: str) -> str:
202
+ """Resolve the model path for loading.
203
+
204
+ If model is not cached, downloads it automatically.
205
+ If model is a local path, returns it directly.
206
+
207
+ Args:
208
+ model_name: HuggingFace model name or local path.
209
+
210
+ Returns:
211
+ Resolved local path for model loading.
212
+ """
213
+ # If it's already a local path, return it directly
214
+ if os.path.exists(model_name):
215
+ return model_name
216
+
217
+ # If cached, return cached path
218
+ if self.is_cached(model_name):
219
+ cached_path = self.get_cached_path(model_name)
220
+ print(f"[Infinity-Parser2] Found cached model at: {cached_path}")
221
+ return cached_path
222
+
223
+ # Otherwise, download and cache
224
+ return self.download_and_cache(model_name)
225
+
226
+
227
+ # Global model cache instance
228
+ _model_cache: Optional[ModelCache] = None
229
+
230
+
231
+ def get_model_cache(cache_dir: Optional[str] = None) -> ModelCache:
232
+ """Get or create the global ModelCache instance.
233
+
234
+ Args:
235
+ cache_dir: Custom cache directory for this session.
236
+
237
+ Returns:
238
+ The global ModelCache instance.
239
+ """
240
+ global _model_cache
241
+ if _model_cache is None:
242
+ _model_cache = ModelCache(cache_dir)
243
+ return _model_cache
@@ -0,0 +1,46 @@
1
+ """PDF to image conversion utility."""
2
+
3
+ import io
4
+ from typing import List, Union
5
+
6
+ from PIL import Image
7
+
8
+ try:
9
+ import fitz # PyMuPDF
10
+ except ImportError:
11
+ raise ImportError(
12
+ "PyMuPDF is required for PDF rendering. Install it with: pip install pymupdf"
13
+ )
14
+
15
+
16
+ def convert_pdf_to_images(
17
+ pdf_path: Union[str, bytes],
18
+ dpi: int = 300,
19
+ ) -> List[Image.Image]:
20
+ """Convert a PDF file to a list of PIL Images (one per page).
21
+
22
+ Args:
23
+ pdf_path: Path to the PDF file or PDF bytes.
24
+ dpi: Resolution for rendering. Higher values give better quality
25
+ but use more memory. Defaults to 300.
26
+
27
+ Returns:
28
+ List of PIL Images, one per PDF page.
29
+ """
30
+ Image.MAX_IMAGE_PIXELS = None # Disable decompression bomb check for large PDF pages
31
+
32
+ if isinstance(pdf_path, bytes):
33
+ doc = fitz.open(stream=pdf_path, filetype="pdf")
34
+ else:
35
+ doc = fitz.open(pdf_path)
36
+
37
+ images = []
38
+ for page_num in range(len(doc)):
39
+ page = doc[page_num]
40
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
41
+ pix = page.get_pixmap(matrix=mat)
42
+ img_data = pix.tobytes("png")
43
+ images.append(Image.open(io.BytesIO(img_data)).convert("RGB"))
44
+
45
+ doc.close()
46
+ return images