vlmparse 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlmparse/build_doc.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import re
2
+ from pathlib import Path
2
3
 
3
4
  import numpy as np
4
5
  import PIL
5
6
  import pypdfium2 as pdfium
6
7
  from loguru import logger
7
8
 
9
+ from .constants import PDF_EXTENSION
10
+
8
11
 
9
12
  def convert_pdfium(file_path, dpi):
10
13
  pdf = pdfium.PdfDocument(file_path)
@@ -64,7 +67,10 @@ def resize_image(image, max_image_size):
64
67
 
65
68
 
66
69
  def get_page_count(file_path):
67
- pdf = pdfium.PdfDocument(file_path)
68
- count = len(pdf)
69
- pdf.close()
70
- return count
70
+ if Path(file_path).suffix.lower() == PDF_EXTENSION:
71
+ pdf = pdfium.PdfDocument(file_path)
72
+ count = len(pdf)
73
+ pdf.close()
74
+ return count
75
+ else:
76
+ return 1
@@ -1,7 +1,155 @@
1
+ import re
2
+ from typing import ClassVar, Literal
3
+
4
+ from loguru import logger
5
+ from PIL import Image
1
6
  from pydantic import Field
2
7
 
3
- from vlmparse.clients.openai_converter import OpenAIConverterConfig
8
+ from vlmparse.clients.openai_converter import (
9
+ OpenAIConverterClient,
10
+ OpenAIConverterConfig,
11
+ )
12
+ from vlmparse.data_model.box import BoundingBox
13
+ from vlmparse.data_model.document import Item, Page
4
14
  from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
+ from vlmparse.utils import to_base64
16
+
17
+
18
+ def re_match(text):
19
+ pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
20
+ matches = re.findall(pattern, text, re.DOTALL)
21
+
22
+ matches_image = []
23
+ matches_other = []
24
+ for a_match in matches:
25
+ if "<|ref|>image<|/ref|>" in a_match[0]:
26
+ matches_image.append(a_match[0])
27
+ else:
28
+ matches_other.append(a_match[0])
29
+ return matches, matches_image, matches_other
30
+
31
+
32
+ def extract_coordinates_and_label(ref_text):
33
+ try:
34
+ label_type = ref_text[1]
35
+ matches = re.findall(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", ref_text[2])
36
+ cor_list = [[int(x) for x in m] for m in matches]
37
+ except Exception as e:
38
+ logger.warning(f"Error parsing coordinates: {e}")
39
+ return None
40
+
41
+ return (label_type, cor_list)
42
+
43
+
44
+ class DeepSeekOCRConverterClient(OpenAIConverterClient):
45
+ """Client for DeepSeekOCR with specific post-processing."""
46
+
47
+ PROMPTS: ClassVar[dict] = {
48
+ "layout": "<|grounding|>Convert the document to markdown.",
49
+ "ocr": "Free OCR.",
50
+ "image_description": "Describe this image in detail.",
51
+ }
52
+
53
+ def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
54
+ items = []
55
+ width, height = image.size
56
+
57
+ for match in matches:
58
+ # match is tuple: (full_str, label, coords_str)
59
+ result = extract_coordinates_and_label(match)
60
+ if not result:
61
+ continue
62
+
63
+ category, coords = result
64
+ if not coords:
65
+ continue
66
+
67
+ # Create boxes
68
+ boxes = []
69
+ for point in coords:
70
+ if len(point) != 4:
71
+ continue
72
+ x1, y1, x2, y2 = point
73
+ # Scale to image size (0-999 -> pixel)
74
+ x1 = (x1 / 999) * width
75
+ y1 = (y1 / 999) * height
76
+ x2 = (x2 / 999) * width
77
+ y2 = (y2 / 999) * height
78
+
79
+ boxes.append(
80
+ BoundingBox(
81
+ l=min(x1, x2), t=min(y1, y2), r=max(x1, x2), b=max(y1, y2)
82
+ )
83
+ )
84
+
85
+ if not boxes:
86
+ continue
87
+
88
+ # Merge if multiple boxes for one item
89
+ try:
90
+ final_box = (
91
+ BoundingBox.merge_boxes(boxes) if len(boxes) > 1 else boxes[0]
92
+ )
93
+ except Exception as e:
94
+ logger.warning(f"Error merging boxes: {e}")
95
+ continue
96
+
97
+ items.append(Item(category=category, text=match[1], box=final_box))
98
+
99
+ return items
100
+
101
+ async def async_call_inside_page(self, page: Page) -> Page:
102
+ # Prepare messages as in parent class
103
+ image = page.image
104
+
105
+ messages = [
106
+ {
107
+ "role": "user",
108
+ "content": [
109
+ {
110
+ "type": "image_url",
111
+ "image_url": {
112
+ "url": f"data:image/png;base64,{to_base64(image)}"
113
+ },
114
+ },
115
+ {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
116
+ ],
117
+ },
118
+ ]
119
+
120
+ # Get raw response using parent's method
121
+ response = await self._get_chat_completion(messages)
122
+ logger.info("Response length: " + str(len(response)))
123
+ page.raw_response = response
124
+
125
+ if self.config.prompt_mode == "layout":
126
+ # Post-processing
127
+ matches, matches_image, matches_other = re_match(response)
128
+
129
+ # Extract items (bounding boxes)
130
+ page.items = self.extract_items(page.image, matches)
131
+
132
+ # Clean text
133
+ outputs = response
134
+
135
+ # Replace image references with a placeholder
136
+ for a_match_image in matches_image:
137
+ outputs = outputs.replace(a_match_image, "![image]")
138
+
139
+ # Replace other references (text grounding) and cleanup
140
+ for a_match_other in matches_other:
141
+ outputs = (
142
+ outputs.replace(a_match_other, "")
143
+ .replace("\\coloneqq", ":=")
144
+ .replace("\\eqqcolon", "=:")
145
+ )
146
+ else:
147
+ outputs = response
148
+
149
+ page.text = outputs.strip()
150
+ logger.debug(page.text)
151
+
152
+ return page
5
153
 
6
154
 
7
155
  class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
@@ -32,10 +180,11 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
32
180
 
33
181
  model_name: str = "deepseek-ai/DeepSeek-OCR"
34
182
  aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
35
- preprompt: str | None = None
36
- postprompt: str | None = "<|grounding|>Convert the document to markdown."
183
+
184
+ prompt_mode: Literal["layout", "ocr"] = "ocr"
37
185
  completion_kwargs: dict | None = {
38
186
  "temperature": 0.0,
187
+ "max_tokens": 8181,
39
188
  "extra_body": {
40
189
  "skip_special_tokens": False,
41
190
  # args used to control custom logits processor
@@ -47,6 +196,8 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
47
196
  },
48
197
  },
49
198
  }
50
- max_image_size: int | None = 1540
51
199
  dpi: int = 200
52
200
  aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
201
+
202
+ def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
203
+ return DeepSeekOCRConverterClient(config=self, **kwargs)
vlmparse/constants.py ADDED
@@ -0,0 +1,2 @@
1
+ IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2
+ PDF_EXTENSION = ".pdf"
vlmparse/converter.py CHANGED
@@ -6,10 +6,12 @@ from pathlib import Path
6
6
  from typing import Literal
7
7
 
8
8
  from loguru import logger
9
+ from PIL import Image
9
10
  from pydantic import Field
10
11
 
11
12
  from .base_model import VLMParseBaseModel
12
13
  from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
14
+ from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
13
15
  from .data_model.document import Document, Page, ProcessingError
14
16
 
15
17
  # Add a lock to ensure PDFium is accessed by only one thread/task at a time
@@ -50,12 +52,24 @@ class BaseConverter:
50
52
  raise NotImplementedError
51
53
 
52
54
  def add_page_image(self, page: Page, file_path, page_idx):
53
- with PDFIUM_LOCK:
54
- image = convert_specific_page_to_image(
55
- file_path,
56
- page_idx,
57
- dpi=self.config.dpi,
55
+ if Path(file_path).suffix.lower() in IMAGE_EXTENSIONS:
56
+ image = Image.open(file_path)
57
+ if image.mode != "RGB":
58
+ image = image.convert("L").convert("RGB")
59
+
60
+ elif Path(file_path).suffix.lower() == PDF_EXTENSION:
61
+ with PDFIUM_LOCK:
62
+ image = convert_specific_page_to_image(
63
+ file_path,
64
+ page_idx,
65
+ dpi=self.config.dpi,
66
+ )
67
+
68
+ else:
69
+ raise ValueError(
70
+ f"Unsupported file extension: {Path(file_path).suffix.lower()}"
58
71
  )
72
+
59
73
  image = resize_image(image, self.config.max_image_size)
60
74
  page.buffer_image = image
61
75
  return page
vlmparse/utils.py CHANGED
@@ -28,12 +28,12 @@ def get_file_paths(inputs: str | list[str]):
28
28
  if "*" in pattern or "?" in pattern:
29
29
  file_paths.extend(glob(pattern, recursive=True))
30
30
  elif os.path.isdir(pattern):
31
- file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
31
+ file_paths.extend(glob(os.path.join(pattern, "*.*"), recursive=True))
32
32
  elif os.path.isfile(pattern):
33
33
  file_paths.append(pattern)
34
34
  else:
35
35
  logger.error(f"Invalid input: {pattern}")
36
- file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
36
+ file_paths = [f for f in file_paths if os.path.exists(f) and os.path.isfile(f)]
37
37
 
38
38
  if not file_paths:
39
39
  logger.error("No PDF files found matching the inputs patterns")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Requires-Python: >=3.12.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -1,10 +1,11 @@
1
1
  vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
- vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
2
+ vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
3
3
  vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
- vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
4
+ vlmparse/constants.py,sha256=7-47S01n4MI2ebR09bpdOo3_P16d-z-NVGsm6KJP8ls,110
5
+ vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
5
6
  vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
6
7
  vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
7
- vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
8
+ vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
8
9
  vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
9
10
  vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
10
11
  vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
@@ -21,7 +22,7 @@ vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7
21
22
  vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
22
23
  vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
23
24
  vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
24
- vlmparse/clients/deepseekocr.py,sha256=iCG5wI5yPv98hIPgVJX4gkkkH1OekblZjFhh5ORVWAk,1813
25
+ vlmparse/clients/deepseekocr.py,sha256=rQvaOaPPoDiZ0MzXqfqqH9BgUBfjmlfHu3NlMjSDgiQ,6501
25
26
  vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
26
27
  vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
27
28
  vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
@@ -42,9 +43,9 @@ vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71
42
43
  vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
43
44
  vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
44
45
  vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
45
- vlmparse-0.1.3.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
46
- vlmparse-0.1.3.dist-info/METADATA,sha256=JkSI4uFnnF59WReyhfRFZZVoe6KLk0ZJrjG0FQkUIPI,5112
47
- vlmparse-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
- vlmparse-0.1.3.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
49
- vlmparse-0.1.3.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
50
- vlmparse-0.1.3.dist-info/RECORD,,
46
+ vlmparse-0.1.4.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
47
+ vlmparse-0.1.4.dist-info/METADATA,sha256=72_47P1ER-J8tzlEvE91Xf58u35p5eZZD1VvPbXzrqA,5112
48
+ vlmparse-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
+ vlmparse-0.1.4.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
50
+ vlmparse-0.1.4.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
51
+ vlmparse-0.1.4.dist-info/RECORD,,