vlmparse 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {vlmparse-0.1.3/vlmparse.egg-info → vlmparse-0.1.4}/PKG-INFO +1 -1
  2. {vlmparse-0.1.3 → vlmparse-0.1.4}/pyproject.toml +1 -1
  3. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_all_converters_mocked.py +25 -0
  4. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/build_doc.py +10 -4
  5. vlmparse-0.1.4/vlmparse/clients/deepseekocr.py +203 -0
  6. vlmparse-0.1.4/vlmparse/constants.py +2 -0
  7. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/converter.py +19 -5
  8. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/utils.py +2 -2
  9. {vlmparse-0.1.3 → vlmparse-0.1.4/vlmparse.egg-info}/PKG-INFO +1 -1
  10. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse.egg-info/SOURCES.txt +1 -0
  11. vlmparse-0.1.3/vlmparse/clients/deepseekocr.py +0 -52
  12. {vlmparse-0.1.3 → vlmparse-0.1.4}/LICENSE +0 -0
  13. {vlmparse-0.1.3 → vlmparse-0.1.4}/README.md +0 -0
  14. {vlmparse-0.1.3 → vlmparse-0.1.4}/setup.cfg +0 -0
  15. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_batch_parser.py +0 -0
  16. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_benchmark_tests.py +0 -0
  17. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_cli.py +0 -0
  18. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_end2end.py +0 -0
  19. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_process_and_run_benchmark.py +0 -0
  20. {vlmparse-0.1.3 → vlmparse-0.1.4}/tests/test_table_tests.py +0 -0
  21. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/base_model.py +0 -0
  22. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -0
  23. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  24. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/create_dataset.py +0 -0
  25. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -0
  26. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -0
  27. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -0
  28. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -0
  29. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/olmocrbench/tests.py +0 -0
  30. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/run_benchmark.py +0 -0
  31. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -0
  32. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -0
  33. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -0
  34. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -0
  35. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -0
  36. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/benchpdf2md/utils.py +0 -0
  37. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/cli.py +0 -0
  38. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/chandra.py +0 -0
  39. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/docling.py +0 -0
  40. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/dotsocr.py +0 -0
  41. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/granite_docling.py +0 -0
  42. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/hunyuanocr.py +0 -0
  43. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/lightonocr.py +0 -0
  44. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/mineru.py +0 -0
  45. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/nanonetocr.py +0 -0
  46. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/olmocr.py +0 -0
  47. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/openai_converter.py +0 -0
  48. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/paddleocrvl.py +0 -0
  49. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
  50. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
  51. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/pipe_utils/utils.py +0 -0
  52. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/clients/prompts.py +0 -0
  53. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/converter_with_server.py +0 -0
  54. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/data_model/box.py +0 -0
  55. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/data_model/document.py +0 -0
  56. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/registries.py +0 -0
  57. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/servers/docker_server.py +0 -0
  58. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/servers/utils.py +0 -0
  59. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/st_viewer/fs_nav.py +0 -0
  60. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse/st_viewer/st_viewer.py +0 -0
  61. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse.egg-info/dependency_links.txt +0 -0
  62. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse.egg-info/entry_points.txt +0 -0
  63. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse.egg-info/requires.txt +0 -0
  64. {vlmparse-0.1.3 → vlmparse-0.1.4}/vlmparse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Requires-Python: >=3.12.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -8,7 +8,7 @@ version = "0.1.0"
8
8
 
9
9
  [project]
10
10
  name = "vlmparse"
11
- version = "0.1.3"
11
+ version = "0.1.4"
12
12
  authors = []
13
13
  description = ""
14
14
  readme = "README.md"
@@ -106,6 +106,31 @@ class TestConverterConfigs:
106
106
  # Verify API was called
107
107
  assert mock_openai_client.chat.completions.create.call_count == 2
108
108
 
109
+ def test_converter_image_processing(self, datadir, mock_openai_client):
110
+ """Test processing of a single image file."""
111
+ model_name = "gemini-2.5-flash-lite"
112
+ image_path = datadir / "page_with_formula.png"
113
+
114
+ config = converter_config_registry.get(model_name)
115
+ converter = config.get_client()
116
+
117
+ # Process image
118
+ document = converter(image_path)
119
+
120
+ # Verify document structure
121
+ assert isinstance(document, Document)
122
+ assert document.file_path == str(image_path)
123
+ assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
124
+
125
+ # Verify page
126
+ page = document.pages[0]
127
+ assert isinstance(page, Page)
128
+ assert page.text is not None
129
+ assert len(page.text) > 0
130
+
131
+ # Verify API was called once
132
+ assert mock_openai_client.chat.completions.create.call_count == 1
133
+
109
134
  def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
110
135
  """Test DotsOCR converter in OCR mode."""
111
136
  config = converter_config_registry.get("dotsocr")
@@ -1,10 +1,13 @@
1
1
  import re
2
+ from pathlib import Path
2
3
 
3
4
  import numpy as np
4
5
  import PIL
5
6
  import pypdfium2 as pdfium
6
7
  from loguru import logger
7
8
 
9
+ from .constants import PDF_EXTENSION
10
+
8
11
 
9
12
  def convert_pdfium(file_path, dpi):
10
13
  pdf = pdfium.PdfDocument(file_path)
@@ -64,7 +67,10 @@ def resize_image(image, max_image_size):
64
67
 
65
68
 
66
69
  def get_page_count(file_path):
67
- pdf = pdfium.PdfDocument(file_path)
68
- count = len(pdf)
69
- pdf.close()
70
- return count
70
+ if Path(file_path).suffix.lower() == PDF_EXTENSION:
71
+ pdf = pdfium.PdfDocument(file_path)
72
+ count = len(pdf)
73
+ pdf.close()
74
+ return count
75
+ else:
76
+ return 1
@@ -0,0 +1,203 @@
1
+ import re
2
+ from typing import ClassVar, Literal
3
+
4
+ from loguru import logger
5
+ from PIL import Image
6
+ from pydantic import Field
7
+
8
+ from vlmparse.clients.openai_converter import (
9
+ OpenAIConverterClient,
10
+ OpenAIConverterConfig,
11
+ )
12
+ from vlmparse.data_model.box import BoundingBox
13
+ from vlmparse.data_model.document import Item, Page
14
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
+ from vlmparse.utils import to_base64
16
+
17
+
18
+ def re_match(text):
19
+ pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
20
+ matches = re.findall(pattern, text, re.DOTALL)
21
+
22
+ matches_image = []
23
+ matches_other = []
24
+ for a_match in matches:
25
+ if "<|ref|>image<|/ref|>" in a_match[0]:
26
+ matches_image.append(a_match[0])
27
+ else:
28
+ matches_other.append(a_match[0])
29
+ return matches, matches_image, matches_other
30
+
31
+
32
+ def extract_coordinates_and_label(ref_text):
33
+ try:
34
+ label_type = ref_text[1]
35
+ matches = re.findall(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", ref_text[2])
36
+ cor_list = [[int(x) for x in m] for m in matches]
37
+ except Exception as e:
38
+ logger.warning(f"Error parsing coordinates: {e}")
39
+ return None
40
+
41
+ return (label_type, cor_list)
42
+
43
+
44
+ class DeepSeekOCRConverterClient(OpenAIConverterClient):
45
+ """Client for DeepSeekOCR with specific post-processing."""
46
+
47
+ PROMPTS: ClassVar[dict] = {
48
+ "layout": "<|grounding|>Convert the document to markdown.",
49
+ "ocr": "Free OCR.",
50
+ "image_description": "Describe this image in detail.",
51
+ }
52
+
53
+ def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
54
+ items = []
55
+ width, height = image.size
56
+
57
+ for match in matches:
58
+ # match is tuple: (full_str, label, coords_str)
59
+ result = extract_coordinates_and_label(match)
60
+ if not result:
61
+ continue
62
+
63
+ category, coords = result
64
+ if not coords:
65
+ continue
66
+
67
+ # Create boxes
68
+ boxes = []
69
+ for point in coords:
70
+ if len(point) != 4:
71
+ continue
72
+ x1, y1, x2, y2 = point
73
+ # Scale to image size (0-999 -> pixel)
74
+ x1 = (x1 / 999) * width
75
+ y1 = (y1 / 999) * height
76
+ x2 = (x2 / 999) * width
77
+ y2 = (y2 / 999) * height
78
+
79
+ boxes.append(
80
+ BoundingBox(
81
+ l=min(x1, x2), t=min(y1, y2), r=max(x1, x2), b=max(y1, y2)
82
+ )
83
+ )
84
+
85
+ if not boxes:
86
+ continue
87
+
88
+ # Merge if multiple boxes for one item
89
+ try:
90
+ final_box = (
91
+ BoundingBox.merge_boxes(boxes) if len(boxes) > 1 else boxes[0]
92
+ )
93
+ except Exception as e:
94
+ logger.warning(f"Error merging boxes: {e}")
95
+ continue
96
+
97
+ items.append(Item(category=category, text=match[1], box=final_box))
98
+
99
+ return items
100
+
101
+ async def async_call_inside_page(self, page: Page) -> Page:
102
+ # Prepare messages as in parent class
103
+ image = page.image
104
+
105
+ messages = [
106
+ {
107
+ "role": "user",
108
+ "content": [
109
+ {
110
+ "type": "image_url",
111
+ "image_url": {
112
+ "url": f"data:image/png;base64,{to_base64(image)}"
113
+ },
114
+ },
115
+ {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
116
+ ],
117
+ },
118
+ ]
119
+
120
+ # Get raw response using parent's method
121
+ response = await self._get_chat_completion(messages)
122
+ logger.info("Response length: " + str(len(response)))
123
+ page.raw_response = response
124
+
125
+ if self.config.prompt_mode == "layout":
126
+ # Post-processing
127
+ matches, matches_image, matches_other = re_match(response)
128
+
129
+ # Extract items (bounding boxes)
130
+ page.items = self.extract_items(page.image, matches)
131
+
132
+ # Clean text
133
+ outputs = response
134
+
135
+ # Replace image references with a placeholder
136
+ for a_match_image in matches_image:
137
+ outputs = outputs.replace(a_match_image, "![image]")
138
+
139
+ # Replace other references (text grounding) and cleanup
140
+ for a_match_other in matches_other:
141
+ outputs = (
142
+ outputs.replace(a_match_other, "")
143
+ .replace("\\coloneqq", ":=")
144
+ .replace("\\eqqcolon", "=:")
145
+ )
146
+ else:
147
+ outputs = response
148
+
149
+ page.text = outputs.strip()
150
+ logger.debug(page.text)
151
+
152
+ return page
153
+
154
+
155
+ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
156
+ """Configuration for DeepSeekOCR model."""
157
+
158
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
159
+ command_args: list[str] = Field(
160
+ default_factory=lambda: [
161
+ "--limit-mm-per-prompt",
162
+ '{"image": 1}',
163
+ "--async-scheduling",
164
+ "--logits_processors",
165
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
166
+ "--no-enable-prefix-caching",
167
+ "--mm-processor-cache-gb",
168
+ "0",
169
+ ]
170
+ )
171
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
172
+
173
+ @property
174
+ def client_config(self):
175
+ return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
176
+
177
+
178
+ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
179
+ """DeepSeekOCR converter - backward compatibility alias."""
180
+
181
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
182
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
183
+
184
+ prompt_mode: Literal["layout", "ocr"] = "ocr"
185
+ completion_kwargs: dict | None = {
186
+ "temperature": 0.0,
187
+ "max_tokens": 8181,
188
+ "extra_body": {
189
+ "skip_special_tokens": False,
190
+ # args used to control custom logits processor
191
+ "vllm_xargs": {
192
+ "ngram_size": 30,
193
+ "window_size": 90,
194
+ # whitelist: <td>, </td>
195
+ "whitelist_token_ids": [128821, 128822],
196
+ },
197
+ },
198
+ }
199
+ dpi: int = 200
200
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
201
+
202
+ def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
203
+ return DeepSeekOCRConverterClient(config=self, **kwargs)
@@ -0,0 +1,2 @@
1
+ IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2
+ PDF_EXTENSION = ".pdf"
@@ -6,10 +6,12 @@ from pathlib import Path
6
6
  from typing import Literal
7
7
 
8
8
  from loguru import logger
9
+ from PIL import Image
9
10
  from pydantic import Field
10
11
 
11
12
  from .base_model import VLMParseBaseModel
12
13
  from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
14
+ from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
13
15
  from .data_model.document import Document, Page, ProcessingError
14
16
 
15
17
  # Add a lock to ensure PDFium is accessed by only one thread/task at a time
@@ -50,12 +52,24 @@ class BaseConverter:
50
52
  raise NotImplementedError
51
53
 
52
54
  def add_page_image(self, page: Page, file_path, page_idx):
53
- with PDFIUM_LOCK:
54
- image = convert_specific_page_to_image(
55
- file_path,
56
- page_idx,
57
- dpi=self.config.dpi,
55
+ if Path(file_path).suffix.lower() in IMAGE_EXTENSIONS:
56
+ image = Image.open(file_path)
57
+ if image.mode != "RGB":
58
+ image = image.convert("L").convert("RGB")
59
+
60
+ elif Path(file_path).suffix.lower() == PDF_EXTENSION:
61
+ with PDFIUM_LOCK:
62
+ image = convert_specific_page_to_image(
63
+ file_path,
64
+ page_idx,
65
+ dpi=self.config.dpi,
66
+ )
67
+
68
+ else:
69
+ raise ValueError(
70
+ f"Unsupported file extension: {Path(file_path).suffix.lower()}"
58
71
  )
72
+
59
73
  image = resize_image(image, self.config.max_image_size)
60
74
  page.buffer_image = image
61
75
  return page
@@ -28,12 +28,12 @@ def get_file_paths(inputs: str | list[str]):
28
28
  if "*" in pattern or "?" in pattern:
29
29
  file_paths.extend(glob(pattern, recursive=True))
30
30
  elif os.path.isdir(pattern):
31
- file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
31
+ file_paths.extend(glob(os.path.join(pattern, "*.*"), recursive=True))
32
32
  elif os.path.isfile(pattern):
33
33
  file_paths.append(pattern)
34
34
  else:
35
35
  logger.error(f"Invalid input: {pattern}")
36
- file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
36
+ file_paths = [f for f in file_paths if os.path.exists(f) and os.path.isfile(f)]
37
37
 
38
38
  if not file_paths:
39
39
  logger.error("No PDF files found matching the inputs patterns")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Requires-Python: >=3.12.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -11,6 +11,7 @@ tests/test_table_tests.py
11
11
  vlmparse/base_model.py
12
12
  vlmparse/build_doc.py
13
13
  vlmparse/cli.py
14
+ vlmparse/constants.py
14
15
  vlmparse/converter.py
15
16
  vlmparse/converter_with_server.py
16
17
  vlmparse/registries.py
@@ -1,52 +0,0 @@
1
- from pydantic import Field
2
-
3
- from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
- from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
-
6
-
7
- class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
8
- """Configuration for DeepSeekOCR model."""
9
-
10
- model_name: str = "deepseek-ai/DeepSeek-OCR"
11
- command_args: list[str] = Field(
12
- default_factory=lambda: [
13
- "--limit-mm-per-prompt",
14
- '{"image": 1}',
15
- "--async-scheduling",
16
- "--logits_processors",
17
- "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
18
- "--no-enable-prefix-caching",
19
- "--mm-processor-cache-gb",
20
- "0",
21
- ]
22
- )
23
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
24
-
25
- @property
26
- def client_config(self):
27
- return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
28
-
29
-
30
- class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
31
- """DeepSeekOCR converter - backward compatibility alias."""
32
-
33
- model_name: str = "deepseek-ai/DeepSeek-OCR"
34
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
35
- preprompt: str | None = None
36
- postprompt: str | None = "<|grounding|>Convert the document to markdown."
37
- completion_kwargs: dict | None = {
38
- "temperature": 0.0,
39
- "extra_body": {
40
- "skip_special_tokens": False,
41
- # args used to control custom logits processor
42
- "vllm_xargs": {
43
- "ngram_size": 30,
44
- "window_size": 90,
45
- # whitelist: <td>, </td>
46
- "whitelist_token_ids": [128821, 128822],
47
- },
48
- },
49
- }
50
- max_image_size: int | None = 1540
51
- dpi: int = 200
52
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes