vlmparse 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {vlmparse-0.1.3/vlmparse.egg-info → vlmparse-0.1.5}/PKG-INFO +17 -3
  2. {vlmparse-0.1.3 → vlmparse-0.1.5}/README.md +15 -1
  3. {vlmparse-0.1.3 → vlmparse-0.1.5}/pyproject.toml +2 -6
  4. {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_all_converters_mocked.py +25 -0
  5. {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_end2end.py +45 -0
  6. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/build_doc.py +10 -4
  7. vlmparse-0.1.5/vlmparse/clients/deepseekocr.py +203 -0
  8. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/docling.py +2 -2
  9. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/dotsocr.py +11 -2
  10. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/mineru.py +8 -7
  11. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/openai_converter.py +1 -0
  12. vlmparse-0.1.5/vlmparse/constants.py +2 -0
  13. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/converter.py +19 -5
  14. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/converter_with_server.py +5 -4
  15. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/registries.py +2 -4
  16. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/servers/docker_server.py +1 -1
  17. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/servers/utils.py +3 -2
  18. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/utils.py +2 -2
  19. {vlmparse-0.1.3 → vlmparse-0.1.5/vlmparse.egg-info}/PKG-INFO +17 -3
  20. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/SOURCES.txt +1 -18
  21. vlmparse-0.1.3/tests/test_benchmark_tests.py +0 -731
  22. vlmparse-0.1.3/tests/test_process_and_run_benchmark.py +0 -144
  23. vlmparse-0.1.3/tests/test_table_tests.py +0 -1516
  24. vlmparse-0.1.3/vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  25. vlmparse-0.1.3/vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  26. vlmparse-0.1.3/vlmparse/benchpdf2md/create_dataset.py +0 -60
  27. vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  28. vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  29. vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  30. vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  31. vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  32. vlmparse-0.1.3/vlmparse/benchpdf2md/run_benchmark.py +0 -296
  33. vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  34. vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  35. vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  36. vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  37. vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  38. vlmparse-0.1.3/vlmparse/benchpdf2md/utils.py +0 -56
  39. vlmparse-0.1.3/vlmparse/clients/deepseekocr.py +0 -52
  40. {vlmparse-0.1.3 → vlmparse-0.1.5}/LICENSE +0 -0
  41. {vlmparse-0.1.3 → vlmparse-0.1.5}/setup.cfg +0 -0
  42. {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_batch_parser.py +0 -0
  43. {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_cli.py +0 -0
  44. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/base_model.py +0 -0
  45. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/cli.py +0 -0
  46. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/chandra.py +0 -0
  47. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/granite_docling.py +0 -0
  48. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/hunyuanocr.py +0 -0
  49. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/lightonocr.py +0 -0
  50. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/nanonetocr.py +0 -0
  51. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/olmocr.py +0 -0
  52. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/paddleocrvl.py +0 -0
  53. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
  54. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
  55. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/utils.py +0 -0
  56. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/prompts.py +0 -0
  57. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/data_model/box.py +0 -0
  58. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/data_model/document.py +0 -0
  59. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/st_viewer/fs_nav.py +0 -0
  60. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/st_viewer/st_viewer.py +0 -0
  61. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/dependency_links.txt +0 -0
  62. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/entry_points.txt +0 -0
  63. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/requires.txt +0 -0
  64. {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.3
4
- Requires-Python: >=3.12.0
3
+ Version: 0.1.5
4
+ Requires-Python: >=3.11.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
7
7
  Requires-Dist: devtools>=0.12.2
@@ -72,6 +72,19 @@ Supported Converters:
72
72
 
73
73
  ## Installation
74
74
 
75
+ Simplest solution with only the cli:
76
+
77
+ ```bash
78
+ uv tool install vlmparse
79
+ ```
80
+
81
+ If you want to run the granite-docling model or use the streamlit viewing app:
82
+
83
+ ```bash
84
+ uv tool install vlmparse[docling_core,st_app]
85
+ ```
86
+
87
+ If you prefer cloning the repository and using the local version:
75
88
  ```bash
76
89
  uv sync
77
90
  ```
@@ -86,10 +99,11 @@ Activate the virtual environment:
86
99
  ```bash
87
100
  source .venv/bin/activate
88
101
  ```
89
- Other solution: append uv run to all the commands below.
90
102
 
91
103
  ## CLI Usage
92
104
 
105
+ Note that you can bypass the previous installation step and just add uvx before each of the commands below.
106
+
93
107
  ### Convert PDFs
94
108
 
95
109
  With a general VLM (requires setting your api key as an environment variable):
@@ -18,6 +18,19 @@ Supported Converters:
18
18
 
19
19
  ## Installation
20
20
 
21
+ Simplest solution with only the cli:
22
+
23
+ ```bash
24
+ uv tool install vlmparse
25
+ ```
26
+
27
+ If you want to run the granite-docling model or use the streamlit viewing app:
28
+
29
+ ```bash
30
+ uv tool install vlmparse[docling_core,st_app]
31
+ ```
32
+
33
+ If you prefer cloning the repository and using the local version:
21
34
  ```bash
22
35
  uv sync
23
36
  ```
@@ -32,10 +45,11 @@ Activate the virtual environment:
32
45
  ```bash
33
46
  source .venv/bin/activate
34
47
  ```
35
- Other solution: append uv run to all the commands below.
36
48
 
37
49
  ## CLI Usage
38
50
 
51
+ Note that you can bypass the previous installation step and just add uvx before each of the commands below.
52
+
39
53
  ### Convert PDFs
40
54
 
41
55
  With a general VLM (requires setting your api key as an environment variable):
@@ -2,17 +2,13 @@
2
2
  requires = ["setuptools", "wheel"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
- [metadata]
6
- name = "vlmparse"
7
- version = "0.1.0"
8
-
9
5
  [project]
10
6
  name = "vlmparse"
11
- version = "0.1.3"
7
+ version = "0.1.5"
12
8
  authors = []
13
9
  description = ""
14
10
  readme = "README.md"
15
- requires-python = ">=3.12.0"
11
+ requires-python = ">=3.11.0"
16
12
  dependencies = [
17
13
  "devtools>=0.12.2",
18
14
  "docker>=7.1.0",
@@ -106,6 +106,31 @@ class TestConverterConfigs:
106
106
  # Verify API was called
107
107
  assert mock_openai_client.chat.completions.create.call_count == 2
108
108
 
109
+ def test_converter_image_processing(self, datadir, mock_openai_client):
110
+ """Test processing of a single image file."""
111
+ model_name = "gemini-2.5-flash-lite"
112
+ image_path = datadir / "page_with_formula.png"
113
+
114
+ config = converter_config_registry.get(model_name)
115
+ converter = config.get_client()
116
+
117
+ # Process image
118
+ document = converter(image_path)
119
+
120
+ # Verify document structure
121
+ assert isinstance(document, Document)
122
+ assert document.file_path == str(image_path)
123
+ assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
124
+
125
+ # Verify page
126
+ page = document.pages[0]
127
+ assert isinstance(page, Page)
128
+ assert page.text is not None
129
+ assert len(page.text) > 0
130
+
131
+ # Verify API was called once
132
+ assert mock_openai_client.chat.completions.create.call_count == 1
133
+
109
134
  def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
110
135
  """Test DotsOCR converter in OCR mode."""
111
136
  config = converter_config_registry.get("dotsocr")
@@ -64,3 +64,48 @@ def test_convert_with_docker(file_path, model):
64
64
  assert doc.pages[1].text is not None
65
65
 
66
66
  # Server will be automatically stopped due to auto_stop=True
67
+ server.stop()
68
+
69
+
70
+ # @pytest.mark.skipif(
71
+ # "RUN_DEPLOYMENT_VLLM" not in os.environ
72
+ # or os.environ["RUN_DEPLOYMENT_VLLM"] == "false"
73
+ # or "GPU_TEST_VLMPARSE" not in os.environ,
74
+ # reason="Skipping because RUN_DEPLOYMENT_VLLM is not set or is false or GPU_TEST is not set",
75
+ # )
76
+ # @pytest.mark.parametrize(
77
+ # "model",
78
+ # [
79
+ # "docling",
80
+ # "lightonocr",
81
+ # "dotsocr",
82
+ # "nanonets/Nanonets-OCR2-3B",
83
+ # "hunyuanocr",
84
+ # "olmocr-2-fp8",
85
+ # "paddleocrvl",
86
+ # "mineru25",
87
+ # "chandra",
88
+ # "deepseekocr",
89
+ # "granite-docling",
90
+ # ],
91
+ # )
92
+ # def test_converter_with_server_with_docker(file_path, model):
93
+ # """Test conversion with automatic Docker deployment (requires GPU due to vllm limitations)."""
94
+
95
+ # from vlmparse.converter_with_server import ConverterWithServer
96
+ # converter_with_server = ConverterWithServer(
97
+ # model=model,
98
+ # uri=None,
99
+ # gpus=os.environ["GPU_TEST_VLMPARSE"],
100
+ # with_vllm_server=True,
101
+ # concurrency=10,
102
+ # )
103
+
104
+ # docs = converter_with_server.parse([file_path])
105
+
106
+ # # Assertions
107
+ # assert len(docs) == 1
108
+ # doc = docs[0]
109
+ # assert len(doc.pages) == 2
110
+ # assert doc.pages[0].text is not None
111
+ # assert doc.pages[1].text is not None
@@ -1,10 +1,13 @@
1
1
  import re
2
+ from pathlib import Path
2
3
 
3
4
  import numpy as np
4
5
  import PIL
5
6
  import pypdfium2 as pdfium
6
7
  from loguru import logger
7
8
 
9
+ from .constants import PDF_EXTENSION
10
+
8
11
 
9
12
  def convert_pdfium(file_path, dpi):
10
13
  pdf = pdfium.PdfDocument(file_path)
@@ -64,7 +67,10 @@ def resize_image(image, max_image_size):
64
67
 
65
68
 
66
69
  def get_page_count(file_path):
67
- pdf = pdfium.PdfDocument(file_path)
68
- count = len(pdf)
69
- pdf.close()
70
- return count
70
+ if Path(file_path).suffix.lower() == PDF_EXTENSION:
71
+ pdf = pdfium.PdfDocument(file_path)
72
+ count = len(pdf)
73
+ pdf.close()
74
+ return count
75
+ else:
76
+ return 1
@@ -0,0 +1,203 @@
1
+ import re
2
+ from typing import ClassVar, Literal
3
+
4
+ from loguru import logger
5
+ from PIL import Image
6
+ from pydantic import Field
7
+
8
+ from vlmparse.clients.openai_converter import (
9
+ OpenAIConverterClient,
10
+ OpenAIConverterConfig,
11
+ )
12
+ from vlmparse.data_model.box import BoundingBox
13
+ from vlmparse.data_model.document import Item, Page
14
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
+ from vlmparse.utils import to_base64
16
+
17
+
18
+ def re_match(text):
19
+ pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
20
+ matches = re.findall(pattern, text, re.DOTALL)
21
+
22
+ matches_image = []
23
+ matches_other = []
24
+ for a_match in matches:
25
+ if "<|ref|>image<|/ref|>" in a_match[0]:
26
+ matches_image.append(a_match[0])
27
+ else:
28
+ matches_other.append(a_match[0])
29
+ return matches, matches_image, matches_other
30
+
31
+
32
+ def extract_coordinates_and_label(ref_text):
33
+ try:
34
+ label_type = ref_text[1]
35
+ matches = re.findall(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", ref_text[2])
36
+ cor_list = [[int(x) for x in m] for m in matches]
37
+ except Exception as e:
38
+ logger.warning(f"Error parsing coordinates: {e}")
39
+ return None
40
+
41
+ return (label_type, cor_list)
42
+
43
+
44
+ class DeepSeekOCRConverterClient(OpenAIConverterClient):
45
+ """Client for DeepSeekOCR with specific post-processing."""
46
+
47
+ PROMPTS: ClassVar[dict] = {
48
+ "layout": "<|grounding|>Convert the document to markdown.",
49
+ "ocr": "Free OCR.",
50
+ "image_description": "Describe this image in detail.",
51
+ }
52
+
53
+ def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
54
+ items = []
55
+ width, height = image.size
56
+
57
+ for match in matches:
58
+ # match is tuple: (full_str, label, coords_str)
59
+ result = extract_coordinates_and_label(match)
60
+ if not result:
61
+ continue
62
+
63
+ category, coords = result
64
+ if not coords:
65
+ continue
66
+
67
+ # Create boxes
68
+ boxes = []
69
+ for point in coords:
70
+ if len(point) != 4:
71
+ continue
72
+ x1, y1, x2, y2 = point
73
+ # Scale to image size (0-999 -> pixel)
74
+ x1 = (x1 / 999) * width
75
+ y1 = (y1 / 999) * height
76
+ x2 = (x2 / 999) * width
77
+ y2 = (y2 / 999) * height
78
+
79
+ boxes.append(
80
+ BoundingBox(
81
+ l=min(x1, x2), t=min(y1, y2), r=max(x1, x2), b=max(y1, y2)
82
+ )
83
+ )
84
+
85
+ if not boxes:
86
+ continue
87
+
88
+ # Merge if multiple boxes for one item
89
+ try:
90
+ final_box = (
91
+ BoundingBox.merge_boxes(boxes) if len(boxes) > 1 else boxes[0]
92
+ )
93
+ except Exception as e:
94
+ logger.warning(f"Error merging boxes: {e}")
95
+ continue
96
+
97
+ items.append(Item(category=category, text=match[1], box=final_box))
98
+
99
+ return items
100
+
101
+ async def async_call_inside_page(self, page: Page) -> Page:
102
+ # Prepare messages as in parent class
103
+ image = page.image
104
+
105
+ messages = [
106
+ {
107
+ "role": "user",
108
+ "content": [
109
+ {
110
+ "type": "image_url",
111
+ "image_url": {
112
+ "url": f"data:image/png;base64,{to_base64(image)}"
113
+ },
114
+ },
115
+ {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
116
+ ],
117
+ },
118
+ ]
119
+
120
+ # Get raw response using parent's method
121
+ response = await self._get_chat_completion(messages)
122
+ logger.info("Response length: " + str(len(response)))
123
+ page.raw_response = response
124
+
125
+ if self.config.prompt_mode == "layout":
126
+ # Post-processing
127
+ matches, matches_image, matches_other = re_match(response)
128
+
129
+ # Extract items (bounding boxes)
130
+ page.items = self.extract_items(page.image, matches)
131
+
132
+ # Clean text
133
+ outputs = response
134
+
135
+ # Replace image references with a placeholder
136
+ for a_match_image in matches_image:
137
+ outputs = outputs.replace(a_match_image, "![image]")
138
+
139
+ # Replace other references (text grounding) and cleanup
140
+ for a_match_other in matches_other:
141
+ outputs = (
142
+ outputs.replace(a_match_other, "")
143
+ .replace("\\coloneqq", ":=")
144
+ .replace("\\eqqcolon", "=:")
145
+ )
146
+ else:
147
+ outputs = response
148
+
149
+ page.text = outputs.strip()
150
+ logger.debug(page.text)
151
+
152
+ return page
153
+
154
+
155
+ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
156
+ """Configuration for DeepSeekOCR model."""
157
+
158
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
159
+ command_args: list[str] = Field(
160
+ default_factory=lambda: [
161
+ "--limit-mm-per-prompt",
162
+ '{"image": 1}',
163
+ "--async-scheduling",
164
+ "--logits_processors",
165
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
166
+ "--no-enable-prefix-caching",
167
+ "--mm-processor-cache-gb",
168
+ "0",
169
+ ]
170
+ )
171
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
172
+
173
+ @property
174
+ def client_config(self):
175
+ return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
176
+
177
+
178
+ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
179
+ """DeepSeekOCR converter - backward compatibility alias."""
180
+
181
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
182
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
183
+
184
+ prompt_mode: Literal["layout", "ocr"] = "ocr"
185
+ completion_kwargs: dict | None = {
186
+ "temperature": 0.0,
187
+ "max_tokens": 8181,
188
+ "extra_body": {
189
+ "skip_special_tokens": False,
190
+ # args used to control custom logits processor
191
+ "vllm_xargs": {
192
+ "ngram_size": 30,
193
+ "window_size": 90,
194
+ # whitelist: <td>, </td>
195
+ "whitelist_token_ids": [128821, 128822],
196
+ },
197
+ },
198
+ }
199
+ dpi: int = 200
200
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
201
+
202
+ def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
203
+ return DeepSeekOCRConverterClient(config=self, **kwargs)
@@ -34,7 +34,7 @@ class DoclingDockerServerConfig(DockerServerConfig):
34
34
  "LOG_LEVEL": "DEBUG", # Enable verbose logging
35
35
  # Performance Tuning
36
36
  # "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
37
- # "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "4", # Increase processing workers (Default: 2)
37
+ "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "16", # Increase processing workers (Default: 2)
38
38
  "DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
39
39
  }
40
40
  )
@@ -62,8 +62,8 @@ class DoclingDockerServerConfig(DockerServerConfig):
62
62
  class DoclingConverterConfig(ConverterConfig):
63
63
  """Configuration for Docling converter client."""
64
64
 
65
+ base_url: str
65
66
  model_name: str = "docling"
66
- base_url: str = "http://localhost:5001"
67
67
  timeout: int = 300
68
68
  api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
69
69
 
@@ -8,6 +8,7 @@ from PIL import Image
8
8
  from pydantic import Field
9
9
 
10
10
  from vlmparse.clients.openai_converter import (
11
+ LLMParams,
11
12
  OpenAIConverterClient,
12
13
  OpenAIConverterConfig,
13
14
  )
@@ -28,6 +29,7 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
28
29
  dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
29
30
  command_args: list[str] = Field(
30
31
  default_factory=lambda: [
32
+ "/workspace/weights/DotsOCR",
31
33
  "--tensor-parallel-size",
32
34
  "1",
33
35
  "--gpu-memory-utilization",
@@ -44,12 +46,19 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
44
46
  # "16384",
45
47
  ]
46
48
  )
47
- add_model_key_to_server: bool = False
49
+ add_model_key_to_server: bool = True
48
50
  aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
49
51
 
50
52
  @property
51
53
  def client_config(self):
52
- return DotsOCRConverterConfig(llm_params=self.llm_params)
54
+ return DotsOCRConverterConfig(
55
+ llm_params=LLMParams(
56
+ base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
57
+ )
58
+ )
59
+
60
+ def get_base_url_suffix(self) -> str:
61
+ return "/v1"
53
62
 
54
63
 
55
64
  class DotsOCRConverterConfig(OpenAIConverterConfig):
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import io
3
- import os
4
3
 
5
4
  import orjson
6
5
  from loguru import logger
@@ -20,18 +19,21 @@ class MinerUDockerServerConfig(DockerServerConfig):
20
19
  docker_image: str = "pulsia/mineru25apipulsia:latest"
21
20
  docker_port: int = 4299
22
21
  container_port: int = 8000
22
+ server_ready_indicators: list[str] = Field(
23
+ default_factory=lambda: ["Uvicorn running"]
24
+ )
23
25
 
24
26
  @property
25
27
  def client_config(self):
26
- return MinerUConverterConfig(api_url=f"http://localhost:{self.docker_port}")
28
+ return MinerUConverterConfig(base_url=f"http://localhost:{self.docker_port}")
27
29
 
28
30
 
29
31
  class MinerUConverterConfig(ConverterConfig):
30
32
  """Configuration for MinerU API converter."""
31
33
 
32
- base_url: str = Field(
33
- default_factory=lambda: os.getenv("MINERU_API_URL", "http://localhost:4299")
34
- )
34
+ base_url: str
35
+ model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
36
+ aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
35
37
  timeout: int = 600
36
38
 
37
39
  def get_client(self, **kwargs) -> "MinerUConverter":
@@ -54,13 +56,12 @@ class MinerUConverter(BaseConverter):
54
56
  super().__init__(config=config, **kwargs)
55
57
  from httpx import AsyncClient
56
58
 
57
- self.client = AsyncClient(base_url=config.api_url, timeout=config.timeout)
59
+ self.client = AsyncClient(base_url=config.base_url, timeout=config.timeout)
58
60
 
59
61
  async def _async_inference_with_api(self, image) -> list:
60
62
  """Run async inference with MinerU API."""
61
63
 
62
64
  img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
63
-
64
65
  response = await self.client.post(
65
66
  "process-image",
66
67
  files={"image": ("image.png", img_byte_arr, "image/png")},
@@ -92,6 +92,7 @@ class OpenAIConverterClient(BaseConverter):
92
92
  base_url=self.config.llm_params.base_url,
93
93
  api_key=self.config.llm_params.api_key,
94
94
  timeout=self.config.llm_params.timeout,
95
+ max_retries=self.config.llm_params.max_retries,
95
96
  )
96
97
 
97
98
  async def _get_chat_completion(
@@ -0,0 +1,2 @@
1
+ IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2
+ PDF_EXTENSION = ".pdf"
@@ -6,10 +6,12 @@ from pathlib import Path
6
6
  from typing import Literal
7
7
 
8
8
  from loguru import logger
9
+ from PIL import Image
9
10
  from pydantic import Field
10
11
 
11
12
  from .base_model import VLMParseBaseModel
12
13
  from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
14
+ from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
13
15
  from .data_model.document import Document, Page, ProcessingError
14
16
 
15
17
  # Add a lock to ensure PDFium is accessed by only one thread/task at a time
@@ -50,12 +52,24 @@ class BaseConverter:
50
52
  raise NotImplementedError
51
53
 
52
54
  def add_page_image(self, page: Page, file_path, page_idx):
53
- with PDFIUM_LOCK:
54
- image = convert_specific_page_to_image(
55
- file_path,
56
- page_idx,
57
- dpi=self.config.dpi,
55
+ if Path(file_path).suffix.lower() in IMAGE_EXTENSIONS:
56
+ image = Image.open(file_path)
57
+ if image.mode != "RGB":
58
+ image = image.convert("L").convert("RGB")
59
+
60
+ elif Path(file_path).suffix.lower() == PDF_EXTENSION:
61
+ with PDFIUM_LOCK:
62
+ image = convert_specific_page_to_image(
63
+ file_path,
64
+ page_idx,
65
+ dpi=self.config.dpi,
66
+ )
67
+
68
+ else:
69
+ raise ValueError(
70
+ f"Unsupported file extension: {Path(file_path).suffix.lower()}"
58
71
  )
72
+
59
73
  image = resize_image(image, self.config.max_image_size)
60
74
  page.buffer_image = image
61
75
  return page
@@ -42,13 +42,13 @@ class ConverterWithServer:
42
42
  docker_config = docker_config_registry.get(
43
43
  self.model, default=self.with_vllm_server
44
44
  )
45
- if self.port is not None:
46
- docker_config.docker_port = self.port
47
45
 
48
46
  if docker_config is not None:
47
+ if self.port is not None:
48
+ docker_config.docker_port = self.port
49
49
  docker_config.gpu_device_ids = gpu_device_ids
50
- server = docker_config.get_server(auto_stop=True)
51
- server.start()
50
+ self.server = docker_config.get_server(auto_stop=True)
51
+ self.server.start()
52
52
 
53
53
  self.client = docker_config.get_client()
54
54
  else:
@@ -56,6 +56,7 @@ class ConverterWithServer:
56
56
 
57
57
  else:
58
58
  client_config = converter_config_registry.get(self.model, uri=self.uri)
59
+
59
60
  self.client = client_config.get_client()
60
61
 
61
62
  def parse(
@@ -108,6 +108,7 @@ for gemini_model in [
108
108
  "gemini-2.5-flash",
109
109
  "gemini-2.5-flash-lite",
110
110
  "gemini-3-pro-preview",
111
+ "gemini-3-flash-preview",
111
112
  ]:
112
113
  converter_config_registry.register(
113
114
  gemini_model,
@@ -120,12 +121,9 @@ for gemini_model in [
120
121
  ),
121
122
  )
122
123
  for openai_model in [
123
- "gpt-5.1",
124
- "gpt-5.1-mini",
125
- "gpt-5.1-nano",
124
+ "gpt-5.2",
126
125
  "gpt-5",
127
126
  "gpt-5-mini",
128
- "gpt-5-nano",
129
127
  ]:
130
128
  converter_config_registry.register(
131
129
  openai_model,
@@ -78,7 +78,7 @@ class VLLMDockerServerConfig(DockerServerConfig):
78
78
  from vlmparse.clients.openai_converter import LLMParams
79
79
 
80
80
  return LLMParams(
81
- base_url=f"http://localhost:{self.docker_port}/v1",
81
+ base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
82
82
  model_name=self.default_model_name,
83
83
  )
84
84
 
@@ -3,9 +3,8 @@ import time
3
3
  from contextlib import contextmanager
4
4
  from pathlib import Path
5
5
 
6
- from loguru import logger
7
-
8
6
  import docker
7
+ from loguru import logger
9
8
 
10
9
 
11
10
  def _ensure_image_exists(
@@ -230,6 +229,8 @@ def get_model_from_uri(uri: str) -> str:
230
229
  for container in containers:
231
230
  c_uri = container.labels.get("vlmparse_uri")
232
231
  c_model = container.labels.get("vlmparse_model_name")
232
+ if c_uri is not None:
233
+ c_uri = c_uri.replace("localhost", "0.0.0.0")
233
234
 
234
235
  # Check if user URI matches container URI (ignoring /v1 suffix if missing)
235
236
  if c_uri and (
@@ -28,12 +28,12 @@ def get_file_paths(inputs: str | list[str]):
28
28
  if "*" in pattern or "?" in pattern:
29
29
  file_paths.extend(glob(pattern, recursive=True))
30
30
  elif os.path.isdir(pattern):
31
- file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
31
+ file_paths.extend(glob(os.path.join(pattern, "*.*"), recursive=True))
32
32
  elif os.path.isfile(pattern):
33
33
  file_paths.append(pattern)
34
34
  else:
35
35
  logger.error(f"Invalid input: {pattern}")
36
- file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
36
+ file_paths = [f for f in file_paths if os.path.exists(f) and os.path.isfile(f)]
37
37
 
38
38
  if not file_paths:
39
39
  logger.error("No PDF files found matching the inputs patterns")