vlmparse 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {vlmparse-0.1.7 → vlmparse-0.1.9}/PKG-INFO +13 -3
  2. {vlmparse-0.1.7 → vlmparse-0.1.9}/README.md +13 -3
  3. {vlmparse-0.1.7 → vlmparse-0.1.9}/pyproject.toml +1 -1
  4. {vlmparse-0.1.7 → vlmparse-0.1.9}/tests/test_all_converters_mocked.py +123 -130
  5. vlmparse-0.1.9/tests/test_batch_parser.py +144 -0
  6. vlmparse-0.1.9/tests/test_cli.py +844 -0
  7. vlmparse-0.1.9/tests/test_end2end.py +121 -0
  8. vlmparse-0.1.9/tests/test_server_logic.py +153 -0
  9. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/build_doc.py +20 -19
  10. vlmparse-0.1.9/vlmparse/cli.py +488 -0
  11. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/chandra.py +176 -60
  12. vlmparse-0.1.9/vlmparse/clients/deepseekocr.py +384 -0
  13. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/docling.py +0 -1
  14. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/dotsocr.py +34 -31
  15. vlmparse-0.1.9/vlmparse/clients/glmocr.py +243 -0
  16. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/granite_docling.py +9 -36
  17. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/hunyuanocr.py +5 -1
  18. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/lightonocr.py +23 -1
  19. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/mineru.py +0 -1
  20. vlmparse-0.1.9/vlmparse/clients/mistral_converter.py +85 -0
  21. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/nanonetocr.py +5 -1
  22. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/olmocr.py +6 -2
  23. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/openai_converter.py +95 -60
  24. vlmparse-0.1.9/vlmparse/clients/paddleocrvl.py +204 -0
  25. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/converter.py +51 -11
  26. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/converter_with_server.py +92 -19
  27. vlmparse-0.1.9/vlmparse/registries.py +188 -0
  28. vlmparse-0.1.9/vlmparse/servers/base_server.py +127 -0
  29. vlmparse-0.1.9/vlmparse/servers/docker_compose_deployment.py +489 -0
  30. vlmparse-0.1.9/vlmparse/servers/docker_compose_server.py +39 -0
  31. vlmparse-0.1.7/vlmparse/servers/utils.py → vlmparse-0.1.9/vlmparse/servers/docker_run_deployment.py +7 -60
  32. vlmparse-0.1.9/vlmparse/servers/docker_server.py +120 -0
  33. vlmparse-0.1.9/vlmparse/servers/model_identity.py +48 -0
  34. vlmparse-0.1.9/vlmparse/servers/server_registry.py +42 -0
  35. vlmparse-0.1.9/vlmparse/servers/utils.py +143 -0
  36. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/st_viewer/st_viewer.py +1 -1
  37. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/utils.py +15 -2
  38. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/PKG-INFO +13 -3
  39. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/SOURCES.txt +9 -0
  40. vlmparse-0.1.7/tests/test_batch_parser.py +0 -186
  41. vlmparse-0.1.7/tests/test_cli.py +0 -732
  42. vlmparse-0.1.7/tests/test_end2end.py +0 -67
  43. vlmparse-0.1.7/vlmparse/cli.py +0 -319
  44. vlmparse-0.1.7/vlmparse/clients/deepseekocr.py +0 -203
  45. vlmparse-0.1.7/vlmparse/clients/paddleocrvl.py +0 -49
  46. vlmparse-0.1.7/vlmparse/registries.py +0 -170
  47. vlmparse-0.1.7/vlmparse/servers/docker_server.py +0 -212
  48. {vlmparse-0.1.7 → vlmparse-0.1.9}/LICENSE +0 -0
  49. {vlmparse-0.1.7 → vlmparse-0.1.9}/setup.cfg +0 -0
  50. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/base_model.py +0 -0
  51. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
  52. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
  53. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/utils.py +0 -0
  54. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/prompts.py +0 -0
  55. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/constants.py +0 -0
  56. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/data_model/box.py +0 -0
  57. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/data_model/document.py +0 -0
  58. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/st_viewer/fs_nav.py +0 -0
  59. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/dependency_links.txt +0 -0
  60. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/entry_points.txt +0 -0
  61. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/requires.txt +0 -0
  62. {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Requires-Python: >=3.11.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -54,6 +54,12 @@ Dynamic: license-file
54
54
 
55
55
  # vlmparse
56
56
 
57
+ <div align="center">
58
+
59
+ [\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
60
+
61
+ </div>
62
+
57
63
  A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
58
64
 
59
65
  Features:
@@ -125,13 +131,13 @@ Deployment (requires a gpu + docker installation):
125
131
  - Check that the port is not used by another service.
126
132
 
127
133
  ```bash
128
- vlmparse serve --model lightonocr --port 8000 --gpus 1
134
+ vlmparse serve --model lightonocr2 --port 8000 --gpus 1
129
135
  ```
130
136
 
131
137
  then convert:
132
138
 
133
139
  ```bash
134
- vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
140
+ vlmparse convert "*.pdf" --out_folder ./output --uri http://localhost:8000/v1
135
141
  ```
136
142
 
137
143
  You can also list all running servers:
@@ -209,3 +215,7 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
209
215
  ```
210
216
 
211
217
  Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
218
+
219
+ ## Credits
220
+
221
+ This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
@@ -1,5 +1,11 @@
1
1
  # vlmparse
2
2
 
3
+ <div align="center">
4
+
5
+ [\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
6
+
7
+ </div>
8
+
3
9
  A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
4
10
 
5
11
  Features:
@@ -71,13 +77,13 @@ Deployment (requires a gpu + docker installation):
71
77
  - Check that the port is not used by another service.
72
78
 
73
79
  ```bash
74
- vlmparse serve --model lightonocr --port 8000 --gpus 1
80
+ vlmparse serve --model lightonocr2 --port 8000 --gpus 1
75
81
  ```
76
82
 
77
83
  then convert:
78
84
 
79
85
  ```bash
80
- vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
86
+ vlmparse convert "*.pdf" --out_folder ./output --uri http://localhost:8000/v1
81
87
  ```
82
88
 
83
89
  You can also list all running servers:
@@ -154,4 +160,8 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
154
160
  documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
155
161
  ```
156
162
 
157
- Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
163
+ Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
164
+
165
+ ## Credits
166
+
167
+ This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vlmparse"
7
- version = "0.1.7"
7
+ version = "0.1.9"
8
8
  authors = []
9
9
  description = ""
10
10
  readme = "README.md"
@@ -19,43 +19,8 @@ MOCK_RESPONSES = {
19
19
  }
20
20
 
21
21
 
22
- @pytest.fixture
23
- def mock_openai_client():
24
- """Mock the AsyncOpenAI client used by all converters."""
25
- with patch("openai.AsyncOpenAI") as mock_client:
26
- # Create mock response object
27
- mock_response = MagicMock()
28
- mock_response.choices = [MagicMock()]
29
- mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
30
- mock_response.usage = MagicMock()
31
- mock_response.usage.prompt_tokens = 50
32
- mock_response.usage.completion_tokens = 150
33
- mock_response.usage.reasoning_tokens = 30
34
-
35
- # Configure the async method
36
- mock_instance = MagicMock()
37
- mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
38
- mock_client.return_value = mock_instance
39
-
40
- yield mock_instance
41
-
42
-
43
- @pytest.fixture
44
- def dotsocr_mock_client():
45
- """Mock for DotsOCR with different response types."""
46
- with patch("openai.AsyncOpenAI") as mock_client:
47
- mock_response = MagicMock()
48
- mock_response.choices = [MagicMock()]
49
- mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
50
- mock_response.usage = MagicMock()
51
- mock_response.usage.prompt_tokens = 40
52
- mock_response.usage.completion_tokens = 160
53
- mock_response.usage.reasoning_tokens = 20
54
- mock_instance = MagicMock()
55
- mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
56
- mock_client.return_value = mock_instance
57
-
58
- yield mock_instance
22
+ # Note: mock_openai_client and dotsocr_mock_client fixtures are replaced by the
23
+ # unified mock_openai_api fixture from conftest.py
59
24
 
60
25
 
61
26
  # List of all models registered in converter_config_registry
@@ -64,6 +29,13 @@ ALL_MODELS = [
64
29
  "lightonocr",
65
30
  "dotsocr",
66
31
  "nanonets/Nanonets-OCR2-3B",
32
+ "hunyuanocr",
33
+ "olmocr-2-fp8",
34
+ "mineru25",
35
+ "chandra",
36
+ "deepseekocr",
37
+ "granite-docling",
38
+ "deepseekocr2",
67
39
  ]
68
40
 
69
41
 
@@ -91,87 +63,93 @@ class TestConverterConfigs:
91
63
  ],
92
64
  )
93
65
  def test_converter_basic_processing(
94
- self, file_path, model_name, mock_openai_client
66
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
95
67
  ):
96
68
  """Test basic document processing for OpenAI-compatible converters."""
97
- config = converter_config_registry.get(model_name)
98
- converter = config.get_client(num_concurrent_pages=2, debug=True)
69
+ with mock_openai_api() as openai_client:
70
+ config = converter_config_registry.get(model_name)
71
+ converter = config.get_client(
72
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
73
+ )
99
74
 
100
- # Process document
101
- document = converter(file_path)
75
+ # Process document
76
+ document = converter(file_path)
102
77
 
103
- # Verify document structure
104
- assert isinstance(document, Document)
105
- assert document.file_path == str(file_path)
106
- assert len(document.pages) == 2, f"Expected 2 pages, got {len(document.pages)}"
78
+ # Verify document structure
79
+ assert isinstance(document, Document)
80
+ assert document.file_path == str(file_path)
81
+ assert (
82
+ len(document.pages) == 2
83
+ ), f"Expected 2 pages, got {len(document.pages)}"
107
84
 
108
- # Verify pages
109
- for page in document.pages:
110
- assert isinstance(page, Page)
111
- assert page.text is not None, "Page text should not be None"
112
- assert len(page.text) > 0, "Page text should not be empty"
85
+ # Verify pages
86
+ for page in document.pages:
87
+ assert isinstance(page, Page)
88
+ assert page.text is not None, "Page text should not be None"
89
+ assert len(page.text) > 0, "Page text should not be empty"
113
90
 
114
- # Verify API was called
115
- assert mock_openai_client.chat.completions.create.call_count == 2
91
+ # Verify API was called
92
+ assert openai_client.chat.completions.create.call_count == 2
116
93
 
117
- def test_converter_image_processing(self, datadir, mock_openai_client):
94
+ def test_converter_image_processing(self, datadir, mock_openai_api, tmp_output_dir):
118
95
  """Test processing of a single image file."""
119
- model_name = "gemini-2.5-flash-lite"
120
- image_path = datadir / "page_with_formula.png"
96
+ with mock_openai_api() as openai_client:
97
+ model_name = "gemini-2.5-flash-lite"
98
+ image_path = datadir / "page_with_formula.png"
121
99
 
122
- config = converter_config_registry.get(model_name)
123
- converter = config.get_client(debug=True)
100
+ config = converter_config_registry.get(model_name)
101
+ converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
124
102
 
125
- # Process image
126
- document = converter(image_path)
103
+ # Process image
104
+ document = converter(image_path)
127
105
 
128
- # Verify document structure
129
- assert isinstance(document, Document)
130
- assert document.file_path == str(image_path)
131
- assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
106
+ # Verify document structure
107
+ assert isinstance(document, Document)
108
+ assert document.file_path == str(image_path)
109
+ assert (
110
+ len(document.pages) == 1
111
+ ), f"Expected 1 page, got {len(document.pages)}"
132
112
 
133
- # Verify page
134
- page = document.pages[0]
135
- assert isinstance(page, Page)
136
- assert page.text is not None
137
- assert len(page.text) > 0
113
+ # Verify page
114
+ page = document.pages[0]
115
+ assert isinstance(page, Page)
116
+ assert page.text is not None
117
+ assert len(page.text) > 0
138
118
 
139
- # Verify API was called once
140
- assert mock_openai_client.chat.completions.create.call_count == 1
119
+ # Verify API was called once
120
+ assert openai_client.chat.completions.create.call_count == 1
141
121
 
142
- def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
122
+ def test_dotsocr_ocr_mode(self, file_path, mock_openai_api, tmp_output_dir):
143
123
  """Test DotsOCR converter in OCR mode."""
144
- config = converter_config_registry.get("dotsocr")
145
- converter = config.get_client(num_concurrent_pages=2, debug=True)
124
+ with mock_openai_api(content=MOCK_RESPONSES["dotsocr_ocr"]) as openai_client:
125
+ config = converter_config_registry.get("dotsocr")
126
+ converter = config.get_client(
127
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
128
+ )
146
129
 
147
- # Process document
148
- document = converter(file_path)
130
+ # Process document
131
+ document = converter(file_path)
149
132
 
150
- # Verify document structure
151
- assert isinstance(document, Document)
152
- assert len(document.pages) == 2
133
+ # Verify document structure
134
+ assert isinstance(document, Document)
135
+ assert len(document.pages) == 2
153
136
 
154
- for page in document.pages:
155
- assert isinstance(page, Page)
156
- assert page.text is not None
157
- assert len(page.text) > 0
137
+ for page in document.pages:
138
+ assert isinstance(page, Page)
139
+ assert page.text is not None
140
+ assert len(page.text) > 0
158
141
 
159
- # Verify API was called
160
- assert dotsocr_mock_client.chat.completions.create.call_count == 2
142
+ # Verify API was called
143
+ assert openai_client.chat.completions.create.call_count == 2
161
144
 
162
145
  @pytest.mark.parametrize("model_name", ALL_MODELS)
163
- def test_converter_error_handling(self, file_path, model_name):
146
+ def test_converter_error_handling(
147
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
148
+ ):
164
149
  """Test that converters handle errors gracefully."""
165
- with patch("openai.AsyncOpenAI") as mock_client:
166
- # Configure mock to raise an exception
167
- mock_instance = MagicMock()
168
- mock_instance.chat.completions.create = AsyncMock(
169
- side_effect=Exception("API Error")
170
- )
171
- mock_client.return_value = mock_instance
172
-
150
+ with mock_openai_api(side_effect=Exception("API Error")):
173
151
  config = converter_config_registry.get(model_name)
174
- converter = config.get_client(debug=False)
152
+ converter = config.get_client(debug=False, save_folder=str(tmp_output_dir))
175
153
 
176
154
  # Process should not crash
177
155
  document = converter(file_path)
@@ -193,25 +171,29 @@ class TestConverterBatchProcessing:
193
171
  "lightonocr",
194
172
  ],
195
173
  )
196
- def test_batch_processing(self, file_path, model_name, mock_openai_client):
174
+ def test_batch_processing(
175
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
176
+ ):
197
177
  """Test batch processing of multiple files."""
198
- config = converter_config_registry.get(model_name)
199
- converter = config.get_client(
200
- num_concurrent_files=2,
201
- num_concurrent_pages=2,
202
- return_documents_in_batch_mode=True,
203
- debug=True,
204
- )
178
+ with mock_openai_api():
179
+ config = converter_config_registry.get(model_name)
180
+ converter = config.get_client(
181
+ num_concurrent_files=2,
182
+ num_concurrent_pages=2,
183
+ return_documents_in_batch_mode=True,
184
+ debug=True,
185
+ save_folder=str(tmp_output_dir),
186
+ )
205
187
 
206
- # Process multiple files (same file for testing)
207
- file_paths = [file_path, file_path]
208
- documents = converter.batch(file_paths)
188
+ # Process multiple files (same file for testing)
189
+ file_paths = [file_path, file_path]
190
+ documents = converter.batch(file_paths)
209
191
 
210
- # Verify results
211
- assert len(documents) == 2
212
- for doc in documents:
213
- assert isinstance(doc, Document)
214
- assert len(doc.pages) == 2
192
+ # Verify results
193
+ assert len(documents) == 2
194
+ for doc in documents:
195
+ assert isinstance(doc, Document)
196
+ assert len(doc.pages) == 2
215
197
 
216
198
 
217
199
  @pytest.fixture
@@ -245,12 +227,16 @@ def mineru_mock_httpx_client():
245
227
 
246
228
 
247
229
  class TestMinerUConverterMockedApi:
248
- def test_mineru_converter_repeated_call(self, file_path, mineru_mock_httpx_client):
230
+ def test_mineru_converter_repeated_call(
231
+ self, file_path, mineru_mock_httpx_client, tmp_output_dir
232
+ ):
249
233
  """Repeated `__call__` should keep working and call API each page."""
250
234
  from vlmparse.clients.mineru import MinerUConverterConfig
251
235
 
252
236
  config = MinerUConverterConfig(base_url="http://mineru.test")
253
- converter = config.get_client(num_concurrent_pages=2, debug=True)
237
+ converter = config.get_client(
238
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
239
+ )
254
240
 
255
241
  with (
256
242
  patch("vlmparse.clients.mineru.clean_response", lambda x: x),
@@ -274,7 +260,7 @@ class TestMinerUConverterMockedApi:
274
260
  assert mineru_mock_httpx_client.post.call_count == 4
275
261
 
276
262
  def test_mineru_converter_batch_processing(
277
- self, file_path, mineru_mock_httpx_client
263
+ self, file_path, mineru_mock_httpx_client, tmp_output_dir
278
264
  ):
279
265
  """Batch mode should return documents and call API for each page."""
280
266
  from vlmparse.clients.mineru import MinerUConverterConfig
@@ -285,6 +271,7 @@ class TestMinerUConverterMockedApi:
285
271
  num_concurrent_pages=2,
286
272
  return_documents_in_batch_mode=True,
287
273
  debug=True,
274
+ save_folder=str(tmp_output_dir),
288
275
  )
289
276
 
290
277
  with (
@@ -306,19 +293,22 @@ class TestMinerUConverterMockedApi:
306
293
  class TestCustomURI:
307
294
  """Test converter initialization with custom URIs."""
308
295
 
309
- def test_custom_uri_config(self, mock_openai_client, file_path):
296
+ def test_custom_uri_config(self, mock_openai_api, file_path, tmp_output_dir):
310
297
  """Test that converters can be initialized with custom URIs."""
311
- custom_uri = "http://localhost:8000/v1"
312
- config = converter_config_registry.get("gemini-2.5-flash-lite", uri=custom_uri)
298
+ with mock_openai_api():
299
+ custom_uri = "http://localhost:8000/v1"
300
+ config = converter_config_registry.get(
301
+ "gemini-2.5-flash-lite", uri=custom_uri
302
+ )
313
303
 
314
- assert config.llm_params.base_url == custom_uri
304
+ assert config.base_url == custom_uri
315
305
 
316
- # Test it works
317
- converter = config.get_client(debug=True)
318
- document = converter(file_path)
306
+ # Test it works
307
+ converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
308
+ document = converter(file_path)
319
309
 
320
- assert isinstance(document, Document)
321
- assert len(document.pages) == 2
310
+ assert isinstance(document, Document)
311
+ assert len(document.pages) == 2
322
312
 
323
313
 
324
314
  class TestConcurrency:
@@ -326,14 +316,17 @@ class TestConcurrency:
326
316
 
327
317
  @pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
328
318
  def test_concurrent_page_processing(
329
- self, file_path, model_name, mock_openai_client
319
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
330
320
  ):
331
321
  """Test that concurrent page processing limits are respected."""
332
- config = converter_config_registry.get(model_name)
333
- converter = config.get_client(num_concurrent_pages=1, debug=True)
322
+ with mock_openai_api() as openai_client:
323
+ config = converter_config_registry.get(model_name)
324
+ converter = config.get_client(
325
+ num_concurrent_pages=1, debug=True, save_folder=str(tmp_output_dir)
326
+ )
334
327
 
335
- document = converter(file_path)
328
+ document = converter(file_path)
336
329
 
337
- assert len(document.pages) == 2
338
- # With concurrency=1, calls should be sequential
339
- assert mock_openai_client.chat.completions.create.call_count == 2
330
+ assert len(document.pages) == 2
331
+ # With concurrency=1, calls should be sequential
332
+ assert openai_client.chat.completions.create.call_count == 2
@@ -0,0 +1,144 @@
1
+ import pytest
2
+
3
+ from vlmparse.converter_with_server import ConverterWithServer
4
+
5
+
6
+ class TestBatchParser:
7
+ """Tests for ConverterWithServer (acting as BatchParser)."""
8
+
9
+ def test_init_starts_docker_server(self, mock_docker_operations):
10
+ """Test that initializing with a model requiring docker starts the server."""
11
+ # Setup using unified mocking system
12
+ with mock_docker_operations(include_client=True) as (
13
+ mock_docker_registry,
14
+ mock_config,
15
+ mock_server,
16
+ mock_client,
17
+ ):
18
+ # Initialize
19
+ with ConverterWithServer(model="test_model", server="hf") as parser:
20
+ # Verify interactions
21
+ # For server="hf", we expect lookup in registry then fallback if not found
22
+ # The mock setup seems to imply it finds it or defaults?
23
+ # In start_server, if server="hf" and not in registry, it makes a default config.
24
+ # If mock_docker_registry.get returns something, it uses it.
25
+ mock_docker_registry.get.assert_called_with("test_model")
26
+ mock_config.get_server.assert_called_with(auto_stop=True)
27
+ mock_server.start.assert_called_once()
28
+ mock_config.get_client.assert_called_once()
29
+ assert parser.client == mock_client
30
+
31
+ def test_init_no_docker_fallback(self, mock_docker_operations, mock_openai_api):
32
+ """Test fallback to standard converter when no docker config exists."""
33
+ # Setup mocks - docker returns None, use real converter registry
34
+ with mock_docker_operations(
35
+ model_filter=lambda model: False # No docker for any model
36
+ ) as (mock_docker_reg, _, _, _):
37
+ with mock_openai_api():
38
+ # Initialize with a real model from registry
39
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
40
+ # Verify interactions
41
+ # Because server="registry" (default), we check valid model
42
+ # If model is not in docker registry, it shouldn't try to get with default=False anymore
43
+ # logic:
44
+ # if server="registry": if model in docker_config_registry.list_models() -> start_server
45
+ # else -> directly to converter_config_registry
46
+
47
+ # The mock_docker_operations mocks list_models via the model_filter probably?
48
+ # Let's check how mock_docker_operations is implemented in conftest
49
+ # Assuming it mocks list_models.
50
+
51
+ # Client should be initialized from real converter registry
52
+ assert parser.client is not None
53
+
54
+ def test_parse_updates_client_config(
55
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
56
+ ):
57
+ """Test that parse method updates client configuration and calls batch."""
58
+ # Use real test file
59
+ test_file = datadir / "Fiche_Graines_A5.pdf"
60
+
61
+ with mock_docker_operations(
62
+ model_filter=lambda model: False # No docker for any model
63
+ ):
64
+ with mock_openai_api():
65
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
66
+ # Call parse with real file
67
+ parser.client.return_documents_in_batch_mode = True
68
+ documents = parser.parse(
69
+ inputs=[str(test_file)],
70
+ out_folder=str(tmp_path),
71
+ mode="md",
72
+ dpi=300,
73
+ debug=True,
74
+ )
75
+
76
+ # Verify client config updates
77
+ assert parser.client.config.dpi == 300
78
+ assert parser.client.debug is True
79
+ assert parser.client.save_mode == "md"
80
+ # Concurrency should be 1 because debug=True
81
+ assert parser.client.num_concurrent_files == 1
82
+ assert parser.client.num_concurrent_pages == 1
83
+
84
+ # Verify result
85
+ assert documents is not None
86
+ assert len(documents) > 0
87
+
88
+ def test_parse_retry_logic(
89
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
90
+ ):
91
+ """Test the retrylast logic filters already processed files."""
92
+ # Create two copies of the test file
93
+ test_file = datadir / "Fiche_Graines_A5.pdf"
94
+ temp_dir = tmp_path / "input_files"
95
+ temp_dir.mkdir()
96
+ file1 = temp_dir / "file1.pdf"
97
+ file2 = temp_dir / "file2.pdf"
98
+
99
+ # Copy test file to simulate multiple inputs
100
+ import shutil
101
+
102
+ shutil.copy(test_file, file1)
103
+ shutil.copy(test_file, file2)
104
+
105
+ # Setup folder structure for retry
106
+ run_folder = tmp_path / "output" / "run1"
107
+ results_folder = run_folder / "results"
108
+ results_folder.mkdir(parents=True)
109
+
110
+ # Create a processed result for file1
111
+ (results_folder / "file1.zip").touch()
112
+
113
+ with mock_docker_operations(model_filter=lambda model: False):
114
+ with mock_openai_api():
115
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
116
+ parser.client.return_documents_in_batch_mode = True
117
+ # Call parse with retrylast - should only process file2
118
+ documents = parser.parse(
119
+ inputs=[str(file1), str(file2)],
120
+ out_folder=str(tmp_path / "output"),
121
+ retrylast=True,
122
+ )
123
+
124
+ # Should only process file2 (file1 was already processed)
125
+ # Verify by checking that only 1 file was processed
126
+ assert documents is not None
127
+ assert len(documents) == 1
128
+
129
+ def test_parse_retry_no_previous_runs(
130
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
131
+ ):
132
+ """Test that retrylast raises ValueError if no previous runs found."""
133
+ test_file = datadir / "Fiche_Graines_A5.pdf"
134
+
135
+ with mock_docker_operations(model_filter=lambda model: False):
136
+ with mock_openai_api():
137
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
138
+ # tmp_path is empty, so os.listdir(tmp_path) will be empty
139
+ with pytest.raises(ValueError, match="No previous runs found"):
140
+ parser.parse(
141
+ inputs=[str(test_file)],
142
+ out_folder=str(tmp_path),
143
+ retrylast=True,
144
+ )