vlmparse 0.1.7__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {vlmparse-0.1.7 → vlmparse-0.1.8}/PKG-INFO +11 -1
  2. {vlmparse-0.1.7 → vlmparse-0.1.8}/README.md +11 -1
  3. {vlmparse-0.1.7 → vlmparse-0.1.8}/pyproject.toml +1 -1
  4. {vlmparse-0.1.7 → vlmparse-0.1.8}/tests/test_all_converters_mocked.py +124 -130
  5. vlmparse-0.1.8/tests/test_batch_parser.py +135 -0
  6. vlmparse-0.1.8/tests/test_cli.py +684 -0
  7. vlmparse-0.1.8/tests/test_end2end.py +119 -0
  8. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/build_doc.py +20 -19
  9. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/cli.py +17 -1
  10. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/chandra.py +176 -60
  11. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/deepseekocr.py +23 -12
  12. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/docling.py +0 -1
  13. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/dotsocr.py +34 -31
  14. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/granite_docling.py +9 -36
  15. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/hunyuanocr.py +5 -1
  16. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/lightonocr.py +23 -1
  17. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/mineru.py +0 -1
  18. vlmparse-0.1.8/vlmparse/clients/mistral_converter.py +85 -0
  19. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/nanonetocr.py +5 -1
  20. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/olmocr.py +6 -2
  21. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/openai_converter.py +95 -60
  22. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/paddleocrvl.py +9 -2
  23. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/converter.py +51 -11
  24. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/converter_with_server.py +41 -5
  25. vlmparse-0.1.8/vlmparse/registries.py +178 -0
  26. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/servers/docker_server.py +59 -35
  27. vlmparse-0.1.8/vlmparse/servers/model_identity.py +48 -0
  28. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/utils.py +15 -2
  29. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/PKG-INFO +11 -1
  30. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/SOURCES.txt +2 -0
  31. vlmparse-0.1.7/tests/test_batch_parser.py +0 -186
  32. vlmparse-0.1.7/tests/test_cli.py +0 -732
  33. vlmparse-0.1.7/tests/test_end2end.py +0 -67
  34. vlmparse-0.1.7/vlmparse/registries.py +0 -170
  35. {vlmparse-0.1.7 → vlmparse-0.1.8}/LICENSE +0 -0
  36. {vlmparse-0.1.7 → vlmparse-0.1.8}/setup.cfg +0 -0
  37. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/base_model.py +0 -0
  38. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
  39. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
  40. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/utils.py +0 -0
  41. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/prompts.py +0 -0
  42. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/constants.py +0 -0
  43. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/data_model/box.py +0 -0
  44. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/data_model/document.py +0 -0
  45. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/servers/utils.py +0 -0
  46. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/st_viewer/fs_nav.py +0 -0
  47. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/st_viewer/st_viewer.py +0 -0
  48. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/dependency_links.txt +0 -0
  49. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/entry_points.txt +0 -0
  50. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/requires.txt +0 -0
  51. {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Requires-Python: >=3.11.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -54,6 +54,12 @@ Dynamic: license-file
54
54
 
55
55
  # vlmparse
56
56
 
57
+ <div align="center">
58
+
59
+ [\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
60
+
61
+ </div>
62
+
57
63
  A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
58
64
 
59
65
  Features:
@@ -209,3 +215,7 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
209
215
  ```
210
216
 
211
217
  Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
218
+
219
+ ## Credits
220
+
221
+ This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
@@ -1,5 +1,11 @@
1
1
  # vlmparse
2
2
 
3
+ <div align="center">
4
+
5
+ [\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
6
+
7
+ </div>
8
+
3
9
  A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
4
10
 
5
11
  Features:
@@ -154,4 +160,8 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
154
160
  documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
155
161
  ```
156
162
 
157
- Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
163
+ Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
164
+
165
+ ## Credits
166
+
167
+ This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vlmparse"
7
- version = "0.1.7"
7
+ version = "0.1.8"
8
8
  authors = []
9
9
  description = ""
10
10
  readme = "README.md"
@@ -19,43 +19,8 @@ MOCK_RESPONSES = {
19
19
  }
20
20
 
21
21
 
22
- @pytest.fixture
23
- def mock_openai_client():
24
- """Mock the AsyncOpenAI client used by all converters."""
25
- with patch("openai.AsyncOpenAI") as mock_client:
26
- # Create mock response object
27
- mock_response = MagicMock()
28
- mock_response.choices = [MagicMock()]
29
- mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
30
- mock_response.usage = MagicMock()
31
- mock_response.usage.prompt_tokens = 50
32
- mock_response.usage.completion_tokens = 150
33
- mock_response.usage.reasoning_tokens = 30
34
-
35
- # Configure the async method
36
- mock_instance = MagicMock()
37
- mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
38
- mock_client.return_value = mock_instance
39
-
40
- yield mock_instance
41
-
42
-
43
- @pytest.fixture
44
- def dotsocr_mock_client():
45
- """Mock for DotsOCR with different response types."""
46
- with patch("openai.AsyncOpenAI") as mock_client:
47
- mock_response = MagicMock()
48
- mock_response.choices = [MagicMock()]
49
- mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
50
- mock_response.usage = MagicMock()
51
- mock_response.usage.prompt_tokens = 40
52
- mock_response.usage.completion_tokens = 160
53
- mock_response.usage.reasoning_tokens = 20
54
- mock_instance = MagicMock()
55
- mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
56
- mock_client.return_value = mock_instance
57
-
58
- yield mock_instance
22
+ # Note: mock_openai_client and dotsocr_mock_client fixtures are replaced by the
23
+ # unified mock_openai_api fixture from conftest.py
59
24
 
60
25
 
61
26
  # List of all models registered in converter_config_registry
@@ -64,6 +29,14 @@ ALL_MODELS = [
64
29
  "lightonocr",
65
30
  "dotsocr",
66
31
  "nanonets/Nanonets-OCR2-3B",
32
+ "hunyuanocr",
33
+ "olmocr-2-fp8",
34
+ "paddleocrvl",
35
+ "mineru25",
36
+ "chandra",
37
+ "deepseekocr",
38
+ "granite-docling",
39
+ "Qwen/Qwen3-VL-8B-Instruct",
67
40
  ]
68
41
 
69
42
 
@@ -91,87 +64,93 @@ class TestConverterConfigs:
91
64
  ],
92
65
  )
93
66
  def test_converter_basic_processing(
94
- self, file_path, model_name, mock_openai_client
67
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
95
68
  ):
96
69
  """Test basic document processing for OpenAI-compatible converters."""
97
- config = converter_config_registry.get(model_name)
98
- converter = config.get_client(num_concurrent_pages=2, debug=True)
70
+ with mock_openai_api() as openai_client:
71
+ config = converter_config_registry.get(model_name)
72
+ converter = config.get_client(
73
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
74
+ )
99
75
 
100
- # Process document
101
- document = converter(file_path)
76
+ # Process document
77
+ document = converter(file_path)
102
78
 
103
- # Verify document structure
104
- assert isinstance(document, Document)
105
- assert document.file_path == str(file_path)
106
- assert len(document.pages) == 2, f"Expected 2 pages, got {len(document.pages)}"
79
+ # Verify document structure
80
+ assert isinstance(document, Document)
81
+ assert document.file_path == str(file_path)
82
+ assert (
83
+ len(document.pages) == 2
84
+ ), f"Expected 2 pages, got {len(document.pages)}"
107
85
 
108
- # Verify pages
109
- for page in document.pages:
110
- assert isinstance(page, Page)
111
- assert page.text is not None, "Page text should not be None"
112
- assert len(page.text) > 0, "Page text should not be empty"
86
+ # Verify pages
87
+ for page in document.pages:
88
+ assert isinstance(page, Page)
89
+ assert page.text is not None, "Page text should not be None"
90
+ assert len(page.text) > 0, "Page text should not be empty"
113
91
 
114
- # Verify API was called
115
- assert mock_openai_client.chat.completions.create.call_count == 2
92
+ # Verify API was called
93
+ assert openai_client.chat.completions.create.call_count == 2
116
94
 
117
- def test_converter_image_processing(self, datadir, mock_openai_client):
95
+ def test_converter_image_processing(self, datadir, mock_openai_api, tmp_output_dir):
118
96
  """Test processing of a single image file."""
119
- model_name = "gemini-2.5-flash-lite"
120
- image_path = datadir / "page_with_formula.png"
97
+ with mock_openai_api() as openai_client:
98
+ model_name = "gemini-2.5-flash-lite"
99
+ image_path = datadir / "page_with_formula.png"
121
100
 
122
- config = converter_config_registry.get(model_name)
123
- converter = config.get_client(debug=True)
101
+ config = converter_config_registry.get(model_name)
102
+ converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
124
103
 
125
- # Process image
126
- document = converter(image_path)
104
+ # Process image
105
+ document = converter(image_path)
127
106
 
128
- # Verify document structure
129
- assert isinstance(document, Document)
130
- assert document.file_path == str(image_path)
131
- assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
107
+ # Verify document structure
108
+ assert isinstance(document, Document)
109
+ assert document.file_path == str(image_path)
110
+ assert (
111
+ len(document.pages) == 1
112
+ ), f"Expected 1 page, got {len(document.pages)}"
132
113
 
133
- # Verify page
134
- page = document.pages[0]
135
- assert isinstance(page, Page)
136
- assert page.text is not None
137
- assert len(page.text) > 0
114
+ # Verify page
115
+ page = document.pages[0]
116
+ assert isinstance(page, Page)
117
+ assert page.text is not None
118
+ assert len(page.text) > 0
138
119
 
139
- # Verify API was called once
140
- assert mock_openai_client.chat.completions.create.call_count == 1
120
+ # Verify API was called once
121
+ assert openai_client.chat.completions.create.call_count == 1
141
122
 
142
- def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
123
+ def test_dotsocr_ocr_mode(self, file_path, mock_openai_api, tmp_output_dir):
143
124
  """Test DotsOCR converter in OCR mode."""
144
- config = converter_config_registry.get("dotsocr")
145
- converter = config.get_client(num_concurrent_pages=2, debug=True)
125
+ with mock_openai_api(content=MOCK_RESPONSES["dotsocr_ocr"]) as openai_client:
126
+ config = converter_config_registry.get("dotsocr")
127
+ converter = config.get_client(
128
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
129
+ )
146
130
 
147
- # Process document
148
- document = converter(file_path)
131
+ # Process document
132
+ document = converter(file_path)
149
133
 
150
- # Verify document structure
151
- assert isinstance(document, Document)
152
- assert len(document.pages) == 2
134
+ # Verify document structure
135
+ assert isinstance(document, Document)
136
+ assert len(document.pages) == 2
153
137
 
154
- for page in document.pages:
155
- assert isinstance(page, Page)
156
- assert page.text is not None
157
- assert len(page.text) > 0
138
+ for page in document.pages:
139
+ assert isinstance(page, Page)
140
+ assert page.text is not None
141
+ assert len(page.text) > 0
158
142
 
159
- # Verify API was called
160
- assert dotsocr_mock_client.chat.completions.create.call_count == 2
143
+ # Verify API was called
144
+ assert openai_client.chat.completions.create.call_count == 2
161
145
 
162
146
  @pytest.mark.parametrize("model_name", ALL_MODELS)
163
- def test_converter_error_handling(self, file_path, model_name):
147
+ def test_converter_error_handling(
148
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
149
+ ):
164
150
  """Test that converters handle errors gracefully."""
165
- with patch("openai.AsyncOpenAI") as mock_client:
166
- # Configure mock to raise an exception
167
- mock_instance = MagicMock()
168
- mock_instance.chat.completions.create = AsyncMock(
169
- side_effect=Exception("API Error")
170
- )
171
- mock_client.return_value = mock_instance
172
-
151
+ with mock_openai_api(side_effect=Exception("API Error")):
173
152
  config = converter_config_registry.get(model_name)
174
- converter = config.get_client(debug=False)
153
+ converter = config.get_client(debug=False, save_folder=str(tmp_output_dir))
175
154
 
176
155
  # Process should not crash
177
156
  document = converter(file_path)
@@ -193,25 +172,29 @@ class TestConverterBatchProcessing:
193
172
  "lightonocr",
194
173
  ],
195
174
  )
196
- def test_batch_processing(self, file_path, model_name, mock_openai_client):
175
+ def test_batch_processing(
176
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
177
+ ):
197
178
  """Test batch processing of multiple files."""
198
- config = converter_config_registry.get(model_name)
199
- converter = config.get_client(
200
- num_concurrent_files=2,
201
- num_concurrent_pages=2,
202
- return_documents_in_batch_mode=True,
203
- debug=True,
204
- )
179
+ with mock_openai_api():
180
+ config = converter_config_registry.get(model_name)
181
+ converter = config.get_client(
182
+ num_concurrent_files=2,
183
+ num_concurrent_pages=2,
184
+ return_documents_in_batch_mode=True,
185
+ debug=True,
186
+ save_folder=str(tmp_output_dir),
187
+ )
205
188
 
206
- # Process multiple files (same file for testing)
207
- file_paths = [file_path, file_path]
208
- documents = converter.batch(file_paths)
189
+ # Process multiple files (same file for testing)
190
+ file_paths = [file_path, file_path]
191
+ documents = converter.batch(file_paths)
209
192
 
210
- # Verify results
211
- assert len(documents) == 2
212
- for doc in documents:
213
- assert isinstance(doc, Document)
214
- assert len(doc.pages) == 2
193
+ # Verify results
194
+ assert len(documents) == 2
195
+ for doc in documents:
196
+ assert isinstance(doc, Document)
197
+ assert len(doc.pages) == 2
215
198
 
216
199
 
217
200
  @pytest.fixture
@@ -245,12 +228,16 @@ def mineru_mock_httpx_client():
245
228
 
246
229
 
247
230
  class TestMinerUConverterMockedApi:
248
- def test_mineru_converter_repeated_call(self, file_path, mineru_mock_httpx_client):
231
+ def test_mineru_converter_repeated_call(
232
+ self, file_path, mineru_mock_httpx_client, tmp_output_dir
233
+ ):
249
234
  """Repeated `__call__` should keep working and call API each page."""
250
235
  from vlmparse.clients.mineru import MinerUConverterConfig
251
236
 
252
237
  config = MinerUConverterConfig(base_url="http://mineru.test")
253
- converter = config.get_client(num_concurrent_pages=2, debug=True)
238
+ converter = config.get_client(
239
+ num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
240
+ )
254
241
 
255
242
  with (
256
243
  patch("vlmparse.clients.mineru.clean_response", lambda x: x),
@@ -274,7 +261,7 @@ class TestMinerUConverterMockedApi:
274
261
  assert mineru_mock_httpx_client.post.call_count == 4
275
262
 
276
263
  def test_mineru_converter_batch_processing(
277
- self, file_path, mineru_mock_httpx_client
264
+ self, file_path, mineru_mock_httpx_client, tmp_output_dir
278
265
  ):
279
266
  """Batch mode should return documents and call API for each page."""
280
267
  from vlmparse.clients.mineru import MinerUConverterConfig
@@ -285,6 +272,7 @@ class TestMinerUConverterMockedApi:
285
272
  num_concurrent_pages=2,
286
273
  return_documents_in_batch_mode=True,
287
274
  debug=True,
275
+ save_folder=str(tmp_output_dir),
288
276
  )
289
277
 
290
278
  with (
@@ -306,19 +294,22 @@ class TestMinerUConverterMockedApi:
306
294
  class TestCustomURI:
307
295
  """Test converter initialization with custom URIs."""
308
296
 
309
- def test_custom_uri_config(self, mock_openai_client, file_path):
297
+ def test_custom_uri_config(self, mock_openai_api, file_path, tmp_output_dir):
310
298
  """Test that converters can be initialized with custom URIs."""
311
- custom_uri = "http://localhost:8000/v1"
312
- config = converter_config_registry.get("gemini-2.5-flash-lite", uri=custom_uri)
299
+ with mock_openai_api():
300
+ custom_uri = "http://localhost:8000/v1"
301
+ config = converter_config_registry.get(
302
+ "gemini-2.5-flash-lite", uri=custom_uri
303
+ )
313
304
 
314
- assert config.llm_params.base_url == custom_uri
305
+ assert config.base_url == custom_uri
315
306
 
316
- # Test it works
317
- converter = config.get_client(debug=True)
318
- document = converter(file_path)
307
+ # Test it works
308
+ converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
309
+ document = converter(file_path)
319
310
 
320
- assert isinstance(document, Document)
321
- assert len(document.pages) == 2
311
+ assert isinstance(document, Document)
312
+ assert len(document.pages) == 2
322
313
 
323
314
 
324
315
  class TestConcurrency:
@@ -326,14 +317,17 @@ class TestConcurrency:
326
317
 
327
318
  @pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
328
319
  def test_concurrent_page_processing(
329
- self, file_path, model_name, mock_openai_client
320
+ self, file_path, model_name, mock_openai_api, tmp_output_dir
330
321
  ):
331
322
  """Test that concurrent page processing limits are respected."""
332
- config = converter_config_registry.get(model_name)
333
- converter = config.get_client(num_concurrent_pages=1, debug=True)
323
+ with mock_openai_api() as openai_client:
324
+ config = converter_config_registry.get(model_name)
325
+ converter = config.get_client(
326
+ num_concurrent_pages=1, debug=True, save_folder=str(tmp_output_dir)
327
+ )
334
328
 
335
- document = converter(file_path)
329
+ document = converter(file_path)
336
330
 
337
- assert len(document.pages) == 2
338
- # With concurrency=1, calls should be sequential
339
- assert mock_openai_client.chat.completions.create.call_count == 2
331
+ assert len(document.pages) == 2
332
+ # With concurrency=1, calls should be sequential
333
+ assert openai_client.chat.completions.create.call_count == 2
@@ -0,0 +1,135 @@
1
+ import pytest
2
+
3
+ from vlmparse.converter_with_server import ConverterWithServer
4
+
5
+
6
+ class TestBatchParser:
7
+ """Tests for ConverterWithServer (acting as BatchParser)."""
8
+
9
+ def test_init_starts_docker_server(self, mock_docker_operations):
10
+ """Test that initializing with a model requiring docker starts the server."""
11
+ # Setup using unified mocking system
12
+ with mock_docker_operations(include_client=True) as (
13
+ mock_docker_registry,
14
+ mock_config,
15
+ mock_server,
16
+ mock_client,
17
+ ):
18
+ # Initialize
19
+ with ConverterWithServer(
20
+ model="test_model", with_vllm_server=True
21
+ ) as parser:
22
+ # Verify interactions
23
+ mock_docker_registry.get.assert_called_with("test_model", default=True)
24
+ mock_config.get_server.assert_called_with(auto_stop=True)
25
+ mock_server.start.assert_called_once()
26
+ mock_config.get_client.assert_called_once()
27
+ assert parser.client == mock_client
28
+
29
+ def test_init_no_docker_fallback(self, mock_docker_operations, mock_openai_api):
30
+ """Test fallback to standard converter when no docker config exists."""
31
+ # Setup mocks - docker returns None, use real converter registry
32
+ with mock_docker_operations(
33
+ model_filter=lambda model: False # No docker for any model
34
+ ) as (mock_docker_reg, _, _, _):
35
+ with mock_openai_api():
36
+ # Initialize with a real model from registry
37
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
38
+ # Verify interactions
39
+ mock_docker_reg.get.assert_called_with(
40
+ "gemini-2.5-flash-lite", default=False
41
+ )
42
+ # Client should be initialized from real converter registry
43
+ assert parser.client is not None
44
+
45
+ def test_parse_updates_client_config(
46
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
47
+ ):
48
+ """Test that parse method updates client configuration and calls batch."""
49
+ # Use real test file
50
+ test_file = datadir / "Fiche_Graines_A5.pdf"
51
+
52
+ with mock_docker_operations(
53
+ model_filter=lambda model: False # No docker for any model
54
+ ):
55
+ with mock_openai_api():
56
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
57
+ # Call parse with real file
58
+ parser.client.return_documents_in_batch_mode = True
59
+ documents = parser.parse(
60
+ inputs=[str(test_file)],
61
+ out_folder=str(tmp_path),
62
+ mode="md",
63
+ dpi=300,
64
+ debug=True,
65
+ )
66
+
67
+ # Verify client config updates
68
+ assert parser.client.config.dpi == 300
69
+ assert parser.client.debug is True
70
+ assert parser.client.save_mode == "md"
71
+ # Concurrency should be 1 because debug=True
72
+ assert parser.client.num_concurrent_files == 1
73
+ assert parser.client.num_concurrent_pages == 1
74
+
75
+ # Verify result
76
+ assert documents is not None
77
+ assert len(documents) > 0
78
+
79
+ def test_parse_retry_logic(
80
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
81
+ ):
82
+ """Test the retrylast logic filters already processed files."""
83
+ # Create two copies of the test file
84
+ test_file = datadir / "Fiche_Graines_A5.pdf"
85
+ temp_dir = tmp_path / "input_files"
86
+ temp_dir.mkdir()
87
+ file1 = temp_dir / "file1.pdf"
88
+ file2 = temp_dir / "file2.pdf"
89
+
90
+ # Copy test file to simulate multiple inputs
91
+ import shutil
92
+
93
+ shutil.copy(test_file, file1)
94
+ shutil.copy(test_file, file2)
95
+
96
+ # Setup folder structure for retry
97
+ run_folder = tmp_path / "output" / "run1"
98
+ results_folder = run_folder / "results"
99
+ results_folder.mkdir(parents=True)
100
+
101
+ # Create a processed result for file1
102
+ (results_folder / "file1.zip").touch()
103
+
104
+ with mock_docker_operations(model_filter=lambda model: False):
105
+ with mock_openai_api():
106
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
107
+ parser.client.return_documents_in_batch_mode = True
108
+ # Call parse with retrylast - should only process file2
109
+ documents = parser.parse(
110
+ inputs=[str(file1), str(file2)],
111
+ out_folder=str(tmp_path / "output"),
112
+ retrylast=True,
113
+ )
114
+
115
+ # Should only process file2 (file1 was already processed)
116
+ # Verify by checking that only 1 file was processed
117
+ assert documents is not None
118
+ assert len(documents) == 1
119
+
120
+ def test_parse_retry_no_previous_runs(
121
+ self, mock_docker_operations, datadir, mock_openai_api, tmp_path
122
+ ):
123
+ """Test that retrylast raises ValueError if no previous runs found."""
124
+ test_file = datadir / "Fiche_Graines_A5.pdf"
125
+
126
+ with mock_docker_operations(model_filter=lambda model: False):
127
+ with mock_openai_api():
128
+ with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
129
+ # tmp_path is empty, so os.listdir(tmp_path) will be empty
130
+ with pytest.raises(ValueError, match="No previous runs found"):
131
+ parser.parse(
132
+ inputs=[str(test_file)],
133
+ out_folder=str(tmp_path),
134
+ retrylast=True,
135
+ )