vlmparse 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlmparse-0.1.7 → vlmparse-0.1.8}/PKG-INFO +11 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/README.md +11 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/pyproject.toml +1 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/tests/test_all_converters_mocked.py +124 -130
- vlmparse-0.1.8/tests/test_batch_parser.py +135 -0
- vlmparse-0.1.8/tests/test_cli.py +684 -0
- vlmparse-0.1.8/tests/test_end2end.py +119 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/build_doc.py +20 -19
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/cli.py +17 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/chandra.py +176 -60
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/deepseekocr.py +23 -12
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/docling.py +0 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/dotsocr.py +34 -31
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/granite_docling.py +9 -36
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/hunyuanocr.py +5 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/lightonocr.py +23 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/mineru.py +0 -1
- vlmparse-0.1.8/vlmparse/clients/mistral_converter.py +85 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/nanonetocr.py +5 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/olmocr.py +6 -2
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/openai_converter.py +95 -60
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/paddleocrvl.py +9 -2
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/converter.py +51 -11
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/converter_with_server.py +41 -5
- vlmparse-0.1.8/vlmparse/registries.py +178 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/servers/docker_server.py +59 -35
- vlmparse-0.1.8/vlmparse/servers/model_identity.py +48 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/utils.py +15 -2
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/PKG-INFO +11 -1
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/SOURCES.txt +2 -0
- vlmparse-0.1.7/tests/test_batch_parser.py +0 -186
- vlmparse-0.1.7/tests/test_cli.py +0 -732
- vlmparse-0.1.7/tests/test_end2end.py +0 -67
- vlmparse-0.1.7/vlmparse/registries.py +0 -170
- {vlmparse-0.1.7 → vlmparse-0.1.8}/LICENSE +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/setup.cfg +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/base_model.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/pipe_utils/utils.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/clients/prompts.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/constants.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/data_model/box.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/data_model/document.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/servers/utils.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/st_viewer/fs_nav.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse/st_viewer/st_viewer.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/dependency_links.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/entry_points.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/requires.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.8}/vlmparse.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vlmparse
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Requires-Python: >=3.11.0
|
|
5
5
|
Description-Content-Type: text/markdown
|
|
6
6
|
License-File: LICENSE
|
|
@@ -54,6 +54,12 @@ Dynamic: license-file
|
|
|
54
54
|
|
|
55
55
|
# vlmparse
|
|
56
56
|
|
|
57
|
+
<div align="center">
|
|
58
|
+
|
|
59
|
+
[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
|
|
60
|
+
|
|
61
|
+
</div>
|
|
62
|
+
|
|
57
63
|
A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
|
|
58
64
|
|
|
59
65
|
Features:
|
|
@@ -209,3 +215,7 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
|
209
215
|
```
|
|
210
216
|
|
|
211
217
|
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
218
|
+
|
|
219
|
+
## Credits
|
|
220
|
+
|
|
221
|
+
This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# vlmparse
|
|
2
2
|
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+
[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
|
|
6
|
+
|
|
7
|
+
</div>
|
|
8
|
+
|
|
3
9
|
A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
|
|
4
10
|
|
|
5
11
|
Features:
|
|
@@ -154,4 +160,8 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
|
154
160
|
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
155
161
|
```
|
|
156
162
|
|
|
157
|
-
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
163
|
+
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
164
|
+
|
|
165
|
+
## Credits
|
|
166
|
+
|
|
167
|
+
This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
|
|
@@ -19,43 +19,8 @@ MOCK_RESPONSES = {
|
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"""Mock the AsyncOpenAI client used by all converters."""
|
|
25
|
-
with patch("openai.AsyncOpenAI") as mock_client:
|
|
26
|
-
# Create mock response object
|
|
27
|
-
mock_response = MagicMock()
|
|
28
|
-
mock_response.choices = [MagicMock()]
|
|
29
|
-
mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
|
|
30
|
-
mock_response.usage = MagicMock()
|
|
31
|
-
mock_response.usage.prompt_tokens = 50
|
|
32
|
-
mock_response.usage.completion_tokens = 150
|
|
33
|
-
mock_response.usage.reasoning_tokens = 30
|
|
34
|
-
|
|
35
|
-
# Configure the async method
|
|
36
|
-
mock_instance = MagicMock()
|
|
37
|
-
mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
38
|
-
mock_client.return_value = mock_instance
|
|
39
|
-
|
|
40
|
-
yield mock_instance
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@pytest.fixture
|
|
44
|
-
def dotsocr_mock_client():
|
|
45
|
-
"""Mock for DotsOCR with different response types."""
|
|
46
|
-
with patch("openai.AsyncOpenAI") as mock_client:
|
|
47
|
-
mock_response = MagicMock()
|
|
48
|
-
mock_response.choices = [MagicMock()]
|
|
49
|
-
mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
|
|
50
|
-
mock_response.usage = MagicMock()
|
|
51
|
-
mock_response.usage.prompt_tokens = 40
|
|
52
|
-
mock_response.usage.completion_tokens = 160
|
|
53
|
-
mock_response.usage.reasoning_tokens = 20
|
|
54
|
-
mock_instance = MagicMock()
|
|
55
|
-
mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
56
|
-
mock_client.return_value = mock_instance
|
|
57
|
-
|
|
58
|
-
yield mock_instance
|
|
22
|
+
# Note: mock_openai_client and dotsocr_mock_client fixtures are replaced by the
|
|
23
|
+
# unified mock_openai_api fixture from conftest.py
|
|
59
24
|
|
|
60
25
|
|
|
61
26
|
# List of all models registered in converter_config_registry
|
|
@@ -64,6 +29,14 @@ ALL_MODELS = [
|
|
|
64
29
|
"lightonocr",
|
|
65
30
|
"dotsocr",
|
|
66
31
|
"nanonets/Nanonets-OCR2-3B",
|
|
32
|
+
"hunyuanocr",
|
|
33
|
+
"olmocr-2-fp8",
|
|
34
|
+
"paddleocrvl",
|
|
35
|
+
"mineru25",
|
|
36
|
+
"chandra",
|
|
37
|
+
"deepseekocr",
|
|
38
|
+
"granite-docling",
|
|
39
|
+
"Qwen/Qwen3-VL-8B-Instruct",
|
|
67
40
|
]
|
|
68
41
|
|
|
69
42
|
|
|
@@ -91,87 +64,93 @@ class TestConverterConfigs:
|
|
|
91
64
|
],
|
|
92
65
|
)
|
|
93
66
|
def test_converter_basic_processing(
|
|
94
|
-
self, file_path, model_name,
|
|
67
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
95
68
|
):
|
|
96
69
|
"""Test basic document processing for OpenAI-compatible converters."""
|
|
97
|
-
|
|
98
|
-
|
|
70
|
+
with mock_openai_api() as openai_client:
|
|
71
|
+
config = converter_config_registry.get(model_name)
|
|
72
|
+
converter = config.get_client(
|
|
73
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
74
|
+
)
|
|
99
75
|
|
|
100
|
-
|
|
101
|
-
|
|
76
|
+
# Process document
|
|
77
|
+
document = converter(file_path)
|
|
102
78
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
79
|
+
# Verify document structure
|
|
80
|
+
assert isinstance(document, Document)
|
|
81
|
+
assert document.file_path == str(file_path)
|
|
82
|
+
assert (
|
|
83
|
+
len(document.pages) == 2
|
|
84
|
+
), f"Expected 2 pages, got {len(document.pages)}"
|
|
107
85
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
86
|
+
# Verify pages
|
|
87
|
+
for page in document.pages:
|
|
88
|
+
assert isinstance(page, Page)
|
|
89
|
+
assert page.text is not None, "Page text should not be None"
|
|
90
|
+
assert len(page.text) > 0, "Page text should not be empty"
|
|
113
91
|
|
|
114
|
-
|
|
115
|
-
|
|
92
|
+
# Verify API was called
|
|
93
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
116
94
|
|
|
117
|
-
def test_converter_image_processing(self, datadir,
|
|
95
|
+
def test_converter_image_processing(self, datadir, mock_openai_api, tmp_output_dir):
|
|
118
96
|
"""Test processing of a single image file."""
|
|
119
|
-
|
|
120
|
-
|
|
97
|
+
with mock_openai_api() as openai_client:
|
|
98
|
+
model_name = "gemini-2.5-flash-lite"
|
|
99
|
+
image_path = datadir / "page_with_formula.png"
|
|
121
100
|
|
|
122
|
-
|
|
123
|
-
|
|
101
|
+
config = converter_config_registry.get(model_name)
|
|
102
|
+
converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
|
|
124
103
|
|
|
125
|
-
|
|
126
|
-
|
|
104
|
+
# Process image
|
|
105
|
+
document = converter(image_path)
|
|
127
106
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
107
|
+
# Verify document structure
|
|
108
|
+
assert isinstance(document, Document)
|
|
109
|
+
assert document.file_path == str(image_path)
|
|
110
|
+
assert (
|
|
111
|
+
len(document.pages) == 1
|
|
112
|
+
), f"Expected 1 page, got {len(document.pages)}"
|
|
132
113
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
114
|
+
# Verify page
|
|
115
|
+
page = document.pages[0]
|
|
116
|
+
assert isinstance(page, Page)
|
|
117
|
+
assert page.text is not None
|
|
118
|
+
assert len(page.text) > 0
|
|
138
119
|
|
|
139
|
-
|
|
140
|
-
|
|
120
|
+
# Verify API was called once
|
|
121
|
+
assert openai_client.chat.completions.create.call_count == 1
|
|
141
122
|
|
|
142
|
-
def test_dotsocr_ocr_mode(self, file_path,
|
|
123
|
+
def test_dotsocr_ocr_mode(self, file_path, mock_openai_api, tmp_output_dir):
|
|
143
124
|
"""Test DotsOCR converter in OCR mode."""
|
|
144
|
-
|
|
145
|
-
|
|
125
|
+
with mock_openai_api(content=MOCK_RESPONSES["dotsocr_ocr"]) as openai_client:
|
|
126
|
+
config = converter_config_registry.get("dotsocr")
|
|
127
|
+
converter = config.get_client(
|
|
128
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
129
|
+
)
|
|
146
130
|
|
|
147
|
-
|
|
148
|
-
|
|
131
|
+
# Process document
|
|
132
|
+
document = converter(file_path)
|
|
149
133
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
134
|
+
# Verify document structure
|
|
135
|
+
assert isinstance(document, Document)
|
|
136
|
+
assert len(document.pages) == 2
|
|
153
137
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
138
|
+
for page in document.pages:
|
|
139
|
+
assert isinstance(page, Page)
|
|
140
|
+
assert page.text is not None
|
|
141
|
+
assert len(page.text) > 0
|
|
158
142
|
|
|
159
|
-
|
|
160
|
-
|
|
143
|
+
# Verify API was called
|
|
144
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
161
145
|
|
|
162
146
|
@pytest.mark.parametrize("model_name", ALL_MODELS)
|
|
163
|
-
def test_converter_error_handling(
|
|
147
|
+
def test_converter_error_handling(
|
|
148
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
149
|
+
):
|
|
164
150
|
"""Test that converters handle errors gracefully."""
|
|
165
|
-
with
|
|
166
|
-
# Configure mock to raise an exception
|
|
167
|
-
mock_instance = MagicMock()
|
|
168
|
-
mock_instance.chat.completions.create = AsyncMock(
|
|
169
|
-
side_effect=Exception("API Error")
|
|
170
|
-
)
|
|
171
|
-
mock_client.return_value = mock_instance
|
|
172
|
-
|
|
151
|
+
with mock_openai_api(side_effect=Exception("API Error")):
|
|
173
152
|
config = converter_config_registry.get(model_name)
|
|
174
|
-
converter = config.get_client(debug=False)
|
|
153
|
+
converter = config.get_client(debug=False, save_folder=str(tmp_output_dir))
|
|
175
154
|
|
|
176
155
|
# Process should not crash
|
|
177
156
|
document = converter(file_path)
|
|
@@ -193,25 +172,29 @@ class TestConverterBatchProcessing:
|
|
|
193
172
|
"lightonocr",
|
|
194
173
|
],
|
|
195
174
|
)
|
|
196
|
-
def test_batch_processing(
|
|
175
|
+
def test_batch_processing(
|
|
176
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
177
|
+
):
|
|
197
178
|
"""Test batch processing of multiple files."""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
179
|
+
with mock_openai_api():
|
|
180
|
+
config = converter_config_registry.get(model_name)
|
|
181
|
+
converter = config.get_client(
|
|
182
|
+
num_concurrent_files=2,
|
|
183
|
+
num_concurrent_pages=2,
|
|
184
|
+
return_documents_in_batch_mode=True,
|
|
185
|
+
debug=True,
|
|
186
|
+
save_folder=str(tmp_output_dir),
|
|
187
|
+
)
|
|
205
188
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
189
|
+
# Process multiple files (same file for testing)
|
|
190
|
+
file_paths = [file_path, file_path]
|
|
191
|
+
documents = converter.batch(file_paths)
|
|
209
192
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
193
|
+
# Verify results
|
|
194
|
+
assert len(documents) == 2
|
|
195
|
+
for doc in documents:
|
|
196
|
+
assert isinstance(doc, Document)
|
|
197
|
+
assert len(doc.pages) == 2
|
|
215
198
|
|
|
216
199
|
|
|
217
200
|
@pytest.fixture
|
|
@@ -245,12 +228,16 @@ def mineru_mock_httpx_client():
|
|
|
245
228
|
|
|
246
229
|
|
|
247
230
|
class TestMinerUConverterMockedApi:
|
|
248
|
-
def test_mineru_converter_repeated_call(
|
|
231
|
+
def test_mineru_converter_repeated_call(
|
|
232
|
+
self, file_path, mineru_mock_httpx_client, tmp_output_dir
|
|
233
|
+
):
|
|
249
234
|
"""Repeated `__call__` should keep working and call API each page."""
|
|
250
235
|
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
251
236
|
|
|
252
237
|
config = MinerUConverterConfig(base_url="http://mineru.test")
|
|
253
|
-
converter = config.get_client(
|
|
238
|
+
converter = config.get_client(
|
|
239
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
240
|
+
)
|
|
254
241
|
|
|
255
242
|
with (
|
|
256
243
|
patch("vlmparse.clients.mineru.clean_response", lambda x: x),
|
|
@@ -274,7 +261,7 @@ class TestMinerUConverterMockedApi:
|
|
|
274
261
|
assert mineru_mock_httpx_client.post.call_count == 4
|
|
275
262
|
|
|
276
263
|
def test_mineru_converter_batch_processing(
|
|
277
|
-
self, file_path, mineru_mock_httpx_client
|
|
264
|
+
self, file_path, mineru_mock_httpx_client, tmp_output_dir
|
|
278
265
|
):
|
|
279
266
|
"""Batch mode should return documents and call API for each page."""
|
|
280
267
|
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
@@ -285,6 +272,7 @@ class TestMinerUConverterMockedApi:
|
|
|
285
272
|
num_concurrent_pages=2,
|
|
286
273
|
return_documents_in_batch_mode=True,
|
|
287
274
|
debug=True,
|
|
275
|
+
save_folder=str(tmp_output_dir),
|
|
288
276
|
)
|
|
289
277
|
|
|
290
278
|
with (
|
|
@@ -306,19 +294,22 @@ class TestMinerUConverterMockedApi:
|
|
|
306
294
|
class TestCustomURI:
|
|
307
295
|
"""Test converter initialization with custom URIs."""
|
|
308
296
|
|
|
309
|
-
def test_custom_uri_config(self,
|
|
297
|
+
def test_custom_uri_config(self, mock_openai_api, file_path, tmp_output_dir):
|
|
310
298
|
"""Test that converters can be initialized with custom URIs."""
|
|
311
|
-
|
|
312
|
-
|
|
299
|
+
with mock_openai_api():
|
|
300
|
+
custom_uri = "http://localhost:8000/v1"
|
|
301
|
+
config = converter_config_registry.get(
|
|
302
|
+
"gemini-2.5-flash-lite", uri=custom_uri
|
|
303
|
+
)
|
|
313
304
|
|
|
314
|
-
|
|
305
|
+
assert config.base_url == custom_uri
|
|
315
306
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
307
|
+
# Test it works
|
|
308
|
+
converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
|
|
309
|
+
document = converter(file_path)
|
|
319
310
|
|
|
320
|
-
|
|
321
|
-
|
|
311
|
+
assert isinstance(document, Document)
|
|
312
|
+
assert len(document.pages) == 2
|
|
322
313
|
|
|
323
314
|
|
|
324
315
|
class TestConcurrency:
|
|
@@ -326,14 +317,17 @@ class TestConcurrency:
|
|
|
326
317
|
|
|
327
318
|
@pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
|
|
328
319
|
def test_concurrent_page_processing(
|
|
329
|
-
self, file_path, model_name,
|
|
320
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
330
321
|
):
|
|
331
322
|
"""Test that concurrent page processing limits are respected."""
|
|
332
|
-
|
|
333
|
-
|
|
323
|
+
with mock_openai_api() as openai_client:
|
|
324
|
+
config = converter_config_registry.get(model_name)
|
|
325
|
+
converter = config.get_client(
|
|
326
|
+
num_concurrent_pages=1, debug=True, save_folder=str(tmp_output_dir)
|
|
327
|
+
)
|
|
334
328
|
|
|
335
|
-
|
|
329
|
+
document = converter(file_path)
|
|
336
330
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
331
|
+
assert len(document.pages) == 2
|
|
332
|
+
# With concurrency=1, calls should be sequential
|
|
333
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from vlmparse.converter_with_server import ConverterWithServer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestBatchParser:
|
|
7
|
+
"""Tests for ConverterWithServer (acting as BatchParser)."""
|
|
8
|
+
|
|
9
|
+
def test_init_starts_docker_server(self, mock_docker_operations):
|
|
10
|
+
"""Test that initializing with a model requiring docker starts the server."""
|
|
11
|
+
# Setup using unified mocking system
|
|
12
|
+
with mock_docker_operations(include_client=True) as (
|
|
13
|
+
mock_docker_registry,
|
|
14
|
+
mock_config,
|
|
15
|
+
mock_server,
|
|
16
|
+
mock_client,
|
|
17
|
+
):
|
|
18
|
+
# Initialize
|
|
19
|
+
with ConverterWithServer(
|
|
20
|
+
model="test_model", with_vllm_server=True
|
|
21
|
+
) as parser:
|
|
22
|
+
# Verify interactions
|
|
23
|
+
mock_docker_registry.get.assert_called_with("test_model", default=True)
|
|
24
|
+
mock_config.get_server.assert_called_with(auto_stop=True)
|
|
25
|
+
mock_server.start.assert_called_once()
|
|
26
|
+
mock_config.get_client.assert_called_once()
|
|
27
|
+
assert parser.client == mock_client
|
|
28
|
+
|
|
29
|
+
def test_init_no_docker_fallback(self, mock_docker_operations, mock_openai_api):
|
|
30
|
+
"""Test fallback to standard converter when no docker config exists."""
|
|
31
|
+
# Setup mocks - docker returns None, use real converter registry
|
|
32
|
+
with mock_docker_operations(
|
|
33
|
+
model_filter=lambda model: False # No docker for any model
|
|
34
|
+
) as (mock_docker_reg, _, _, _):
|
|
35
|
+
with mock_openai_api():
|
|
36
|
+
# Initialize with a real model from registry
|
|
37
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
38
|
+
# Verify interactions
|
|
39
|
+
mock_docker_reg.get.assert_called_with(
|
|
40
|
+
"gemini-2.5-flash-lite", default=False
|
|
41
|
+
)
|
|
42
|
+
# Client should be initialized from real converter registry
|
|
43
|
+
assert parser.client is not None
|
|
44
|
+
|
|
45
|
+
def test_parse_updates_client_config(
|
|
46
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
47
|
+
):
|
|
48
|
+
"""Test that parse method updates client configuration and calls batch."""
|
|
49
|
+
# Use real test file
|
|
50
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
51
|
+
|
|
52
|
+
with mock_docker_operations(
|
|
53
|
+
model_filter=lambda model: False # No docker for any model
|
|
54
|
+
):
|
|
55
|
+
with mock_openai_api():
|
|
56
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
57
|
+
# Call parse with real file
|
|
58
|
+
parser.client.return_documents_in_batch_mode = True
|
|
59
|
+
documents = parser.parse(
|
|
60
|
+
inputs=[str(test_file)],
|
|
61
|
+
out_folder=str(tmp_path),
|
|
62
|
+
mode="md",
|
|
63
|
+
dpi=300,
|
|
64
|
+
debug=True,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Verify client config updates
|
|
68
|
+
assert parser.client.config.dpi == 300
|
|
69
|
+
assert parser.client.debug is True
|
|
70
|
+
assert parser.client.save_mode == "md"
|
|
71
|
+
# Concurrency should be 1 because debug=True
|
|
72
|
+
assert parser.client.num_concurrent_files == 1
|
|
73
|
+
assert parser.client.num_concurrent_pages == 1
|
|
74
|
+
|
|
75
|
+
# Verify result
|
|
76
|
+
assert documents is not None
|
|
77
|
+
assert len(documents) > 0
|
|
78
|
+
|
|
79
|
+
def test_parse_retry_logic(
|
|
80
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
81
|
+
):
|
|
82
|
+
"""Test the retrylast logic filters already processed files."""
|
|
83
|
+
# Create two copies of the test file
|
|
84
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
85
|
+
temp_dir = tmp_path / "input_files"
|
|
86
|
+
temp_dir.mkdir()
|
|
87
|
+
file1 = temp_dir / "file1.pdf"
|
|
88
|
+
file2 = temp_dir / "file2.pdf"
|
|
89
|
+
|
|
90
|
+
# Copy test file to simulate multiple inputs
|
|
91
|
+
import shutil
|
|
92
|
+
|
|
93
|
+
shutil.copy(test_file, file1)
|
|
94
|
+
shutil.copy(test_file, file2)
|
|
95
|
+
|
|
96
|
+
# Setup folder structure for retry
|
|
97
|
+
run_folder = tmp_path / "output" / "run1"
|
|
98
|
+
results_folder = run_folder / "results"
|
|
99
|
+
results_folder.mkdir(parents=True)
|
|
100
|
+
|
|
101
|
+
# Create a processed result for file1
|
|
102
|
+
(results_folder / "file1.zip").touch()
|
|
103
|
+
|
|
104
|
+
with mock_docker_operations(model_filter=lambda model: False):
|
|
105
|
+
with mock_openai_api():
|
|
106
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
107
|
+
parser.client.return_documents_in_batch_mode = True
|
|
108
|
+
# Call parse with retrylast - should only process file2
|
|
109
|
+
documents = parser.parse(
|
|
110
|
+
inputs=[str(file1), str(file2)],
|
|
111
|
+
out_folder=str(tmp_path / "output"),
|
|
112
|
+
retrylast=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Should only process file2 (file1 was already processed)
|
|
116
|
+
# Verify by checking that only 1 file was processed
|
|
117
|
+
assert documents is not None
|
|
118
|
+
assert len(documents) == 1
|
|
119
|
+
|
|
120
|
+
def test_parse_retry_no_previous_runs(
|
|
121
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
122
|
+
):
|
|
123
|
+
"""Test that retrylast raises ValueError if no previous runs found."""
|
|
124
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
125
|
+
|
|
126
|
+
with mock_docker_operations(model_filter=lambda model: False):
|
|
127
|
+
with mock_openai_api():
|
|
128
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
129
|
+
# tmp_path is empty, so os.listdir(tmp_path) will be empty
|
|
130
|
+
with pytest.raises(ValueError, match="No previous runs found"):
|
|
131
|
+
parser.parse(
|
|
132
|
+
inputs=[str(test_file)],
|
|
133
|
+
out_folder=str(tmp_path),
|
|
134
|
+
retrylast=True,
|
|
135
|
+
)
|