vlmparse 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlmparse-0.1.7 → vlmparse-0.1.9}/PKG-INFO +13 -3
- {vlmparse-0.1.7 → vlmparse-0.1.9}/README.md +13 -3
- {vlmparse-0.1.7 → vlmparse-0.1.9}/pyproject.toml +1 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/tests/test_all_converters_mocked.py +123 -130
- vlmparse-0.1.9/tests/test_batch_parser.py +144 -0
- vlmparse-0.1.9/tests/test_cli.py +844 -0
- vlmparse-0.1.9/tests/test_end2end.py +121 -0
- vlmparse-0.1.9/tests/test_server_logic.py +153 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/build_doc.py +20 -19
- vlmparse-0.1.9/vlmparse/cli.py +488 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/chandra.py +176 -60
- vlmparse-0.1.9/vlmparse/clients/deepseekocr.py +384 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/docling.py +0 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/dotsocr.py +34 -31
- vlmparse-0.1.9/vlmparse/clients/glmocr.py +243 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/granite_docling.py +9 -36
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/hunyuanocr.py +5 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/lightonocr.py +23 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/mineru.py +0 -1
- vlmparse-0.1.9/vlmparse/clients/mistral_converter.py +85 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/nanonetocr.py +5 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/olmocr.py +6 -2
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/openai_converter.py +95 -60
- vlmparse-0.1.9/vlmparse/clients/paddleocrvl.py +204 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/converter.py +51 -11
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/converter_with_server.py +92 -19
- vlmparse-0.1.9/vlmparse/registries.py +188 -0
- vlmparse-0.1.9/vlmparse/servers/base_server.py +127 -0
- vlmparse-0.1.9/vlmparse/servers/docker_compose_deployment.py +489 -0
- vlmparse-0.1.9/vlmparse/servers/docker_compose_server.py +39 -0
- vlmparse-0.1.7/vlmparse/servers/utils.py → vlmparse-0.1.9/vlmparse/servers/docker_run_deployment.py +7 -60
- vlmparse-0.1.9/vlmparse/servers/docker_server.py +120 -0
- vlmparse-0.1.9/vlmparse/servers/model_identity.py +48 -0
- vlmparse-0.1.9/vlmparse/servers/server_registry.py +42 -0
- vlmparse-0.1.9/vlmparse/servers/utils.py +143 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/st_viewer/st_viewer.py +1 -1
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/utils.py +15 -2
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/PKG-INFO +13 -3
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/SOURCES.txt +9 -0
- vlmparse-0.1.7/tests/test_batch_parser.py +0 -186
- vlmparse-0.1.7/tests/test_cli.py +0 -732
- vlmparse-0.1.7/tests/test_end2end.py +0 -67
- vlmparse-0.1.7/vlmparse/cli.py +0 -319
- vlmparse-0.1.7/vlmparse/clients/deepseekocr.py +0 -203
- vlmparse-0.1.7/vlmparse/clients/paddleocrvl.py +0 -49
- vlmparse-0.1.7/vlmparse/registries.py +0 -170
- vlmparse-0.1.7/vlmparse/servers/docker_server.py +0 -212
- {vlmparse-0.1.7 → vlmparse-0.1.9}/LICENSE +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/setup.cfg +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/base_model.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/pipe_utils/utils.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/clients/prompts.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/constants.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/data_model/box.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/data_model/document.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse/st_viewer/fs_nav.py +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/dependency_links.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/entry_points.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/requires.txt +0 -0
- {vlmparse-0.1.7 → vlmparse-0.1.9}/vlmparse.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vlmparse
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Requires-Python: >=3.11.0
|
|
5
5
|
Description-Content-Type: text/markdown
|
|
6
6
|
License-File: LICENSE
|
|
@@ -54,6 +54,12 @@ Dynamic: license-file
|
|
|
54
54
|
|
|
55
55
|
# vlmparse
|
|
56
56
|
|
|
57
|
+
<div align="center">
|
|
58
|
+
|
|
59
|
+
[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
|
|
60
|
+
|
|
61
|
+
</div>
|
|
62
|
+
|
|
57
63
|
A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
|
|
58
64
|
|
|
59
65
|
Features:
|
|
@@ -125,13 +131,13 @@ Deployment (requires a gpu + docker installation):
|
|
|
125
131
|
- Check that the port is not used by another service.
|
|
126
132
|
|
|
127
133
|
```bash
|
|
128
|
-
vlmparse serve --model
|
|
134
|
+
vlmparse serve --model lightonocr2 --port 8000 --gpus 1
|
|
129
135
|
```
|
|
130
136
|
|
|
131
137
|
then convert:
|
|
132
138
|
|
|
133
139
|
```bash
|
|
134
|
-
vlmparse convert
|
|
140
|
+
vlmparse convert "*.pdf" --out_folder ./output --uri http://localhost:8000/v1
|
|
135
141
|
```
|
|
136
142
|
|
|
137
143
|
You can also list all running servers:
|
|
@@ -209,3 +215,7 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
|
209
215
|
```
|
|
210
216
|
|
|
211
217
|
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
218
|
+
|
|
219
|
+
## Credits
|
|
220
|
+
|
|
221
|
+
This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# vlmparse
|
|
2
2
|
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+
[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
|
|
6
|
+
|
|
7
|
+
</div>
|
|
8
|
+
|
|
3
9
|
A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
|
|
4
10
|
|
|
5
11
|
Features:
|
|
@@ -71,13 +77,13 @@ Deployment (requires a gpu + docker installation):
|
|
|
71
77
|
- Check that the port is not used by another service.
|
|
72
78
|
|
|
73
79
|
```bash
|
|
74
|
-
vlmparse serve --model
|
|
80
|
+
vlmparse serve --model lightonocr2 --port 8000 --gpus 1
|
|
75
81
|
```
|
|
76
82
|
|
|
77
83
|
then convert:
|
|
78
84
|
|
|
79
85
|
```bash
|
|
80
|
-
vlmparse convert
|
|
86
|
+
vlmparse convert "*.pdf" --out_folder ./output --uri http://localhost:8000/v1
|
|
81
87
|
```
|
|
82
88
|
|
|
83
89
|
You can also list all running servers:
|
|
@@ -154,4 +160,8 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
|
154
160
|
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
155
161
|
```
|
|
156
162
|
|
|
157
|
-
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
163
|
+
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
164
|
+
|
|
165
|
+
## Credits
|
|
166
|
+
|
|
167
|
+
This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.
|
|
@@ -19,43 +19,8 @@ MOCK_RESPONSES = {
|
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"""Mock the AsyncOpenAI client used by all converters."""
|
|
25
|
-
with patch("openai.AsyncOpenAI") as mock_client:
|
|
26
|
-
# Create mock response object
|
|
27
|
-
mock_response = MagicMock()
|
|
28
|
-
mock_response.choices = [MagicMock()]
|
|
29
|
-
mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
|
|
30
|
-
mock_response.usage = MagicMock()
|
|
31
|
-
mock_response.usage.prompt_tokens = 50
|
|
32
|
-
mock_response.usage.completion_tokens = 150
|
|
33
|
-
mock_response.usage.reasoning_tokens = 30
|
|
34
|
-
|
|
35
|
-
# Configure the async method
|
|
36
|
-
mock_instance = MagicMock()
|
|
37
|
-
mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
38
|
-
mock_client.return_value = mock_instance
|
|
39
|
-
|
|
40
|
-
yield mock_instance
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@pytest.fixture
|
|
44
|
-
def dotsocr_mock_client():
|
|
45
|
-
"""Mock for DotsOCR with different response types."""
|
|
46
|
-
with patch("openai.AsyncOpenAI") as mock_client:
|
|
47
|
-
mock_response = MagicMock()
|
|
48
|
-
mock_response.choices = [MagicMock()]
|
|
49
|
-
mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
|
|
50
|
-
mock_response.usage = MagicMock()
|
|
51
|
-
mock_response.usage.prompt_tokens = 40
|
|
52
|
-
mock_response.usage.completion_tokens = 160
|
|
53
|
-
mock_response.usage.reasoning_tokens = 20
|
|
54
|
-
mock_instance = MagicMock()
|
|
55
|
-
mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
56
|
-
mock_client.return_value = mock_instance
|
|
57
|
-
|
|
58
|
-
yield mock_instance
|
|
22
|
+
# Note: mock_openai_client and dotsocr_mock_client fixtures are replaced by the
|
|
23
|
+
# unified mock_openai_api fixture from conftest.py
|
|
59
24
|
|
|
60
25
|
|
|
61
26
|
# List of all models registered in converter_config_registry
|
|
@@ -64,6 +29,13 @@ ALL_MODELS = [
|
|
|
64
29
|
"lightonocr",
|
|
65
30
|
"dotsocr",
|
|
66
31
|
"nanonets/Nanonets-OCR2-3B",
|
|
32
|
+
"hunyuanocr",
|
|
33
|
+
"olmocr-2-fp8",
|
|
34
|
+
"mineru25",
|
|
35
|
+
"chandra",
|
|
36
|
+
"deepseekocr",
|
|
37
|
+
"granite-docling",
|
|
38
|
+
"deepseekocr2",
|
|
67
39
|
]
|
|
68
40
|
|
|
69
41
|
|
|
@@ -91,87 +63,93 @@ class TestConverterConfigs:
|
|
|
91
63
|
],
|
|
92
64
|
)
|
|
93
65
|
def test_converter_basic_processing(
|
|
94
|
-
self, file_path, model_name,
|
|
66
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
95
67
|
):
|
|
96
68
|
"""Test basic document processing for OpenAI-compatible converters."""
|
|
97
|
-
|
|
98
|
-
|
|
69
|
+
with mock_openai_api() as openai_client:
|
|
70
|
+
config = converter_config_registry.get(model_name)
|
|
71
|
+
converter = config.get_client(
|
|
72
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
73
|
+
)
|
|
99
74
|
|
|
100
|
-
|
|
101
|
-
|
|
75
|
+
# Process document
|
|
76
|
+
document = converter(file_path)
|
|
102
77
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
78
|
+
# Verify document structure
|
|
79
|
+
assert isinstance(document, Document)
|
|
80
|
+
assert document.file_path == str(file_path)
|
|
81
|
+
assert (
|
|
82
|
+
len(document.pages) == 2
|
|
83
|
+
), f"Expected 2 pages, got {len(document.pages)}"
|
|
107
84
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
85
|
+
# Verify pages
|
|
86
|
+
for page in document.pages:
|
|
87
|
+
assert isinstance(page, Page)
|
|
88
|
+
assert page.text is not None, "Page text should not be None"
|
|
89
|
+
assert len(page.text) > 0, "Page text should not be empty"
|
|
113
90
|
|
|
114
|
-
|
|
115
|
-
|
|
91
|
+
# Verify API was called
|
|
92
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
116
93
|
|
|
117
|
-
def test_converter_image_processing(self, datadir,
|
|
94
|
+
def test_converter_image_processing(self, datadir, mock_openai_api, tmp_output_dir):
|
|
118
95
|
"""Test processing of a single image file."""
|
|
119
|
-
|
|
120
|
-
|
|
96
|
+
with mock_openai_api() as openai_client:
|
|
97
|
+
model_name = "gemini-2.5-flash-lite"
|
|
98
|
+
image_path = datadir / "page_with_formula.png"
|
|
121
99
|
|
|
122
|
-
|
|
123
|
-
|
|
100
|
+
config = converter_config_registry.get(model_name)
|
|
101
|
+
converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
|
|
124
102
|
|
|
125
|
-
|
|
126
|
-
|
|
103
|
+
# Process image
|
|
104
|
+
document = converter(image_path)
|
|
127
105
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
106
|
+
# Verify document structure
|
|
107
|
+
assert isinstance(document, Document)
|
|
108
|
+
assert document.file_path == str(image_path)
|
|
109
|
+
assert (
|
|
110
|
+
len(document.pages) == 1
|
|
111
|
+
), f"Expected 1 page, got {len(document.pages)}"
|
|
132
112
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
113
|
+
# Verify page
|
|
114
|
+
page = document.pages[0]
|
|
115
|
+
assert isinstance(page, Page)
|
|
116
|
+
assert page.text is not None
|
|
117
|
+
assert len(page.text) > 0
|
|
138
118
|
|
|
139
|
-
|
|
140
|
-
|
|
119
|
+
# Verify API was called once
|
|
120
|
+
assert openai_client.chat.completions.create.call_count == 1
|
|
141
121
|
|
|
142
|
-
def test_dotsocr_ocr_mode(self, file_path,
|
|
122
|
+
def test_dotsocr_ocr_mode(self, file_path, mock_openai_api, tmp_output_dir):
|
|
143
123
|
"""Test DotsOCR converter in OCR mode."""
|
|
144
|
-
|
|
145
|
-
|
|
124
|
+
with mock_openai_api(content=MOCK_RESPONSES["dotsocr_ocr"]) as openai_client:
|
|
125
|
+
config = converter_config_registry.get("dotsocr")
|
|
126
|
+
converter = config.get_client(
|
|
127
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
128
|
+
)
|
|
146
129
|
|
|
147
|
-
|
|
148
|
-
|
|
130
|
+
# Process document
|
|
131
|
+
document = converter(file_path)
|
|
149
132
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
133
|
+
# Verify document structure
|
|
134
|
+
assert isinstance(document, Document)
|
|
135
|
+
assert len(document.pages) == 2
|
|
153
136
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
137
|
+
for page in document.pages:
|
|
138
|
+
assert isinstance(page, Page)
|
|
139
|
+
assert page.text is not None
|
|
140
|
+
assert len(page.text) > 0
|
|
158
141
|
|
|
159
|
-
|
|
160
|
-
|
|
142
|
+
# Verify API was called
|
|
143
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
161
144
|
|
|
162
145
|
@pytest.mark.parametrize("model_name", ALL_MODELS)
|
|
163
|
-
def test_converter_error_handling(
|
|
146
|
+
def test_converter_error_handling(
|
|
147
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
148
|
+
):
|
|
164
149
|
"""Test that converters handle errors gracefully."""
|
|
165
|
-
with
|
|
166
|
-
# Configure mock to raise an exception
|
|
167
|
-
mock_instance = MagicMock()
|
|
168
|
-
mock_instance.chat.completions.create = AsyncMock(
|
|
169
|
-
side_effect=Exception("API Error")
|
|
170
|
-
)
|
|
171
|
-
mock_client.return_value = mock_instance
|
|
172
|
-
|
|
150
|
+
with mock_openai_api(side_effect=Exception("API Error")):
|
|
173
151
|
config = converter_config_registry.get(model_name)
|
|
174
|
-
converter = config.get_client(debug=False)
|
|
152
|
+
converter = config.get_client(debug=False, save_folder=str(tmp_output_dir))
|
|
175
153
|
|
|
176
154
|
# Process should not crash
|
|
177
155
|
document = converter(file_path)
|
|
@@ -193,25 +171,29 @@ class TestConverterBatchProcessing:
|
|
|
193
171
|
"lightonocr",
|
|
194
172
|
],
|
|
195
173
|
)
|
|
196
|
-
def test_batch_processing(
|
|
174
|
+
def test_batch_processing(
|
|
175
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
176
|
+
):
|
|
197
177
|
"""Test batch processing of multiple files."""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
178
|
+
with mock_openai_api():
|
|
179
|
+
config = converter_config_registry.get(model_name)
|
|
180
|
+
converter = config.get_client(
|
|
181
|
+
num_concurrent_files=2,
|
|
182
|
+
num_concurrent_pages=2,
|
|
183
|
+
return_documents_in_batch_mode=True,
|
|
184
|
+
debug=True,
|
|
185
|
+
save_folder=str(tmp_output_dir),
|
|
186
|
+
)
|
|
205
187
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
188
|
+
# Process multiple files (same file for testing)
|
|
189
|
+
file_paths = [file_path, file_path]
|
|
190
|
+
documents = converter.batch(file_paths)
|
|
209
191
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
192
|
+
# Verify results
|
|
193
|
+
assert len(documents) == 2
|
|
194
|
+
for doc in documents:
|
|
195
|
+
assert isinstance(doc, Document)
|
|
196
|
+
assert len(doc.pages) == 2
|
|
215
197
|
|
|
216
198
|
|
|
217
199
|
@pytest.fixture
|
|
@@ -245,12 +227,16 @@ def mineru_mock_httpx_client():
|
|
|
245
227
|
|
|
246
228
|
|
|
247
229
|
class TestMinerUConverterMockedApi:
|
|
248
|
-
def test_mineru_converter_repeated_call(
|
|
230
|
+
def test_mineru_converter_repeated_call(
|
|
231
|
+
self, file_path, mineru_mock_httpx_client, tmp_output_dir
|
|
232
|
+
):
|
|
249
233
|
"""Repeated `__call__` should keep working and call API each page."""
|
|
250
234
|
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
251
235
|
|
|
252
236
|
config = MinerUConverterConfig(base_url="http://mineru.test")
|
|
253
|
-
converter = config.get_client(
|
|
237
|
+
converter = config.get_client(
|
|
238
|
+
num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
|
|
239
|
+
)
|
|
254
240
|
|
|
255
241
|
with (
|
|
256
242
|
patch("vlmparse.clients.mineru.clean_response", lambda x: x),
|
|
@@ -274,7 +260,7 @@ class TestMinerUConverterMockedApi:
|
|
|
274
260
|
assert mineru_mock_httpx_client.post.call_count == 4
|
|
275
261
|
|
|
276
262
|
def test_mineru_converter_batch_processing(
|
|
277
|
-
self, file_path, mineru_mock_httpx_client
|
|
263
|
+
self, file_path, mineru_mock_httpx_client, tmp_output_dir
|
|
278
264
|
):
|
|
279
265
|
"""Batch mode should return documents and call API for each page."""
|
|
280
266
|
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
@@ -285,6 +271,7 @@ class TestMinerUConverterMockedApi:
|
|
|
285
271
|
num_concurrent_pages=2,
|
|
286
272
|
return_documents_in_batch_mode=True,
|
|
287
273
|
debug=True,
|
|
274
|
+
save_folder=str(tmp_output_dir),
|
|
288
275
|
)
|
|
289
276
|
|
|
290
277
|
with (
|
|
@@ -306,19 +293,22 @@ class TestMinerUConverterMockedApi:
|
|
|
306
293
|
class TestCustomURI:
|
|
307
294
|
"""Test converter initialization with custom URIs."""
|
|
308
295
|
|
|
309
|
-
def test_custom_uri_config(self,
|
|
296
|
+
def test_custom_uri_config(self, mock_openai_api, file_path, tmp_output_dir):
|
|
310
297
|
"""Test that converters can be initialized with custom URIs."""
|
|
311
|
-
|
|
312
|
-
|
|
298
|
+
with mock_openai_api():
|
|
299
|
+
custom_uri = "http://localhost:8000/v1"
|
|
300
|
+
config = converter_config_registry.get(
|
|
301
|
+
"gemini-2.5-flash-lite", uri=custom_uri
|
|
302
|
+
)
|
|
313
303
|
|
|
314
|
-
|
|
304
|
+
assert config.base_url == custom_uri
|
|
315
305
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
306
|
+
# Test it works
|
|
307
|
+
converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
|
|
308
|
+
document = converter(file_path)
|
|
319
309
|
|
|
320
|
-
|
|
321
|
-
|
|
310
|
+
assert isinstance(document, Document)
|
|
311
|
+
assert len(document.pages) == 2
|
|
322
312
|
|
|
323
313
|
|
|
324
314
|
class TestConcurrency:
|
|
@@ -326,14 +316,17 @@ class TestConcurrency:
|
|
|
326
316
|
|
|
327
317
|
@pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
|
|
328
318
|
def test_concurrent_page_processing(
|
|
329
|
-
self, file_path, model_name,
|
|
319
|
+
self, file_path, model_name, mock_openai_api, tmp_output_dir
|
|
330
320
|
):
|
|
331
321
|
"""Test that concurrent page processing limits are respected."""
|
|
332
|
-
|
|
333
|
-
|
|
322
|
+
with mock_openai_api() as openai_client:
|
|
323
|
+
config = converter_config_registry.get(model_name)
|
|
324
|
+
converter = config.get_client(
|
|
325
|
+
num_concurrent_pages=1, debug=True, save_folder=str(tmp_output_dir)
|
|
326
|
+
)
|
|
334
327
|
|
|
335
|
-
|
|
328
|
+
document = converter(file_path)
|
|
336
329
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
330
|
+
assert len(document.pages) == 2
|
|
331
|
+
# With concurrency=1, calls should be sequential
|
|
332
|
+
assert openai_client.chat.completions.create.call_count == 2
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from vlmparse.converter_with_server import ConverterWithServer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestBatchParser:
|
|
7
|
+
"""Tests for ConverterWithServer (acting as BatchParser)."""
|
|
8
|
+
|
|
9
|
+
def test_init_starts_docker_server(self, mock_docker_operations):
|
|
10
|
+
"""Test that initializing with a model requiring docker starts the server."""
|
|
11
|
+
# Setup using unified mocking system
|
|
12
|
+
with mock_docker_operations(include_client=True) as (
|
|
13
|
+
mock_docker_registry,
|
|
14
|
+
mock_config,
|
|
15
|
+
mock_server,
|
|
16
|
+
mock_client,
|
|
17
|
+
):
|
|
18
|
+
# Initialize
|
|
19
|
+
with ConverterWithServer(model="test_model", server="hf") as parser:
|
|
20
|
+
# Verify interactions
|
|
21
|
+
# For server="hf", we expect lookup in registry then fallback if not found
|
|
22
|
+
# The mock setup seems to imply it finds it or defaults?
|
|
23
|
+
# In start_server, if server="hf" and not in registry, it makes a default config.
|
|
24
|
+
# If mock_docker_registry.get returns something, it uses it.
|
|
25
|
+
mock_docker_registry.get.assert_called_with("test_model")
|
|
26
|
+
mock_config.get_server.assert_called_with(auto_stop=True)
|
|
27
|
+
mock_server.start.assert_called_once()
|
|
28
|
+
mock_config.get_client.assert_called_once()
|
|
29
|
+
assert parser.client == mock_client
|
|
30
|
+
|
|
31
|
+
def test_init_no_docker_fallback(self, mock_docker_operations, mock_openai_api):
|
|
32
|
+
"""Test fallback to standard converter when no docker config exists."""
|
|
33
|
+
# Setup mocks - docker returns None, use real converter registry
|
|
34
|
+
with mock_docker_operations(
|
|
35
|
+
model_filter=lambda model: False # No docker for any model
|
|
36
|
+
) as (mock_docker_reg, _, _, _):
|
|
37
|
+
with mock_openai_api():
|
|
38
|
+
# Initialize with a real model from registry
|
|
39
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
40
|
+
# Verify interactions
|
|
41
|
+
# Because server="registry" (default), we check valid model
|
|
42
|
+
# If model is not in docker registry, it shouldn't try to get with default=False anymore
|
|
43
|
+
# logic:
|
|
44
|
+
# if server="registry": if model in docker_config_registry.list_models() -> start_server
|
|
45
|
+
# else -> directly to converter_config_registry
|
|
46
|
+
|
|
47
|
+
# The mock_docker_operations mocks list_models via the model_filter probably?
|
|
48
|
+
# Let's check how mock_docker_operations is implemented in conftest
|
|
49
|
+
# Assuming it mocks list_models.
|
|
50
|
+
|
|
51
|
+
# Client should be initialized from real converter registry
|
|
52
|
+
assert parser.client is not None
|
|
53
|
+
|
|
54
|
+
def test_parse_updates_client_config(
|
|
55
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
56
|
+
):
|
|
57
|
+
"""Test that parse method updates client configuration and calls batch."""
|
|
58
|
+
# Use real test file
|
|
59
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
60
|
+
|
|
61
|
+
with mock_docker_operations(
|
|
62
|
+
model_filter=lambda model: False # No docker for any model
|
|
63
|
+
):
|
|
64
|
+
with mock_openai_api():
|
|
65
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
66
|
+
# Call parse with real file
|
|
67
|
+
parser.client.return_documents_in_batch_mode = True
|
|
68
|
+
documents = parser.parse(
|
|
69
|
+
inputs=[str(test_file)],
|
|
70
|
+
out_folder=str(tmp_path),
|
|
71
|
+
mode="md",
|
|
72
|
+
dpi=300,
|
|
73
|
+
debug=True,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Verify client config updates
|
|
77
|
+
assert parser.client.config.dpi == 300
|
|
78
|
+
assert parser.client.debug is True
|
|
79
|
+
assert parser.client.save_mode == "md"
|
|
80
|
+
# Concurrency should be 1 because debug=True
|
|
81
|
+
assert parser.client.num_concurrent_files == 1
|
|
82
|
+
assert parser.client.num_concurrent_pages == 1
|
|
83
|
+
|
|
84
|
+
# Verify result
|
|
85
|
+
assert documents is not None
|
|
86
|
+
assert len(documents) > 0
|
|
87
|
+
|
|
88
|
+
def test_parse_retry_logic(
|
|
89
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
90
|
+
):
|
|
91
|
+
"""Test the retrylast logic filters already processed files."""
|
|
92
|
+
# Create two copies of the test file
|
|
93
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
94
|
+
temp_dir = tmp_path / "input_files"
|
|
95
|
+
temp_dir.mkdir()
|
|
96
|
+
file1 = temp_dir / "file1.pdf"
|
|
97
|
+
file2 = temp_dir / "file2.pdf"
|
|
98
|
+
|
|
99
|
+
# Copy test file to simulate multiple inputs
|
|
100
|
+
import shutil
|
|
101
|
+
|
|
102
|
+
shutil.copy(test_file, file1)
|
|
103
|
+
shutil.copy(test_file, file2)
|
|
104
|
+
|
|
105
|
+
# Setup folder structure for retry
|
|
106
|
+
run_folder = tmp_path / "output" / "run1"
|
|
107
|
+
results_folder = run_folder / "results"
|
|
108
|
+
results_folder.mkdir(parents=True)
|
|
109
|
+
|
|
110
|
+
# Create a processed result for file1
|
|
111
|
+
(results_folder / "file1.zip").touch()
|
|
112
|
+
|
|
113
|
+
with mock_docker_operations(model_filter=lambda model: False):
|
|
114
|
+
with mock_openai_api():
|
|
115
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
116
|
+
parser.client.return_documents_in_batch_mode = True
|
|
117
|
+
# Call parse with retrylast - should only process file2
|
|
118
|
+
documents = parser.parse(
|
|
119
|
+
inputs=[str(file1), str(file2)],
|
|
120
|
+
out_folder=str(tmp_path / "output"),
|
|
121
|
+
retrylast=True,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Should only process file2 (file1 was already processed)
|
|
125
|
+
# Verify by checking that only 1 file was processed
|
|
126
|
+
assert documents is not None
|
|
127
|
+
assert len(documents) == 1
|
|
128
|
+
|
|
129
|
+
def test_parse_retry_no_previous_runs(
|
|
130
|
+
self, mock_docker_operations, datadir, mock_openai_api, tmp_path
|
|
131
|
+
):
|
|
132
|
+
"""Test that retrylast raises ValueError if no previous runs found."""
|
|
133
|
+
test_file = datadir / "Fiche_Graines_A5.pdf"
|
|
134
|
+
|
|
135
|
+
with mock_docker_operations(model_filter=lambda model: False):
|
|
136
|
+
with mock_openai_api():
|
|
137
|
+
with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
|
|
138
|
+
# tmp_path is empty, so os.listdir(tmp_path) will be empty
|
|
139
|
+
with pytest.raises(ValueError, match="No previous runs found"):
|
|
140
|
+
parser.parse(
|
|
141
|
+
inputs=[str(test_file)],
|
|
142
|
+
out_folder=str(tmp_path),
|
|
143
|
+
retrylast=True,
|
|
144
|
+
)
|