vlmparse 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlmparse-0.1.4/vlmparse.egg-info → vlmparse-0.1.6}/PKG-INFO +22 -6
- {vlmparse-0.1.4 → vlmparse-0.1.6}/README.md +21 -5
- {vlmparse-0.1.4 → vlmparse-0.1.6}/pyproject.toml +2 -6
- {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_all_converters_mocked.py +104 -6
- {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_batch_parser.py +58 -60
- {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_cli.py +79 -129
- {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_end2end.py +23 -22
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/cli.py +26 -96
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/chandra.py +1 -1
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/docling.py +2 -2
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/dotsocr.py +20 -7
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/hunyuanocr.py +2 -1
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/mineru.py +18 -19
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/olmocr.py +1 -1
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/openai_converter.py +14 -4
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/paddleocrvl.py +2 -1
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/converter_with_server.py +38 -11
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/data_model/document.py +11 -1
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/registries.py +3 -7
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/servers/docker_server.py +16 -2
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/servers/utils.py +3 -2
- {vlmparse-0.1.4 → vlmparse-0.1.6/vlmparse.egg-info}/PKG-INFO +22 -6
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/SOURCES.txt +0 -18
- vlmparse-0.1.4/tests/test_benchmark_tests.py +0 -731
- vlmparse-0.1.4/tests/test_process_and_run_benchmark.py +0 -144
- vlmparse-0.1.4/tests/test_table_tests.py +0 -1516
- vlmparse-0.1.4/vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse-0.1.4/vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse-0.1.4/vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse-0.1.4/vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse-0.1.4/vlmparse/benchpdf2md/utils.py +0 -56
- {vlmparse-0.1.4 → vlmparse-0.1.6}/LICENSE +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/setup.cfg +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/base_model.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/build_doc.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/deepseekocr.py +51 -51
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/granite_docling.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/lightonocr.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/nanonetocr.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/utils.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/prompts.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/constants.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/converter.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/data_model/box.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/st_viewer/fs_nav.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/st_viewer/st_viewer.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/utils.py +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/dependency_links.txt +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/entry_points.txt +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/requires.txt +0 -0
- {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vlmparse
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Requires-Python: >=3.
|
|
3
|
+
Version: 0.1.6
|
|
4
|
+
Requires-Python: >=3.11.0
|
|
5
5
|
Description-Content-Type: text/markdown
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Dist: devtools>=0.12.2
|
|
@@ -72,6 +72,19 @@ Supported Converters:
|
|
|
72
72
|
|
|
73
73
|
## Installation
|
|
74
74
|
|
|
75
|
+
Simplest solution with only the cli:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
uv tool install vlmparse
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
If you want to run the granite-docling model or use the streamlit viewing app:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv tool install vlmparse[docling_core,st_app]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If you prefer cloning the repository and using the local version:
|
|
75
88
|
```bash
|
|
76
89
|
uv sync
|
|
77
90
|
```
|
|
@@ -86,10 +99,11 @@ Activate the virtual environment:
|
|
|
86
99
|
```bash
|
|
87
100
|
source .venv/bin/activate
|
|
88
101
|
```
|
|
89
|
-
Other solution: append uv run to all the commands below.
|
|
90
102
|
|
|
91
103
|
## CLI Usage
|
|
92
104
|
|
|
105
|
+
Note that you can bypass the previous installation step and just add uvx before each of the commands below.
|
|
106
|
+
|
|
93
107
|
### Convert PDFs
|
|
94
108
|
|
|
95
109
|
With a general VLM (requires setting your api key as an environment variable):
|
|
@@ -185,11 +199,13 @@ server.stop()
|
|
|
185
199
|
```
|
|
186
200
|
|
|
187
201
|
|
|
188
|
-
Converter with automatic server
|
|
202
|
+
Converter with automatic server management:
|
|
189
203
|
|
|
190
204
|
```python
|
|
191
205
|
from vlmparse.converter_with_server import ConverterWithServer
|
|
192
206
|
|
|
193
|
-
|
|
194
|
-
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
207
|
+
with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
208
|
+
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
195
209
|
```
|
|
210
|
+
|
|
211
|
+
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
@@ -18,6 +18,19 @@ Supported Converters:
|
|
|
18
18
|
|
|
19
19
|
## Installation
|
|
20
20
|
|
|
21
|
+
Simplest solution with only the cli:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv tool install vlmparse
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
If you want to run the granite-docling model or use the streamlit viewing app:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv tool install vlmparse[docling_core,st_app]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
If you prefer cloning the repository and using the local version:
|
|
21
34
|
```bash
|
|
22
35
|
uv sync
|
|
23
36
|
```
|
|
@@ -32,10 +45,11 @@ Activate the virtual environment:
|
|
|
32
45
|
```bash
|
|
33
46
|
source .venv/bin/activate
|
|
34
47
|
```
|
|
35
|
-
Other solution: append uv run to all the commands below.
|
|
36
48
|
|
|
37
49
|
## CLI Usage
|
|
38
50
|
|
|
51
|
+
Note that you can bypass the previous installation step and just add uvx before each of the commands below.
|
|
52
|
+
|
|
39
53
|
### Convert PDFs
|
|
40
54
|
|
|
41
55
|
With a general VLM (requires setting your api key as an environment variable):
|
|
@@ -131,11 +145,13 @@ server.stop()
|
|
|
131
145
|
```
|
|
132
146
|
|
|
133
147
|
|
|
134
|
-
Converter with automatic server
|
|
148
|
+
Converter with automatic server management:
|
|
135
149
|
|
|
136
150
|
```python
|
|
137
151
|
from vlmparse.converter_with_server import ConverterWithServer
|
|
138
152
|
|
|
139
|
-
|
|
140
|
-
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
141
|
-
```
|
|
153
|
+
with ConverterWithServer(model="mineru2.5") as converter_with_server:
|
|
154
|
+
documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
|
|
@@ -2,17 +2,13 @@
|
|
|
2
2
|
requires = ["setuptools", "wheel"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
|
-
[metadata]
|
|
6
|
-
name = "vlmparse"
|
|
7
|
-
version = "0.1.0"
|
|
8
|
-
|
|
9
5
|
[project]
|
|
10
6
|
name = "vlmparse"
|
|
11
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.6"
|
|
12
8
|
authors = []
|
|
13
9
|
description = ""
|
|
14
10
|
readme = "README.md"
|
|
15
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.11.0"
|
|
16
12
|
dependencies = [
|
|
17
13
|
"devtools>=0.12.2",
|
|
18
14
|
"docker>=7.1.0",
|
|
@@ -5,6 +5,7 @@ This avoids the need to deploy actual Docker servers.
|
|
|
5
5
|
|
|
6
6
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
7
7
|
|
|
8
|
+
import orjson
|
|
8
9
|
import pytest
|
|
9
10
|
|
|
10
11
|
from vlmparse.data_model.document import Document, Page
|
|
@@ -26,6 +27,10 @@ def mock_openai_client():
|
|
|
26
27
|
mock_response = MagicMock()
|
|
27
28
|
mock_response.choices = [MagicMock()]
|
|
28
29
|
mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
|
|
30
|
+
mock_response.usage = MagicMock()
|
|
31
|
+
mock_response.usage.prompt_tokens = 50
|
|
32
|
+
mock_response.usage.completion_tokens = 150
|
|
33
|
+
mock_response.usage.reasoning_tokens = 30
|
|
29
34
|
|
|
30
35
|
# Configure the async method
|
|
31
36
|
mock_instance = MagicMock()
|
|
@@ -42,7 +47,10 @@ def dotsocr_mock_client():
|
|
|
42
47
|
mock_response = MagicMock()
|
|
43
48
|
mock_response.choices = [MagicMock()]
|
|
44
49
|
mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
|
|
45
|
-
|
|
50
|
+
mock_response.usage = MagicMock()
|
|
51
|
+
mock_response.usage.prompt_tokens = 40
|
|
52
|
+
mock_response.usage.completion_tokens = 160
|
|
53
|
+
mock_response.usage.reasoning_tokens = 20
|
|
46
54
|
mock_instance = MagicMock()
|
|
47
55
|
mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
48
56
|
mock_client.return_value = mock_instance
|
|
@@ -87,7 +95,7 @@ class TestConverterConfigs:
|
|
|
87
95
|
):
|
|
88
96
|
"""Test basic document processing for OpenAI-compatible converters."""
|
|
89
97
|
config = converter_config_registry.get(model_name)
|
|
90
|
-
converter = config.get_client(num_concurrent_pages=2)
|
|
98
|
+
converter = config.get_client(num_concurrent_pages=2, debug=True)
|
|
91
99
|
|
|
92
100
|
# Process document
|
|
93
101
|
document = converter(file_path)
|
|
@@ -112,7 +120,7 @@ class TestConverterConfigs:
|
|
|
112
120
|
image_path = datadir / "page_with_formula.png"
|
|
113
121
|
|
|
114
122
|
config = converter_config_registry.get(model_name)
|
|
115
|
-
converter = config.get_client()
|
|
123
|
+
converter = config.get_client(debug=True)
|
|
116
124
|
|
|
117
125
|
# Process image
|
|
118
126
|
document = converter(image_path)
|
|
@@ -134,7 +142,7 @@ class TestConverterConfigs:
|
|
|
134
142
|
def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
|
|
135
143
|
"""Test DotsOCR converter in OCR mode."""
|
|
136
144
|
config = converter_config_registry.get("dotsocr")
|
|
137
|
-
converter = config.get_client(num_concurrent_pages=2)
|
|
145
|
+
converter = config.get_client(num_concurrent_pages=2, debug=True)
|
|
138
146
|
|
|
139
147
|
# Process document
|
|
140
148
|
document = converter(file_path)
|
|
@@ -192,6 +200,7 @@ class TestConverterBatchProcessing:
|
|
|
192
200
|
num_concurrent_files=2,
|
|
193
201
|
num_concurrent_pages=2,
|
|
194
202
|
return_documents_in_batch_mode=True,
|
|
203
|
+
debug=True,
|
|
195
204
|
)
|
|
196
205
|
|
|
197
206
|
# Process multiple files (same file for testing)
|
|
@@ -205,6 +214,95 @@ class TestConverterBatchProcessing:
|
|
|
205
214
|
assert len(doc.pages) == 2
|
|
206
215
|
|
|
207
216
|
|
|
217
|
+
@pytest.fixture
|
|
218
|
+
def mineru_mock_httpx_client():
|
|
219
|
+
"""Mock the httpx AsyncClient used by MinerUConverter."""
|
|
220
|
+
with patch("httpx.AsyncClient") as mock_async_client:
|
|
221
|
+
mock_client = MagicMock()
|
|
222
|
+
mock_async_client.return_value = mock_client
|
|
223
|
+
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
224
|
+
mock_client.__aexit__ = AsyncMock(return_value=None)
|
|
225
|
+
|
|
226
|
+
mock_response = MagicMock()
|
|
227
|
+
mock_response.raise_for_status = MagicMock()
|
|
228
|
+
mock_response.content = orjson.dumps(
|
|
229
|
+
[
|
|
230
|
+
{
|
|
231
|
+
"bbox": [0.1, 0.2, 0.3, 0.4],
|
|
232
|
+
"content": "<p>Hello MinerU</p>",
|
|
233
|
+
"type": "Text",
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
"bbox": [0.5, 0.6, 0.7, 0.8],
|
|
237
|
+
"content": "<p>Second block</p>",
|
|
238
|
+
"type": "Text",
|
|
239
|
+
},
|
|
240
|
+
]
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
mock_client.post = AsyncMock(return_value=mock_response)
|
|
244
|
+
yield mock_client
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class TestMinerUConverterMockedApi:
|
|
248
|
+
def test_mineru_converter_repeated_call(self, file_path, mineru_mock_httpx_client):
|
|
249
|
+
"""Repeated `__call__` should keep working and call API each page."""
|
|
250
|
+
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
251
|
+
|
|
252
|
+
config = MinerUConverterConfig(base_url="http://mineru.test")
|
|
253
|
+
converter = config.get_client(num_concurrent_pages=2, debug=True)
|
|
254
|
+
|
|
255
|
+
with (
|
|
256
|
+
patch("vlmparse.clients.mineru.clean_response", lambda x: x),
|
|
257
|
+
patch("vlmparse.clients.mineru.html_to_md_keep_tables", lambda x: x),
|
|
258
|
+
):
|
|
259
|
+
doc1 = converter(file_path)
|
|
260
|
+
doc2 = converter(file_path)
|
|
261
|
+
|
|
262
|
+
assert isinstance(doc1, Document)
|
|
263
|
+
assert isinstance(doc2, Document)
|
|
264
|
+
assert len(doc1.pages) == 2
|
|
265
|
+
assert len(doc2.pages) == 2
|
|
266
|
+
|
|
267
|
+
for page in doc1.pages + doc2.pages:
|
|
268
|
+
assert isinstance(page, Page)
|
|
269
|
+
assert page.text is not None and len(page.text) > 0
|
|
270
|
+
assert page.items is not None
|
|
271
|
+
assert len(page.items) == 2
|
|
272
|
+
|
|
273
|
+
# 2 pages per doc * 2 docs
|
|
274
|
+
assert mineru_mock_httpx_client.post.call_count == 4
|
|
275
|
+
|
|
276
|
+
def test_mineru_converter_batch_processing(
|
|
277
|
+
self, file_path, mineru_mock_httpx_client
|
|
278
|
+
):
|
|
279
|
+
"""Batch mode should return documents and call API for each page."""
|
|
280
|
+
from vlmparse.clients.mineru import MinerUConverterConfig
|
|
281
|
+
|
|
282
|
+
config = MinerUConverterConfig(base_url="http://mineru.test")
|
|
283
|
+
converter = config.get_client(
|
|
284
|
+
num_concurrent_files=2,
|
|
285
|
+
num_concurrent_pages=2,
|
|
286
|
+
return_documents_in_batch_mode=True,
|
|
287
|
+
debug=True,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
with (
|
|
291
|
+
patch("vlmparse.clients.mineru.clean_response", lambda x: x),
|
|
292
|
+
patch("vlmparse.clients.mineru.html_to_md_keep_tables", lambda x: x),
|
|
293
|
+
):
|
|
294
|
+
docs = converter.batch([file_path, file_path])
|
|
295
|
+
|
|
296
|
+
assert isinstance(docs, list)
|
|
297
|
+
assert len(docs) == 2
|
|
298
|
+
for doc in docs:
|
|
299
|
+
assert isinstance(doc, Document)
|
|
300
|
+
assert len(doc.pages) == 2
|
|
301
|
+
|
|
302
|
+
# 2 pages per doc * 2 docs
|
|
303
|
+
assert mineru_mock_httpx_client.post.call_count == 4
|
|
304
|
+
|
|
305
|
+
|
|
208
306
|
class TestCustomURI:
|
|
209
307
|
"""Test converter initialization with custom URIs."""
|
|
210
308
|
|
|
@@ -216,7 +314,7 @@ class TestCustomURI:
|
|
|
216
314
|
assert config.llm_params.base_url == custom_uri
|
|
217
315
|
|
|
218
316
|
# Test it works
|
|
219
|
-
converter = config.get_client()
|
|
317
|
+
converter = config.get_client(debug=True)
|
|
220
318
|
document = converter(file_path)
|
|
221
319
|
|
|
222
320
|
assert isinstance(document, Document)
|
|
@@ -232,7 +330,7 @@ class TestConcurrency:
|
|
|
232
330
|
):
|
|
233
331
|
"""Test that concurrent page processing limits are respected."""
|
|
234
332
|
config = converter_config_registry.get(model_name)
|
|
235
|
-
converter = config.get_client(num_concurrent_pages=1)
|
|
333
|
+
converter = config.get_client(num_concurrent_pages=1, debug=True)
|
|
236
334
|
|
|
237
335
|
document = converter(file_path)
|
|
238
336
|
|
|
@@ -39,14 +39,13 @@ class TestBatchParser:
|
|
|
39
39
|
mock_docker_registry.get.return_value = mock_config
|
|
40
40
|
|
|
41
41
|
# Initialize
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
assert parser.client == mock_client
|
|
42
|
+
with ConverterWithServer(model="test_model", with_vllm_server=True) as parser:
|
|
43
|
+
# Verify interactions
|
|
44
|
+
mock_docker_registry.get.assert_called_with("test_model", default=True)
|
|
45
|
+
mock_config.get_server.assert_called_with(auto_stop=True)
|
|
46
|
+
mock_server.start.assert_called_once()
|
|
47
|
+
mock_config.get_client.assert_called_once()
|
|
48
|
+
assert parser.client == mock_client
|
|
50
49
|
|
|
51
50
|
def test_init_no_docker_fallback(
|
|
52
51
|
self, mock_docker_registry, mock_converter_registry
|
|
@@ -61,13 +60,12 @@ class TestBatchParser:
|
|
|
61
60
|
mock_converter_registry.get.return_value = mock_converter_config
|
|
62
61
|
|
|
63
62
|
# Initialize
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
assert parser.client == mock_client
|
|
63
|
+
with ConverterWithServer(model="test_model") as parser:
|
|
64
|
+
# Verify interactions
|
|
65
|
+
mock_docker_registry.get.assert_called_with("test_model", default=False)
|
|
66
|
+
mock_converter_registry.get.assert_called_with("test_model")
|
|
67
|
+
mock_converter_config.get_client.assert_called_once()
|
|
68
|
+
assert parser.client == mock_client
|
|
71
69
|
|
|
72
70
|
def test_init_with_uri(self, mock_converter_registry):
|
|
73
71
|
"""Test initialization with explicit URI."""
|
|
@@ -76,13 +74,12 @@ class TestBatchParser:
|
|
|
76
74
|
mock_config.get_client.return_value = mock_client
|
|
77
75
|
mock_converter_registry.get.return_value = mock_config
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
assert parser.client == mock_client
|
|
77
|
+
with ConverterWithServer(model="test_model", uri="http://custom.uri") as parser:
|
|
78
|
+
mock_converter_registry.get.assert_called_with(
|
|
79
|
+
"test_model", uri="http://custom.uri"
|
|
80
|
+
)
|
|
81
|
+
mock_config.get_client.assert_called_once()
|
|
82
|
+
assert parser.client == mock_client
|
|
86
83
|
|
|
87
84
|
def test_parse_updates_client_config(
|
|
88
85
|
self, mock_docker_registry, mock_get_file_paths, tmp_path
|
|
@@ -100,27 +97,30 @@ class TestBatchParser:
|
|
|
100
97
|
mock_doc = MagicMock(spec=Document)
|
|
101
98
|
mock_client.batch.return_value = [mock_doc, mock_doc]
|
|
102
99
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
100
|
+
with ConverterWithServer(model="test_model") as parser:
|
|
101
|
+
# Call parse
|
|
102
|
+
documents = parser.parse(
|
|
103
|
+
inputs=["dummy"],
|
|
104
|
+
out_folder=str(tmp_path),
|
|
105
|
+
mode="md",
|
|
106
|
+
dpi=300,
|
|
107
|
+
debug=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Verify client config updates
|
|
111
|
+
assert mock_client.config.dpi == 300
|
|
112
|
+
assert mock_client.debug is True
|
|
113
|
+
assert mock_client.save_mode == "md"
|
|
114
|
+
# Concurrency should be 1 because debug=True
|
|
115
|
+
assert mock_client.num_concurrent_files == 1
|
|
116
|
+
assert mock_client.num_concurrent_pages == 1
|
|
117
|
+
|
|
118
|
+
# Verify batch call
|
|
119
|
+
mock_client.batch.assert_called_once_with(["file1.pdf", "file2.pdf"])
|
|
120
|
+
|
|
121
|
+
# Verify result
|
|
122
|
+
assert len(documents) == 2
|
|
123
|
+
assert documents[0] == mock_doc
|
|
124
124
|
|
|
125
125
|
def test_parse_retry_logic(
|
|
126
126
|
self, mock_docker_registry, mock_get_file_paths, tmp_path
|
|
@@ -143,19 +143,18 @@ class TestBatchParser:
|
|
|
143
143
|
# Input has file1 (processed) and file2 (new)
|
|
144
144
|
mock_get_file_paths.return_value = ["path/to/file1.pdf", "path/to/file2.pdf"]
|
|
145
145
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
|
|
146
|
+
with ConverterWithServer(model="test_model") as parser:
|
|
147
|
+
# Call parse with retrylast
|
|
148
|
+
parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
|
|
150
149
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
150
|
+
# Verify only file2 was sent to batch
|
|
151
|
+
# file1 should be filtered out because file1.zip exists
|
|
152
|
+
call_args = mock_client.batch.call_args
|
|
153
|
+
assert call_args is not None
|
|
154
|
+
batch_files = call_args[0][0]
|
|
155
|
+
assert len(batch_files) == 1
|
|
156
|
+
assert "file2.pdf" in batch_files[0]
|
|
157
|
+
assert "file1.pdf" not in batch_files[0]
|
|
159
158
|
|
|
160
159
|
def test_parse_retry_no_previous_runs(
|
|
161
160
|
self, mock_docker_registry, mock_get_file_paths, tmp_path
|
|
@@ -166,9 +165,8 @@ class TestBatchParser:
|
|
|
166
165
|
mock_config.get_client.return_value = mock_client
|
|
167
166
|
mock_docker_registry.get.return_value = mock_config
|
|
168
167
|
|
|
169
|
-
|
|
168
|
+
with ConverterWithServer(model="test_model") as parser:
|
|
169
|
+
# tmp_path is empty, so os.listdir(tmp_path) will be empty
|
|
170
170
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
with pytest.raises(ValueError, match="No previous runs found"):
|
|
174
|
-
parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
|
|
171
|
+
with pytest.raises(ValueError, match="No previous runs found"):
|
|
172
|
+
parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
|