vlmparse 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {vlmparse-0.1.4/vlmparse.egg-info → vlmparse-0.1.6}/PKG-INFO +22 -6
  2. {vlmparse-0.1.4 → vlmparse-0.1.6}/README.md +21 -5
  3. {vlmparse-0.1.4 → vlmparse-0.1.6}/pyproject.toml +2 -6
  4. {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_all_converters_mocked.py +104 -6
  5. {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_batch_parser.py +58 -60
  6. {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_cli.py +79 -129
  7. {vlmparse-0.1.4 → vlmparse-0.1.6}/tests/test_end2end.py +23 -22
  8. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/cli.py +26 -96
  9. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/chandra.py +1 -1
  10. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/docling.py +2 -2
  11. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/dotsocr.py +20 -7
  12. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/hunyuanocr.py +2 -1
  13. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/mineru.py +18 -19
  14. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/olmocr.py +1 -1
  15. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/openai_converter.py +14 -4
  16. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/paddleocrvl.py +2 -1
  17. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/converter_with_server.py +38 -11
  18. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/data_model/document.py +11 -1
  19. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/registries.py +3 -7
  20. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/servers/docker_server.py +16 -2
  21. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/servers/utils.py +3 -2
  22. {vlmparse-0.1.4 → vlmparse-0.1.6/vlmparse.egg-info}/PKG-INFO +22 -6
  23. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/SOURCES.txt +0 -18
  24. vlmparse-0.1.4/tests/test_benchmark_tests.py +0 -731
  25. vlmparse-0.1.4/tests/test_process_and_run_benchmark.py +0 -144
  26. vlmparse-0.1.4/tests/test_table_tests.py +0 -1516
  27. vlmparse-0.1.4/vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  28. vlmparse-0.1.4/vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  29. vlmparse-0.1.4/vlmparse/benchpdf2md/create_dataset.py +0 -60
  30. vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  31. vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  32. vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  33. vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  34. vlmparse-0.1.4/vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  35. vlmparse-0.1.4/vlmparse/benchpdf2md/run_benchmark.py +0 -296
  36. vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  37. vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  38. vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  39. vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  40. vlmparse-0.1.4/vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  41. vlmparse-0.1.4/vlmparse/benchpdf2md/utils.py +0 -56
  42. {vlmparse-0.1.4 → vlmparse-0.1.6}/LICENSE +0 -0
  43. {vlmparse-0.1.4 → vlmparse-0.1.6}/setup.cfg +0 -0
  44. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/base_model.py +0 -0
  45. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/build_doc.py +0 -0
  46. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/deepseekocr.py +51 -51
  47. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/granite_docling.py +0 -0
  48. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/lightonocr.py +0 -0
  49. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/nanonetocr.py +0 -0
  50. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
  51. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
  52. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/pipe_utils/utils.py +0 -0
  53. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/clients/prompts.py +0 -0
  54. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/constants.py +0 -0
  55. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/converter.py +0 -0
  56. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/data_model/box.py +0 -0
  57. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/st_viewer/fs_nav.py +0 -0
  58. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/st_viewer/st_viewer.py +0 -0
  59. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse/utils.py +0 -0
  60. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/dependency_links.txt +0 -0
  61. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/entry_points.txt +0 -0
  62. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/requires.txt +0 -0
  63. {vlmparse-0.1.4 → vlmparse-0.1.6}/vlmparse.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.4
4
- Requires-Python: >=3.12.0
3
+ Version: 0.1.6
4
+ Requires-Python: >=3.11.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
7
7
  Requires-Dist: devtools>=0.12.2
@@ -72,6 +72,19 @@ Supported Converters:
72
72
 
73
73
  ## Installation
74
74
 
75
+ Simplest solution with only the cli:
76
+
77
+ ```bash
78
+ uv tool install vlmparse
79
+ ```
80
+
81
+ If you want to run the granite-docling model or use the streamlit viewing app:
82
+
83
+ ```bash
84
+ uv tool install vlmparse[docling_core,st_app]
85
+ ```
86
+
87
+ If you prefer cloning the repository and using the local version:
75
88
  ```bash
76
89
  uv sync
77
90
  ```
@@ -86,10 +99,11 @@ Activate the virtual environment:
86
99
  ```bash
87
100
  source .venv/bin/activate
88
101
  ```
89
- Other solution: append uv run to all the commands below.
90
102
 
91
103
  ## CLI Usage
92
104
 
105
+ Note that you can bypass the previous installation step and just add uvx before each of the commands below.
106
+
93
107
  ### Convert PDFs
94
108
 
95
109
  With a general VLM (requires setting your api key as an environment variable):
@@ -185,11 +199,13 @@ server.stop()
185
199
  ```
186
200
 
187
201
 
188
- Converter with automatic server deployment:
202
+ Converter with automatic server management:
189
203
 
190
204
  ```python
191
205
  from vlmparse.converter_with_server import ConverterWithServer
192
206
 
193
- converter_with_server = ConverterWithServer(model="mineru2.5")
194
- documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
207
+ with ConverterWithServer(model="mineru2.5") as converter_with_server:
208
+ documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
195
209
  ```
210
+
211
+ Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
@@ -18,6 +18,19 @@ Supported Converters:
18
18
 
19
19
  ## Installation
20
20
 
21
+ Simplest solution with only the cli:
22
+
23
+ ```bash
24
+ uv tool install vlmparse
25
+ ```
26
+
27
+ If you want to run the granite-docling model or use the streamlit viewing app:
28
+
29
+ ```bash
30
+ uv tool install vlmparse[docling_core,st_app]
31
+ ```
32
+
33
+ If you prefer cloning the repository and using the local version:
21
34
  ```bash
22
35
  uv sync
23
36
  ```
@@ -32,10 +45,11 @@ Activate the virtual environment:
32
45
  ```bash
33
46
  source .venv/bin/activate
34
47
  ```
35
- Other solution: append uv run to all the commands below.
36
48
 
37
49
  ## CLI Usage
38
50
 
51
+ Note that you can bypass the previous installation step and just add uvx before each of the commands below.
52
+
39
53
  ### Convert PDFs
40
54
 
41
55
  With a general VLM (requires setting your api key as an environment variable):
@@ -131,11 +145,13 @@ server.stop()
131
145
  ```
132
146
 
133
147
 
134
- Converter with automatic server deployment:
148
+ Converter with automatic server management:
135
149
 
136
150
  ```python
137
151
  from vlmparse.converter_with_server import ConverterWithServer
138
152
 
139
- converter_with_server = ConverterWithServer(model="mineru2.5")
140
- documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
141
- ```
153
+ with ConverterWithServer(model="mineru2.5") as converter_with_server:
154
+ documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
155
+ ```
156
+
157
+ Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
@@ -2,17 +2,13 @@
2
2
  requires = ["setuptools", "wheel"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
- [metadata]
6
- name = "vlmparse"
7
- version = "0.1.0"
8
-
9
5
  [project]
10
6
  name = "vlmparse"
11
- version = "0.1.4"
7
+ version = "0.1.6"
12
8
  authors = []
13
9
  description = ""
14
10
  readme = "README.md"
15
- requires-python = ">=3.12.0"
11
+ requires-python = ">=3.11.0"
16
12
  dependencies = [
17
13
  "devtools>=0.12.2",
18
14
  "docker>=7.1.0",
@@ -5,6 +5,7 @@ This avoids the need to deploy actual Docker servers.
5
5
 
6
6
  from unittest.mock import AsyncMock, MagicMock, patch
7
7
 
8
+ import orjson
8
9
  import pytest
9
10
 
10
11
  from vlmparse.data_model.document import Document, Page
@@ -26,6 +27,10 @@ def mock_openai_client():
26
27
  mock_response = MagicMock()
27
28
  mock_response.choices = [MagicMock()]
28
29
  mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
30
+ mock_response.usage = MagicMock()
31
+ mock_response.usage.prompt_tokens = 50
32
+ mock_response.usage.completion_tokens = 150
33
+ mock_response.usage.reasoning_tokens = 30
29
34
 
30
35
  # Configure the async method
31
36
  mock_instance = MagicMock()
@@ -42,7 +47,10 @@ def dotsocr_mock_client():
42
47
  mock_response = MagicMock()
43
48
  mock_response.choices = [MagicMock()]
44
49
  mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
45
-
50
+ mock_response.usage = MagicMock()
51
+ mock_response.usage.prompt_tokens = 40
52
+ mock_response.usage.completion_tokens = 160
53
+ mock_response.usage.reasoning_tokens = 20
46
54
  mock_instance = MagicMock()
47
55
  mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
48
56
  mock_client.return_value = mock_instance
@@ -87,7 +95,7 @@ class TestConverterConfigs:
87
95
  ):
88
96
  """Test basic document processing for OpenAI-compatible converters."""
89
97
  config = converter_config_registry.get(model_name)
90
- converter = config.get_client(num_concurrent_pages=2)
98
+ converter = config.get_client(num_concurrent_pages=2, debug=True)
91
99
 
92
100
  # Process document
93
101
  document = converter(file_path)
@@ -112,7 +120,7 @@ class TestConverterConfigs:
112
120
  image_path = datadir / "page_with_formula.png"
113
121
 
114
122
  config = converter_config_registry.get(model_name)
115
- converter = config.get_client()
123
+ converter = config.get_client(debug=True)
116
124
 
117
125
  # Process image
118
126
  document = converter(image_path)
@@ -134,7 +142,7 @@ class TestConverterConfigs:
134
142
  def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
135
143
  """Test DotsOCR converter in OCR mode."""
136
144
  config = converter_config_registry.get("dotsocr")
137
- converter = config.get_client(num_concurrent_pages=2)
145
+ converter = config.get_client(num_concurrent_pages=2, debug=True)
138
146
 
139
147
  # Process document
140
148
  document = converter(file_path)
@@ -192,6 +200,7 @@ class TestConverterBatchProcessing:
192
200
  num_concurrent_files=2,
193
201
  num_concurrent_pages=2,
194
202
  return_documents_in_batch_mode=True,
203
+ debug=True,
195
204
  )
196
205
 
197
206
  # Process multiple files (same file for testing)
@@ -205,6 +214,95 @@ class TestConverterBatchProcessing:
205
214
  assert len(doc.pages) == 2
206
215
 
207
216
 
217
+ @pytest.fixture
218
+ def mineru_mock_httpx_client():
219
+ """Mock the httpx AsyncClient used by MinerUConverter."""
220
+ with patch("httpx.AsyncClient") as mock_async_client:
221
+ mock_client = MagicMock()
222
+ mock_async_client.return_value = mock_client
223
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
224
+ mock_client.__aexit__ = AsyncMock(return_value=None)
225
+
226
+ mock_response = MagicMock()
227
+ mock_response.raise_for_status = MagicMock()
228
+ mock_response.content = orjson.dumps(
229
+ [
230
+ {
231
+ "bbox": [0.1, 0.2, 0.3, 0.4],
232
+ "content": "<p>Hello MinerU</p>",
233
+ "type": "Text",
234
+ },
235
+ {
236
+ "bbox": [0.5, 0.6, 0.7, 0.8],
237
+ "content": "<p>Second block</p>",
238
+ "type": "Text",
239
+ },
240
+ ]
241
+ )
242
+
243
+ mock_client.post = AsyncMock(return_value=mock_response)
244
+ yield mock_client
245
+
246
+
247
+ class TestMinerUConverterMockedApi:
248
+ def test_mineru_converter_repeated_call(self, file_path, mineru_mock_httpx_client):
249
+ """Repeated `__call__` should keep working and call API each page."""
250
+ from vlmparse.clients.mineru import MinerUConverterConfig
251
+
252
+ config = MinerUConverterConfig(base_url="http://mineru.test")
253
+ converter = config.get_client(num_concurrent_pages=2, debug=True)
254
+
255
+ with (
256
+ patch("vlmparse.clients.mineru.clean_response", lambda x: x),
257
+ patch("vlmparse.clients.mineru.html_to_md_keep_tables", lambda x: x),
258
+ ):
259
+ doc1 = converter(file_path)
260
+ doc2 = converter(file_path)
261
+
262
+ assert isinstance(doc1, Document)
263
+ assert isinstance(doc2, Document)
264
+ assert len(doc1.pages) == 2
265
+ assert len(doc2.pages) == 2
266
+
267
+ for page in doc1.pages + doc2.pages:
268
+ assert isinstance(page, Page)
269
+ assert page.text is not None and len(page.text) > 0
270
+ assert page.items is not None
271
+ assert len(page.items) == 2
272
+
273
+ # 2 pages per doc * 2 docs
274
+ assert mineru_mock_httpx_client.post.call_count == 4
275
+
276
+ def test_mineru_converter_batch_processing(
277
+ self, file_path, mineru_mock_httpx_client
278
+ ):
279
+ """Batch mode should return documents and call API for each page."""
280
+ from vlmparse.clients.mineru import MinerUConverterConfig
281
+
282
+ config = MinerUConverterConfig(base_url="http://mineru.test")
283
+ converter = config.get_client(
284
+ num_concurrent_files=2,
285
+ num_concurrent_pages=2,
286
+ return_documents_in_batch_mode=True,
287
+ debug=True,
288
+ )
289
+
290
+ with (
291
+ patch("vlmparse.clients.mineru.clean_response", lambda x: x),
292
+ patch("vlmparse.clients.mineru.html_to_md_keep_tables", lambda x: x),
293
+ ):
294
+ docs = converter.batch([file_path, file_path])
295
+
296
+ assert isinstance(docs, list)
297
+ assert len(docs) == 2
298
+ for doc in docs:
299
+ assert isinstance(doc, Document)
300
+ assert len(doc.pages) == 2
301
+
302
+ # 2 pages per doc * 2 docs
303
+ assert mineru_mock_httpx_client.post.call_count == 4
304
+
305
+
208
306
  class TestCustomURI:
209
307
  """Test converter initialization with custom URIs."""
210
308
 
@@ -216,7 +314,7 @@ class TestCustomURI:
216
314
  assert config.llm_params.base_url == custom_uri
217
315
 
218
316
  # Test it works
219
- converter = config.get_client()
317
+ converter = config.get_client(debug=True)
220
318
  document = converter(file_path)
221
319
 
222
320
  assert isinstance(document, Document)
@@ -232,7 +330,7 @@ class TestConcurrency:
232
330
  ):
233
331
  """Test that concurrent page processing limits are respected."""
234
332
  config = converter_config_registry.get(model_name)
235
- converter = config.get_client(num_concurrent_pages=1)
333
+ converter = config.get_client(num_concurrent_pages=1, debug=True)
236
334
 
237
335
  document = converter(file_path)
238
336
 
@@ -39,14 +39,13 @@ class TestBatchParser:
39
39
  mock_docker_registry.get.return_value = mock_config
40
40
 
41
41
  # Initialize
42
- parser = ConverterWithServer(model="test_model", with_vllm_server=True)
43
-
44
- # Verify interactions
45
- mock_docker_registry.get.assert_called_with("test_model", default=True)
46
- mock_config.get_server.assert_called_with(auto_stop=True)
47
- mock_server.start.assert_called_once()
48
- mock_config.get_client.assert_called_once()
49
- assert parser.client == mock_client
42
+ with ConverterWithServer(model="test_model", with_vllm_server=True) as parser:
43
+ # Verify interactions
44
+ mock_docker_registry.get.assert_called_with("test_model", default=True)
45
+ mock_config.get_server.assert_called_with(auto_stop=True)
46
+ mock_server.start.assert_called_once()
47
+ mock_config.get_client.assert_called_once()
48
+ assert parser.client == mock_client
50
49
 
51
50
  def test_init_no_docker_fallback(
52
51
  self, mock_docker_registry, mock_converter_registry
@@ -61,13 +60,12 @@ class TestBatchParser:
61
60
  mock_converter_registry.get.return_value = mock_converter_config
62
61
 
63
62
  # Initialize
64
- parser = ConverterWithServer(model="test_model")
65
-
66
- # Verify interactions
67
- mock_docker_registry.get.assert_called_with("test_model", default=False)
68
- mock_converter_registry.get.assert_called_with("test_model")
69
- mock_converter_config.get_client.assert_called_once()
70
- assert parser.client == mock_client
63
+ with ConverterWithServer(model="test_model") as parser:
64
+ # Verify interactions
65
+ mock_docker_registry.get.assert_called_with("test_model", default=False)
66
+ mock_converter_registry.get.assert_called_with("test_model")
67
+ mock_converter_config.get_client.assert_called_once()
68
+ assert parser.client == mock_client
71
69
 
72
70
  def test_init_with_uri(self, mock_converter_registry):
73
71
  """Test initialization with explicit URI."""
@@ -76,13 +74,12 @@ class TestBatchParser:
76
74
  mock_config.get_client.return_value = mock_client
77
75
  mock_converter_registry.get.return_value = mock_config
78
76
 
79
- parser = ConverterWithServer(model="test_model", uri="http://custom.uri")
80
-
81
- mock_converter_registry.get.assert_called_with(
82
- "test_model", uri="http://custom.uri"
83
- )
84
- mock_config.get_client.assert_called_once()
85
- assert parser.client == mock_client
77
+ with ConverterWithServer(model="test_model", uri="http://custom.uri") as parser:
78
+ mock_converter_registry.get.assert_called_with(
79
+ "test_model", uri="http://custom.uri"
80
+ )
81
+ mock_config.get_client.assert_called_once()
82
+ assert parser.client == mock_client
86
83
 
87
84
  def test_parse_updates_client_config(
88
85
  self, mock_docker_registry, mock_get_file_paths, tmp_path
@@ -100,27 +97,30 @@ class TestBatchParser:
100
97
  mock_doc = MagicMock(spec=Document)
101
98
  mock_client.batch.return_value = [mock_doc, mock_doc]
102
99
 
103
- parser = ConverterWithServer(model="test_model")
104
-
105
- # Call parse
106
- documents = parser.parse(
107
- inputs=["dummy"], out_folder=str(tmp_path), mode="md", dpi=300, debug=True
108
- )
109
-
110
- # Verify client config updates
111
- assert mock_client.config.dpi == 300
112
- assert mock_client.debug is True
113
- assert mock_client.save_mode == "md"
114
- # Concurrency should be 1 because debug=True
115
- assert mock_client.num_concurrent_files == 1
116
- assert mock_client.num_concurrent_pages == 1
117
-
118
- # Verify batch call
119
- mock_client.batch.assert_called_once_with(["file1.pdf", "file2.pdf"])
120
-
121
- # Verify result
122
- assert len(documents) == 2
123
- assert documents[0] == mock_doc
100
+ with ConverterWithServer(model="test_model") as parser:
101
+ # Call parse
102
+ documents = parser.parse(
103
+ inputs=["dummy"],
104
+ out_folder=str(tmp_path),
105
+ mode="md",
106
+ dpi=300,
107
+ debug=True,
108
+ )
109
+
110
+ # Verify client config updates
111
+ assert mock_client.config.dpi == 300
112
+ assert mock_client.debug is True
113
+ assert mock_client.save_mode == "md"
114
+ # Concurrency should be 1 because debug=True
115
+ assert mock_client.num_concurrent_files == 1
116
+ assert mock_client.num_concurrent_pages == 1
117
+
118
+ # Verify batch call
119
+ mock_client.batch.assert_called_once_with(["file1.pdf", "file2.pdf"])
120
+
121
+ # Verify result
122
+ assert len(documents) == 2
123
+ assert documents[0] == mock_doc
124
124
 
125
125
  def test_parse_retry_logic(
126
126
  self, mock_docker_registry, mock_get_file_paths, tmp_path
@@ -143,19 +143,18 @@ class TestBatchParser:
143
143
  # Input has file1 (processed) and file2 (new)
144
144
  mock_get_file_paths.return_value = ["path/to/file1.pdf", "path/to/file2.pdf"]
145
145
 
146
- parser = ConverterWithServer(model="test_model")
147
-
148
- # Call parse with retrylast
149
- parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
146
+ with ConverterWithServer(model="test_model") as parser:
147
+ # Call parse with retrylast
148
+ parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
150
149
 
151
- # Verify only file2 was sent to batch
152
- # file1 should be filtered out because file1.zip exists
153
- call_args = mock_client.batch.call_args
154
- assert call_args is not None
155
- batch_files = call_args[0][0]
156
- assert len(batch_files) == 1
157
- assert "file2.pdf" in batch_files[0]
158
- assert "file1.pdf" not in batch_files[0]
150
+ # Verify only file2 was sent to batch
151
+ # file1 should be filtered out because file1.zip exists
152
+ call_args = mock_client.batch.call_args
153
+ assert call_args is not None
154
+ batch_files = call_args[0][0]
155
+ assert len(batch_files) == 1
156
+ assert "file2.pdf" in batch_files[0]
157
+ assert "file1.pdf" not in batch_files[0]
159
158
 
160
159
  def test_parse_retry_no_previous_runs(
161
160
  self, mock_docker_registry, mock_get_file_paths, tmp_path
@@ -166,9 +165,8 @@ class TestBatchParser:
166
165
  mock_config.get_client.return_value = mock_client
167
166
  mock_docker_registry.get.return_value = mock_config
168
167
 
169
- parser = ConverterWithServer(model="test_model")
168
+ with ConverterWithServer(model="test_model") as parser:
169
+ # tmp_path is empty, so os.listdir(tmp_path) will be empty
170
170
 
171
- # tmp_path is empty, so os.listdir(tmp_path) will be empty
172
-
173
- with pytest.raises(ValueError, match="No previous runs found"):
174
- parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)
171
+ with pytest.raises(ValueError, match="No previous runs found"):
172
+ parser.parse(inputs=["dummy"], out_folder=str(tmp_path), retrylast=True)