vlmparse 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlmparse-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 WisTex TechSero Ltd. Co.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,184 @@
1
+ Metadata-Version: 2.4
2
+ Name: vlmparse
3
+ Version: 0.1.0
4
+ Requires-Python: >=3.12.0
5
+ Description-Content-Type: text/markdown
6
+ License-File: LICENSE
7
+ Requires-Dist: devtools>=0.12.2
8
+ Requires-Dist: docker>=7.1.0
9
+ Requires-Dist: html-to-markdown>=1.9.0
10
+ Requires-Dist: httpx>=0.27.0
11
+ Requires-Dist: loguru>=0.7.3
12
+ Requires-Dist: nest-asyncio>=1.6.0
13
+ Requires-Dist: numpy>=2.3.2
14
+ Requires-Dist: openai>=1.102.0
15
+ Requires-Dist: orjson>=3.11.3
16
+ Requires-Dist: pillow>=11.3.0
17
+ Requires-Dist: pydantic
18
+ Requires-Dist: pypdfium2>=4.30.0
19
+ Requires-Dist: fire>=0.7.1
20
+ Requires-Dist: lxml>=6.0.2
21
+ Requires-Dist: tabulate>=0.9.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: jupyter; extra == "dev"
24
+ Provides-Extra: docling-core
25
+ Requires-Dist: docling-core; extra == "docling-core"
26
+ Provides-Extra: st-app
27
+ Requires-Dist: streamlit>=1.49.0; extra == "st-app"
28
+ Provides-Extra: bench
29
+ Requires-Dist: html-to-markdown>=1.9.0; extra == "bench"
30
+ Requires-Dist: loguru>=0.7.3; extra == "bench"
31
+ Requires-Dist: nest-asyncio>=1.6.0; extra == "bench"
32
+ Requires-Dist: numpy>=2.3.2; extra == "bench"
33
+ Requires-Dist: pillow>=11.3.0; extra == "bench"
34
+ Requires-Dist: pydantic; extra == "bench"
35
+ Requires-Dist: rapidfuzz>=3.14.0; extra == "bench"
36
+ Requires-Dist: unidecode>=1.4.0; extra == "bench"
37
+ Requires-Dist: fire>=0.7.1; extra == "bench"
38
+ Requires-Dist: lxml>=6.0.2; extra == "bench"
39
+ Requires-Dist: datasets>=4.4.1; extra == "bench"
40
+ Requires-Dist: openpyxl>=3.1.5; extra == "bench"
41
+ Requires-Dist: joblib>=1.5.2; extra == "bench"
42
+ Requires-Dist: playwright; extra == "bench"
43
+ Requires-Dist: fuzzysearch>=0.8.1; extra == "bench"
44
+ Provides-Extra: test
45
+ Requires-Dist: pre-commit; extra == "test"
46
+ Requires-Dist: pytest; extra == "test"
47
+ Requires-Dist: pytest-cov; extra == "test"
48
+ Requires-Dist: pytest-mock; extra == "test"
49
+ Requires-Dist: ruff; extra == "test"
50
+ Requires-Dist: isort; extra == "test"
51
+ Requires-Dist: pre-commit; extra == "test"
52
+ Dynamic: license-file
53
+
54
+ # vlmparse
55
+
56
+ A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
57
+
58
+ Features:
59
+
60
+ - ⚡ Async/concurrent processing for high throughput
61
+ - 🐳 Automatic Docker server management for local models
62
+ - 🔄 Unified interface across all VLM/OCR providers
63
+ - 📊 Built-in result visualization with Streamlit
64
+
65
+ Supported Converters:
66
+
67
+ - **Open Source Small VLMs**: `lightonocr`, `mineru2.5`, `hunyuanocr`, `paddleocrvl`, `granite-docling`, `olmocr2-fp8`, `dotsocr`, `chandra`, `deepseekocr`, `nanonets/Nanonets-OCR2-3B`
68
+ - **Open Source Generalist VLMs**: such as the Qwen family.
69
+ - **Pipelines**: `docling`
70
+ - **Proprietary LLMs**: `gemini`, `gpt`
71
+
72
+ ## Installation
73
+
74
+ ```bash
75
+ uv sync
76
+ ```
77
+
78
+ With optional dependencies:
79
+
80
+ ```bash
81
+ uv sync --all-extras
82
+ ```
83
+
84
+ Activate the virtual environment:
85
+ ```bash
86
+ source .venv/bin/activate
87
+ ```
88
+ Other solution: append uv run to all the commands below.
89
+
90
+ ## CLI Usage
91
+
92
+ ### Convert PDFs
93
+
94
+ With a general VLM (requires setting your api key as an environment variable):
95
+
96
+ ```bash
97
+ vlmparse convert --input "*.pdf" --out_folder ./output --model gemini-2.5-flash-lite
98
+ ```
99
+
100
+ Convert with auto deployment of a small vlm (or any huggingface VLM model, requires a gpu + docker installation):
101
+
102
+ ```bash
103
+ vlmparse convert --input "*.pdf" --out_folder ./output --model nanonets/Nanonets-OCR2-3B
104
+ ```
105
+
106
+ ### Deploy a local model server
107
+
108
+ Deployment (requires a gpu + docker installation):
109
+ - You need a gpu dedicated for this.
110
+ - Check that the port is not used by another service.
111
+
112
+ ```bash
113
+ vlmparse serve --model lightonocr --port 8000 --gpus 1
114
+ ```
115
+
116
+ then convert:
117
+
118
+ ```bash
119
+ vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
120
+ ```
121
+
122
+ You can also list all running servers:
123
+
124
+ ```bash
125
+ vlmparse list
126
+ ```
127
+
128
+ Show logs of a server (if only one server is running, the container name is not needed):
129
+ ```bash
130
+ vlmparse log <container_name>
131
+ ```
132
+
133
+ Stop a server (if only one server is running, the container name is not needed):
134
+ ```bash
135
+ vlmparse stop <container_name>
136
+ ```
137
+
138
+ ### View conversion results with Streamlit
139
+
140
+ ```bash
141
+ vlmparse view ./output
142
+ ```
143
+
144
+ ## Configuration
145
+
146
+ Set API keys as environment variables:
147
+
148
+ ```bash
149
+ export GOOGLE_API_KEY="your-key"
150
+ export OPENAI_API_KEY="your-key"
151
+ ```
152
+
153
+ ## Python API
154
+
155
+ Client interface:
156
+
157
+ ```python
158
+ from vlmparse.registries import converter_config_registry
159
+
160
+ # Get a converter configuration
161
+ config = converter_config_registry.get("gemini-2.5-flash-lite")
162
+ client = config.get_client()
163
+
164
+ # Convert a single PDF
165
+ document = client("path/to/document.pdf")
166
+ print(document.to_markdown())
167
+
168
+ # Batch convert multiple PDFs
169
+ documents = client.batch(["file1.pdf", "file2.pdf"])
170
+ ```
171
+
172
+ Docker server interface:
173
+
174
+ ```python
175
+ from vlmparse.registries import docker_config_registry
176
+
177
+ config = docker_config_registry.get("lightonocr")
178
+ server = config.get_server()
179
+ server.start()
180
+
181
+ # Client calls...
182
+
183
+ server.stop()
184
+ ```
@@ -0,0 +1,131 @@
1
+ # vlmparse
2
+
3
+ A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
4
+
5
+ Features:
6
+
7
+ - ⚡ Async/concurrent processing for high throughput
8
+ - 🐳 Automatic Docker server management for local models
9
+ - 🔄 Unified interface across all VLM/OCR providers
10
+ - 📊 Built-in result visualization with Streamlit
11
+
12
+ Supported Converters:
13
+
14
+ - **Open Source Small VLMs**: `lightonocr`, `mineru2.5`, `hunyuanocr`, `paddleocrvl`, `granite-docling`, `olmocr2-fp8`, `dotsocr`, `chandra`, `deepseekocr`, `nanonets/Nanonets-OCR2-3B`
15
+ - **Open Source Generalist VLMs**: such as the Qwen family.
16
+ - **Pipelines**: `docling`
17
+ - **Proprietary LLMs**: `gemini`, `gpt`
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ uv sync
23
+ ```
24
+
25
+ With optional dependencies:
26
+
27
+ ```bash
28
+ uv sync --all-extras
29
+ ```
30
+
31
+ Activate the virtual environment:
32
+ ```bash
33
+ source .venv/bin/activate
34
+ ```
35
+ Other solution: append uv run to all the commands below.
36
+
37
+ ## CLI Usage
38
+
39
+ ### Convert PDFs
40
+
41
+ With a general VLM (requires setting your api key as an environment variable):
42
+
43
+ ```bash
44
+ vlmparse convert --input "*.pdf" --out_folder ./output --model gemini-2.5-flash-lite
45
+ ```
46
+
47
+ Convert with auto deployment of a small vlm (or any huggingface VLM model, requires a gpu + docker installation):
48
+
49
+ ```bash
50
+ vlmparse convert --input "*.pdf" --out_folder ./output --model nanonets/Nanonets-OCR2-3B
51
+ ```
52
+
53
+ ### Deploy a local model server
54
+
55
+ Deployment (requires a gpu + docker installation):
56
+ - You need a gpu dedicated for this.
57
+ - Check that the port is not used by another service.
58
+
59
+ ```bash
60
+ vlmparse serve --model lightonocr --port 8000 --gpus 1
61
+ ```
62
+
63
+ then convert:
64
+
65
+ ```bash
66
+ vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
67
+ ```
68
+
69
+ You can also list all running servers:
70
+
71
+ ```bash
72
+ vlmparse list
73
+ ```
74
+
75
+ Show logs of a server (if only one server is running, the container name is not needed):
76
+ ```bash
77
+ vlmparse log <container_name>
78
+ ```
79
+
80
+ Stop a server (if only one server is running, the container name is not needed):
81
+ ```bash
82
+ vlmparse stop <container_name>
83
+ ```
84
+
85
+ ### View conversion results with Streamlit
86
+
87
+ ```bash
88
+ vlmparse view ./output
89
+ ```
90
+
91
+ ## Configuration
92
+
93
+ Set API keys as environment variables:
94
+
95
+ ```bash
96
+ export GOOGLE_API_KEY="your-key"
97
+ export OPENAI_API_KEY="your-key"
98
+ ```
99
+
100
+ ## Python API
101
+
102
+ Client interface:
103
+
104
+ ```python
105
+ from vlmparse.registries import converter_config_registry
106
+
107
+ # Get a converter configuration
108
+ config = converter_config_registry.get("gemini-2.5-flash-lite")
109
+ client = config.get_client()
110
+
111
+ # Convert a single PDF
112
+ document = client("path/to/document.pdf")
113
+ print(document.to_markdown())
114
+
115
+ # Batch convert multiple PDFs
116
+ documents = client.batch(["file1.pdf", "file2.pdf"])
117
+ ```
118
+
119
+ Docker server interface:
120
+
121
+ ```python
122
+ from vlmparse.registries import docker_config_registry
123
+
124
+ config = docker_config_registry.get("lightonocr")
125
+ server = config.get_server()
126
+ server.start()
127
+
128
+ # Client calls...
129
+
130
+ server.stop()
131
+ ```
@@ -0,0 +1,88 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [metadata]
6
+ name = "vlmparse"
7
+ version = "0.1.0"
8
+
9
+ [project]
10
+ name = "vlmparse"
11
+ version = "0.1.0"
12
+ authors = []
13
+ description = ""
14
+ readme = "README.md"
15
+ requires-python = ">=3.12.0"
16
+ dependencies = [
17
+ "devtools>=0.12.2",
18
+ "docker>=7.1.0",
19
+ "html-to-markdown>=1.9.0",
20
+ "httpx>=0.27.0",
21
+ "loguru>=0.7.3",
22
+ "nest-asyncio>=1.6.0",
23
+ "numpy>=2.3.2",
24
+ "openai>=1.102.0",
25
+ "orjson>=3.11.3",
26
+ "pillow>=11.3.0",
27
+ "pydantic",
28
+ "pypdfium2>=4.30.0",
29
+ "fire>=0.7.1",
30
+ "lxml>=6.0.2",
31
+ "tabulate>=0.9.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "jupyter"
37
+ ]
38
+ docling_core = [
39
+ "docling-core",
40
+ ]
41
+ st_app = [
42
+ "streamlit>=1.49.0",
43
+ ]
44
+ bench = [
45
+ "html-to-markdown>=1.9.0",
46
+ "loguru>=0.7.3",
47
+ "nest-asyncio>=1.6.0",
48
+ "numpy>=2.3.2",
49
+ "pillow>=11.3.0",
50
+ "pydantic",
51
+ "rapidfuzz>=3.14.0",
52
+ "unidecode>=1.4.0",
53
+ "fire>=0.7.1",
54
+ "lxml>=6.0.2",
55
+ "datasets>=4.4.1",
56
+ "openpyxl>=3.1.5",
57
+ "joblib>=1.5.2",
58
+ "playwright",
59
+ "fuzzysearch>=0.8.1",
60
+ ]
61
+ test = [
62
+ "pre-commit",
63
+ "pytest",
64
+ "pytest-cov",
65
+ "pytest-mock",
66
+ "ruff",
67
+ "isort",
68
+ "pre-commit",
69
+ ]
70
+
71
+ [tool.setuptools.packages.find]
72
+ include = ["vlmparse"]
73
+
74
+ [project.scripts]
75
+ vlmparse = "vlmparse.cli:main"
76
+
77
+ [tool.ruff.lint]
78
+ select = ["E4", "E7", "E9", "F", "B", "A"]
79
+ ignore = ["E741", "B006"]
80
+
81
+ [tool.ruff.lint.per-file-ignores]
82
+ "__init__.py" = ["E402"]
83
+
84
+ [tool.uv]
85
+ config-settings = { editable_mode = "compat" }
86
+
87
+ [tool.uv.sources]
88
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,216 @@
1
+ """
2
+ Test all converter configs with mocked OpenAI clients.
3
+ This avoids the need to deploy actual Docker servers.
4
+ """
5
+
6
+ from unittest.mock import AsyncMock, MagicMock, patch
7
+
8
+ import pytest
9
+
10
+ from vlmparse.data_model.document import Document, Page
11
+ from vlmparse.registries import converter_config_registry
12
+
13
+ # Mock response for different model types
14
+ MOCK_RESPONSES = {
15
+ "default": "# Test Document\n\nThis is a test page with some content.",
16
+ "dotsocr_layout": '[{"bbox": [10, 10, 100, 50], "category": "Text", "text": "Test content"}]',
17
+ "dotsocr_ocr": "Test content from DotsOCR",
18
+ }
19
+
20
+
21
+ @pytest.fixture
22
+ def mock_openai_client():
23
+ """Mock the AsyncOpenAI client used by all converters."""
24
+ with patch("openai.AsyncOpenAI") as mock_client:
25
+ # Create mock response object
26
+ mock_response = MagicMock()
27
+ mock_response.choices = [MagicMock()]
28
+ mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
29
+
30
+ # Configure the async method
31
+ mock_instance = MagicMock()
32
+ mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
33
+ mock_client.return_value = mock_instance
34
+
35
+ yield mock_instance
36
+
37
+
38
+ @pytest.fixture
39
+ def dotsocr_mock_client():
40
+ """Mock for DotsOCR with different response types."""
41
+ with patch("openai.AsyncOpenAI") as mock_client:
42
+ mock_response = MagicMock()
43
+ mock_response.choices = [MagicMock()]
44
+ mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
45
+
46
+ mock_instance = MagicMock()
47
+ mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
48
+ mock_client.return_value = mock_instance
49
+
50
+ yield mock_instance
51
+
52
+
53
+ # List of all models registered in converter_config_registry
54
+ ALL_MODELS = [
55
+ "gemini-2.5-flash-lite",
56
+ "lightonocr",
57
+ "dotsocr",
58
+ "nanonets/Nanonets-OCR2-3B",
59
+ ]
60
+
61
+
62
+ class TestConverterConfigs:
63
+ """Test suite for all converter configs."""
64
+
65
+ @pytest.mark.parametrize("model_name", ALL_MODELS)
66
+ def test_config_retrieval(self, model_name):
67
+ """Test that all registered models can be retrieved from registry."""
68
+ config = converter_config_registry.get(model_name)
69
+ assert config is not None, f"Config for {model_name} should not be None"
70
+
71
+ @pytest.mark.parametrize("model_name", ALL_MODELS)
72
+ def test_config_has_get_client(self, model_name):
73
+ """Test that all configs have get_client method."""
74
+ config = converter_config_registry.get(model_name)
75
+ assert hasattr(config, "get_client"), f"{model_name} config missing get_client"
76
+
77
+ @pytest.mark.parametrize(
78
+ "model_name",
79
+ [
80
+ "gemini-2.5-flash-lite",
81
+ "lightonocr",
82
+ "nanonets/Nanonets-OCR2-3B",
83
+ ],
84
+ )
85
+ def test_converter_basic_processing(
86
+ self, file_path, model_name, mock_openai_client
87
+ ):
88
+ """Test basic document processing for OpenAI-compatible converters."""
89
+ config = converter_config_registry.get(model_name)
90
+ converter = config.get_client(num_concurrent_pages=2)
91
+
92
+ # Process document
93
+ document = converter(file_path)
94
+
95
+ # Verify document structure
96
+ assert isinstance(document, Document)
97
+ assert document.file_path == str(file_path)
98
+ assert len(document.pages) == 2, f"Expected 2 pages, got {len(document.pages)}"
99
+
100
+ # Verify pages
101
+ for page in document.pages:
102
+ assert isinstance(page, Page)
103
+ assert page.text is not None, "Page text should not be None"
104
+ assert len(page.text) > 0, "Page text should not be empty"
105
+
106
+ # Verify API was called
107
+ assert mock_openai_client.chat.completions.create.call_count == 2
108
+
109
+ def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
110
+ """Test DotsOCR converter in OCR mode."""
111
+ config = converter_config_registry.get("dotsocr")
112
+ converter = config.get_client(num_concurrent_pages=2)
113
+
114
+ # Process document
115
+ document = converter(file_path)
116
+
117
+ # Verify document structure
118
+ assert isinstance(document, Document)
119
+ assert len(document.pages) == 2
120
+
121
+ for page in document.pages:
122
+ assert isinstance(page, Page)
123
+ assert page.text is not None
124
+ assert len(page.text) > 0
125
+
126
+ # Verify API was called
127
+ assert dotsocr_mock_client.chat.completions.create.call_count == 2
128
+
129
+ @pytest.mark.parametrize("model_name", ALL_MODELS)
130
+ def test_converter_error_handling(self, file_path, model_name):
131
+ """Test that converters handle errors gracefully."""
132
+ with patch("openai.AsyncOpenAI") as mock_client:
133
+ # Configure mock to raise an exception
134
+ mock_instance = MagicMock()
135
+ mock_instance.chat.completions.create = AsyncMock(
136
+ side_effect=Exception("API Error")
137
+ )
138
+ mock_client.return_value = mock_instance
139
+
140
+ config = converter_config_registry.get(model_name)
141
+ converter = config.get_client(debug=False)
142
+
143
+ # Process should not crash
144
+ document = converter(file_path)
145
+
146
+ # Document should have error info in pages
147
+ assert isinstance(document, Document)
148
+ # Check that pages have errors
149
+ for page in document.pages:
150
+ assert page.error is not None
151
+
152
+
153
+ class TestConverterBatchProcessing:
154
+ """Test batch processing capabilities."""
155
+
156
+ @pytest.mark.parametrize(
157
+ "model_name",
158
+ [
159
+ "gemini-2.5-flash-lite",
160
+ "lightonocr",
161
+ ],
162
+ )
163
+ def test_batch_processing(self, file_path, model_name, mock_openai_client):
164
+ """Test batch processing of multiple files."""
165
+ config = converter_config_registry.get(model_name)
166
+ converter = config.get_client(
167
+ num_concurrent_files=2,
168
+ num_concurrent_pages=2,
169
+ return_documents_in_batch_mode=True,
170
+ )
171
+
172
+ # Process multiple files (same file for testing)
173
+ file_paths = [file_path, file_path]
174
+ documents = converter.batch(file_paths)
175
+
176
+ # Verify results
177
+ assert len(documents) == 2
178
+ for doc in documents:
179
+ assert isinstance(doc, Document)
180
+ assert len(doc.pages) == 2
181
+
182
+
183
+ class TestCustomURI:
184
+ """Test converter initialization with custom URIs."""
185
+
186
+ def test_custom_uri_config(self, mock_openai_client, file_path):
187
+ """Test that converters can be initialized with custom URIs."""
188
+ custom_uri = "http://localhost:8000/v1"
189
+ config = converter_config_registry.get("gemini-2.5-flash-lite", uri=custom_uri)
190
+
191
+ assert config.llm_params.base_url == custom_uri
192
+
193
+ # Test it works
194
+ converter = config.get_client()
195
+ document = converter(file_path)
196
+
197
+ assert isinstance(document, Document)
198
+ assert len(document.pages) == 2
199
+
200
+
201
+ class TestConcurrency:
202
+ """Test concurrent processing settings."""
203
+
204
+ @pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
205
+ def test_concurrent_page_processing(
206
+ self, file_path, model_name, mock_openai_client
207
+ ):
208
+ """Test that concurrent page processing limits are respected."""
209
+ config = converter_config_registry.get(model_name)
210
+ converter = config.get_client(num_concurrent_pages=1)
211
+
212
+ document = converter(file_path)
213
+
214
+ assert len(document.pages) == 2
215
+ # With concurrency=1, calls should be sequential
216
+ assert mock_openai_client.chat.completions.create.call_count == 2