PyPI - vlmparse - Versions diffs - 0.1.0__tar.gz - Mend

vlmparse 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

vlmparse-0.1.0/LICENSE +21 -0
vlmparse-0.1.0/PKG-INFO +184 -0
vlmparse-0.1.0/README.md +131 -0
vlmparse-0.1.0/pyproject.toml +88 -0
vlmparse-0.1.0/setup.cfg +4 -0
vlmparse-0.1.0/tests/test_all_converters_mocked.py +216 -0
vlmparse-0.1.0/tests/test_batch_parser.py +174 -0
vlmparse-0.1.0/tests/test_benchmark_tests.py +731 -0
vlmparse-0.1.0/tests/test_cli.py +812 -0
vlmparse-0.1.0/tests/test_end2end.py +66 -0
vlmparse-0.1.0/tests/test_process_and_run_benchmark.py +144 -0
vlmparse-0.1.0/tests/test_table_tests.py +1516 -0
vlmparse-0.1.0/vlmparse/base_model.py +16 -0
vlmparse-0.1.0/vlmparse/build_doc.py +70 -0
vlmparse-0.1.0/vlmparse/cli.py +409 -0
vlmparse-0.1.0/vlmparse/converter.py +179 -0
vlmparse-0.1.0/vlmparse/converter_with_server.py +122 -0
vlmparse-0.1.0/vlmparse/registries.py +174 -0
vlmparse-0.1.0/vlmparse/utils.py +41 -0
vlmparse-0.1.0/vlmparse.egg-info/PKG-INFO +184 -0
vlmparse-0.1.0/vlmparse.egg-info/SOURCES.txt +23 -0
vlmparse-0.1.0/vlmparse.egg-info/dependency_links.txt +1 -0
vlmparse-0.1.0/vlmparse.egg-info/entry_points.txt +2 -0
vlmparse-0.1.0/vlmparse.egg-info/requires.txt +49 -0
vlmparse-0.1.0/vlmparse.egg-info/top_level.txt +1 -0

vlmparse-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2022 WisTex TechSero Ltd. Co.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

vlmparse-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,184 @@
+Metadata-Version: 2.4
+Name: vlmparse
+Version: 0.1.0
+Requires-Python: >=3.12.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: devtools>=0.12.2
+Requires-Dist: docker>=7.1.0
+Requires-Dist: html-to-markdown>=1.9.0
+Requires-Dist: httpx>=0.27.0
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: nest-asyncio>=1.6.0
+Requires-Dist: numpy>=2.3.2
+Requires-Dist: openai>=1.102.0
+Requires-Dist: orjson>=3.11.3
+Requires-Dist: pillow>=11.3.0
+Requires-Dist: pydantic
+Requires-Dist: pypdfium2>=4.30.0
+Requires-Dist: fire>=0.7.1
+Requires-Dist: lxml>=6.0.2
+Requires-Dist: tabulate>=0.9.0
+Provides-Extra: dev
+Requires-Dist: jupyter; extra == "dev"
+Provides-Extra: docling-core
+Requires-Dist: docling-core; extra == "docling-core"
+Provides-Extra: st-app
+Requires-Dist: streamlit>=1.49.0; extra == "st-app"
+Provides-Extra: bench
+Requires-Dist: html-to-markdown>=1.9.0; extra == "bench"
+Requires-Dist: loguru>=0.7.3; extra == "bench"
+Requires-Dist: nest-asyncio>=1.6.0; extra == "bench"
+Requires-Dist: numpy>=2.3.2; extra == "bench"
+Requires-Dist: pillow>=11.3.0; extra == "bench"
+Requires-Dist: pydantic; extra == "bench"
+Requires-Dist: rapidfuzz>=3.14.0; extra == "bench"
+Requires-Dist: unidecode>=1.4.0; extra == "bench"
+Requires-Dist: fire>=0.7.1; extra == "bench"
+Requires-Dist: lxml>=6.0.2; extra == "bench"
+Requires-Dist: datasets>=4.4.1; extra == "bench"
+Requires-Dist: openpyxl>=3.1.5; extra == "bench"
+Requires-Dist: joblib>=1.5.2; extra == "bench"
+Requires-Dist: playwright; extra == "bench"
+Requires-Dist: fuzzysearch>=0.8.1; extra == "bench"
+Provides-Extra: test
+Requires-Dist: pre-commit; extra == "test"
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest-cov; extra == "test"
+Requires-Dist: pytest-mock; extra == "test"
+Requires-Dist: ruff; extra == "test"
+Requires-Dist: isort; extra == "test"
+Requires-Dist: pre-commit; extra == "test"
+Dynamic: license-file
+# vlmparse
+A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
+Features:
+- ⚡ Async/concurrent processing for high throughput
+- 🐳 Automatic Docker server management for local models
+- 🔄 Unified interface across all VLM/OCR providers
+- 📊 Built-in result visualization with Streamlit
+Supported Converters:
+- **Open Source Small VLMs**: `lightonocr`, `mineru2.5`, `hunyuanocr`, `paddleocrvl`, `granite-docling`, `olmocr2-fp8`, `dotsocr`, `chandra`, `deepseekocr`, `nanonets/Nanonets-OCR2-3B`
+- **Open Source Generalist VLMs**: such as the Qwen family.
+- **Pipelines**: `docling`
+- **Proprietary LLMs**: `gemini`, `gpt`
+## Installation
+```bash
+uv sync
+```
+With optional dependencies:
+```bash
+uv sync --all-extras
+```
+Activate the virtual environment:
+```bash
+source .venv/bin/activate
+```
+Other solution: append uv run to all the commands below.
+## CLI Usage
+### Convert PDFs
+With a general VLM (requires setting your api key as an environment variable):
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model gemini-2.5-flash-lite
+```
+Convert with auto deployment of a small vlm (or any huggingface VLM model, requires a gpu + docker installation):
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model nanonets/Nanonets-OCR2-3B
+```
+### Deploy a local model server
+Deployment (requires a gpu + docker installation):
+- You need a gpu dedicated for this.
+- Check that the port is not used by another service.
+```bash
+vlmparse serve --model lightonocr --port 8000 --gpus 1
+```
+then convert:
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
+```
+You can also list all running servers:
+```bash
+vlmparse list
+```
+Show logs of a server (if only one server is running, the container name is not needed):
+```bash
+vlmparse log <container_name>
+```
+Stop a server (if only one server is running, the container name is not needed):
+```bash
+vlmparse stop <container_name>
+```
+### View conversion results with Streamlit
+```bash
+vlmparse view ./output
+```
+## Configuration
+Set API keys as environment variables:
+```bash
+export GOOGLE_API_KEY="your-key"
+export OPENAI_API_KEY="your-key"
+```
+## Python API
+Client interface:
+```python
+from vlmparse.registries import converter_config_registry
+# Get a converter configuration
+config = converter_config_registry.get("gemini-2.5-flash-lite")
+client = config.get_client()
+# Convert a single PDF
+document = client("path/to/document.pdf")
+print(document.to_markdown())
+# Batch convert multiple PDFs
+documents = client.batch(["file1.pdf", "file2.pdf"])
+```
+Docker server interface:
+```python
+from vlmparse.registries import docker_config_registry
+config = docker_config_registry.get("lightonocr")
+server = config.get_server()
+server.start()
+# Client calls...
+server.stop()
+```

vlmparse-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,131 @@
+# vlmparse
+A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
+Features:
+- ⚡ Async/concurrent processing for high throughput
+- 🐳 Automatic Docker server management for local models
+- 🔄 Unified interface across all VLM/OCR providers
+- 📊 Built-in result visualization with Streamlit
+Supported Converters:
+- **Open Source Small VLMs**: `lightonocr`, `mineru2.5`, `hunyuanocr`, `paddleocrvl`, `granite-docling`, `olmocr2-fp8`, `dotsocr`, `chandra`, `deepseekocr`, `nanonets/Nanonets-OCR2-3B`
+- **Open Source Generalist VLMs**: such as the Qwen family.
+- **Pipelines**: `docling`
+- **Proprietary LLMs**: `gemini`, `gpt`
+## Installation
+```bash
+uv sync
+```
+With optional dependencies:
+```bash
+uv sync --all-extras
+```
+Activate the virtual environment:
+```bash
+source .venv/bin/activate
+```
+Other solution: append uv run to all the commands below.
+## CLI Usage
+### Convert PDFs
+With a general VLM (requires setting your api key as an environment variable):
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model gemini-2.5-flash-lite
+```
+Convert with auto deployment of a small vlm (or any huggingface VLM model, requires a gpu + docker installation):
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model nanonets/Nanonets-OCR2-3B
+```
+### Deploy a local model server
+Deployment (requires a gpu + docker installation):
+- You need a gpu dedicated for this.
+- Check that the port is not used by another service.
+```bash
+vlmparse serve --model lightonocr --port 8000 --gpus 1
+```
+then convert:
+```bash
+vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
+```
+You can also list all running servers:
+```bash
+vlmparse list
+```
+Show logs of a server (if only one server is running, the container name is not needed):
+```bash
+vlmparse log <container_name>
+```
+Stop a server (if only one server is running, the container name is not needed):
+```bash
+vlmparse stop <container_name>
+```
+### View conversion results with Streamlit
+```bash
+vlmparse view ./output
+```
+## Configuration
+Set API keys as environment variables:
+```bash
+export GOOGLE_API_KEY="your-key"
+export OPENAI_API_KEY="your-key"
+```
+## Python API
+Client interface:
+```python
+from vlmparse.registries import converter_config_registry
+# Get a converter configuration
+config = converter_config_registry.get("gemini-2.5-flash-lite")
+client = config.get_client()
+# Convert a single PDF
+document = client("path/to/document.pdf")
+print(document.to_markdown())
+# Batch convert multiple PDFs
+documents = client.batch(["file1.pdf", "file2.pdf"])
+```
+Docker server interface:
+```python
+from vlmparse.registries import docker_config_registry
+config = docker_config_registry.get("lightonocr")
+server = config.get_server()
+server.start()
+# Client calls...
+server.stop()
+```

vlmparse-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,88 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[metadata]
+name = "vlmparse"
+version = "0.1.0"
+[project]
+name = "vlmparse"
+version = "0.1.0"
+authors = []
+description = ""
+readme = "README.md"
+requires-python = ">=3.12.0"
+dependencies = [
+    "devtools>=0.12.2",
+    "docker>=7.1.0",
+    "html-to-markdown>=1.9.0",
+    "httpx>=0.27.0",
+    "loguru>=0.7.3",
+    "nest-asyncio>=1.6.0",
+    "numpy>=2.3.2",
+    "openai>=1.102.0",
+    "orjson>=3.11.3",
+    "pillow>=11.3.0",
+    "pydantic",
+    "pypdfium2>=4.30.0",
+    "fire>=0.7.1",
+    "lxml>=6.0.2",
+    "tabulate>=0.9.0",
+]
+[project.optional-dependencies]
+dev = [
+    "jupyter"
+]
+docling_core = [
+    "docling-core",
+]
+st_app = [
+    "streamlit>=1.49.0",
+]
+bench = [
+    "html-to-markdown>=1.9.0",
+    "loguru>=0.7.3",
+    "nest-asyncio>=1.6.0",
+    "numpy>=2.3.2",
+    "pillow>=11.3.0",
+    "pydantic",
+    "rapidfuzz>=3.14.0",
+    "unidecode>=1.4.0",
+    "fire>=0.7.1",
+    "lxml>=6.0.2",
+    "datasets>=4.4.1",
+    "openpyxl>=3.1.5",
+    "joblib>=1.5.2",
+    "playwright",
+    "fuzzysearch>=0.8.1",
+    ]
+test = [
+    "pre-commit",
+    "pytest",
+    "pytest-cov",
+    "pytest-mock",
+    "ruff",
+    "isort",
+    "pre-commit",
+]
+[tool.setuptools.packages.find]
+include = ["vlmparse"]
+[project.scripts]
+vlmparse = "vlmparse.cli:main"
+[tool.ruff.lint]
+select = ["E4", "E7", "E9", "F", "B", "A"]
+ignore = ["E741", "B006"]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402"]
+[tool.uv]
+config-settings = { editable_mode = "compat" }
+[tool.uv.sources]

vlmparse-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

vlmparse-0.1.0/tests/test_all_converters_mocked.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""
+Test all converter configs with mocked OpenAI clients.
+This avoids the need to deploy actual Docker servers.
+"""
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+from vlmparse.data_model.document import Document, Page
+from vlmparse.registries import converter_config_registry
+# Mock response for different model types
+MOCK_RESPONSES = {
+    "default": "# Test Document\n\nThis is a test page with some content.",
+    "dotsocr_layout": '[{"bbox": [10, 10, 100, 50], "category": "Text", "text": "Test content"}]',
+    "dotsocr_ocr": "Test content from DotsOCR",
+}
+@pytest.fixture
+def mock_openai_client():
+    """Mock the AsyncOpenAI client used by all converters."""
+    with patch("openai.AsyncOpenAI") as mock_client:
+        # Create mock response object
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
+        # Configure the async method
+        mock_instance = MagicMock()
+        mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
+        mock_client.return_value = mock_instance
+        yield mock_instance
+@pytest.fixture
+def dotsocr_mock_client():
+    """Mock for DotsOCR with different response types."""
+    with patch("openai.AsyncOpenAI") as mock_client:
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
+        mock_instance = MagicMock()
+        mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
+        mock_client.return_value = mock_instance
+        yield mock_instance
+# List of all models registered in converter_config_registry
+ALL_MODELS = [
+    "gemini-2.5-flash-lite",
+    "lightonocr",
+    "dotsocr",
+    "nanonets/Nanonets-OCR2-3B",
+]
+class TestConverterConfigs:
+    """Test suite for all converter configs."""
+    @pytest.mark.parametrize("model_name", ALL_MODELS)
+    def test_config_retrieval(self, model_name):
+        """Test that all registered models can be retrieved from registry."""
+        config = converter_config_registry.get(model_name)
+        assert config is not None, f"Config for {model_name} should not be None"
+    @pytest.mark.parametrize("model_name", ALL_MODELS)
+    def test_config_has_get_client(self, model_name):
+        """Test that all configs have get_client method."""
+        config = converter_config_registry.get(model_name)
+        assert hasattr(config, "get_client"), f"{model_name} config missing get_client"
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "gemini-2.5-flash-lite",
+            "lightonocr",
+            "nanonets/Nanonets-OCR2-3B",
+        ],
+    )
+    def test_converter_basic_processing(
+        self, file_path, model_name, mock_openai_client
+    ):
+        """Test basic document processing for OpenAI-compatible converters."""
+        config = converter_config_registry.get(model_name)
+        converter = config.get_client(num_concurrent_pages=2)
+        # Process document
+        document = converter(file_path)
+        # Verify document structure
+        assert isinstance(document, Document)
+        assert document.file_path == str(file_path)
+        assert len(document.pages) == 2, f"Expected 2 pages, got {len(document.pages)}"
+        # Verify pages
+        for page in document.pages:
+            assert isinstance(page, Page)
+            assert page.text is not None, "Page text should not be None"
+            assert len(page.text) > 0, "Page text should not be empty"
+        # Verify API was called
+        assert mock_openai_client.chat.completions.create.call_count == 2
+    def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
+        """Test DotsOCR converter in OCR mode."""
+        config = converter_config_registry.get("dotsocr")
+        converter = config.get_client(num_concurrent_pages=2)
+        # Process document
+        document = converter(file_path)
+        # Verify document structure
+        assert isinstance(document, Document)
+        assert len(document.pages) == 2
+        for page in document.pages:
+            assert isinstance(page, Page)
+            assert page.text is not None
+            assert len(page.text) > 0
+        # Verify API was called
+        assert dotsocr_mock_client.chat.completions.create.call_count == 2
+    @pytest.mark.parametrize("model_name", ALL_MODELS)
+    def test_converter_error_handling(self, file_path, model_name):
+        """Test that converters handle errors gracefully."""
+        with patch("openai.AsyncOpenAI") as mock_client:
+            # Configure mock to raise an exception
+            mock_instance = MagicMock()
+            mock_instance.chat.completions.create = AsyncMock(
+                side_effect=Exception("API Error")
+            )
+            mock_client.return_value = mock_instance
+            config = converter_config_registry.get(model_name)
+            converter = config.get_client(debug=False)
+            # Process should not crash
+            document = converter(file_path)
+            # Document should have error info in pages
+            assert isinstance(document, Document)
+            # Check that pages have errors
+            for page in document.pages:
+                assert page.error is not None
+class TestConverterBatchProcessing:
+    """Test batch processing capabilities."""
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "gemini-2.5-flash-lite",
+            "lightonocr",
+        ],
+    )
+    def test_batch_processing(self, file_path, model_name, mock_openai_client):
+        """Test batch processing of multiple files."""
+        config = converter_config_registry.get(model_name)
+        converter = config.get_client(
+            num_concurrent_files=2,
+            num_concurrent_pages=2,
+            return_documents_in_batch_mode=True,
+        )
+        # Process multiple files (same file for testing)
+        file_paths = [file_path, file_path]
+        documents = converter.batch(file_paths)
+        # Verify results
+        assert len(documents) == 2
+        for doc in documents:
+            assert isinstance(doc, Document)
+            assert len(doc.pages) == 2
+class TestCustomURI:
+    """Test converter initialization with custom URIs."""
+    def test_custom_uri_config(self, mock_openai_client, file_path):
+        """Test that converters can be initialized with custom URIs."""
+        custom_uri = "http://localhost:8000/v1"
+        config = converter_config_registry.get("gemini-2.5-flash-lite", uri=custom_uri)
+        assert config.llm_params.base_url == custom_uri
+        # Test it works
+        converter = config.get_client()
+        document = converter(file_path)
+        assert isinstance(document, Document)
+        assert len(document.pages) == 2
+class TestConcurrency:
+    """Test concurrent processing settings."""
+    @pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
+    def test_concurrent_page_processing(
+        self, file_path, model_name, mock_openai_client
+    ):
+        """Test that concurrent page processing limits are respected."""
+        config = converter_config_registry.get(model_name)
+        converter = config.get_client(num_concurrent_pages=1)
+        document = converter(file_path)
+        assert len(document.pages) == 2
+        # With concurrency=1, calls should be sequential
+        assert mock_openai_client.chat.completions.create.call_count == 2