PyPI - vlmparse - Versions diffs - 0.1.7__tar.gz → 0.1.8__tar.gz - Mend

vlmparse 0.1.7tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{vlmparse-0.1.7 → vlmparse-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vlmparse
-Version: 0.1.7
+Version: 0.1.8
 Requires-Python: >=3.11.0
 Description-Content-Type: text/markdown
 License-File: LICENSE
@@ -54,6 +54,12 @@ Dynamic: license-file
 # vlmparse
+<div align="center">
+[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
+</div>
 A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
 Features:
@@ -209,3 +215,7 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
 ```
 Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
+## Credits
+This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.

{vlmparse-0.1.7 → vlmparse-0.1.8}/README.md RENAMED Viewed

@@ -1,5 +1,11 @@
 # vlmparse
+<div align="center">
+[\[📜 arXiv coming soon\]] | [[Dataset (🤗Hugging Face)]](https://huggingface.co/datasets/pulsia/fr-bench-pdf2md) | [[pypi]](https://pypi.org/project/vlmparse/) | [[vlmparse]](https://github.com/ld-lab-pulsia/vlmparse) | [[Benchmark]](https://github.com/ld-lab-pulsia/benchpdf2md)
+</div>
 A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
 Features:
@@ -154,4 +160,8 @@ with ConverterWithServer(model="mineru2.5") as converter_with_server:
     documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
 ```
-Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
+Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
+## Credits
+This work was financed by La Poste and led by members of Probayes and OpenValue, two subsidiaries (filiales) of La Poste.

{vlmparse-0.1.7 → vlmparse-0.1.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vlmparse"
-version = "0.1.7"
+version = "0.1.8"
 authors = []
 description = ""
 readme = "README.md"

{vlmparse-0.1.7 → vlmparse-0.1.8}/tests/test_all_converters_mocked.py RENAMED Viewed

@@ -19,43 +19,8 @@ MOCK_RESPONSES = {
 }
-@pytest.fixture
-def mock_openai_client():
-    """Mock the AsyncOpenAI client used by all converters."""
-    with patch("openai.AsyncOpenAI") as mock_client:
-        # Create mock response object
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = MOCK_RESPONSES["default"]
-        mock_response.usage = MagicMock()
-        mock_response.usage.prompt_tokens = 50
-        mock_response.usage.completion_tokens = 150
-        mock_response.usage.reasoning_tokens = 30
-        # Configure the async method
-        mock_instance = MagicMock()
-        mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
-        mock_client.return_value = mock_instance
-        yield mock_instance
-@pytest.fixture
-def dotsocr_mock_client():
-    """Mock for DotsOCR with different response types."""
-    with patch("openai.AsyncOpenAI") as mock_client:
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = MOCK_RESPONSES["dotsocr_ocr"]
-        mock_response.usage = MagicMock()
-        mock_response.usage.prompt_tokens = 40
-        mock_response.usage.completion_tokens = 160
-        mock_response.usage.reasoning_tokens = 20
-        mock_instance = MagicMock()
-        mock_instance.chat.completions.create = AsyncMock(return_value=mock_response)
-        mock_client.return_value = mock_instance
-        yield mock_instance
+# Note: mock_openai_client and dotsocr_mock_client fixtures are replaced by the
+# unified mock_openai_api fixture from conftest.py
 # List of all models registered in converter_config_registry
@@ -64,6 +29,14 @@ ALL_MODELS = [
     "lightonocr",
     "dotsocr",
     "nanonets/Nanonets-OCR2-3B",
+    "hunyuanocr",
+    "olmocr-2-fp8",
+    "paddleocrvl",
+    "mineru25",
+    "chandra",
+    "deepseekocr",
+    "granite-docling",
+    "Qwen/Qwen3-VL-8B-Instruct",
 ]
@@ -91,87 +64,93 @@ class TestConverterConfigs:
         ],
     )
     def test_converter_basic_processing(
-        self, file_path, model_name, mock_openai_client
+        self, file_path, model_name, mock_openai_api, tmp_output_dir
     ):
         """Test basic document processing for OpenAI-compatible converters."""
-        config = converter_config_registry.get(model_name)
-        converter = config.get_client(num_concurrent_pages=2, debug=True)
+        with mock_openai_api() as openai_client:
+            config = converter_config_registry.get(model_name)
+            converter = config.get_client(
+                num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
+            )
-        # Process document
-        document = converter(file_path)
+            # Process document
+            document = converter(file_path)
-        # Verify document structure
-        assert isinstance(document, Document)
-        assert document.file_path == str(file_path)
-        assert len(document.pages) == 2, f"Expected 2 pages, got {len(document.pages)}"
+            # Verify document structure
+            assert isinstance(document, Document)
+            assert document.file_path == str(file_path)
+            assert (
+                len(document.pages) == 2
+            ), f"Expected 2 pages, got {len(document.pages)}"
-        # Verify pages
-        for page in document.pages:
-            assert isinstance(page, Page)
-            assert page.text is not None, "Page text should not be None"
-            assert len(page.text) > 0, "Page text should not be empty"
+            # Verify pages
+            for page in document.pages:
+                assert isinstance(page, Page)
+                assert page.text is not None, "Page text should not be None"
+                assert len(page.text) > 0, "Page text should not be empty"
-        # Verify API was called
-        assert mock_openai_client.chat.completions.create.call_count == 2
+            # Verify API was called
+            assert openai_client.chat.completions.create.call_count == 2
-    def test_converter_image_processing(self, datadir, mock_openai_client):
+    def test_converter_image_processing(self, datadir, mock_openai_api, tmp_output_dir):
         """Test processing of a single image file."""
-        model_name = "gemini-2.5-flash-lite"
-        image_path = datadir / "page_with_formula.png"
+        with mock_openai_api() as openai_client:
+            model_name = "gemini-2.5-flash-lite"
+            image_path = datadir / "page_with_formula.png"
-        config = converter_config_registry.get(model_name)
-        converter = config.get_client(debug=True)
+            config = converter_config_registry.get(model_name)
+            converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
-        # Process image
-        document = converter(image_path)
+            # Process image
+            document = converter(image_path)
-        # Verify document structure
-        assert isinstance(document, Document)
-        assert document.file_path == str(image_path)
-        assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
+            # Verify document structure
+            assert isinstance(document, Document)
+            assert document.file_path == str(image_path)
+            assert (
+                len(document.pages) == 1
+            ), f"Expected 1 page, got {len(document.pages)}"
-        # Verify page
-        page = document.pages[0]
-        assert isinstance(page, Page)
-        assert page.text is not None
-        assert len(page.text) > 0
+            # Verify page
+            page = document.pages[0]
+            assert isinstance(page, Page)
+            assert page.text is not None
+            assert len(page.text) > 0
-        # Verify API was called once
-        assert mock_openai_client.chat.completions.create.call_count == 1
+            # Verify API was called once
+            assert openai_client.chat.completions.create.call_count == 1
-    def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
+    def test_dotsocr_ocr_mode(self, file_path, mock_openai_api, tmp_output_dir):
         """Test DotsOCR converter in OCR mode."""
-        config = converter_config_registry.get("dotsocr")
-        converter = config.get_client(num_concurrent_pages=2, debug=True)
+        with mock_openai_api(content=MOCK_RESPONSES["dotsocr_ocr"]) as openai_client:
+            config = converter_config_registry.get("dotsocr")
+            converter = config.get_client(
+                num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
+            )
-        # Process document
-        document = converter(file_path)
+            # Process document
+            document = converter(file_path)
-        # Verify document structure
-        assert isinstance(document, Document)
-        assert len(document.pages) == 2
+            # Verify document structure
+            assert isinstance(document, Document)
+            assert len(document.pages) == 2
-        for page in document.pages:
-            assert isinstance(page, Page)
-            assert page.text is not None
-            assert len(page.text) > 0
+            for page in document.pages:
+                assert isinstance(page, Page)
+                assert page.text is not None
+                assert len(page.text) > 0
-        # Verify API was called
-        assert dotsocr_mock_client.chat.completions.create.call_count == 2
+            # Verify API was called
+            assert openai_client.chat.completions.create.call_count == 2
     @pytest.mark.parametrize("model_name", ALL_MODELS)
-    def test_converter_error_handling(self, file_path, model_name):
+    def test_converter_error_handling(
+        self, file_path, model_name, mock_openai_api, tmp_output_dir
+    ):
         """Test that converters handle errors gracefully."""
-        with patch("openai.AsyncOpenAI") as mock_client:
-            # Configure mock to raise an exception
-            mock_instance = MagicMock()
-            mock_instance.chat.completions.create = AsyncMock(
-                side_effect=Exception("API Error")
-            )
-            mock_client.return_value = mock_instance
+        with mock_openai_api(side_effect=Exception("API Error")):
             config = converter_config_registry.get(model_name)
-            converter = config.get_client(debug=False)
+            converter = config.get_client(debug=False, save_folder=str(tmp_output_dir))
             # Process should not crash
             document = converter(file_path)
@@ -193,25 +172,29 @@ class TestConverterBatchProcessing:
             "lightonocr",
         ],
     )
-    def test_batch_processing(self, file_path, model_name, mock_openai_client):
+    def test_batch_processing(
+        self, file_path, model_name, mock_openai_api, tmp_output_dir
+    ):
         """Test batch processing of multiple files."""
-        config = converter_config_registry.get(model_name)
-        converter = config.get_client(
-            num_concurrent_files=2,
-            num_concurrent_pages=2,
-            return_documents_in_batch_mode=True,
-            debug=True,
-        )
+        with mock_openai_api():
+            config = converter_config_registry.get(model_name)
+            converter = config.get_client(
+                num_concurrent_files=2,
+                num_concurrent_pages=2,
+                return_documents_in_batch_mode=True,
+                debug=True,
+                save_folder=str(tmp_output_dir),
+            )
-        # Process multiple files (same file for testing)
-        file_paths = [file_path, file_path]
-        documents = converter.batch(file_paths)
+            # Process multiple files (same file for testing)
+            file_paths = [file_path, file_path]
+            documents = converter.batch(file_paths)
-        # Verify results
-        assert len(documents) == 2
-        for doc in documents:
-            assert isinstance(doc, Document)
-            assert len(doc.pages) == 2
+            # Verify results
+            assert len(documents) == 2
+            for doc in documents:
+                assert isinstance(doc, Document)
+                assert len(doc.pages) == 2
 @pytest.fixture
@@ -245,12 +228,16 @@ def mineru_mock_httpx_client():
 class TestMinerUConverterMockedApi:
-    def test_mineru_converter_repeated_call(self, file_path, mineru_mock_httpx_client):
+    def test_mineru_converter_repeated_call(
+        self, file_path, mineru_mock_httpx_client, tmp_output_dir
+    ):
         """Repeated `__call__` should keep working and call API each page."""
         from vlmparse.clients.mineru import MinerUConverterConfig
         config = MinerUConverterConfig(base_url="http://mineru.test")
-        converter = config.get_client(num_concurrent_pages=2, debug=True)
+        converter = config.get_client(
+            num_concurrent_pages=2, debug=True, save_folder=str(tmp_output_dir)
+        )
         with (
             patch("vlmparse.clients.mineru.clean_response", lambda x: x),
@@ -274,7 +261,7 @@ class TestMinerUConverterMockedApi:
         assert mineru_mock_httpx_client.post.call_count == 4
     def test_mineru_converter_batch_processing(
-        self, file_path, mineru_mock_httpx_client
+        self, file_path, mineru_mock_httpx_client, tmp_output_dir
     ):
         """Batch mode should return documents and call API for each page."""
         from vlmparse.clients.mineru import MinerUConverterConfig
@@ -285,6 +272,7 @@ class TestMinerUConverterMockedApi:
             num_concurrent_pages=2,
             return_documents_in_batch_mode=True,
             debug=True,
+            save_folder=str(tmp_output_dir),
         )
         with (
@@ -306,19 +294,22 @@ class TestMinerUConverterMockedApi:
 class TestCustomURI:
     """Test converter initialization with custom URIs."""
-    def test_custom_uri_config(self, mock_openai_client, file_path):
+    def test_custom_uri_config(self, mock_openai_api, file_path, tmp_output_dir):
         """Test that converters can be initialized with custom URIs."""
-        custom_uri = "http://localhost:8000/v1"
-        config = converter_config_registry.get("gemini-2.5-flash-lite", uri=custom_uri)
+        with mock_openai_api():
+            custom_uri = "http://localhost:8000/v1"
+            config = converter_config_registry.get(
+                "gemini-2.5-flash-lite", uri=custom_uri
+            )
-        assert config.llm_params.base_url == custom_uri
+            assert config.base_url == custom_uri
-        # Test it works
-        converter = config.get_client(debug=True)
-        document = converter(file_path)
+            # Test it works
+            converter = config.get_client(debug=True, save_folder=str(tmp_output_dir))
+            document = converter(file_path)
-        assert isinstance(document, Document)
-        assert len(document.pages) == 2
+            assert isinstance(document, Document)
+            assert len(document.pages) == 2
 class TestConcurrency:
@@ -326,14 +317,17 @@ class TestConcurrency:
     @pytest.mark.parametrize("model_name", ["gemini-2.5-flash-lite", "lightonocr"])
     def test_concurrent_page_processing(
-        self, file_path, model_name, mock_openai_client
+        self, file_path, model_name, mock_openai_api, tmp_output_dir
     ):
         """Test that concurrent page processing limits are respected."""
-        config = converter_config_registry.get(model_name)
-        converter = config.get_client(num_concurrent_pages=1, debug=True)
+        with mock_openai_api() as openai_client:
+            config = converter_config_registry.get(model_name)
+            converter = config.get_client(
+                num_concurrent_pages=1, debug=True, save_folder=str(tmp_output_dir)
+            )
-        document = converter(file_path)
+            document = converter(file_path)
-        assert len(document.pages) == 2
-        # With concurrency=1, calls should be sequential
-        assert mock_openai_client.chat.completions.create.call_count == 2
+            assert len(document.pages) == 2
+            # With concurrency=1, calls should be sequential
+            assert openai_client.chat.completions.create.call_count == 2

vlmparse-0.1.8/tests/test_batch_parser.py ADDED Viewed

@@ -0,0 +1,135 @@
+import pytest
+from vlmparse.converter_with_server import ConverterWithServer
+class TestBatchParser:
+    """Tests for ConverterWithServer (acting as BatchParser)."""
+    def test_init_starts_docker_server(self, mock_docker_operations):
+        """Test that initializing with a model requiring docker starts the server."""
+        # Setup using unified mocking system
+        with mock_docker_operations(include_client=True) as (
+            mock_docker_registry,
+            mock_config,
+            mock_server,
+            mock_client,
+        ):
+            # Initialize
+            with ConverterWithServer(
+                model="test_model", with_vllm_server=True
+            ) as parser:
+                # Verify interactions
+                mock_docker_registry.get.assert_called_with("test_model", default=True)
+                mock_config.get_server.assert_called_with(auto_stop=True)
+                mock_server.start.assert_called_once()
+                mock_config.get_client.assert_called_once()
+                assert parser.client == mock_client
+    def test_init_no_docker_fallback(self, mock_docker_operations, mock_openai_api):
+        """Test fallback to standard converter when no docker config exists."""
+        # Setup mocks - docker returns None, use real converter registry
+        with mock_docker_operations(
+            model_filter=lambda model: False  # No docker for any model
+        ) as (mock_docker_reg, _, _, _):
+            with mock_openai_api():
+                # Initialize with a real model from registry
+                with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
+                    # Verify interactions
+                    mock_docker_reg.get.assert_called_with(
+                        "gemini-2.5-flash-lite", default=False
+                    )
+                    # Client should be initialized from real converter registry
+                    assert parser.client is not None
+    def test_parse_updates_client_config(
+        self, mock_docker_operations, datadir, mock_openai_api, tmp_path
+    ):
+        """Test that parse method updates client configuration and calls batch."""
+        # Use real test file
+        test_file = datadir / "Fiche_Graines_A5.pdf"
+        with mock_docker_operations(
+            model_filter=lambda model: False  # No docker for any model
+        ):
+            with mock_openai_api():
+                with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
+                    # Call parse with real file
+                    parser.client.return_documents_in_batch_mode = True
+                    documents = parser.parse(
+                        inputs=[str(test_file)],
+                        out_folder=str(tmp_path),
+                        mode="md",
+                        dpi=300,
+                        debug=True,
+                    )
+                    # Verify client config updates
+                    assert parser.client.config.dpi == 300
+                    assert parser.client.debug is True
+                    assert parser.client.save_mode == "md"
+                    # Concurrency should be 1 because debug=True
+                    assert parser.client.num_concurrent_files == 1
+                    assert parser.client.num_concurrent_pages == 1
+                    # Verify result
+                    assert documents is not None
+                    assert len(documents) > 0
+    def test_parse_retry_logic(
+        self, mock_docker_operations, datadir, mock_openai_api, tmp_path
+    ):
+        """Test the retrylast logic filters already processed files."""
+        # Create two copies of the test file
+        test_file = datadir / "Fiche_Graines_A5.pdf"
+        temp_dir = tmp_path / "input_files"
+        temp_dir.mkdir()
+        file1 = temp_dir / "file1.pdf"
+        file2 = temp_dir / "file2.pdf"
+        # Copy test file to simulate multiple inputs
+        import shutil
+        shutil.copy(test_file, file1)
+        shutil.copy(test_file, file2)
+        # Setup folder structure for retry
+        run_folder = tmp_path / "output" / "run1"
+        results_folder = run_folder / "results"
+        results_folder.mkdir(parents=True)
+        # Create a processed result for file1
+        (results_folder / "file1.zip").touch()
+        with mock_docker_operations(model_filter=lambda model: False):
+            with mock_openai_api():
+                with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
+                    parser.client.return_documents_in_batch_mode = True
+                    # Call parse with retrylast - should only process file2
+                    documents = parser.parse(
+                        inputs=[str(file1), str(file2)],
+                        out_folder=str(tmp_path / "output"),
+                        retrylast=True,
+                    )
+                    # Should only process file2 (file1 was already processed)
+                    # Verify by checking that only 1 file was processed
+                    assert documents is not None
+                    assert len(documents) == 1
+    def test_parse_retry_no_previous_runs(
+        self, mock_docker_operations, datadir, mock_openai_api, tmp_path
+    ):
+        """Test that retrylast raises ValueError if no previous runs found."""
+        test_file = datadir / "Fiche_Graines_A5.pdf"
+        with mock_docker_operations(model_filter=lambda model: False):
+            with mock_openai_api():
+                with ConverterWithServer(model="gemini-2.5-flash-lite") as parser:
+                    # tmp_path is empty, so os.listdir(tmp_path) will be empty
+                    with pytest.raises(ValueError, match="No previous runs found"):
+                        parser.parse(
+                            inputs=[str(test_file)],
+                            out_folder=str(tmp_path),
+                            retrylast=True,
+                        )

vlmparse 0.1.7__tar.gz → 0.1.8__tar.gz

vlmparse 0.1.7tar.gz → 0.1.8tar.gz