PyPI - vlmparse - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

vlmparse 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

vlmparse/build_doc.py +10 -4
vlmparse/clients/deepseekocr.py +155 -4
vlmparse/constants.py +2 -0
vlmparse/converter.py +19 -5
vlmparse/utils.py +2 -2
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/METADATA +1 -1
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/RECORD +11 -10
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/WHEEL +0 -0
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/entry_points.txt +0 -0
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/licenses/LICENSE +0 -0
{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/top_level.txt +0 -0

vlmparse/build_doc.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import re
+from pathlib import Path
 import numpy as np
 import PIL
 import pypdfium2 as pdfium
 from loguru import logger
+from .constants import PDF_EXTENSION
 def convert_pdfium(file_path, dpi):
     pdf = pdfium.PdfDocument(file_path)
@@ -64,7 +67,10 @@ def resize_image(image, max_image_size):
 def get_page_count(file_path):
-    pdf = pdfium.PdfDocument(file_path)
-    count = len(pdf)
-    pdf.close()
-    return count
+    if Path(file_path).suffix.lower() == PDF_EXTENSION:
+        pdf = pdfium.PdfDocument(file_path)
+        count = len(pdf)
+        pdf.close()
+        return count
+    else:
+        return 1

vlmparse/clients/deepseekocr.py CHANGED Viewed

@@ -1,7 +1,155 @@
+import re
+from typing import ClassVar, Literal
+from loguru import logger
+from PIL import Image
 from pydantic import Field
-from vlmparse.clients.openai_converter import OpenAIConverterConfig
+from vlmparse.clients.openai_converter import (
+    OpenAIConverterClient,
+    OpenAIConverterConfig,
+)
+from vlmparse.data_model.box import BoundingBox
+from vlmparse.data_model.document import Item, Page
 from vlmparse.servers.docker_server import VLLMDockerServerConfig
+from vlmparse.utils import to_base64
+def re_match(text):
+    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
+    matches = re.findall(pattern, text, re.DOTALL)
+    matches_image = []
+    matches_other = []
+    for a_match in matches:
+        if "<|ref|>image<|/ref|>" in a_match[0]:
+            matches_image.append(a_match[0])
+        else:
+            matches_other.append(a_match[0])
+    return matches, matches_image, matches_other
+def extract_coordinates_and_label(ref_text):
+    try:
+        label_type = ref_text[1]
+        matches = re.findall(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", ref_text[2])
+        cor_list = [[int(x) for x in m] for m in matches]
+    except Exception as e:
+        logger.warning(f"Error parsing coordinates: {e}")
+        return None
+    return (label_type, cor_list)
+class DeepSeekOCRConverterClient(OpenAIConverterClient):
+    """Client for DeepSeekOCR with specific post-processing."""
+    PROMPTS: ClassVar[dict] = {
+        "layout": "<|grounding|>Convert the document to markdown.",
+        "ocr": "Free OCR.",
+        "image_description": "Describe this image in detail.",
+    }
+    def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
+        items = []
+        width, height = image.size
+        for match in matches:
+            # match is tuple: (full_str, label, coords_str)
+            result = extract_coordinates_and_label(match)
+            if not result:
+                continue
+            category, coords = result
+            if not coords:
+                continue
+            # Create boxes
+            boxes = []
+            for point in coords:
+                if len(point) != 4:
+                    continue
+                x1, y1, x2, y2 = point
+                # Scale to image size (0-999 -> pixel)
+                x1 = (x1 / 999) * width
+                y1 = (y1 / 999) * height
+                x2 = (x2 / 999) * width
+                y2 = (y2 / 999) * height
+                boxes.append(
+                    BoundingBox(
+                        l=min(x1, x2), t=min(y1, y2), r=max(x1, x2), b=max(y1, y2)
+                    )
+                )
+            if not boxes:
+                continue
+            # Merge if multiple boxes for one item
+            try:
+                final_box = (
+                    BoundingBox.merge_boxes(boxes) if len(boxes) > 1 else boxes[0]
+                )
+            except Exception as e:
+                logger.warning(f"Error merging boxes: {e}")
+                continue
+            items.append(Item(category=category, text=match[1], box=final_box))
+        return items
+    async def async_call_inside_page(self, page: Page) -> Page:
+        # Prepare messages as in parent class
+        image = page.image
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{to_base64(image)}"
+                        },
+                    },
+                    {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
+                ],
+            },
+        ]
+        # Get raw response using parent's method
+        response = await self._get_chat_completion(messages)
+        logger.info("Response length: " + str(len(response)))
+        page.raw_response = response
+        if self.config.prompt_mode == "layout":
+            # Post-processing
+            matches, matches_image, matches_other = re_match(response)
+            # Extract items (bounding boxes)
+            page.items = self.extract_items(page.image, matches)
+            # Clean text
+            outputs = response
+            # Replace image references with a placeholder
+            for a_match_image in matches_image:
+                outputs = outputs.replace(a_match_image, "![image]")
+            # Replace other references (text grounding) and cleanup
+            for a_match_other in matches_other:
+                outputs = (
+                    outputs.replace(a_match_other, "")
+                    .replace("\\coloneqq", ":=")
+                    .replace("\\eqqcolon", "=:")
+                )
+        else:
+            outputs = response
+        page.text = outputs.strip()
+        logger.debug(page.text)
+        return page
 class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
@@ -32,10 +180,11 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
     model_name: str = "deepseek-ai/DeepSeek-OCR"
     aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
-    preprompt: str | None = None
-    postprompt: str | None = "<|grounding|>Convert the document to markdown."
+    prompt_mode: Literal["layout", "ocr"] = "ocr"
     completion_kwargs: dict | None = {
         "temperature": 0.0,
+        "max_tokens": 8181,
         "extra_body": {
             "skip_special_tokens": False,
             # args used to control custom logits processor
@@ -47,6 +196,8 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
             },
         },
     }
-    max_image_size: int | None = 1540
     dpi: int = 200
     aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
+    def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
+        return DeepSeekOCRConverterClient(config=self, **kwargs)

vlmparse/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2	+ PDF_EXTENSION = ".pdf"

vlmparse/converter.py CHANGED Viewed

@@ -6,10 +6,12 @@ from pathlib import Path
 from typing import Literal
 from loguru import logger
+from PIL import Image
 from pydantic import Field
 from .base_model import VLMParseBaseModel
 from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
+from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
 from .data_model.document import Document, Page, ProcessingError
 # Add a lock to ensure PDFium is accessed by only one thread/task at a time
@@ -50,12 +52,24 @@ class BaseConverter:
         raise NotImplementedError
     def add_page_image(self, page: Page, file_path, page_idx):
-        with PDFIUM_LOCK:
-            image = convert_specific_page_to_image(
-                file_path,
-                page_idx,
-                dpi=self.config.dpi,
+        if Path(file_path).suffix.lower() in IMAGE_EXTENSIONS:
+            image = Image.open(file_path)
+            if image.mode != "RGB":
+                image = image.convert("L").convert("RGB")
+        elif Path(file_path).suffix.lower() == PDF_EXTENSION:
+            with PDFIUM_LOCK:
+                image = convert_specific_page_to_image(
+                    file_path,
+                    page_idx,
+                    dpi=self.config.dpi,
+                )
+        else:
+            raise ValueError(
+                f"Unsupported file extension: {Path(file_path).suffix.lower()}"
             )
         image = resize_image(image, self.config.max_image_size)
         page.buffer_image = image
         return page

vlmparse/utils.py CHANGED Viewed

@@ -28,12 +28,12 @@ def get_file_paths(inputs: str | list[str]):
         if "*" in pattern or "?" in pattern:
             file_paths.extend(glob(pattern, recursive=True))
         elif os.path.isdir(pattern):
-            file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
+            file_paths.extend(glob(os.path.join(pattern, "*.*"), recursive=True))
         elif os.path.isfile(pattern):
             file_paths.append(pattern)
         else:
             logger.error(f"Invalid input: {pattern}")
-    file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
+    file_paths = [f for f in file_paths if os.path.exists(f) and os.path.isfile(f)]
     if not file_paths:
         logger.error("No PDF files found matching the inputs patterns")

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vlmparse
-Version: 0.1.3
+Version: 0.1.4
 Requires-Python: >=3.12.0
 Description-Content-Type: text/markdown
 License-File: LICENSE

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
 vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
-vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
+vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
 vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
-vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
+vlmparse/constants.py,sha256=7-47S01n4MI2ebR09bpdOo3_P16d-z-NVGsm6KJP8ls,110
+vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
 vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
 vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
-vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
+vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
 vlmparse/benchpdf2md/create_dataset.py,sha256=0o4I0O3pHm1W7NYOTnW1JvPmgxJM8KLElKFvAbPAIic,1855
 vlmparse/benchpdf2md/run_benchmark.py,sha256=LMHElWyWIgB4ppBL0s-qjfMz5FZQnZOEm5mXxd0p0C8,9800
 vlmparse/benchpdf2md/utils.py,sha256=Q62vtvLIzxOEzSi-w210d7qnaRz-q_5ykmLNTkmbs-8,1732
@@ -21,7 +22,7 @@ vlmparse/benchpdf2md/st_visu_benchmark/test_form.py,sha256=qNmFZoSdbWcw1EJKesgO7
 vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py,sha256=WkKncexShO3SU-DO7dPT4DOe-8UNjsCaHlj9L1B2mkI,572
 vlmparse/benchpdf2md/st_visu_benchmark/utils.py,sha256=JSmOJQY1DDETtWmjWv07SlQlORE6yBewiMcE5qRZI_Q,1109
 vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
-vlmparse/clients/deepseekocr.py,sha256=iCG5wI5yPv98hIPgVJX4gkkkH1OekblZjFhh5ORVWAk,1813
+vlmparse/clients/deepseekocr.py,sha256=rQvaOaPPoDiZ0MzXqfqqH9BgUBfjmlfHu3NlMjSDgiQ,6501
 vlmparse/clients/docling.py,sha256=K-Grl_nZiSdooEdEaflevprE56l3Keby9xSMBtFwdis,5355
 vlmparse/clients/dotsocr.py,sha256=9ygvIVVOi9UhTUJwmrI-h6AjMV9vL9J2vMaBfUyTorY,9895
 vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
@@ -42,9 +43,9 @@ vlmparse/servers/docker_server.py,sha256=nI7K8CEzJwSZxLY7Jg9IuYHHLR5YQpOSgY8Ln71
 vlmparse/servers/utils.py,sha256=gMk5Y8FA1nlSxi7JzKxZu7XyljkYUZ5AnsTb3YFqu28,8821
 vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
 vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
-vlmparse-0.1.3.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
-vlmparse-0.1.3.dist-info/METADATA,sha256=JkSI4uFnnF59WReyhfRFZZVoe6KLk0ZJrjG0FQkUIPI,5112
-vlmparse-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-vlmparse-0.1.3.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
-vlmparse-0.1.3.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
-vlmparse-0.1.3.dist-info/RECORD,,
+vlmparse-0.1.4.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
+vlmparse-0.1.4.dist-info/METADATA,sha256=72_47P1ER-J8tzlEvE91Xf58u35p5eZZD1VvPbXzrqA,5112
+vlmparse-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+vlmparse-0.1.4.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
+vlmparse-0.1.4.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
+vlmparse-0.1.4.dist-info/RECORD,,

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{vlmparse-0.1.3.dist-info → vlmparse-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

vlmparse 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

vlmparse 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl