PyPI - vlmparse - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

vlmparse 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

vlmparse/build_doc.py +20 -19
vlmparse/cli.py +439 -270
vlmparse/clients/chandra.py +176 -60
vlmparse/clients/deepseekocr.py +193 -12
vlmparse/clients/docling.py +0 -1
vlmparse/clients/dotsocr.py +34 -31
vlmparse/clients/glmocr.py +243 -0
vlmparse/clients/granite_docling.py +9 -36
vlmparse/clients/hunyuanocr.py +5 -1
vlmparse/clients/lightonocr.py +23 -1
vlmparse/clients/mineru.py +0 -1
vlmparse/clients/mistral_converter.py +85 -0
vlmparse/clients/nanonetocr.py +5 -1
vlmparse/clients/olmocr.py +6 -2
vlmparse/clients/openai_converter.py +95 -60
vlmparse/clients/paddleocrvl.py +195 -40
vlmparse/converter.py +51 -11
vlmparse/converter_with_server.py +92 -19
vlmparse/registries.py +107 -89
vlmparse/servers/base_server.py +127 -0
vlmparse/servers/docker_compose_deployment.py +489 -0
vlmparse/servers/docker_compose_server.py +39 -0
vlmparse/servers/docker_run_deployment.py +226 -0
vlmparse/servers/docker_server.py +17 -109
vlmparse/servers/model_identity.py +48 -0
vlmparse/servers/server_registry.py +42 -0
vlmparse/servers/utils.py +83 -219
vlmparse/st_viewer/st_viewer.py +1 -1
vlmparse/utils.py +15 -2
{vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
vlmparse-0.1.9.dist-info/RECORD +44 -0
{vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
vlmparse-0.1.7.dist-info/RECORD +0 -36
{vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
{vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
{vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0

vlmparse/clients/chandra.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import json
 import math
 import time
+from dataclasses import asdict, dataclass
+from bs4 import BeautifulSoup
 from loguru import logger
 from PIL import Image
 from pydantic import Field
@@ -11,7 +14,8 @@ from vlmparse.clients.openai_converter import (
 )
 from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
 from vlmparse.clients.pipe_utils.utils import clean_response
-from vlmparse.data_model.document import Page
+from vlmparse.data_model.box import BoundingBox
+from vlmparse.data_model.document import Item, Page
 from vlmparse.servers.docker_server import VLLMDockerServerConfig
 from vlmparse.utils import to_base64
@@ -110,11 +114,6 @@ OCR this image to HTML.
 {PROMPT_ENDING}
 """.strip()
-PROMPT_MAPPING = {
-    "ocr_layout": OCR_LAYOUT_PROMPT,
-    "ocr": OCR_PROMPT,
-}
 def scale_to_fit(
     img: Image.Image,
@@ -188,11 +187,135 @@ def detect_repeat_token(
     return False
+@dataclass
+class LayoutBlock:
+    """Represents a layout block with bounding box and content."""
+    bbox: list[int]
+    label: str
+    content: str
+def parse_layout(
+    html: str, image: Image.Image, bbox_scale: int = 1024
+) -> list[LayoutBlock]:
+    """
+    Parse HTML layout blocks with bounding boxes.
+    Args:
+        html: HTML string with layout blocks (divs with data-bbox and data-label attributes)
+        image: PIL Image to get dimensions for bbox scaling
+        bbox_scale: The scale used in the prompt for normalized bboxes
+    Returns:
+        List of LayoutBlock objects with scaled bounding boxes
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    top_level_divs = soup.find_all("div", recursive=False)
+    width, height = image.size
+    width_scaler = width / bbox_scale
+    height_scaler = height / bbox_scale
+    layout_blocks = []
+    for div in top_level_divs:
+        bbox = div.get("data-bbox")
+        try:
+            bbox = json.loads(bbox)
+            assert len(bbox) == 4, "Invalid bbox length"
+        except Exception:
+            try:
+                bbox = bbox.split(" ")
+                assert len(bbox) == 4, "Invalid bbox length"
+            except Exception:
+                # Default bbox if parsing fails
+                bbox = [0, 0, bbox_scale, bbox_scale]
+        bbox = list(map(int, bbox))
+        # Scale bbox to image dimensions
+        bbox = [
+            max(0, int(bbox[0] * width_scaler)),
+            max(0, int(bbox[1] * height_scaler)),
+            min(int(bbox[2] * width_scaler), width),
+            min(int(bbox[3] * height_scaler), height),
+        ]
+        label = div.get("data-label", "block")
+        content = str(div.decode_contents())
+        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
+    return layout_blocks
+def parse_chunks(html: str, image: Image.Image, bbox_scale: int = 1024) -> list[dict]:
+    """
+    Parse HTML layout blocks into dictionaries.
+    Args:
+        html: HTML string with layout blocks
+        image: PIL Image to get dimensions for bbox scaling
+        bbox_scale: The scale used in the prompt for normalized bboxes
+    Returns:
+        List of dictionaries with bbox, label, and content keys
+    """
+    layout = parse_layout(html, image, bbox_scale=bbox_scale)
+    chunks = [asdict(block) for block in layout]
+    return chunks
+def layout_blocks_to_items(
+    layout_blocks: list[LayoutBlock],
+) -> list[Item]:
+    """
+    Convert layout blocks to Item objects for the Page model.
+    Args:
+        layout_blocks: List of LayoutBlock objects
+    Returns:
+        List of Item objects with category, box, and text
+    """
+    items = []
+    for block in layout_blocks:
+        # Convert content HTML to markdown
+        try:
+            text = html_to_md_keep_tables(block.content)
+        except Exception as e:
+            logger.warning(f"Error converting block content to markdown: {e}")
+            text = block.content
+        # Create bounding box from [x0, y0, x1, y1] format
+        bbox = BoundingBox(
+            l=block.bbox[0],
+            t=block.bbox[1],
+            r=block.bbox[2],
+            b=block.bbox[3],
+        )
+        items.append(
+            Item(
+                category=block.label,
+                box=bbox,
+                text=text.strip(),
+            )
+        )
+    return items
 class ChandraConverterConfig(OpenAIConverterConfig):
     """Chandra converter configuration."""
     model_name: str = "datalab-to/chandra"
-    prompt_type: str = "ocr"  # Default prompt type
+    postprompt: str | None = None
+    prompts: dict[str, str] = {
+        "ocr": OCR_PROMPT,
+        "ocr_layout": OCR_LAYOUT_PROMPT,
+    }
+    prompt_mode_map: dict[str, str] = {
+        "table": "ocr_layout",
+    }
     bbox_scale: int = 1024
     max_retries: int = 0
     max_failure_retries: int = None
@@ -216,8 +339,7 @@ class ChandraConverterClient(OpenAIConverterClient):
     async def async_call_inside_page(self, page: Page) -> Page:
         """Process a single page using Chandra logic."""
-        prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
+        prompt = self.get_prompt_for_mode() or OCR_PROMPT
         prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
         image = scale_to_fit(page.image)
@@ -238,61 +360,34 @@ class ChandraConverterClient(OpenAIConverterClient):
         retries = 0
         max_retries = self.config.max_retries
-        max_failure_retries = self.config.max_failure_retries
         result_content = ""
-        error_occurred = False
         while True:
-            try:
-                # Adjust temperature if retrying
-                temperature = self.config.completion_kwargs.get("temperature", 0.0)
-                if retries > 0:
-                    temperature = 0.3  # As per vllm.py logic
-                completion_kwargs = self.config.completion_kwargs.copy()
-                completion_kwargs["temperature"] = temperature
-                if retries > 0:
-                    completion_kwargs["top_p"] = 0.95
-                result_content = await self._get_chat_completion(
-                    messages, completion_kwargs=completion_kwargs
-                )
-                error_occurred = False
-            except Exception as e:
-                logger.error(f"Error during VLLM generation: {e}")
-                error_occurred = True
-                result_content = ""
             should_retry = False
-            # Check for repeat token
-            if not error_occurred:
-                has_repeat = detect_repeat_token(result_content) or (
-                    len(result_content) > 50
-                    and detect_repeat_token(result_content, cut_from_end=50)
+            # Adjust temperature if retrying
+            temperature = self.config.completion_kwargs.get("temperature", 0.0)
+            if retries > 0:
+                temperature = 0.3  # As per vllm.py logic
+            completion_kwargs = self.config.completion_kwargs.copy()
+            completion_kwargs["temperature"] = temperature
+            if retries > 0:
+                completion_kwargs["top_p"] = 0.95
+            result_content, usage = await self._get_chat_completion(
+                messages, completion_kwargs=completion_kwargs
+            )
+            has_repeat = detect_repeat_token(result_content) or (
+                len(result_content) > 50
+                and detect_repeat_token(result_content, cut_from_end=50)
+            )
+            if has_repeat and retries < max_retries:
+                logger.warning(
+                    f"Detected repeat token, retrying generation (attempt {retries + 1})..."
                 )
-                if has_repeat and retries < max_retries:
-                    logger.warning(
-                        f"Detected repeat token, retrying generation (attempt {retries + 1})..."
-                    )
-                    should_retry = True
-            # Check for error
-            if error_occurred:
-                if max_failure_retries is not None:
-                    if retries < max_failure_retries:
-                        logger.warning(
-                            f"Detected vllm error, retrying generation (attempt {retries + 1})..."
-                        )
-                        should_retry = True
-                elif (
-                    retries < max_retries
-                ):  # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
-                    logger.warning(
-                        f"Detected vllm error, retrying generation (attempt {retries + 1})..."
-                    )
-                    should_retry = True
+                should_retry = True
             if should_retry:
                 time.sleep(2 * (retries + 1))
@@ -305,10 +400,27 @@ class ChandraConverterClient(OpenAIConverterClient):
         page.raw_response = result_content
         text = clean_response(result_content)
+        # Check if we're in layout mode (ocr_layout prompt)
+        current_prompt_key = self.get_prompt_key()
+        is_layout_mode = current_prompt_key == "ocr_layout"
+        if is_layout_mode:
+            # Parse layout blocks and populate items
+            try:
+                layout_blocks = parse_layout(
+                    text, image, bbox_scale=self.config.bbox_scale
+                )
+                page.items = layout_blocks_to_items(layout_blocks)
+                logger.info(f"Parsed {len(page.items)} layout blocks")
+            except Exception as e:
+                logger.warning(f"Error parsing layout blocks: {e}")
+                page.items = []
         # Convert HTML to MD
         text = html_to_md_keep_tables(text)
         page.text = text
+        page.completion_tokens = usage.completion_tokens
+        page.prompt_tokens = usage.prompt_tokens
         return page
@@ -320,4 +432,8 @@ class ChandraDockerServerConfig(VLLMDockerServerConfig):
     @property
     def client_config(self):
-        return ChandraConverterConfig(llm_params=self.llm_params)
+        return ChandraConverterConfig(
+            **self._create_client_kwargs(
+                f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
+            )
+        )

vlmparse/clients/deepseekocr.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-from typing import ClassVar, Literal
 from loguru import logger
 from PIL import Image
@@ -14,6 +13,10 @@ from vlmparse.data_model.document import Item, Page
 from vlmparse.servers.docker_server import VLLMDockerServerConfig
 from vlmparse.utils import to_base64
+# ==============================================================================
+# DeepSeek-OCR (v1)
+# ==============================================================================
 class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
     """Configuration for DeepSeekOCR model."""
@@ -35,7 +38,11 @@ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
     @property
     def client_config(self):
-        return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
+        return DeepSeekOCRConverterConfig(
+            **self._create_client_kwargs(
+                f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
+            )
+        )
 class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
@@ -43,8 +50,17 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
     model_name: str = "deepseek-ai/DeepSeek-OCR"
     aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
+    postprompt: str | None = None
+    prompts: dict[str, str] = {
+        "layout": "<|grounding|>Convert the document to markdown.",
+        "ocr": "Free OCR.",
+        "image_description": "Describe this image in detail.",
+    }
+    prompt_mode_map: dict[str, str] = {
+        "ocr_layout": "layout",
+        "table": "layout",
+    }
-    prompt_mode: Literal["layout", "ocr"] = "ocr"
     completion_kwargs: dict | None = {
         "temperature": 0.0,
         "max_tokens": 8181,
@@ -95,12 +111,6 @@ def extract_coordinates_and_label(ref_text):
 class DeepSeekOCRConverterClient(OpenAIConverterClient):
     """Client for DeepSeekOCR with specific post-processing."""
-    PROMPTS: ClassVar[dict] = {
-        "layout": "<|grounding|>Convert the document to markdown.",
-        "ocr": "Free OCR.",
-        "image_description": "Describe this image in detail.",
-    }
     def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
         items = []
         width, height = image.size
@@ -153,6 +163,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
         # Prepare messages as in parent class
         image = page.image
+        prompt_key = self.get_prompt_key() or "ocr"
         messages = [
             {
                 "role": "user",
@@ -163,17 +175,17 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
                             "url": f"data:image/png;base64,{to_base64(image)}"
                         },
                     },
-                    {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
+                    {"type": "text", "text": self.config.prompts[prompt_key]},
                 ],
             },
         ]
         # Get raw response using parent's method
-        response = await self._get_chat_completion(messages)
+        response, usage = await self._get_chat_completion(messages)
         logger.info("Response length: " + str(len(response)))
         page.raw_response = response
-        if self.config.prompt_mode == "layout":
+        if prompt_key == "layout":
             # Post-processing
             matches, matches_image, matches_other = re_match(response)
@@ -199,5 +211,174 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
         page.text = outputs.strip()
         logger.debug(page.text)
+        if usage is not None:
+            page.prompt_tokens = usage.prompt_tokens
+            page.completion_tokens = usage.completion_tokens
+        return page
+# ==============================================================================
+# DeepSeek-OCR-2
+# ==============================================================================
+class DeepSeekOCR2DockerServerConfig(VLLMDockerServerConfig):
+    """Configuration for DeepSeek-OCR-2 model.
+    DeepSeek-OCR-2 uses a custom architecture that requires:
+    - Custom model registration via hf_overrides
+    - NoRepeatNGram logits processor with specific whitelist tokens
+    - Custom image processor (DeepseekOCR2Processor)
+    """
+    docker_image: str = "vllm/vllm-openai:nightly"
+    model_name: str = "deepseek-ai/DeepSeek-OCR-2"
+    command_args: list[str] = Field(
+        default_factory=lambda: [
+            "--limit-mm-per-prompt",
+            '{"image": 1}',
+            "--hf-overrides",
+            '{"architectures": ["DeepseekOCR2ForCausalLM"]}',
+            "--block-size",
+            "256",
+            "--trust-remote-code",
+            "--max-model-len",
+            "8192",
+            "--swap-space",
+            "0",
+            "--gpu-memory-utilization",
+            "0.9",
+            "--logits_processors",
+            "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
+        ]
+    )
+    aliases: list[str] = Field(
+        default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
+    )
+    @property
+    def client_config(self):
+        return DeepSeekOCR2ConverterConfig(
+            **self._create_client_kwargs(
+                f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
+            )
+        )
+class DeepSeekOCR2ConverterConfig(OpenAIConverterConfig):
+    """DeepSeek-OCR-2 converter configuration.
+    Key differences from DeepSeek-OCR v1:
+    - Uses DeepseekOCR2ForCausalLM architecture
+    - Different logits processor parameters (ngram_size=20, window_size=50)
+    - Supports cropping mode for image processing
+    """
+    model_name: str = "deepseek-ai/DeepSeek-OCR-2"
+    aliases: list[str] = Field(
+        default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
+    )
+    postprompt: str | None = None
+    prompts: dict[str, str] = {
+        "layout": "<|grounding|>Convert the document to markdown.",
+        "ocr": "Free OCR.",
+        "image_description": "Describe this image in detail.",
+    }
+    prompt_mode_map: dict[str, str] = {
+        "ocr_layout": "layout",
+        "table": "layout",
+    }
+    completion_kwargs: dict | None = {
+        "temperature": 0.0,
+        "max_tokens": 8180,
+        "extra_body": {
+            "skip_special_tokens": False,
+            # args used to control custom logits processor
+            "vllm_xargs": {
+                "ngram_size": 20,
+                "window_size": 50,
+                # whitelist: <td>, </td>
+                "whitelist_token_ids": [128821, 128822],
+            },
+        },
+    }
+    dpi: int = 144  # Default DPI used in reference implementation
+    def get_client(self, **kwargs) -> "DeepSeekOCR2ConverterClient":
+        return DeepSeekOCR2ConverterClient(config=self, **kwargs)
+class DeepSeekOCR2ConverterClient(DeepSeekOCRConverterClient):
+    """Client for DeepSeek-OCR-2 with specific post-processing.
+    Inherits from DeepSeekOCRConverterClient as the post-processing logic
+    for parsing grounding references and extracting items is the same.
+    The main differences are in the model configuration and logits processor.
+    """
+    async def async_call_inside_page(self, page: Page) -> Page:
+        # Prepare messages as in parent class
+        image = page.image
+        prompt_key = self.get_prompt_key() or "ocr"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{to_base64(image)}"
+                        },
+                    },
+                    {"type": "text", "text": self.config.prompts[prompt_key]},
+                ],
+            },
+        ]
+        # Get raw response using parent's method
+        response, usage = await self._get_chat_completion(messages)
+        logger.info("Response length: " + str(len(response)))
+        page.raw_response = response
+        if prompt_key == "layout":
+            # Post-processing
+            matches, matches_image, matches_other = re_match(response)
+            # Extract items (bounding boxes)
+            page.items = self.extract_items(page.image, matches)
+            # Clean text
+            outputs = response
+            # Check for sentence end marker (indicates successful completion)
+            # If not present, it might be due to repetition detection
+            if "<｜end▁of▁sentence｜>" in outputs:
+                outputs = outputs.replace("<｜end▁of▁sentence｜>", "")
+            # Replace image references with a placeholder
+            for a_match_image in matches_image:
+                outputs = outputs.replace(a_match_image, "![image]")
+            # Replace other references (text grounding) and cleanup
+            for a_match_other in matches_other:
+                outputs = (
+                    outputs.replace(a_match_other, "")
+                    .replace("\\coloneqq", ":=")
+                    .replace("\\eqqcolon", "=:")
+                    .replace("\n\n\n\n", "\n\n")
+                    .replace("\n\n\n", "\n\n")
+                )
+        else:
+            outputs = response
+        page.text = outputs.strip()
+        logger.debug(page.text)
+        if usage is not None:
+            page.prompt_tokens = usage.prompt_tokens
+            page.completion_tokens = usage.completion_tokens
         return page

vlmparse/clients/docling.py CHANGED Viewed

@@ -62,7 +62,6 @@ class DoclingDockerServerConfig(DockerServerConfig):
 class DoclingConverterConfig(ConverterConfig):
     """Configuration for Docling converter client."""
-    base_url: str
     model_name: str = "docling"
     timeout: int = 300
     api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}

vlmparse/clients/dotsocr.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import json
 import math
 from pathlib import Path
-from typing import ClassVar, Literal
+from typing import ClassVar
 from loguru import logger
 from PIL import Image
 from pydantic import Field
 from vlmparse.clients.openai_converter import (
-    LLMParams,
     OpenAIConverterClient,
     OpenAIConverterConfig,
 )
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
     )
     add_model_key_to_server: bool = True
     aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
+    default_model_name: str = DEFAULT_MODEL_NAME
     @property
     def client_config(self):
         return DotsOCRConverterConfig(
-            llm_params=LLMParams(
-                base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
+            **self._create_client_kwargs(
+                f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
             )
         )
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
     model_name: str = "rednote-hilab/dots.ocr"
     preprompt: str | None = ""
     postprompt: str | None = None
-    completion_kwargs: dict | None = {
-        "temperature": 0.1,
-        "top_p": 1.0,
-        "max_completion_tokens": 16384,
-    }
-    aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
-    dpi: int = 200
-    prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
-    def get_client(self, **kwargs) -> "DotsOCRConverter":
-        return DotsOCRConverter(config=self, **kwargs)
-class DotsOCRConverter(OpenAIConverterClient):
-    """DotsOCR VLLM converter."""
-    # Constants
-    MIN_PIXELS: ClassVar[int] = 3136
-    MAX_PIXELS: ClassVar[int] = 11289600
-    IMAGE_FACTOR: ClassVar[int] = 28
-    # Prompts
-    PROMPTS: ClassVar[dict] = {
+    prompts: dict[str, str] = {
         "prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
 1. Bbox format: [x1, y1, x2, y2]
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
 """,
         "prompt_ocr": """Extract the text content from this image.""",
     }
+    prompt_mode_map: dict[str, str] = {
+        "ocr": "prompt_ocr",
+        "ocr_layout": "prompt_layout_all_en",
+        "table": "prompt_layout_all_en",
+    }
+    completion_kwargs: dict | None = {
+        "temperature": 0.1,
+        "top_p": 1.0,
+        "max_completion_tokens": 16384,
+    }
+    aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
+    dpi: int = 200
+    def get_client(self, **kwargs) -> "DotsOCRConverter":
+        return DotsOCRConverter(config=self, **kwargs)
+class DotsOCRConverter(OpenAIConverterClient):
+    """DotsOCR VLLM converter."""
+    # Constants
+    MIN_PIXELS: ClassVar[int] = 3136
+    MAX_PIXELS: ClassVar[int] = 11289600
+    IMAGE_FACTOR: ClassVar[int] = 28
     @staticmethod
     def round_by_factor(number: int, factor: int) -> int:
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
         image = self.fetch_image(
             origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
         )
-        prompt = self.PROMPTS[prompt_mode]
+        prompt = self.config.prompts[prompt_mode]
         response, usage = await self._async_inference_with_vllm(image, prompt)
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
     async def async_call_inside_page(self, page: Page) -> Page:
         image = page.image
+        prompt_key = self.get_prompt_key() or "prompt_ocr"
         _, response, _, usage = await self._parse_image_vllm(
-            image, prompt_mode=self.config.prompt_mode
+            image, prompt_mode=prompt_key
         )
         logger.info("Response: " + str(response))
         items = None
-        if self.config.prompt_mode == "prompt_layout_all_en":
+        if prompt_key == "prompt_layout_all_en":
             text = "\n\n".join([item.get("text", "") for item in response])
             items = []
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
         page.completion_tokens = usage.completion_tokens
         page.prompt_tokens = usage.prompt_tokens
-        page.reasoning_tokens = usage.reasoning_tokens
         return page

vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

vlmparse 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl