PyPI - vlmparse - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

vlmparse 0.1.0py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
vlmparse/benchpdf2md/create_dataset.py +60 -0
vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
vlmparse/benchpdf2md/run_benchmark.py +296 -0
vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
vlmparse/benchpdf2md/utils.py +56 -0
vlmparse/clients/chandra.py +323 -0
vlmparse/clients/deepseekocr.py +52 -0
vlmparse/clients/docling.py +146 -0
vlmparse/clients/dotsocr.py +277 -0
vlmparse/clients/granite_docling.py +132 -0
vlmparse/clients/hunyuanocr.py +45 -0
vlmparse/clients/lightonocr.py +43 -0
vlmparse/clients/mineru.py +119 -0
vlmparse/clients/nanonetocr.py +29 -0
vlmparse/clients/olmocr.py +46 -0
vlmparse/clients/openai_converter.py +173 -0
vlmparse/clients/paddleocrvl.py +48 -0
vlmparse/clients/pipe_utils/cleaner.py +74 -0
vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
vlmparse/clients/pipe_utils/utils.py +12 -0
vlmparse/clients/prompts.py +66 -0
vlmparse/data_model/box.py +551 -0
vlmparse/data_model/document.py +148 -0
vlmparse/servers/docker_server.py +199 -0
vlmparse/servers/utils.py +250 -0
vlmparse/st_viewer/fs_nav.py +53 -0
vlmparse/st_viewer/st_viewer.py +80 -0
{vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/METADATA +12 -1
vlmparse-0.1.3.dist-info/RECORD +50 -0
vlmparse-0.1.0.dist-info/RECORD +0 -13
{vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/WHEEL +0 -0
{vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/entry_points.txt +0 -0
{vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/licenses/LICENSE +0 -0
{vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/top_level.txt +0 -0

vlmparse/benchpdf2md/st_visu_benchmark/test_form.py ADDED Viewed

@@ -0,0 +1,95 @@
+import streamlit as st
+def edit_test_form(test_obj, test_type):
+    st.markdown("### Edit Test Fields")
+    with st.form("edit_test_fields"):
+        type_fields = {}
+        type_fields["max_diffs"] = st.number_input(
+            "Max Diffs", value=test_obj.max_diffs, min_value=0, step=1
+        )
+        type_fields["unidecode"] = st.checkbox("Unidecode", value=test_obj.unidecode)
+        type_fields["alphanum"] = st.checkbox("Alphanum", value=test_obj.alphanum)
+        type_fields["ignore_str"] = st.text_input(
+            "Ignore strings (seperarated by spaces)",
+            value=" ".join(test_obj.ignore_str),
+        )
+        type_fields["ignore_space"] = st.checkbox(
+            "Ignore space", value=test_obj.ignore_space
+        )
+        type_fields["ignore_str"] = (
+            type_fields["ignore_str"].split(" ") if type_fields["ignore_str"] else []
+        )
+        if test_type == "present" or test_type == "absent":
+            type_fields["text"] = st.text_area(
+                "Text", value=test_obj.text, height="content"
+            )
+            layout_cat_options = [
+                "text",
+                "footer",
+                "header",
+                "footnote",
+                "image",
+                "image_caption",
+            ]
+            type_fields["layout_cat"] = st.selectbox(
+                "Layout Category",
+                layout_cat_options,
+                index=layout_cat_options.index(test_obj.layout_cat),
+            )
+            type_fields["case_sensitive"] = st.checkbox(
+                "Case Sensitive", value=test_obj.case_sensitive
+            )
+            type_fields["first_n"] = st.number_input(
+                "First N",
+                value=test_obj.first_n if test_obj.first_n else 0,
+                min_value=0,
+                step=100,
+            )
+            type_fields["last_n"] = st.number_input(
+                "Last N",
+                value=test_obj.last_n if test_obj.last_n else 0,
+                min_value=0,
+                step=100,
+            )
+            if type_fields["first_n"] == 0:
+                type_fields["first_n"] = None
+            if type_fields["last_n"] == 0:
+                type_fields["last_n"] = None
+        elif test_type == "order":
+            type_fields["before"] = st.text_area(
+                "Before", value=test_obj.before, height="content"
+            )
+            type_fields["after"] = st.text_area(
+                "After", value=test_obj.after, height="content"
+            )
+        elif test_type == "table":
+            type_fields["cell"] = st.text_input("Cell", value=test_obj.cell)
+            type_fields["up"] = st.text_input(
+                "Up", value=test_obj.up if test_obj.up else ""
+            )
+            type_fields["down"] = st.text_input(
+                "Down", value=test_obj.down if test_obj.down else ""
+            )
+            type_fields["left"] = st.text_input(
+                "Left", value=test_obj.left if test_obj.left else ""
+            )
+            type_fields["right"] = st.text_input(
+                "Right", value=test_obj.right if test_obj.right else ""
+            )
+            type_fields["top_heading"] = st.text_input(
+                "Top Heading",
+                value=test_obj.top_heading if test_obj.top_heading else "",
+            )
+            type_fields["left_heading"] = st.text_input(
+                "Left Heading",
+                value=test_obj.left_heading if test_obj.left_heading else "",
+            )
+        if st.form_submit_button("Save Changes"):
+            for field, value in type_fields.items():
+                setattr(test_obj, field, value)
+            return test_obj

vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py ADDED Viewed

@@ -0,0 +1,20 @@
+from pathlib import Path
+from typing import Optional
+import streamlit as st
+from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_pdf_bytes
+def download_pdf_page(
+    pdf_path: Path, page_no: int = 0, file_name: Optional[str] = None
+):
+    pdf_bytes = get_pdf_bytes(pdf_path, page_no)
+    if pdf_bytes:
+        st.download_button(
+            label="📄 Download PDF Page",
+            data=pdf_bytes,
+            file_name=file_name if file_name else f"{pdf_path.stem}.pdf",
+            mime="application/pdf",
+            use_container_width=True,
+        )

vlmparse/benchpdf2md/st_visu_benchmark/utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+import io
+from pathlib import Path
+import pypdfium2 as pdfium
+import streamlit as st
+from vlmparse.data_model.document import Document
+@st.cache_data
+def get_pdf_bytes(pdf_path, page_no=0):
+    pdf_reader = pdfium.PdfDocument(pdf_path)
+    if page_no >= len(pdf_reader):
+        pdf_reader.close()
+        return None
+    # Create a new PDF
+    new_pdf = pdfium.PdfDocument.new()
+    # Import the chosen page into the new PDF
+    new_pdf.import_pages(pdf_reader, pages=[page_no])
+    bytes_io = io.BytesIO()
+    # Get bytes
+    new_pdf.save(bytes_io)
+    pdf_bytes = bytes_io.getvalue()
+    # Clean up
+    new_pdf.close()
+    pdf_reader.close()
+    return pdf_bytes
+@st.cache_data
+def get_doc(doc_path: Path):
+    return Document.from_zip(doc_path)
+def save_new_test(tests, test_obj_edited, test_path):
+    from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import save_tests
+    for test in tests:
+        if test.id == test_obj_edited.id:
+            test = test_obj_edited
+        else:
+            test = test
+    save_tests(tests, test_path)
+    st.success("Test updated successfully!")

vlmparse/benchpdf2md/utils.py ADDED Viewed

@@ -0,0 +1,56 @@
+import base64
+from io import BytesIO
+import numpy as np
+import pandas as pd
+from PIL import Image
+def vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap=1000):
+    group_col = [group_col] if isinstance(group_col, str) else group_col
+    grouped = df.groupby(group_col)[value_col]
+    def bootstrap_group(group):
+        values = group.values
+        n = len(values)
+        bootstrap_samples = np.random.choice(
+            values, size=(n_bootstrap, n), replace=True
+        )
+        bootstrap_means = np.mean(bootstrap_samples, axis=1)
+        return pd.Series(
+            {"mean": np.mean(values), "bootstrap_std": np.std(bootstrap_means)}
+        )
+    result = grouped.apply(bootstrap_group)
+    return result.unstack(-1)
+def format_results_vectorized(result_df, precision=2):
+    means = result_df["mean"].values
+    margins = 2 * result_df["bootstrap_std"].values
+    formatted = np.char.add(
+        np.char.add(np.round(means, precision).astype(str), " ± "),
+        np.round(margins, precision).astype(str),
+    )
+    return pd.DataFrame({"formatted_result": formatted}, index=result_df.index)
+def bootstrap_and_format_results(
+    df, group_col, value_col, n_bootstrap=1000, precision=2
+):
+    result_df = vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap)
+    return format_results_vectorized(result_df, precision)
+def to_base64(image: Image, extension="PNG"):
+    img_byte_arr = BytesIO()
+    image.save(img_byte_arr, format=extension)
+    img_byte_arr = img_byte_arr.getvalue()
+    return base64.b64encode(img_byte_arr).decode("utf-8")
+def from_base64(base64_str: str):
+    image_data = base64.b64decode(base64_str)
+    return Image.open(BytesIO(image_data))

vlmparse/clients/chandra.py ADDED Viewed

@@ -0,0 +1,323 @@
+import math
+import time
+from loguru import logger
+from PIL import Image
+from pydantic import Field
+from vlmparse.clients.openai_converter import (
+    OpenAIConverterClient,
+    OpenAIConverterConfig,
+)
+from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
+from vlmparse.clients.pipe_utils.utils import clean_response
+from vlmparse.data_model.document import Page
+from vlmparse.servers.docker_server import VLLMDockerServerConfig
+from vlmparse.utils import to_base64
+ALLOWED_TAGS = [
+    "math",
+    "br",
+    "i",
+    "b",
+    "u",
+    "del",
+    "sup",
+    "sub",
+    "table",
+    "tr",
+    "td",
+    "p",
+    "th",
+    "div",
+    "pre",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "ul",
+    "ol",
+    "li",
+    "input",
+    "a",
+    "span",
+    "img",
+    "hr",
+    "tbody",
+    "small",
+    "caption",
+    "strong",
+    "thead",
+    "big",
+    "code",
+]
+ALLOWED_ATTRIBUTES = [
+    "class",
+    "colspan",
+    "rowspan",
+    "display",
+    "checked",
+    "type",
+    "border",
+    "value",
+    "style",
+    "href",
+    "alt",
+    "align",
+]
+PROMPT_ENDING = f"""
+Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
+Guidelines:
+* Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
+* Tables: Use colspan and rowspan attributes to match table structure.
+* Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
+* Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
+* Forms: Mark checkboxes and radio buttons properly.
+* Text: join lines together properly into paragraphs using <p>...</p> tags.  Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
+* Use the simplest possible HTML structure that accurately represents the content of the block.
+* Make sure the text is accurate and easy for a human to read and interpret.  Reading order should be correct and natural.
+""".strip()
+OCR_LAYOUT_PROMPT = f"""
+OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-{{bbox_scale}}. The data-label attribute is the label for the block.
+Use the following labels:
+- Caption
+- Footnote
+- Equation-Block
+- List-Group
+- Page-Header
+- Page-Footer
+- Image
+- Section-Header
+- Table
+- Text
+- Complex-Block
+- Code-Block
+- Form
+- Table-Of-Contents
+- Figure
+{PROMPT_ENDING}
+""".strip()
+OCR_PROMPT = f"""
+OCR this image to HTML.
+{PROMPT_ENDING}
+""".strip()
+PROMPT_MAPPING = {
+    "ocr_layout": OCR_LAYOUT_PROMPT,
+    "ocr": OCR_PROMPT,
+}
+def scale_to_fit(
+    img: Image.Image,
+    max_size: tuple[int, int] = (3072, 2048),
+    min_size: tuple[int, int] = (28, 28),
+):
+    resample_method = Image.Resampling.LANCZOS
+    width, height = img.size
+    if width == 0 or height == 0:
+        return img
+    max_width, max_height = max_size
+    min_width, min_height = min_size
+    current_pixels = width * height
+    max_pixels = max_width * max_height
+    min_pixels = min_width * min_height
+    if current_pixels > max_pixels:
+        scale_factor = (max_pixels / current_pixels) ** 0.5
+        new_width = math.floor(width * scale_factor)
+        new_height = math.floor(height * scale_factor)
+    elif current_pixels < min_pixels:
+        scale_factor = (min_pixels / current_pixels) ** 0.5
+        new_width = math.ceil(width * scale_factor)
+        new_height = math.ceil(height * scale_factor)
+    else:
+        return img
+    return img.resize((new_width, new_height), resample=resample_method)
+def detect_repeat_token(
+    predicted_tokens: str,
+    base_max_repeats: int = 4,
+    window_size: int = 500,
+    cut_from_end: int = 0,
+    scaling_factor: float = 3.0,
+):
+    try:
+        # Use existing html_to_md_keep_tables from vlmparse
+        predicted_tokens = html_to_md_keep_tables(predicted_tokens)
+    except Exception as e:
+        logger.error(f"Error parsing markdown: {e}")
+        return True
+    if cut_from_end > 0:
+        predicted_tokens = predicted_tokens[:-cut_from_end]
+    for seq_len in range(1, window_size // 2 + 1):
+        # Extract the potential repeating sequence from the end
+        candidate_seq = predicted_tokens[-seq_len:]
+        # Inverse scaling: shorter sequences need more repeats
+        max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len))
+        # Count how many times this sequence appears consecutively at the end
+        repeat_count = 0
+        pos = len(predicted_tokens) - seq_len
+        if pos < 0:
+            continue
+        while pos >= 0:
+            if predicted_tokens[pos : pos + seq_len] == candidate_seq:
+                repeat_count += 1
+                pos -= seq_len
+            else:
+                break
+        if repeat_count > max_repeats:
+            return True
+    return False
+class ChandraConverterConfig(OpenAIConverterConfig):
+    """Chandra converter configuration."""
+    model_name: str = "datalab-to/chandra"
+    prompt_type: str = "ocr"  # Default prompt type
+    bbox_scale: int = 1024
+    max_retries: int = 6
+    max_failure_retries: int = None
+    completion_kwargs: dict = Field(
+        default_factory=lambda: {
+            "temperature": 0.0,
+            "max_tokens": 12384,
+            "top_p": 0.1,
+        }
+    )
+    aliases: list[str] = Field(default_factory=lambda: ["chandra"])
+    def get_client(self, **kwargs) -> "ChandraConverterClient":
+        return ChandraConverterClient(config=self, **kwargs)
+class ChandraConverterClient(OpenAIConverterClient):
+    """Client for Chandra model."""
+    config: ChandraConverterConfig
+    async def async_call_inside_page(self, page: Page) -> Page:
+        """Process a single page using Chandra logic."""
+        prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
+        prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
+        image = scale_to_fit(page.image)
+        image_b64 = to_base64(image)  # vlmparse.utils.to_base64
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        retries = 0
+        max_retries = self.config.max_retries
+        max_failure_retries = self.config.max_failure_retries
+        result_content = ""
+        error_occurred = False
+        while True:
+            try:
+                # Adjust temperature if retrying
+                temperature = self.config.completion_kwargs.get("temperature", 0.0)
+                if retries > 0:
+                    temperature = 0.3  # As per vllm.py logic
+                completion_kwargs = self.config.completion_kwargs.copy()
+                completion_kwargs["temperature"] = temperature
+                if retries > 0:
+                    completion_kwargs["top_p"] = 0.95
+                result_content = await self._get_chat_completion(
+                    messages, completion_kwargs=completion_kwargs
+                )
+                error_occurred = False
+            except Exception as e:
+                logger.error(f"Error during VLLM generation: {e}")
+                error_occurred = True
+                result_content = ""
+            should_retry = False
+            # Check for repeat token
+            if not error_occurred:
+                has_repeat = detect_repeat_token(result_content) or (
+                    len(result_content) > 50
+                    and detect_repeat_token(result_content, cut_from_end=50)
+                )
+                if has_repeat and retries < max_retries:
+                    logger.warning(
+                        f"Detected repeat token, retrying generation (attempt {retries + 1})..."
+                    )
+                    should_retry = True
+            # Check for error
+            if error_occurred:
+                if max_failure_retries is not None:
+                    if retries < max_failure_retries:
+                        logger.warning(
+                            f"Detected vllm error, retrying generation (attempt {retries + 1})..."
+                        )
+                        should_retry = True
+                elif (
+                    retries < max_retries
+                ):  # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
+                    logger.warning(
+                        f"Detected vllm error, retrying generation (attempt {retries + 1})..."
+                    )
+                    should_retry = True
+            if should_retry:
+                time.sleep(2 * (retries + 1))
+                retries += 1
+                continue
+            else:
+                break
+        logger.info("Response length: " + str(len(result_content)))
+        page.raw_response = result_content
+        text = clean_response(result_content)
+        # Convert HTML to MD
+        text = html_to_md_keep_tables(text)
+        page.text = text
+        return page
+class ChandraDockerServerConfig(VLLMDockerServerConfig):
+    """Configuration for Chandra Docker server."""
+    model_name: str = "datalab-to/chandra"
+    aliases: list[str] = Field(default_factory=lambda: ["chandra"])
+    @property
+    def client_config(self):
+        return ChandraConverterConfig(llm_params=self.llm_params)

vlmparse/clients/deepseekocr.py ADDED Viewed

@@ -0,0 +1,52 @@
+from pydantic import Field
+from vlmparse.clients.openai_converter import OpenAIConverterConfig
+from vlmparse.servers.docker_server import VLLMDockerServerConfig
+class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
+    """Configuration for DeepSeekOCR model."""
+    model_name: str = "deepseek-ai/DeepSeek-OCR"
+    command_args: list[str] = Field(
+        default_factory=lambda: [
+            "--limit-mm-per-prompt",
+            '{"image": 1}',
+            "--async-scheduling",
+            "--logits_processors",
+            "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
+            "--no-enable-prefix-caching",
+            "--mm-processor-cache-gb",
+            "0",
+        ]
+    )
+    aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
+    @property
+    def client_config(self):
+        return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
+class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
+    """DeepSeekOCR converter - backward compatibility alias."""
+    model_name: str = "deepseek-ai/DeepSeek-OCR"
+    aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
+    preprompt: str | None = None
+    postprompt: str | None = "<|grounding|>Convert the document to markdown."
+    completion_kwargs: dict | None = {
+        "temperature": 0.0,
+        "extra_body": {
+            "skip_special_tokens": False,
+            # args used to control custom logits processor
+            "vllm_xargs": {
+                "ngram_size": 30,
+                "window_size": 90,
+                # whitelist: <td>, </td>
+                "whitelist_token_ids": [128821, 128822],
+            },
+        },
+    }
+    max_image_size: int | None = 1540
+    dpi: int = 200
+    aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])

vlmparse 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

vlmparse 0.1.0py3-none-any.whl → 0.1.3py3-none-any.whl