vlmparse 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +60 -0
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
- vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
- vlmparse/benchpdf2md/run_benchmark.py +296 -0
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
- vlmparse/benchpdf2md/utils.py +56 -0
- vlmparse/clients/chandra.py +323 -0
- vlmparse/clients/deepseekocr.py +52 -0
- vlmparse/clients/docling.py +146 -0
- vlmparse/clients/dotsocr.py +277 -0
- vlmparse/clients/granite_docling.py +132 -0
- vlmparse/clients/hunyuanocr.py +45 -0
- vlmparse/clients/lightonocr.py +43 -0
- vlmparse/clients/mineru.py +119 -0
- vlmparse/clients/nanonetocr.py +29 -0
- vlmparse/clients/olmocr.py +46 -0
- vlmparse/clients/openai_converter.py +173 -0
- vlmparse/clients/paddleocrvl.py +48 -0
- vlmparse/clients/pipe_utils/cleaner.py +74 -0
- vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
- vlmparse/clients/pipe_utils/utils.py +12 -0
- vlmparse/clients/prompts.py +66 -0
- vlmparse/data_model/box.py +551 -0
- vlmparse/data_model/document.py +148 -0
- vlmparse/servers/docker_server.py +199 -0
- vlmparse/servers/utils.py +250 -0
- vlmparse/st_viewer/fs_nav.py +53 -0
- vlmparse/st_viewer/st_viewer.py +80 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/METADATA +11 -1
- vlmparse-0.1.2.dist-info/RECORD +50 -0
- vlmparse-0.1.0.dist-info/RECORD +0 -13
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
11
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
12
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
13
|
+
from vlmparse.data_model.document import Page
|
|
14
|
+
from vlmparse.servers.docker_server import DockerServerConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DoclingDockerServerConfig(DockerServerConfig):
|
|
18
|
+
"""Configuration for Docling Serve using official image."""
|
|
19
|
+
|
|
20
|
+
model_name: str = "docling"
|
|
21
|
+
docker_image: str = Field(default="")
|
|
22
|
+
cpu_only: bool = False
|
|
23
|
+
command_args: list[str] = Field(default_factory=list)
|
|
24
|
+
server_ready_indicators: list[str] = Field(
|
|
25
|
+
default_factory=lambda: ["Application startup complete", "Uvicorn running"]
|
|
26
|
+
)
|
|
27
|
+
enable_ui: bool = False
|
|
28
|
+
docker_port: int = 5001
|
|
29
|
+
container_port: int = 5001
|
|
30
|
+
environment: dict[str, str] = Field(
|
|
31
|
+
default_factory=lambda: {
|
|
32
|
+
"DOCLING_SERVE_HOST": "0.0.0.0",
|
|
33
|
+
"DOCLING_SERVE_PORT": "5001",
|
|
34
|
+
"LOG_LEVEL": "DEBUG", # Enable verbose logging
|
|
35
|
+
# Performance Tuning
|
|
36
|
+
# "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
|
|
37
|
+
# "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "4", # Increase processing workers (Default: 2)
|
|
38
|
+
"DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def model_post_init(self, __context):
|
|
43
|
+
"""Set docker_image and gpu_device_ids based on cpu_only if not explicitly provided."""
|
|
44
|
+
if not self.docker_image:
|
|
45
|
+
if self.cpu_only:
|
|
46
|
+
self.docker_image = "quay.io/docling-project/docling-serve-cpu:latest"
|
|
47
|
+
else:
|
|
48
|
+
self.docker_image = "quay.io/docling-project/docling-serve:latest"
|
|
49
|
+
|
|
50
|
+
# For CPU-only mode, explicitly disable GPU by setting empty list
|
|
51
|
+
if self.cpu_only and self.gpu_device_ids is None:
|
|
52
|
+
self.gpu_device_ids = []
|
|
53
|
+
|
|
54
|
+
if self.enable_ui:
|
|
55
|
+
self.command_args.append("--enable-ui")
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def client_config(self):
|
|
59
|
+
return DoclingConverterConfig(base_url=f"http://localhost:{self.docker_port}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DoclingConverterConfig(ConverterConfig):
|
|
63
|
+
"""Configuration for Docling converter client."""
|
|
64
|
+
|
|
65
|
+
model_name: str = "docling"
|
|
66
|
+
base_url: str = "http://localhost:5001"
|
|
67
|
+
timeout: int = 300
|
|
68
|
+
api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
|
|
69
|
+
|
|
70
|
+
def get_client(self, **kwargs) -> "DoclingConverter":
|
|
71
|
+
return DoclingConverter(config=self, **kwargs)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def image_to_bytes(image: Image.Image) -> bytes:
|
|
75
|
+
# Convert image to bytes for file upload
|
|
76
|
+
img_byte_arr = BytesIO()
|
|
77
|
+
image.save(img_byte_arr, format="PNG")
|
|
78
|
+
img_bytes = img_byte_arr.getvalue()
|
|
79
|
+
return img_bytes
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DoclingConverter(BaseConverter):
|
|
83
|
+
"""Client for Docling Serve API using httpx."""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
config: DoclingConverterConfig,
|
|
88
|
+
num_concurrent_files: int = 10,
|
|
89
|
+
num_concurrent_pages: int = 10,
|
|
90
|
+
save_folder: str | None = None,
|
|
91
|
+
save_mode: Literal["document", "md", "md_page"] = "document",
|
|
92
|
+
debug: bool = False,
|
|
93
|
+
return_documents_in_batch_mode: bool = False,
|
|
94
|
+
):
|
|
95
|
+
super().__init__(
|
|
96
|
+
config=config,
|
|
97
|
+
num_concurrent_files=num_concurrent_files,
|
|
98
|
+
num_concurrent_pages=num_concurrent_pages,
|
|
99
|
+
save_folder=save_folder,
|
|
100
|
+
save_mode=save_mode,
|
|
101
|
+
debug=debug,
|
|
102
|
+
return_documents_in_batch_mode=return_documents_in_batch_mode,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
106
|
+
"""Process a single page using Docling Serve API."""
|
|
107
|
+
img_bytes = await asyncio.to_thread(image_to_bytes, page.image)
|
|
108
|
+
|
|
109
|
+
data = self.config.api_kwargs
|
|
110
|
+
url = f"{self.config.base_url}/v1/convert/file"
|
|
111
|
+
logger.debug(f"Calling Docling API at: {url}")
|
|
112
|
+
files = {"files": ("image.png", img_bytes, "image/png")}
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
116
|
+
response = await client.post(
|
|
117
|
+
url, files=files, data=data, headers={"Accept": "application/json"}
|
|
118
|
+
)
|
|
119
|
+
response.raise_for_status()
|
|
120
|
+
|
|
121
|
+
result = response.json()
|
|
122
|
+
logger.info(f"Docling API response status: {response.status_code}")
|
|
123
|
+
|
|
124
|
+
# Extract text from the response
|
|
125
|
+
# The response structure depends on the output format
|
|
126
|
+
if self.config.api_kwargs["output_format"] == "markdown":
|
|
127
|
+
text = result["document"]["md_content"]
|
|
128
|
+
|
|
129
|
+
elif self.config.api_kwargs["output_format"] == "text":
|
|
130
|
+
text = result["document"]["md_content"]
|
|
131
|
+
|
|
132
|
+
else: # json or other formats
|
|
133
|
+
text = str(result)
|
|
134
|
+
|
|
135
|
+
logger.info(f"Extracted text length: {len(text)}")
|
|
136
|
+
|
|
137
|
+
# Clean and convert the response
|
|
138
|
+
text = clean_response(text)
|
|
139
|
+
text = html_to_md_keep_tables(text)
|
|
140
|
+
page.text = text
|
|
141
|
+
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error(f"Error processing page with Docling: {e}")
|
|
144
|
+
page.text = f"Error: {str(e)}"
|
|
145
|
+
|
|
146
|
+
return page
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import ClassVar, Literal
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from vlmparse.clients.openai_converter import (
|
|
11
|
+
OpenAIConverterClient,
|
|
12
|
+
OpenAIConverterConfig,
|
|
13
|
+
)
|
|
14
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
15
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
16
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
17
|
+
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME, DockerServerConfig
|
|
18
|
+
from vlmparse.utils import to_base64
|
|
19
|
+
|
|
20
|
+
DOCKERFILE_DIR = Path(__file__).parent.parent.parent / "docker_pipelines"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
24
|
+
"""Configuration for DotsOCR model."""
|
|
25
|
+
|
|
26
|
+
model_name: str = "rednote-hilab/dots.ocr"
|
|
27
|
+
docker_image: str = "dotsocr:latest"
|
|
28
|
+
dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
|
|
29
|
+
command_args: list[str] = Field(
|
|
30
|
+
default_factory=lambda: [
|
|
31
|
+
"--tensor-parallel-size",
|
|
32
|
+
"1",
|
|
33
|
+
"--gpu-memory-utilization",
|
|
34
|
+
"0.8",
|
|
35
|
+
"--chat-template-content-format",
|
|
36
|
+
"string",
|
|
37
|
+
"--served-model-name",
|
|
38
|
+
DEFAULT_MODEL_NAME,
|
|
39
|
+
"--trust-remote-code",
|
|
40
|
+
# "--limit-mm-per-prompt",
|
|
41
|
+
# '{"image": 1}',
|
|
42
|
+
# "--no-enable-prefix-caching",
|
|
43
|
+
# "--max-model-len",
|
|
44
|
+
# "16384",
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
add_model_key_to_server: bool = False
|
|
48
|
+
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def client_config(self):
|
|
52
|
+
return DotsOCRConverterConfig(llm_params=self.llm_params)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
56
|
+
model_name: str = "rednote-hilab/dots.ocr"
|
|
57
|
+
preprompt: str | None = ""
|
|
58
|
+
postprompt: str | None = None
|
|
59
|
+
completion_kwargs: dict | None = {
|
|
60
|
+
"temperature": 0.1,
|
|
61
|
+
"top_p": 1.0,
|
|
62
|
+
"max_completion_tokens": 16384,
|
|
63
|
+
}
|
|
64
|
+
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
65
|
+
dpi: int = 200
|
|
66
|
+
prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
|
|
67
|
+
|
|
68
|
+
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
69
|
+
return DotsOCRConverter(config=self, **kwargs)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DotsOCRConverter(OpenAIConverterClient):
|
|
73
|
+
"""DotsOCR VLLM converter."""
|
|
74
|
+
|
|
75
|
+
# Constants
|
|
76
|
+
MIN_PIXELS: ClassVar[int] = 3136
|
|
77
|
+
MAX_PIXELS: ClassVar[int] = 11289600
|
|
78
|
+
IMAGE_FACTOR: ClassVar[int] = 28
|
|
79
|
+
|
|
80
|
+
# Prompts
|
|
81
|
+
PROMPTS: ClassVar[dict] = {
|
|
82
|
+
"prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
83
|
+
|
|
84
|
+
1. Bbox format: [x1, y1, x2, y2]
|
|
85
|
+
|
|
86
|
+
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
|
87
|
+
|
|
88
|
+
3. Text Extraction & Formatting Rules:
|
|
89
|
+
- Picture: For the 'Picture' category, the text field should be omitted.
|
|
90
|
+
- Formula: Format its text as LaTeX.
|
|
91
|
+
- Table: Format its text as HTML.
|
|
92
|
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
93
|
+
|
|
94
|
+
4. Constraints:
|
|
95
|
+
- The output text must be the original text from the image, with no translation.
|
|
96
|
+
- All layout elements must be sorted according to human reading order.
|
|
97
|
+
|
|
98
|
+
5. Final Output: The entire output must be a single JSON object.
|
|
99
|
+
""",
|
|
100
|
+
"prompt_ocr": """Extract the text content from this image.""",
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def round_by_factor(number: int, factor: int) -> int:
|
|
105
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
106
|
+
return round(number / factor) * factor
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def ceil_by_factor(number: int, factor: int) -> int:
|
|
110
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
|
111
|
+
return math.ceil(number / factor) * factor
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def floor_by_factor(number: int, factor: int) -> int:
|
|
115
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
|
116
|
+
return math.floor(number / factor) * factor
|
|
117
|
+
|
|
118
|
+
def smart_resize(
|
|
119
|
+
self,
|
|
120
|
+
height: int,
|
|
121
|
+
width: int,
|
|
122
|
+
factor: int = 28,
|
|
123
|
+
min_pixels: int = 3136,
|
|
124
|
+
max_pixels: int = 11289600,
|
|
125
|
+
):
|
|
126
|
+
"""Rescales image dimensions to meet factor, pixel range, and aspect ratio constraints."""
|
|
127
|
+
if max(height, width) / min(height, width) > 200:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
|
130
|
+
)
|
|
131
|
+
h_bar = max(factor, self.round_by_factor(height, factor))
|
|
132
|
+
w_bar = max(factor, self.round_by_factor(width, factor))
|
|
133
|
+
if h_bar * w_bar > max_pixels:
|
|
134
|
+
beta = math.sqrt((height * width) / max_pixels)
|
|
135
|
+
h_bar = max(factor, self.floor_by_factor(height / beta, factor))
|
|
136
|
+
w_bar = max(factor, self.floor_by_factor(width / beta, factor))
|
|
137
|
+
elif h_bar * w_bar < min_pixels:
|
|
138
|
+
beta = math.sqrt(min_pixels / (height * width))
|
|
139
|
+
h_bar = self.ceil_by_factor(height * beta, factor)
|
|
140
|
+
w_bar = self.ceil_by_factor(width * beta, factor)
|
|
141
|
+
if h_bar * w_bar > max_pixels:
|
|
142
|
+
beta = math.sqrt((h_bar * w_bar) / max_pixels)
|
|
143
|
+
h_bar = max(factor, self.floor_by_factor(h_bar / beta, factor))
|
|
144
|
+
w_bar = max(factor, self.floor_by_factor(w_bar / beta, factor))
|
|
145
|
+
return h_bar, w_bar
|
|
146
|
+
|
|
147
|
+
def fetch_image(
|
|
148
|
+
self,
|
|
149
|
+
image,
|
|
150
|
+
min_pixels=None,
|
|
151
|
+
max_pixels=None,
|
|
152
|
+
) -> Image.Image:
|
|
153
|
+
"""Fetch and resize image."""
|
|
154
|
+
# Resize if needed
|
|
155
|
+
if min_pixels or max_pixels:
|
|
156
|
+
width, height = image.size
|
|
157
|
+
if not min_pixels:
|
|
158
|
+
min_pixels = self.MIN_PIXELS
|
|
159
|
+
if not max_pixels:
|
|
160
|
+
max_pixels = self.MAX_PIXELS
|
|
161
|
+
resized_height, resized_width = self.smart_resize(
|
|
162
|
+
height,
|
|
163
|
+
width,
|
|
164
|
+
factor=self.IMAGE_FACTOR,
|
|
165
|
+
min_pixels=min_pixels,
|
|
166
|
+
max_pixels=max_pixels,
|
|
167
|
+
)
|
|
168
|
+
assert resized_height > 0 and resized_width > 0
|
|
169
|
+
image = image.resize((resized_width, resized_height))
|
|
170
|
+
|
|
171
|
+
return image
|
|
172
|
+
|
|
173
|
+
def post_process_cells(
|
|
174
|
+
self,
|
|
175
|
+
origin_image: Image.Image,
|
|
176
|
+
cells: list,
|
|
177
|
+
input_width: int,
|
|
178
|
+
input_height: int,
|
|
179
|
+
) -> list:
|
|
180
|
+
"""Post-process cell bounding boxes to original image dimensions."""
|
|
181
|
+
if not cells or not isinstance(cells, list):
|
|
182
|
+
return cells
|
|
183
|
+
|
|
184
|
+
original_width, original_height = origin_image.size
|
|
185
|
+
|
|
186
|
+
scale_x = input_width / original_width
|
|
187
|
+
scale_y = input_height / original_height
|
|
188
|
+
|
|
189
|
+
cells_out = []
|
|
190
|
+
for cell in cells:
|
|
191
|
+
bbox = cell["bbox"]
|
|
192
|
+
bbox_resized = [
|
|
193
|
+
int(float(bbox[0]) / scale_x),
|
|
194
|
+
int(float(bbox[1]) / scale_y),
|
|
195
|
+
int(float(bbox[2]) / scale_x),
|
|
196
|
+
int(float(bbox[3]) / scale_y),
|
|
197
|
+
]
|
|
198
|
+
cell_copy = cell.copy()
|
|
199
|
+
cell_copy["bbox"] = bbox_resized
|
|
200
|
+
cells_out.append(cell_copy)
|
|
201
|
+
|
|
202
|
+
return cells_out
|
|
203
|
+
|
|
204
|
+
async def _async_inference_with_vllm(self, image, prompt):
|
|
205
|
+
"""Run async inference with VLLM."""
|
|
206
|
+
messages = [
|
|
207
|
+
{
|
|
208
|
+
"role": "user",
|
|
209
|
+
"content": [
|
|
210
|
+
{
|
|
211
|
+
"type": "image_url",
|
|
212
|
+
"image_url": {
|
|
213
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
214
|
+
},
|
|
215
|
+
},
|
|
216
|
+
{"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"},
|
|
217
|
+
],
|
|
218
|
+
}
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
return await self._get_chat_completion(messages)
|
|
222
|
+
|
|
223
|
+
async def _parse_image_vllm(self, origin_image, prompt_mode="prompt_layout_all_en"):
|
|
224
|
+
"""Parse image using VLLM inference."""
|
|
225
|
+
|
|
226
|
+
image = self.fetch_image(
|
|
227
|
+
origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
|
|
228
|
+
)
|
|
229
|
+
prompt = self.PROMPTS[prompt_mode]
|
|
230
|
+
|
|
231
|
+
response = await self._async_inference_with_vllm(image, prompt)
|
|
232
|
+
|
|
233
|
+
if prompt_mode in ["prompt_layout_all_en"]:
|
|
234
|
+
try:
|
|
235
|
+
cells = json.loads(response)
|
|
236
|
+
cells = self.post_process_cells(
|
|
237
|
+
origin_image,
|
|
238
|
+
cells,
|
|
239
|
+
image.width,
|
|
240
|
+
image.height,
|
|
241
|
+
)
|
|
242
|
+
return {}, cells, False
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning(f"cells post process error: {e}, returning raw response")
|
|
245
|
+
return {}, response, True
|
|
246
|
+
else:
|
|
247
|
+
return {}, response, None
|
|
248
|
+
|
|
249
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
250
|
+
image = page.image
|
|
251
|
+
|
|
252
|
+
_, response, _ = await self._parse_image_vllm(
|
|
253
|
+
image, prompt_mode=self.config.prompt_mode
|
|
254
|
+
)
|
|
255
|
+
logger.info("Response: " + str(response))
|
|
256
|
+
|
|
257
|
+
items = None
|
|
258
|
+
if self.config.prompt_mode == "prompt_layout_all_en":
|
|
259
|
+
text = "\n\n".join([item.get("text", "") for item in response])
|
|
260
|
+
|
|
261
|
+
items = []
|
|
262
|
+
for item in response:
|
|
263
|
+
l, t, r, b = item["bbox"]
|
|
264
|
+
items.append(
|
|
265
|
+
Item(
|
|
266
|
+
text=item.get("text", ""),
|
|
267
|
+
box=BoundingBox(l=l, t=t, r=r, b=b),
|
|
268
|
+
category=item["category"],
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
response = text
|
|
272
|
+
page.items = items
|
|
273
|
+
|
|
274
|
+
text = clean_response(response)
|
|
275
|
+
text = html_to_md_keep_tables(text)
|
|
276
|
+
page.text = text
|
|
277
|
+
return page
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from vlmparse.clients.openai_converter import (
|
|
4
|
+
OpenAIConverterClient,
|
|
5
|
+
OpenAIConverterConfig,
|
|
6
|
+
)
|
|
7
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
8
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
9
|
+
from vlmparse.data_model.document import Page
|
|
10
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
11
|
+
from vlmparse.utils import to_base64
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
|
|
15
|
+
"""Configuration for Granite Docling model."""
|
|
16
|
+
|
|
17
|
+
model_name: str = "ibm-granite/granite-docling-258M"
|
|
18
|
+
command_args: list[str] = Field(
|
|
19
|
+
default_factory=lambda: [
|
|
20
|
+
"--revision",
|
|
21
|
+
"untied",
|
|
22
|
+
"--limit-mm-per-prompt",
|
|
23
|
+
'{"image": 1}',
|
|
24
|
+
"--trust-remote-code",
|
|
25
|
+
]
|
|
26
|
+
)
|
|
27
|
+
aliases: list[str] = Field(default_factory=lambda: ["granite-docling"])
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def client_config(self):
|
|
31
|
+
return GraniteDoclingConverterConfig(llm_params=self.llm_params)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GraniteDoclingConverterConfig(OpenAIConverterConfig):
|
|
35
|
+
"""Granite Docling converter configuration."""
|
|
36
|
+
|
|
37
|
+
preprompt: str | None = None
|
|
38
|
+
postprompt: str | None = "Convert this page to docling."
|
|
39
|
+
completion_kwargs: dict | None = {
|
|
40
|
+
"temperature": 0.0,
|
|
41
|
+
"max_tokens": 8000,
|
|
42
|
+
"extra_body": {
|
|
43
|
+
"skip_special_tokens": False,
|
|
44
|
+
},
|
|
45
|
+
}
|
|
46
|
+
aliases: list[str] = Field(default_factory=lambda: ["granite-docling"])
|
|
47
|
+
|
|
48
|
+
def get_client(self, **kwargs) -> "GraniteDoclingConverter":
|
|
49
|
+
return GraniteDoclingConverter(config=self, **kwargs)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class GraniteDoclingConverter(OpenAIConverterClient):
|
|
53
|
+
"""Client for Granite Docling model."""
|
|
54
|
+
|
|
55
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
56
|
+
image = page.image.convert("RGB")
|
|
57
|
+
messages = [
|
|
58
|
+
{
|
|
59
|
+
"role": "user",
|
|
60
|
+
"content": [
|
|
61
|
+
{
|
|
62
|
+
"type": "image_url",
|
|
63
|
+
"image_url": {
|
|
64
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
{"type": "text", "text": self.config.postprompt},
|
|
68
|
+
],
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
doctags = await self._get_chat_completion_adaptive(
|
|
73
|
+
messages, completion_kwargs=self.config.completion_kwargs
|
|
74
|
+
)
|
|
75
|
+
doctags = clean_response(doctags)
|
|
76
|
+
|
|
77
|
+
page.raw_response = doctags
|
|
78
|
+
page.text = _doctags_to_markdown(doctags, image)
|
|
79
|
+
return page
|
|
80
|
+
|
|
81
|
+
async def _get_chat_completion_adaptive(
|
|
82
|
+
self, messages: list[dict], completion_kwargs: dict | None
|
|
83
|
+
) -> str:
|
|
84
|
+
"""
|
|
85
|
+
vLLM enforces input+output <= model context length. If `max_tokens` is too
|
|
86
|
+
high (especially for multimodal prompts), retry with progressively smaller
|
|
87
|
+
`max_tokens`.
|
|
88
|
+
"""
|
|
89
|
+
kwargs = (completion_kwargs or {}).copy()
|
|
90
|
+
max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
|
|
91
|
+
|
|
92
|
+
for _ in range(6):
|
|
93
|
+
try:
|
|
94
|
+
return await self._get_chat_completion(
|
|
95
|
+
messages, completion_kwargs=kwargs
|
|
96
|
+
)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
msg = str(e)
|
|
99
|
+
too_large = (
|
|
100
|
+
"max_tokens" in msg
|
|
101
|
+
and "maximum context length" in msg
|
|
102
|
+
and "is too large" in msg
|
|
103
|
+
)
|
|
104
|
+
if not too_large or not isinstance(max_tokens, int):
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
max_tokens = max(256, int(max_tokens * 0.75))
|
|
108
|
+
if "max_tokens" in kwargs:
|
|
109
|
+
kwargs["max_tokens"] = max_tokens
|
|
110
|
+
if "max_completion_tokens" in kwargs:
|
|
111
|
+
kwargs["max_completion_tokens"] = max_tokens
|
|
112
|
+
|
|
113
|
+
return await self._get_chat_completion(messages, completion_kwargs=kwargs)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _doctags_to_markdown(doctags: str, image):
|
|
117
|
+
try:
|
|
118
|
+
from docling_core.types.doc import DoclingDocument
|
|
119
|
+
from docling_core.types.doc.document import DocTagsDocument
|
|
120
|
+
except Exception as e: # pragma: no cover
|
|
121
|
+
raise RuntimeError(
|
|
122
|
+
"Missing optional dependency 'docling-core'. "
|
|
123
|
+
"Install it with: pip install 'vlmparse[docling_core]'"
|
|
124
|
+
) from e
|
|
125
|
+
|
|
126
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
127
|
+
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
|
|
128
|
+
|
|
129
|
+
html = doc.export_to_html()
|
|
130
|
+
html = clean_response(html)
|
|
131
|
+
md = html_to_md_keep_tables(html, remove_head=True)
|
|
132
|
+
return md
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from vlmparse.clients.openai_converter import OpenAIConverterConfig
|
|
4
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
8
|
+
"""Configuration for HunyuanOCR model."""
|
|
9
|
+
|
|
10
|
+
model_name: str = "tencent/HunyuanOCR"
|
|
11
|
+
command_args: list[str] = Field(
|
|
12
|
+
default_factory=lambda: [
|
|
13
|
+
"--limit-mm-per-prompt",
|
|
14
|
+
'{"image": 1}',
|
|
15
|
+
"--async-scheduling",
|
|
16
|
+
"--no-enable-prefix-caching",
|
|
17
|
+
"--mm-processor-cache-gb",
|
|
18
|
+
"0",
|
|
19
|
+
# Default argument in the hunyuan model, not sure why it is set this low.
|
|
20
|
+
"--gpu-memory-utilization",
|
|
21
|
+
"0.2",
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def client_config(self):
|
|
28
|
+
return HunyuanOCRConverterConfig(llm_params=self.llm_params)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class HunyuanOCRConverterConfig(OpenAIConverterConfig):
|
|
32
|
+
"""HunyuanOCR converter"""
|
|
33
|
+
|
|
34
|
+
model_name: str = "tencent/HunyuanOCR"
|
|
35
|
+
preprompt: str | None = ""
|
|
36
|
+
postprompt: str | None = (
|
|
37
|
+
"Extract all information from the main body of the document image and represent it in markdown format, ignoring headers and footers. Tables should be expressed in HTML format, formulas in the document should be represented using LaTeX format, and the parsing should be organized according to the reading order."
|
|
38
|
+
)
|
|
39
|
+
completion_kwargs: dict | None = {
|
|
40
|
+
"temperature": 0.0,
|
|
41
|
+
"extra_body": {"top_k": 1, "repetition_penalty": 1.0},
|
|
42
|
+
}
|
|
43
|
+
max_image_size: int | None = 1540
|
|
44
|
+
dpi: int = 200
|
|
45
|
+
aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from vlmparse.clients.openai_converter import OpenAIConverterConfig
|
|
6
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
7
|
+
|
|
8
|
+
DOCKERFILE_DIR = Path(__file__).parent.parent.parent / "docker_pipelines"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
12
|
+
"""Configuration for LightOnOCR model."""
|
|
13
|
+
|
|
14
|
+
model_name: str = "lightonai/LightOnOCR-1B-1025"
|
|
15
|
+
command_args: list[str] = Field(
|
|
16
|
+
default_factory=lambda: [
|
|
17
|
+
"--limit-mm-per-prompt",
|
|
18
|
+
'{"image": 1}',
|
|
19
|
+
"--mm-processor-cache-gb",
|
|
20
|
+
"0",
|
|
21
|
+
"--no-enable-prefix-caching",
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def client_config(self):
|
|
28
|
+
return LightOnOCRConverterConfig(llm_params=self.llm_params)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LightOnOCRConverterConfig(OpenAIConverterConfig):
|
|
32
|
+
"""LightOnOCR converter - backward compatibility alias."""
|
|
33
|
+
|
|
34
|
+
model_name: str = "lightonai/LightOnOCR-1B-1025"
|
|
35
|
+
preprompt: str | None = None
|
|
36
|
+
postprompt: str | None = None
|
|
37
|
+
completion_kwargs: dict | None = {
|
|
38
|
+
"temperature": 0.2,
|
|
39
|
+
"max_tokens": 4096,
|
|
40
|
+
"top_p": 0.9,
|
|
41
|
+
}
|
|
42
|
+
dpi: int = 200
|
|
43
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
|