vlmparse 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +20 -19
- vlmparse/cli.py +17 -1
- vlmparse/clients/chandra.py +176 -60
- vlmparse/clients/deepseekocr.py +23 -12
- vlmparse/clients/docling.py +0 -1
- vlmparse/clients/dotsocr.py +34 -31
- vlmparse/clients/granite_docling.py +9 -36
- vlmparse/clients/hunyuanocr.py +5 -1
- vlmparse/clients/lightonocr.py +23 -1
- vlmparse/clients/mineru.py +0 -1
- vlmparse/clients/mistral_converter.py +85 -0
- vlmparse/clients/nanonetocr.py +5 -1
- vlmparse/clients/olmocr.py +6 -2
- vlmparse/clients/openai_converter.py +95 -60
- vlmparse/clients/paddleocrvl.py +9 -2
- vlmparse/converter.py +51 -11
- vlmparse/converter_with_server.py +41 -5
- vlmparse/registries.py +97 -89
- vlmparse/servers/docker_server.py +59 -35
- vlmparse/servers/model_identity.py +48 -0
- vlmparse/utils.py +15 -2
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.8.dist-info}/METADATA +11 -1
- vlmparse-0.1.8.dist-info/RECORD +38 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.8.dist-info}/WHEEL +1 -1
- vlmparse-0.1.7.dist-info/RECORD +0 -36
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.8.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.8.dist-info}/top_level.txt +0 -0
vlmparse/build_doc.py
CHANGED
|
@@ -10,12 +10,10 @@ from .constants import PDF_EXTENSION
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def convert_pdfium(file_path, dpi):
|
|
13
|
-
pdf = pdfium.PdfDocument(file_path)
|
|
14
13
|
pil_images = []
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
pdf.close()
|
|
14
|
+
with pdfium.PdfDocument(file_path) as pdf:
|
|
15
|
+
for page in pdf:
|
|
16
|
+
pil_images.append(page.render(scale=dpi / 72).to_pil())
|
|
19
17
|
return pil_images
|
|
20
18
|
|
|
21
19
|
|
|
@@ -32,24 +30,29 @@ def convert_pdfium_to_images(file_path, dpi=175):
|
|
|
32
30
|
]
|
|
33
31
|
|
|
34
32
|
except PIL.Image.DecompressionBombError as e:
|
|
35
|
-
logger.exception(
|
|
33
|
+
logger.opt(exception=True).warning(
|
|
34
|
+
"Decompression bomb detected for {file_path}, reducing DPI",
|
|
35
|
+
file_path=str(file_path),
|
|
36
|
+
)
|
|
36
37
|
cur_size, limit_size = map(int, re.findall(r"\d+", str(e)))
|
|
37
38
|
factor = custom_ceil(cur_size / limit_size, precision=1)
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
new_dpi = dpi // factor
|
|
40
|
+
logger.info(
|
|
41
|
+
"Retrying {file_path} with reduced DPI: {old_dpi} -> {new_dpi}",
|
|
42
|
+
file_path=str(file_path),
|
|
43
|
+
old_dpi=dpi,
|
|
44
|
+
new_dpi=new_dpi,
|
|
40
45
|
)
|
|
41
|
-
|
|
42
|
-
images = convert_pdfium(file_path, dpi=dpi)
|
|
46
|
+
images = convert_pdfium(file_path, dpi=new_dpi)
|
|
43
47
|
|
|
44
48
|
return images
|
|
45
49
|
|
|
46
50
|
|
|
47
51
|
def convert_specific_page_to_image(file_path, page_number, dpi=175):
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
pdf.close()
|
|
52
|
+
with pdfium.PdfDocument(file_path) as pdf:
|
|
53
|
+
page = pdf.get_page(page_number)
|
|
54
|
+
image = page.render(scale=dpi / 72).to_pil()
|
|
55
|
+
image = image.convert("L").convert("RGB") if image.mode != "RGB" else image
|
|
53
56
|
return image
|
|
54
57
|
|
|
55
58
|
|
|
@@ -68,9 +71,7 @@ def resize_image(image, max_image_size):
|
|
|
68
71
|
|
|
69
72
|
def get_page_count(file_path):
|
|
70
73
|
if Path(file_path).suffix.lower() == PDF_EXTENSION:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
pdf.close()
|
|
74
|
-
return count
|
|
74
|
+
with pdfium.PdfDocument(file_path) as pdf:
|
|
75
|
+
return len(pdf)
|
|
75
76
|
else:
|
|
76
77
|
return 1
|
vlmparse/cli.py
CHANGED
|
@@ -49,10 +49,19 @@ class DParseCLI:
|
|
|
49
49
|
uri: str | None = None,
|
|
50
50
|
gpus: str | None = None,
|
|
51
51
|
mode: Literal["document", "md", "md_page"] = "document",
|
|
52
|
+
conversion_mode: Literal[
|
|
53
|
+
"ocr",
|
|
54
|
+
"ocr_layout",
|
|
55
|
+
"table",
|
|
56
|
+
"image_description",
|
|
57
|
+
"formula",
|
|
58
|
+
"chart",
|
|
59
|
+
] = "ocr",
|
|
52
60
|
with_vllm_server: bool = False,
|
|
53
61
|
concurrency: int = 10,
|
|
54
62
|
dpi: int | None = None,
|
|
55
63
|
debug: bool = False,
|
|
64
|
+
_return_documents: bool = False,
|
|
56
65
|
):
|
|
57
66
|
"""Parse PDF documents and save results.
|
|
58
67
|
|
|
@@ -64,6 +73,7 @@ class DParseCLI:
|
|
|
64
73
|
uri: URI of the server, if not specified and the pipe is vllm, a local server will be deployed
|
|
65
74
|
gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
|
|
66
75
|
mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
|
|
76
|
+
conversion_mode: Conversion mode - "ocr" (plain), "ocr_layout" (OCR with layout), "table" (table-centric), "image_description" (describe the image), "formula" (formula extraction), "chart" (chart recognition)
|
|
67
77
|
with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
|
|
68
78
|
dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
|
|
69
79
|
debug: If True, run in debug mode (single-threaded, no concurrency)
|
|
@@ -76,9 +86,15 @@ class DParseCLI:
|
|
|
76
86
|
gpus=gpus,
|
|
77
87
|
with_vllm_server=with_vllm_server,
|
|
78
88
|
concurrency=concurrency,
|
|
89
|
+
return_documents=_return_documents,
|
|
79
90
|
) as converter_with_server:
|
|
80
91
|
return converter_with_server.parse(
|
|
81
|
-
inputs=inputs,
|
|
92
|
+
inputs=inputs,
|
|
93
|
+
out_folder=out_folder,
|
|
94
|
+
mode=mode,
|
|
95
|
+
conversion_mode=conversion_mode,
|
|
96
|
+
dpi=dpi,
|
|
97
|
+
debug=debug,
|
|
82
98
|
)
|
|
83
99
|
|
|
84
100
|
def list(self):
|
vlmparse/clients/chandra.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import math
|
|
2
3
|
import time
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
3
5
|
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
4
7
|
from loguru import logger
|
|
5
8
|
from PIL import Image
|
|
6
9
|
from pydantic import Field
|
|
@@ -11,7 +14,8 @@ from vlmparse.clients.openai_converter import (
|
|
|
11
14
|
)
|
|
12
15
|
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
13
16
|
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
14
|
-
from vlmparse.data_model.
|
|
17
|
+
from vlmparse.data_model.box import BoundingBox
|
|
18
|
+
from vlmparse.data_model.document import Item, Page
|
|
15
19
|
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
16
20
|
from vlmparse.utils import to_base64
|
|
17
21
|
|
|
@@ -110,11 +114,6 @@ OCR this image to HTML.
|
|
|
110
114
|
{PROMPT_ENDING}
|
|
111
115
|
""".strip()
|
|
112
116
|
|
|
113
|
-
PROMPT_MAPPING = {
|
|
114
|
-
"ocr_layout": OCR_LAYOUT_PROMPT,
|
|
115
|
-
"ocr": OCR_PROMPT,
|
|
116
|
-
}
|
|
117
|
-
|
|
118
117
|
|
|
119
118
|
def scale_to_fit(
|
|
120
119
|
img: Image.Image,
|
|
@@ -188,11 +187,135 @@ def detect_repeat_token(
|
|
|
188
187
|
return False
|
|
189
188
|
|
|
190
189
|
|
|
190
|
+
@dataclass
|
|
191
|
+
class LayoutBlock:
|
|
192
|
+
"""Represents a layout block with bounding box and content."""
|
|
193
|
+
|
|
194
|
+
bbox: list[int]
|
|
195
|
+
label: str
|
|
196
|
+
content: str
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def parse_layout(
|
|
200
|
+
html: str, image: Image.Image, bbox_scale: int = 1024
|
|
201
|
+
) -> list[LayoutBlock]:
|
|
202
|
+
"""
|
|
203
|
+
Parse HTML layout blocks with bounding boxes.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
html: HTML string with layout blocks (divs with data-bbox and data-label attributes)
|
|
207
|
+
image: PIL Image to get dimensions for bbox scaling
|
|
208
|
+
bbox_scale: The scale used in the prompt for normalized bboxes
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of LayoutBlock objects with scaled bounding boxes
|
|
212
|
+
"""
|
|
213
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
214
|
+
top_level_divs = soup.find_all("div", recursive=False)
|
|
215
|
+
width, height = image.size
|
|
216
|
+
width_scaler = width / bbox_scale
|
|
217
|
+
height_scaler = height / bbox_scale
|
|
218
|
+
layout_blocks = []
|
|
219
|
+
|
|
220
|
+
for div in top_level_divs:
|
|
221
|
+
bbox = div.get("data-bbox")
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
bbox = json.loads(bbox)
|
|
225
|
+
assert len(bbox) == 4, "Invalid bbox length"
|
|
226
|
+
except Exception:
|
|
227
|
+
try:
|
|
228
|
+
bbox = bbox.split(" ")
|
|
229
|
+
assert len(bbox) == 4, "Invalid bbox length"
|
|
230
|
+
except Exception:
|
|
231
|
+
# Default bbox if parsing fails
|
|
232
|
+
bbox = [0, 0, bbox_scale, bbox_scale]
|
|
233
|
+
|
|
234
|
+
bbox = list(map(int, bbox))
|
|
235
|
+
# Scale bbox to image dimensions
|
|
236
|
+
bbox = [
|
|
237
|
+
max(0, int(bbox[0] * width_scaler)),
|
|
238
|
+
max(0, int(bbox[1] * height_scaler)),
|
|
239
|
+
min(int(bbox[2] * width_scaler), width),
|
|
240
|
+
min(int(bbox[3] * height_scaler), height),
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
label = div.get("data-label", "block")
|
|
244
|
+
content = str(div.decode_contents())
|
|
245
|
+
layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
|
|
246
|
+
|
|
247
|
+
return layout_blocks
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def parse_chunks(html: str, image: Image.Image, bbox_scale: int = 1024) -> list[dict]:
|
|
251
|
+
"""
|
|
252
|
+
Parse HTML layout blocks into dictionaries.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
html: HTML string with layout blocks
|
|
256
|
+
image: PIL Image to get dimensions for bbox scaling
|
|
257
|
+
bbox_scale: The scale used in the prompt for normalized bboxes
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
List of dictionaries with bbox, label, and content keys
|
|
261
|
+
"""
|
|
262
|
+
layout = parse_layout(html, image, bbox_scale=bbox_scale)
|
|
263
|
+
chunks = [asdict(block) for block in layout]
|
|
264
|
+
return chunks
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def layout_blocks_to_items(
|
|
268
|
+
layout_blocks: list[LayoutBlock],
|
|
269
|
+
) -> list[Item]:
|
|
270
|
+
"""
|
|
271
|
+
Convert layout blocks to Item objects for the Page model.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
layout_blocks: List of LayoutBlock objects
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of Item objects with category, box, and text
|
|
278
|
+
"""
|
|
279
|
+
items = []
|
|
280
|
+
for block in layout_blocks:
|
|
281
|
+
# Convert content HTML to markdown
|
|
282
|
+
try:
|
|
283
|
+
text = html_to_md_keep_tables(block.content)
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.warning(f"Error converting block content to markdown: {e}")
|
|
286
|
+
text = block.content
|
|
287
|
+
|
|
288
|
+
# Create bounding box from [x0, y0, x1, y1] format
|
|
289
|
+
bbox = BoundingBox(
|
|
290
|
+
l=block.bbox[0],
|
|
291
|
+
t=block.bbox[1],
|
|
292
|
+
r=block.bbox[2],
|
|
293
|
+
b=block.bbox[3],
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
items.append(
|
|
297
|
+
Item(
|
|
298
|
+
category=block.label,
|
|
299
|
+
box=bbox,
|
|
300
|
+
text=text.strip(),
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return items
|
|
305
|
+
|
|
306
|
+
|
|
191
307
|
class ChandraConverterConfig(OpenAIConverterConfig):
|
|
192
308
|
"""Chandra converter configuration."""
|
|
193
309
|
|
|
194
310
|
model_name: str = "datalab-to/chandra"
|
|
195
|
-
|
|
311
|
+
postprompt: str | None = None
|
|
312
|
+
prompts: dict[str, str] = {
|
|
313
|
+
"ocr": OCR_PROMPT,
|
|
314
|
+
"ocr_layout": OCR_LAYOUT_PROMPT,
|
|
315
|
+
}
|
|
316
|
+
prompt_mode_map: dict[str, str] = {
|
|
317
|
+
"table": "ocr_layout",
|
|
318
|
+
}
|
|
196
319
|
bbox_scale: int = 1024
|
|
197
320
|
max_retries: int = 0
|
|
198
321
|
max_failure_retries: int = None
|
|
@@ -216,8 +339,7 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
216
339
|
|
|
217
340
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
218
341
|
"""Process a single page using Chandra logic."""
|
|
219
|
-
|
|
220
|
-
prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
|
|
342
|
+
prompt = self.get_prompt_for_mode() or OCR_PROMPT
|
|
221
343
|
prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
|
|
222
344
|
|
|
223
345
|
image = scale_to_fit(page.image)
|
|
@@ -238,61 +360,34 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
238
360
|
|
|
239
361
|
retries = 0
|
|
240
362
|
max_retries = self.config.max_retries
|
|
241
|
-
max_failure_retries = self.config.max_failure_retries
|
|
242
363
|
|
|
243
364
|
result_content = ""
|
|
244
|
-
error_occurred = False
|
|
245
365
|
|
|
246
366
|
while True:
|
|
247
|
-
try:
|
|
248
|
-
# Adjust temperature if retrying
|
|
249
|
-
temperature = self.config.completion_kwargs.get("temperature", 0.0)
|
|
250
|
-
if retries > 0:
|
|
251
|
-
temperature = 0.3 # As per vllm.py logic
|
|
252
|
-
|
|
253
|
-
completion_kwargs = self.config.completion_kwargs.copy()
|
|
254
|
-
completion_kwargs["temperature"] = temperature
|
|
255
|
-
if retries > 0:
|
|
256
|
-
completion_kwargs["top_p"] = 0.95
|
|
257
|
-
|
|
258
|
-
result_content = await self._get_chat_completion(
|
|
259
|
-
messages, completion_kwargs=completion_kwargs
|
|
260
|
-
)
|
|
261
|
-
error_occurred = False
|
|
262
|
-
except Exception as e:
|
|
263
|
-
logger.error(f"Error during VLLM generation: {e}")
|
|
264
|
-
error_occurred = True
|
|
265
|
-
result_content = ""
|
|
266
|
-
|
|
267
367
|
should_retry = False
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
if
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
368
|
+
# Adjust temperature if retrying
|
|
369
|
+
temperature = self.config.completion_kwargs.get("temperature", 0.0)
|
|
370
|
+
if retries > 0:
|
|
371
|
+
temperature = 0.3 # As per vllm.py logic
|
|
372
|
+
|
|
373
|
+
completion_kwargs = self.config.completion_kwargs.copy()
|
|
374
|
+
completion_kwargs["temperature"] = temperature
|
|
375
|
+
if retries > 0:
|
|
376
|
+
completion_kwargs["top_p"] = 0.95
|
|
377
|
+
|
|
378
|
+
result_content, usage = await self._get_chat_completion(
|
|
379
|
+
messages, completion_kwargs=completion_kwargs
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
has_repeat = detect_repeat_token(result_content) or (
|
|
383
|
+
len(result_content) > 50
|
|
384
|
+
and detect_repeat_token(result_content, cut_from_end=50)
|
|
385
|
+
)
|
|
386
|
+
if has_repeat and retries < max_retries:
|
|
387
|
+
logger.warning(
|
|
388
|
+
f"Detected repeat token, retrying generation (attempt {retries + 1})..."
|
|
274
389
|
)
|
|
275
|
-
|
|
276
|
-
logger.warning(
|
|
277
|
-
f"Detected repeat token, retrying generation (attempt {retries + 1})..."
|
|
278
|
-
)
|
|
279
|
-
should_retry = True
|
|
280
|
-
|
|
281
|
-
# Check for error
|
|
282
|
-
if error_occurred:
|
|
283
|
-
if max_failure_retries is not None:
|
|
284
|
-
if retries < max_failure_retries:
|
|
285
|
-
logger.warning(
|
|
286
|
-
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
|
|
287
|
-
)
|
|
288
|
-
should_retry = True
|
|
289
|
-
elif (
|
|
290
|
-
retries < max_retries
|
|
291
|
-
): # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
|
|
292
|
-
logger.warning(
|
|
293
|
-
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
|
|
294
|
-
)
|
|
295
|
-
should_retry = True
|
|
390
|
+
should_retry = True
|
|
296
391
|
|
|
297
392
|
if should_retry:
|
|
298
393
|
time.sleep(2 * (retries + 1))
|
|
@@ -305,10 +400,27 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
305
400
|
page.raw_response = result_content
|
|
306
401
|
text = clean_response(result_content)
|
|
307
402
|
|
|
403
|
+
# Check if we're in layout mode (ocr_layout prompt)
|
|
404
|
+
current_prompt_key = self.get_prompt_key()
|
|
405
|
+
is_layout_mode = current_prompt_key == "ocr_layout"
|
|
406
|
+
|
|
407
|
+
if is_layout_mode:
|
|
408
|
+
# Parse layout blocks and populate items
|
|
409
|
+
try:
|
|
410
|
+
layout_blocks = parse_layout(
|
|
411
|
+
text, image, bbox_scale=self.config.bbox_scale
|
|
412
|
+
)
|
|
413
|
+
page.items = layout_blocks_to_items(layout_blocks)
|
|
414
|
+
logger.info(f"Parsed {len(page.items)} layout blocks")
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"Error parsing layout blocks: {e}")
|
|
417
|
+
page.items = []
|
|
418
|
+
|
|
308
419
|
# Convert HTML to MD
|
|
309
420
|
text = html_to_md_keep_tables(text)
|
|
310
421
|
page.text = text
|
|
311
|
-
|
|
422
|
+
page.completion_tokens = usage.completion_tokens
|
|
423
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
312
424
|
return page
|
|
313
425
|
|
|
314
426
|
|
|
@@ -320,4 +432,8 @@ class ChandraDockerServerConfig(VLLMDockerServerConfig):
|
|
|
320
432
|
|
|
321
433
|
@property
|
|
322
434
|
def client_config(self):
|
|
323
|
-
return ChandraConverterConfig(
|
|
435
|
+
return ChandraConverterConfig(
|
|
436
|
+
**self._create_client_kwargs(
|
|
437
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
438
|
+
)
|
|
439
|
+
)
|
vlmparse/clients/deepseekocr.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import ClassVar, Literal
|
|
3
2
|
|
|
4
3
|
from loguru import logger
|
|
5
4
|
from PIL import Image
|
|
@@ -35,7 +34,11 @@ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
35
34
|
|
|
36
35
|
@property
|
|
37
36
|
def client_config(self):
|
|
38
|
-
return DeepSeekOCRConverterConfig(
|
|
37
|
+
return DeepSeekOCRConverterConfig(
|
|
38
|
+
**self._create_client_kwargs(
|
|
39
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
40
|
+
)
|
|
41
|
+
)
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -43,8 +46,17 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
|
43
46
|
|
|
44
47
|
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
45
48
|
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
49
|
+
postprompt: str | None = None
|
|
50
|
+
prompts: dict[str, str] = {
|
|
51
|
+
"layout": "<|grounding|>Convert the document to markdown.",
|
|
52
|
+
"ocr": "Free OCR.",
|
|
53
|
+
"image_description": "Describe this image in detail.",
|
|
54
|
+
}
|
|
55
|
+
prompt_mode_map: dict[str, str] = {
|
|
56
|
+
"ocr_layout": "layout",
|
|
57
|
+
"table": "layout",
|
|
58
|
+
}
|
|
46
59
|
|
|
47
|
-
prompt_mode: Literal["layout", "ocr"] = "ocr"
|
|
48
60
|
completion_kwargs: dict | None = {
|
|
49
61
|
"temperature": 0.0,
|
|
50
62
|
"max_tokens": 8181,
|
|
@@ -95,12 +107,6 @@ def extract_coordinates_and_label(ref_text):
|
|
|
95
107
|
class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
96
108
|
"""Client for DeepSeekOCR with specific post-processing."""
|
|
97
109
|
|
|
98
|
-
PROMPTS: ClassVar[dict] = {
|
|
99
|
-
"layout": "<|grounding|>Convert the document to markdown.",
|
|
100
|
-
"ocr": "Free OCR.",
|
|
101
|
-
"image_description": "Describe this image in detail.",
|
|
102
|
-
}
|
|
103
|
-
|
|
104
110
|
def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
|
|
105
111
|
items = []
|
|
106
112
|
width, height = image.size
|
|
@@ -153,6 +159,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
153
159
|
# Prepare messages as in parent class
|
|
154
160
|
image = page.image
|
|
155
161
|
|
|
162
|
+
prompt_key = self.get_prompt_key() or "ocr"
|
|
163
|
+
|
|
156
164
|
messages = [
|
|
157
165
|
{
|
|
158
166
|
"role": "user",
|
|
@@ -163,17 +171,17 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
163
171
|
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
164
172
|
},
|
|
165
173
|
},
|
|
166
|
-
{"type": "text", "text": self.
|
|
174
|
+
{"type": "text", "text": self.config.prompts[prompt_key]},
|
|
167
175
|
],
|
|
168
176
|
},
|
|
169
177
|
]
|
|
170
178
|
|
|
171
179
|
# Get raw response using parent's method
|
|
172
|
-
response = await self._get_chat_completion(messages)
|
|
180
|
+
response, usage = await self._get_chat_completion(messages)
|
|
173
181
|
logger.info("Response length: " + str(len(response)))
|
|
174
182
|
page.raw_response = response
|
|
175
183
|
|
|
176
|
-
if
|
|
184
|
+
if prompt_key == "layout":
|
|
177
185
|
# Post-processing
|
|
178
186
|
matches, matches_image, matches_other = re_match(response)
|
|
179
187
|
|
|
@@ -199,5 +207,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
199
207
|
|
|
200
208
|
page.text = outputs.strip()
|
|
201
209
|
logger.debug(page.text)
|
|
210
|
+
if usage is not None:
|
|
211
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
212
|
+
page.completion_tokens = usage.completion_tokens
|
|
202
213
|
|
|
203
214
|
return page
|
vlmparse/clients/docling.py
CHANGED
|
@@ -62,7 +62,6 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
62
62
|
class DoclingConverterConfig(ConverterConfig):
|
|
63
63
|
"""Configuration for Docling converter client."""
|
|
64
64
|
|
|
65
|
-
base_url: str
|
|
66
65
|
model_name: str = "docling"
|
|
67
66
|
timeout: int = 300
|
|
68
67
|
api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
|
vlmparse/clients/dotsocr.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import ClassVar
|
|
4
|
+
from typing import ClassVar
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
from PIL import Image
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from vlmparse.clients.openai_converter import (
|
|
11
|
-
LLMParams,
|
|
12
11
|
OpenAIConverterClient,
|
|
13
12
|
OpenAIConverterConfig,
|
|
14
13
|
)
|
|
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
48
47
|
)
|
|
49
48
|
add_model_key_to_server: bool = True
|
|
50
49
|
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
50
|
+
default_model_name: str = DEFAULT_MODEL_NAME
|
|
51
51
|
|
|
52
52
|
@property
|
|
53
53
|
def client_config(self):
|
|
54
54
|
return DotsOCRConverterConfig(
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
**self._create_client_kwargs(
|
|
56
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
57
57
|
)
|
|
58
58
|
)
|
|
59
59
|
|
|
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
|
65
65
|
model_name: str = "rednote-hilab/dots.ocr"
|
|
66
66
|
preprompt: str | None = ""
|
|
67
67
|
postprompt: str | None = None
|
|
68
|
-
|
|
69
|
-
"temperature": 0.1,
|
|
70
|
-
"top_p": 1.0,
|
|
71
|
-
"max_completion_tokens": 16384,
|
|
72
|
-
}
|
|
73
|
-
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
74
|
-
dpi: int = 200
|
|
75
|
-
prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
|
|
76
|
-
|
|
77
|
-
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
78
|
-
return DotsOCRConverter(config=self, **kwargs)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class DotsOCRConverter(OpenAIConverterClient):
|
|
82
|
-
"""DotsOCR VLLM converter."""
|
|
83
|
-
|
|
84
|
-
# Constants
|
|
85
|
-
MIN_PIXELS: ClassVar[int] = 3136
|
|
86
|
-
MAX_PIXELS: ClassVar[int] = 11289600
|
|
87
|
-
IMAGE_FACTOR: ClassVar[int] = 28
|
|
88
|
-
|
|
89
|
-
# Prompts
|
|
90
|
-
PROMPTS: ClassVar[dict] = {
|
|
68
|
+
prompts: dict[str, str] = {
|
|
91
69
|
"prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
92
70
|
|
|
93
71
|
1. Bbox format: [x1, y1, x2, y2]
|
|
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
108
86
|
""",
|
|
109
87
|
"prompt_ocr": """Extract the text content from this image.""",
|
|
110
88
|
}
|
|
89
|
+
prompt_mode_map: dict[str, str] = {
|
|
90
|
+
"ocr": "prompt_ocr",
|
|
91
|
+
"ocr_layout": "prompt_layout_all_en",
|
|
92
|
+
"table": "prompt_layout_all_en",
|
|
93
|
+
}
|
|
94
|
+
completion_kwargs: dict | None = {
|
|
95
|
+
"temperature": 0.1,
|
|
96
|
+
"top_p": 1.0,
|
|
97
|
+
"max_completion_tokens": 16384,
|
|
98
|
+
}
|
|
99
|
+
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
100
|
+
dpi: int = 200
|
|
101
|
+
|
|
102
|
+
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
103
|
+
return DotsOCRConverter(config=self, **kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class DotsOCRConverter(OpenAIConverterClient):
|
|
107
|
+
"""DotsOCR VLLM converter."""
|
|
108
|
+
|
|
109
|
+
# Constants
|
|
110
|
+
MIN_PIXELS: ClassVar[int] = 3136
|
|
111
|
+
MAX_PIXELS: ClassVar[int] = 11289600
|
|
112
|
+
IMAGE_FACTOR: ClassVar[int] = 28
|
|
111
113
|
|
|
112
114
|
@staticmethod
|
|
113
115
|
def round_by_factor(number: int, factor: int) -> int:
|
|
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
235
237
|
image = self.fetch_image(
|
|
236
238
|
origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
|
|
237
239
|
)
|
|
238
|
-
prompt = self.
|
|
240
|
+
prompt = self.config.prompts[prompt_mode]
|
|
239
241
|
|
|
240
242
|
response, usage = await self._async_inference_with_vllm(image, prompt)
|
|
241
243
|
|
|
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
258
260
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
259
261
|
image = page.image
|
|
260
262
|
|
|
263
|
+
prompt_key = self.get_prompt_key() or "prompt_ocr"
|
|
264
|
+
|
|
261
265
|
_, response, _, usage = await self._parse_image_vllm(
|
|
262
|
-
image, prompt_mode=
|
|
266
|
+
image, prompt_mode=prompt_key
|
|
263
267
|
)
|
|
264
268
|
logger.info("Response: " + str(response))
|
|
265
269
|
|
|
266
270
|
items = None
|
|
267
|
-
if
|
|
271
|
+
if prompt_key == "prompt_layout_all_en":
|
|
268
272
|
text = "\n\n".join([item.get("text", "") for item in response])
|
|
269
273
|
|
|
270
274
|
items = []
|
|
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
286
290
|
|
|
287
291
|
page.completion_tokens = usage.completion_tokens
|
|
288
292
|
page.prompt_tokens = usage.prompt_tokens
|
|
289
|
-
page.reasoning_tokens = usage.reasoning_tokens
|
|
290
293
|
return page
|