vlmparse 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlmparse-0.1.3/vlmparse.egg-info → vlmparse-0.1.5}/PKG-INFO +17 -3
- {vlmparse-0.1.3 → vlmparse-0.1.5}/README.md +15 -1
- {vlmparse-0.1.3 → vlmparse-0.1.5}/pyproject.toml +2 -6
- {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_all_converters_mocked.py +25 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_end2end.py +45 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/build_doc.py +10 -4
- vlmparse-0.1.5/vlmparse/clients/deepseekocr.py +203 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/docling.py +2 -2
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/dotsocr.py +11 -2
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/mineru.py +8 -7
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/openai_converter.py +1 -0
- vlmparse-0.1.5/vlmparse/constants.py +2 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/converter.py +19 -5
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/converter_with_server.py +5 -4
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/registries.py +2 -4
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/servers/docker_server.py +1 -1
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/servers/utils.py +3 -2
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/utils.py +2 -2
- {vlmparse-0.1.3 → vlmparse-0.1.5/vlmparse.egg-info}/PKG-INFO +17 -3
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/SOURCES.txt +1 -18
- vlmparse-0.1.3/tests/test_benchmark_tests.py +0 -731
- vlmparse-0.1.3/tests/test_process_and_run_benchmark.py +0 -144
- vlmparse-0.1.3/tests/test_table_tests.py +0 -1516
- vlmparse-0.1.3/vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse-0.1.3/vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse-0.1.3/vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse-0.1.3/vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse-0.1.3/vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse-0.1.3/vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse-0.1.3/vlmparse/benchpdf2md/utils.py +0 -56
- vlmparse-0.1.3/vlmparse/clients/deepseekocr.py +0 -52
- {vlmparse-0.1.3 → vlmparse-0.1.5}/LICENSE +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/setup.cfg +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_batch_parser.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/tests/test_cli.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/base_model.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/cli.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/chandra.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/granite_docling.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/hunyuanocr.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/lightonocr.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/nanonetocr.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/olmocr.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/paddleocrvl.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/cleaner.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/html_to_md_conversion.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/pipe_utils/utils.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/clients/prompts.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/data_model/box.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/data_model/document.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/st_viewer/fs_nav.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse/st_viewer/st_viewer.py +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/dependency_links.txt +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/entry_points.txt +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/requires.txt +0 -0
- {vlmparse-0.1.3 → vlmparse-0.1.5}/vlmparse.egg-info/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vlmparse
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Requires-Python: >=3.
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Requires-Python: >=3.11.0
|
|
5
5
|
Description-Content-Type: text/markdown
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Dist: devtools>=0.12.2
|
|
@@ -72,6 +72,19 @@ Supported Converters:
|
|
|
72
72
|
|
|
73
73
|
## Installation
|
|
74
74
|
|
|
75
|
+
Simplest solution with only the cli:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
uv tool install vlmparse
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
If you want to run the granite-docling model or use the streamlit viewing app:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv tool install vlmparse[docling_core,st_app]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If you prefer cloning the repository and using the local version:
|
|
75
88
|
```bash
|
|
76
89
|
uv sync
|
|
77
90
|
```
|
|
@@ -86,10 +99,11 @@ Activate the virtual environment:
|
|
|
86
99
|
```bash
|
|
87
100
|
source .venv/bin/activate
|
|
88
101
|
```
|
|
89
|
-
Other solution: append uv run to all the commands below.
|
|
90
102
|
|
|
91
103
|
## CLI Usage
|
|
92
104
|
|
|
105
|
+
Note that you can bypass the previous installation step and just add uvx before each of the commands below.
|
|
106
|
+
|
|
93
107
|
### Convert PDFs
|
|
94
108
|
|
|
95
109
|
With a general VLM (requires setting your api key as an environment variable):
|
|
@@ -18,6 +18,19 @@ Supported Converters:
|
|
|
18
18
|
|
|
19
19
|
## Installation
|
|
20
20
|
|
|
21
|
+
Simplest solution with only the cli:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv tool install vlmparse
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
If you want to run the granite-docling model or use the streamlit viewing app:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv tool install vlmparse[docling_core,st_app]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
If you prefer cloning the repository and using the local version:
|
|
21
34
|
```bash
|
|
22
35
|
uv sync
|
|
23
36
|
```
|
|
@@ -32,10 +45,11 @@ Activate the virtual environment:
|
|
|
32
45
|
```bash
|
|
33
46
|
source .venv/bin/activate
|
|
34
47
|
```
|
|
35
|
-
Other solution: append uv run to all the commands below.
|
|
36
48
|
|
|
37
49
|
## CLI Usage
|
|
38
50
|
|
|
51
|
+
Note that you can bypass the previous installation step and just add uvx before each of the commands below.
|
|
52
|
+
|
|
39
53
|
### Convert PDFs
|
|
40
54
|
|
|
41
55
|
With a general VLM (requires setting your api key as an environment variable):
|
|
@@ -2,17 +2,13 @@
|
|
|
2
2
|
requires = ["setuptools", "wheel"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
|
-
[metadata]
|
|
6
|
-
name = "vlmparse"
|
|
7
|
-
version = "0.1.0"
|
|
8
|
-
|
|
9
5
|
[project]
|
|
10
6
|
name = "vlmparse"
|
|
11
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
12
8
|
authors = []
|
|
13
9
|
description = ""
|
|
14
10
|
readme = "README.md"
|
|
15
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.11.0"
|
|
16
12
|
dependencies = [
|
|
17
13
|
"devtools>=0.12.2",
|
|
18
14
|
"docker>=7.1.0",
|
|
@@ -106,6 +106,31 @@ class TestConverterConfigs:
|
|
|
106
106
|
# Verify API was called
|
|
107
107
|
assert mock_openai_client.chat.completions.create.call_count == 2
|
|
108
108
|
|
|
109
|
+
def test_converter_image_processing(self, datadir, mock_openai_client):
|
|
110
|
+
"""Test processing of a single image file."""
|
|
111
|
+
model_name = "gemini-2.5-flash-lite"
|
|
112
|
+
image_path = datadir / "page_with_formula.png"
|
|
113
|
+
|
|
114
|
+
config = converter_config_registry.get(model_name)
|
|
115
|
+
converter = config.get_client()
|
|
116
|
+
|
|
117
|
+
# Process image
|
|
118
|
+
document = converter(image_path)
|
|
119
|
+
|
|
120
|
+
# Verify document structure
|
|
121
|
+
assert isinstance(document, Document)
|
|
122
|
+
assert document.file_path == str(image_path)
|
|
123
|
+
assert len(document.pages) == 1, f"Expected 1 page, got {len(document.pages)}"
|
|
124
|
+
|
|
125
|
+
# Verify page
|
|
126
|
+
page = document.pages[0]
|
|
127
|
+
assert isinstance(page, Page)
|
|
128
|
+
assert page.text is not None
|
|
129
|
+
assert len(page.text) > 0
|
|
130
|
+
|
|
131
|
+
# Verify API was called once
|
|
132
|
+
assert mock_openai_client.chat.completions.create.call_count == 1
|
|
133
|
+
|
|
109
134
|
def test_dotsocr_ocr_mode(self, file_path, dotsocr_mock_client):
|
|
110
135
|
"""Test DotsOCR converter in OCR mode."""
|
|
111
136
|
config = converter_config_registry.get("dotsocr")
|
|
@@ -64,3 +64,48 @@ def test_convert_with_docker(file_path, model):
|
|
|
64
64
|
assert doc.pages[1].text is not None
|
|
65
65
|
|
|
66
66
|
# Server will be automatically stopped due to auto_stop=True
|
|
67
|
+
server.stop()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# @pytest.mark.skipif(
|
|
71
|
+
# "RUN_DEPLOYMENT_VLLM" not in os.environ
|
|
72
|
+
# or os.environ["RUN_DEPLOYMENT_VLLM"] == "false"
|
|
73
|
+
# or "GPU_TEST_VLMPARSE" not in os.environ,
|
|
74
|
+
# reason="Skipping because RUN_DEPLOYMENT_VLLM is not set or is false or GPU_TEST is not set",
|
|
75
|
+
# )
|
|
76
|
+
# @pytest.mark.parametrize(
|
|
77
|
+
# "model",
|
|
78
|
+
# [
|
|
79
|
+
# "docling",
|
|
80
|
+
# "lightonocr",
|
|
81
|
+
# "dotsocr",
|
|
82
|
+
# "nanonets/Nanonets-OCR2-3B",
|
|
83
|
+
# "hunyuanocr",
|
|
84
|
+
# "olmocr-2-fp8",
|
|
85
|
+
# "paddleocrvl",
|
|
86
|
+
# "mineru25",
|
|
87
|
+
# "chandra",
|
|
88
|
+
# "deepseekocr",
|
|
89
|
+
# "granite-docling",
|
|
90
|
+
# ],
|
|
91
|
+
# )
|
|
92
|
+
# def test_converter_with_server_with_docker(file_path, model):
|
|
93
|
+
# """Test conversion with automatic Docker deployment (requires GPU due to vllm limitations)."""
|
|
94
|
+
|
|
95
|
+
# from vlmparse.converter_with_server import ConverterWithServer
|
|
96
|
+
# converter_with_server = ConverterWithServer(
|
|
97
|
+
# model=model,
|
|
98
|
+
# uri=None,
|
|
99
|
+
# gpus=os.environ["GPU_TEST_VLMPARSE"],
|
|
100
|
+
# with_vllm_server=True,
|
|
101
|
+
# concurrency=10,
|
|
102
|
+
# )
|
|
103
|
+
|
|
104
|
+
# docs = converter_with_server.parse([file_path])
|
|
105
|
+
|
|
106
|
+
# # Assertions
|
|
107
|
+
# assert len(docs) == 1
|
|
108
|
+
# doc = docs[0]
|
|
109
|
+
# assert len(doc.pages) == 2
|
|
110
|
+
# assert doc.pages[0].text is not None
|
|
111
|
+
# assert doc.pages[1].text is not None
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import PIL
|
|
5
6
|
import pypdfium2 as pdfium
|
|
6
7
|
from loguru import logger
|
|
7
8
|
|
|
9
|
+
from .constants import PDF_EXTENSION
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def convert_pdfium(file_path, dpi):
|
|
10
13
|
pdf = pdfium.PdfDocument(file_path)
|
|
@@ -64,7 +67,10 @@ def resize_image(image, max_image_size):
|
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
def get_page_count(file_path):
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
if Path(file_path).suffix.lower() == PDF_EXTENSION:
|
|
71
|
+
pdf = pdfium.PdfDocument(file_path)
|
|
72
|
+
count = len(pdf)
|
|
73
|
+
pdf.close()
|
|
74
|
+
return count
|
|
75
|
+
else:
|
|
76
|
+
return 1
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import ClassVar, Literal
|
|
3
|
+
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from vlmparse.clients.openai_converter import (
|
|
9
|
+
OpenAIConverterClient,
|
|
10
|
+
OpenAIConverterConfig,
|
|
11
|
+
)
|
|
12
|
+
from vlmparse.data_model.box import BoundingBox
|
|
13
|
+
from vlmparse.data_model.document import Item, Page
|
|
14
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
15
|
+
from vlmparse.utils import to_base64
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def re_match(text):
|
|
19
|
+
pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
|
|
20
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
21
|
+
|
|
22
|
+
matches_image = []
|
|
23
|
+
matches_other = []
|
|
24
|
+
for a_match in matches:
|
|
25
|
+
if "<|ref|>image<|/ref|>" in a_match[0]:
|
|
26
|
+
matches_image.append(a_match[0])
|
|
27
|
+
else:
|
|
28
|
+
matches_other.append(a_match[0])
|
|
29
|
+
return matches, matches_image, matches_other
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_coordinates_and_label(ref_text):
|
|
33
|
+
try:
|
|
34
|
+
label_type = ref_text[1]
|
|
35
|
+
matches = re.findall(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", ref_text[2])
|
|
36
|
+
cor_list = [[int(x) for x in m] for m in matches]
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.warning(f"Error parsing coordinates: {e}")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
return (label_type, cor_list)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
45
|
+
"""Client for DeepSeekOCR with specific post-processing."""
|
|
46
|
+
|
|
47
|
+
PROMPTS: ClassVar[dict] = {
|
|
48
|
+
"layout": "<|grounding|>Convert the document to markdown.",
|
|
49
|
+
"ocr": "Free OCR.",
|
|
50
|
+
"image_description": "Describe this image in detail.",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
|
|
54
|
+
items = []
|
|
55
|
+
width, height = image.size
|
|
56
|
+
|
|
57
|
+
for match in matches:
|
|
58
|
+
# match is tuple: (full_str, label, coords_str)
|
|
59
|
+
result = extract_coordinates_and_label(match)
|
|
60
|
+
if not result:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
category, coords = result
|
|
64
|
+
if not coords:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
# Create boxes
|
|
68
|
+
boxes = []
|
|
69
|
+
for point in coords:
|
|
70
|
+
if len(point) != 4:
|
|
71
|
+
continue
|
|
72
|
+
x1, y1, x2, y2 = point
|
|
73
|
+
# Scale to image size (0-999 -> pixel)
|
|
74
|
+
x1 = (x1 / 999) * width
|
|
75
|
+
y1 = (y1 / 999) * height
|
|
76
|
+
x2 = (x2 / 999) * width
|
|
77
|
+
y2 = (y2 / 999) * height
|
|
78
|
+
|
|
79
|
+
boxes.append(
|
|
80
|
+
BoundingBox(
|
|
81
|
+
l=min(x1, x2), t=min(y1, y2), r=max(x1, x2), b=max(y1, y2)
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if not boxes:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# Merge if multiple boxes for one item
|
|
89
|
+
try:
|
|
90
|
+
final_box = (
|
|
91
|
+
BoundingBox.merge_boxes(boxes) if len(boxes) > 1 else boxes[0]
|
|
92
|
+
)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.warning(f"Error merging boxes: {e}")
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
items.append(Item(category=category, text=match[1], box=final_box))
|
|
98
|
+
|
|
99
|
+
return items
|
|
100
|
+
|
|
101
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
102
|
+
# Prepare messages as in parent class
|
|
103
|
+
image = page.image
|
|
104
|
+
|
|
105
|
+
messages = [
|
|
106
|
+
{
|
|
107
|
+
"role": "user",
|
|
108
|
+
"content": [
|
|
109
|
+
{
|
|
110
|
+
"type": "image_url",
|
|
111
|
+
"image_url": {
|
|
112
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
{"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
|
|
116
|
+
],
|
|
117
|
+
},
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
# Get raw response using parent's method
|
|
121
|
+
response = await self._get_chat_completion(messages)
|
|
122
|
+
logger.info("Response length: " + str(len(response)))
|
|
123
|
+
page.raw_response = response
|
|
124
|
+
|
|
125
|
+
if self.config.prompt_mode == "layout":
|
|
126
|
+
# Post-processing
|
|
127
|
+
matches, matches_image, matches_other = re_match(response)
|
|
128
|
+
|
|
129
|
+
# Extract items (bounding boxes)
|
|
130
|
+
page.items = self.extract_items(page.image, matches)
|
|
131
|
+
|
|
132
|
+
# Clean text
|
|
133
|
+
outputs = response
|
|
134
|
+
|
|
135
|
+
# Replace image references with a placeholder
|
|
136
|
+
for a_match_image in matches_image:
|
|
137
|
+
outputs = outputs.replace(a_match_image, "![image]")
|
|
138
|
+
|
|
139
|
+
# Replace other references (text grounding) and cleanup
|
|
140
|
+
for a_match_other in matches_other:
|
|
141
|
+
outputs = (
|
|
142
|
+
outputs.replace(a_match_other, "")
|
|
143
|
+
.replace("\\coloneqq", ":=")
|
|
144
|
+
.replace("\\eqqcolon", "=:")
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
outputs = response
|
|
148
|
+
|
|
149
|
+
page.text = outputs.strip()
|
|
150
|
+
logger.debug(page.text)
|
|
151
|
+
|
|
152
|
+
return page
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
156
|
+
"""Configuration for DeepSeekOCR model."""
|
|
157
|
+
|
|
158
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
159
|
+
command_args: list[str] = Field(
|
|
160
|
+
default_factory=lambda: [
|
|
161
|
+
"--limit-mm-per-prompt",
|
|
162
|
+
'{"image": 1}',
|
|
163
|
+
"--async-scheduling",
|
|
164
|
+
"--logits_processors",
|
|
165
|
+
"vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
|
|
166
|
+
"--no-enable-prefix-caching",
|
|
167
|
+
"--mm-processor-cache-gb",
|
|
168
|
+
"0",
|
|
169
|
+
]
|
|
170
|
+
)
|
|
171
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def client_config(self):
|
|
175
|
+
return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
179
|
+
"""DeepSeekOCR converter - backward compatibility alias."""
|
|
180
|
+
|
|
181
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
182
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
183
|
+
|
|
184
|
+
prompt_mode: Literal["layout", "ocr"] = "ocr"
|
|
185
|
+
completion_kwargs: dict | None = {
|
|
186
|
+
"temperature": 0.0,
|
|
187
|
+
"max_tokens": 8181,
|
|
188
|
+
"extra_body": {
|
|
189
|
+
"skip_special_tokens": False,
|
|
190
|
+
# args used to control custom logits processor
|
|
191
|
+
"vllm_xargs": {
|
|
192
|
+
"ngram_size": 30,
|
|
193
|
+
"window_size": 90,
|
|
194
|
+
# whitelist: <td>, </td>
|
|
195
|
+
"whitelist_token_ids": [128821, 128822],
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
}
|
|
199
|
+
dpi: int = 200
|
|
200
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
201
|
+
|
|
202
|
+
def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
|
|
203
|
+
return DeepSeekOCRConverterClient(config=self, **kwargs)
|
|
@@ -34,7 +34,7 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
34
34
|
"LOG_LEVEL": "DEBUG", # Enable verbose logging
|
|
35
35
|
# Performance Tuning
|
|
36
36
|
# "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
|
|
37
|
-
|
|
37
|
+
"DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "16", # Increase processing workers (Default: 2)
|
|
38
38
|
"DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
|
|
39
39
|
}
|
|
40
40
|
)
|
|
@@ -62,8 +62,8 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
62
62
|
class DoclingConverterConfig(ConverterConfig):
|
|
63
63
|
"""Configuration for Docling converter client."""
|
|
64
64
|
|
|
65
|
+
base_url: str
|
|
65
66
|
model_name: str = "docling"
|
|
66
|
-
base_url: str = "http://localhost:5001"
|
|
67
67
|
timeout: int = 300
|
|
68
68
|
api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
|
|
69
69
|
|
|
@@ -8,6 +8,7 @@ from PIL import Image
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from vlmparse.clients.openai_converter import (
|
|
11
|
+
LLMParams,
|
|
11
12
|
OpenAIConverterClient,
|
|
12
13
|
OpenAIConverterConfig,
|
|
13
14
|
)
|
|
@@ -28,6 +29,7 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
28
29
|
dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
|
|
29
30
|
command_args: list[str] = Field(
|
|
30
31
|
default_factory=lambda: [
|
|
32
|
+
"/workspace/weights/DotsOCR",
|
|
31
33
|
"--tensor-parallel-size",
|
|
32
34
|
"1",
|
|
33
35
|
"--gpu-memory-utilization",
|
|
@@ -44,12 +46,19 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
44
46
|
# "16384",
|
|
45
47
|
]
|
|
46
48
|
)
|
|
47
|
-
add_model_key_to_server: bool =
|
|
49
|
+
add_model_key_to_server: bool = True
|
|
48
50
|
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
49
51
|
|
|
50
52
|
@property
|
|
51
53
|
def client_config(self):
|
|
52
|
-
return DotsOCRConverterConfig(
|
|
54
|
+
return DotsOCRConverterConfig(
|
|
55
|
+
llm_params=LLMParams(
|
|
56
|
+
base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_base_url_suffix(self) -> str:
|
|
61
|
+
return "/v1"
|
|
53
62
|
|
|
54
63
|
|
|
55
64
|
class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import io
|
|
3
|
-
import os
|
|
4
3
|
|
|
5
4
|
import orjson
|
|
6
5
|
from loguru import logger
|
|
@@ -20,18 +19,21 @@ class MinerUDockerServerConfig(DockerServerConfig):
|
|
|
20
19
|
docker_image: str = "pulsia/mineru25apipulsia:latest"
|
|
21
20
|
docker_port: int = 4299
|
|
22
21
|
container_port: int = 8000
|
|
22
|
+
server_ready_indicators: list[str] = Field(
|
|
23
|
+
default_factory=lambda: ["Uvicorn running"]
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
@property
|
|
25
27
|
def client_config(self):
|
|
26
|
-
return MinerUConverterConfig(
|
|
28
|
+
return MinerUConverterConfig(base_url=f"http://localhost:{self.docker_port}")
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class MinerUConverterConfig(ConverterConfig):
|
|
30
32
|
"""Configuration for MinerU API converter."""
|
|
31
33
|
|
|
32
|
-
base_url: str
|
|
33
|
-
|
|
34
|
-
)
|
|
34
|
+
base_url: str
|
|
35
|
+
model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
|
|
36
|
+
aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
|
|
35
37
|
timeout: int = 600
|
|
36
38
|
|
|
37
39
|
def get_client(self, **kwargs) -> "MinerUConverter":
|
|
@@ -54,13 +56,12 @@ class MinerUConverter(BaseConverter):
|
|
|
54
56
|
super().__init__(config=config, **kwargs)
|
|
55
57
|
from httpx import AsyncClient
|
|
56
58
|
|
|
57
|
-
self.client = AsyncClient(base_url=config.
|
|
59
|
+
self.client = AsyncClient(base_url=config.base_url, timeout=config.timeout)
|
|
58
60
|
|
|
59
61
|
async def _async_inference_with_api(self, image) -> list:
|
|
60
62
|
"""Run async inference with MinerU API."""
|
|
61
63
|
|
|
62
64
|
img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
|
|
63
|
-
|
|
64
65
|
response = await self.client.post(
|
|
65
66
|
"process-image",
|
|
66
67
|
files={"image": ("image.png", img_byte_arr, "image/png")},
|
|
@@ -92,6 +92,7 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
92
92
|
base_url=self.config.llm_params.base_url,
|
|
93
93
|
api_key=self.config.llm_params.api_key,
|
|
94
94
|
timeout=self.config.llm_params.timeout,
|
|
95
|
+
max_retries=self.config.llm_params.max_retries,
|
|
95
96
|
)
|
|
96
97
|
|
|
97
98
|
async def _get_chat_completion(
|
|
@@ -6,10 +6,12 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Literal
|
|
7
7
|
|
|
8
8
|
from loguru import logger
|
|
9
|
+
from PIL import Image
|
|
9
10
|
from pydantic import Field
|
|
10
11
|
|
|
11
12
|
from .base_model import VLMParseBaseModel
|
|
12
13
|
from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
|
|
14
|
+
from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
|
|
13
15
|
from .data_model.document import Document, Page, ProcessingError
|
|
14
16
|
|
|
15
17
|
# Add a lock to ensure PDFium is accessed by only one thread/task at a time
|
|
@@ -50,12 +52,24 @@ class BaseConverter:
|
|
|
50
52
|
raise NotImplementedError
|
|
51
53
|
|
|
52
54
|
def add_page_image(self, page: Page, file_path, page_idx):
|
|
53
|
-
|
|
54
|
-
image =
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
if Path(file_path).suffix.lower() in IMAGE_EXTENSIONS:
|
|
56
|
+
image = Image.open(file_path)
|
|
57
|
+
if image.mode != "RGB":
|
|
58
|
+
image = image.convert("L").convert("RGB")
|
|
59
|
+
|
|
60
|
+
elif Path(file_path).suffix.lower() == PDF_EXTENSION:
|
|
61
|
+
with PDFIUM_LOCK:
|
|
62
|
+
image = convert_specific_page_to_image(
|
|
63
|
+
file_path,
|
|
64
|
+
page_idx,
|
|
65
|
+
dpi=self.config.dpi,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Unsupported file extension: {Path(file_path).suffix.lower()}"
|
|
58
71
|
)
|
|
72
|
+
|
|
59
73
|
image = resize_image(image, self.config.max_image_size)
|
|
60
74
|
page.buffer_image = image
|
|
61
75
|
return page
|
|
@@ -42,13 +42,13 @@ class ConverterWithServer:
|
|
|
42
42
|
docker_config = docker_config_registry.get(
|
|
43
43
|
self.model, default=self.with_vllm_server
|
|
44
44
|
)
|
|
45
|
-
if self.port is not None:
|
|
46
|
-
docker_config.docker_port = self.port
|
|
47
45
|
|
|
48
46
|
if docker_config is not None:
|
|
47
|
+
if self.port is not None:
|
|
48
|
+
docker_config.docker_port = self.port
|
|
49
49
|
docker_config.gpu_device_ids = gpu_device_ids
|
|
50
|
-
server = docker_config.get_server(auto_stop=True)
|
|
51
|
-
server.start()
|
|
50
|
+
self.server = docker_config.get_server(auto_stop=True)
|
|
51
|
+
self.server.start()
|
|
52
52
|
|
|
53
53
|
self.client = docker_config.get_client()
|
|
54
54
|
else:
|
|
@@ -56,6 +56,7 @@ class ConverterWithServer:
|
|
|
56
56
|
|
|
57
57
|
else:
|
|
58
58
|
client_config = converter_config_registry.get(self.model, uri=self.uri)
|
|
59
|
+
|
|
59
60
|
self.client = client_config.get_client()
|
|
60
61
|
|
|
61
62
|
def parse(
|
|
@@ -108,6 +108,7 @@ for gemini_model in [
|
|
|
108
108
|
"gemini-2.5-flash",
|
|
109
109
|
"gemini-2.5-flash-lite",
|
|
110
110
|
"gemini-3-pro-preview",
|
|
111
|
+
"gemini-3-flash-preview",
|
|
111
112
|
]:
|
|
112
113
|
converter_config_registry.register(
|
|
113
114
|
gemini_model,
|
|
@@ -120,12 +121,9 @@ for gemini_model in [
|
|
|
120
121
|
),
|
|
121
122
|
)
|
|
122
123
|
for openai_model in [
|
|
123
|
-
"gpt-5.
|
|
124
|
-
"gpt-5.1-mini",
|
|
125
|
-
"gpt-5.1-nano",
|
|
124
|
+
"gpt-5.2",
|
|
126
125
|
"gpt-5",
|
|
127
126
|
"gpt-5-mini",
|
|
128
|
-
"gpt-5-nano",
|
|
129
127
|
]:
|
|
130
128
|
converter_config_registry.register(
|
|
131
129
|
openai_model,
|
|
@@ -78,7 +78,7 @@ class VLLMDockerServerConfig(DockerServerConfig):
|
|
|
78
78
|
from vlmparse.clients.openai_converter import LLMParams
|
|
79
79
|
|
|
80
80
|
return LLMParams(
|
|
81
|
-
base_url=f"http://localhost:{self.docker_port}
|
|
81
|
+
base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
|
|
82
82
|
model_name=self.default_model_name,
|
|
83
83
|
)
|
|
84
84
|
|
|
@@ -3,9 +3,8 @@ import time
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from loguru import logger
|
|
7
|
-
|
|
8
6
|
import docker
|
|
7
|
+
from loguru import logger
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def _ensure_image_exists(
|
|
@@ -230,6 +229,8 @@ def get_model_from_uri(uri: str) -> str:
|
|
|
230
229
|
for container in containers:
|
|
231
230
|
c_uri = container.labels.get("vlmparse_uri")
|
|
232
231
|
c_model = container.labels.get("vlmparse_model_name")
|
|
232
|
+
if c_uri is not None:
|
|
233
|
+
c_uri = c_uri.replace("localhost", "0.0.0.0")
|
|
233
234
|
|
|
234
235
|
# Check if user URI matches container URI (ignoring /v1 suffix if missing)
|
|
235
236
|
if c_uri and (
|
|
@@ -28,12 +28,12 @@ def get_file_paths(inputs: str | list[str]):
|
|
|
28
28
|
if "*" in pattern or "?" in pattern:
|
|
29
29
|
file_paths.extend(glob(pattern, recursive=True))
|
|
30
30
|
elif os.path.isdir(pattern):
|
|
31
|
-
file_paths.extend(glob(os.path.join(pattern, "
|
|
31
|
+
file_paths.extend(glob(os.path.join(pattern, "*.*"), recursive=True))
|
|
32
32
|
elif os.path.isfile(pattern):
|
|
33
33
|
file_paths.append(pattern)
|
|
34
34
|
else:
|
|
35
35
|
logger.error(f"Invalid input: {pattern}")
|
|
36
|
-
file_paths = [f for f in file_paths if os.path.exists(f) and
|
|
36
|
+
file_paths = [f for f in file_paths if os.path.exists(f) and os.path.isfile(f)]
|
|
37
37
|
|
|
38
38
|
if not file_paths:
|
|
39
39
|
logger.error("No PDF files found matching the inputs patterns")
|