vlmparse 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +60 -0
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
- vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
- vlmparse/benchpdf2md/run_benchmark.py +296 -0
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
- vlmparse/benchpdf2md/utils.py +56 -0
- vlmparse/clients/chandra.py +323 -0
- vlmparse/clients/deepseekocr.py +52 -0
- vlmparse/clients/docling.py +146 -0
- vlmparse/clients/dotsocr.py +277 -0
- vlmparse/clients/granite_docling.py +132 -0
- vlmparse/clients/hunyuanocr.py +45 -0
- vlmparse/clients/lightonocr.py +43 -0
- vlmparse/clients/mineru.py +119 -0
- vlmparse/clients/nanonetocr.py +29 -0
- vlmparse/clients/olmocr.py +46 -0
- vlmparse/clients/openai_converter.py +173 -0
- vlmparse/clients/paddleocrvl.py +48 -0
- vlmparse/clients/pipe_utils/cleaner.py +74 -0
- vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
- vlmparse/clients/pipe_utils/utils.py +12 -0
- vlmparse/clients/prompts.py +66 -0
- vlmparse/data_model/box.py +551 -0
- vlmparse/data_model/document.py +148 -0
- vlmparse/servers/docker_server.py +199 -0
- vlmparse/servers/utils.py +250 -0
- vlmparse/st_viewer/fs_nav.py +53 -0
- vlmparse/st_viewer/st_viewer.py +80 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/METADATA +11 -1
- vlmparse-0.1.2.dist-info/RECORD +50 -0
- vlmparse-0.1.0.dist-info/RECORD +0 -13
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import orjson
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
|
|
9
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
10
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
11
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
12
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
13
|
+
from vlmparse.servers.docker_server import DockerServerConfig
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MinerUDockerServerConfig(DockerServerConfig):
|
|
17
|
+
"""Configuration for MinerU Docker server."""
|
|
18
|
+
|
|
19
|
+
model_name: str = "mineru25"
|
|
20
|
+
docker_image: str = "pulsia/mineru25apipulsia:latest"
|
|
21
|
+
docker_port: int = 4299
|
|
22
|
+
container_port: int = 8000
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def client_config(self):
|
|
26
|
+
return MinerUConverterConfig(api_url=f"http://localhost:{self.docker_port}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MinerUConverterConfig(ConverterConfig):
|
|
30
|
+
"""Configuration for MinerU API converter."""
|
|
31
|
+
|
|
32
|
+
base_url: str = Field(
|
|
33
|
+
default_factory=lambda: os.getenv("MINERU_API_URL", "http://localhost:4299")
|
|
34
|
+
)
|
|
35
|
+
timeout: int = 600
|
|
36
|
+
|
|
37
|
+
def get_client(self, **kwargs) -> "MinerUConverter":
|
|
38
|
+
return MinerUConverter(config=self, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def to_bytes_io(image):
|
|
42
|
+
img_byte_arr = io.BytesIO()
|
|
43
|
+
image.save(img_byte_arr, format="PNG")
|
|
44
|
+
img_byte_arr.seek(0)
|
|
45
|
+
return img_byte_arr
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class MinerUConverter(BaseConverter):
|
|
49
|
+
"""MinerU HTTP API converter."""
|
|
50
|
+
|
|
51
|
+
config: MinerUConverterConfig
|
|
52
|
+
|
|
53
|
+
def __init__(self, config: MinerUConverterConfig, **kwargs):
|
|
54
|
+
super().__init__(config=config, **kwargs)
|
|
55
|
+
from httpx import AsyncClient
|
|
56
|
+
|
|
57
|
+
self.client = AsyncClient(base_url=config.api_url, timeout=config.timeout)
|
|
58
|
+
|
|
59
|
+
async def _async_inference_with_api(self, image) -> list:
|
|
60
|
+
"""Run async inference with MinerU API."""
|
|
61
|
+
|
|
62
|
+
img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
|
|
63
|
+
|
|
64
|
+
response = await self.client.post(
|
|
65
|
+
"process-image",
|
|
66
|
+
files={"image": ("image.png", img_byte_arr, "image/png")},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
|
|
71
|
+
res = orjson.loads(response.content)
|
|
72
|
+
|
|
73
|
+
return res
|
|
74
|
+
|
|
75
|
+
async def _parse_image_with_api(self, origin_image):
|
|
76
|
+
response = await self._async_inference_with_api(origin_image)
|
|
77
|
+
|
|
78
|
+
original_width, original_height = origin_image.size
|
|
79
|
+
|
|
80
|
+
for cell in response:
|
|
81
|
+
bbox = cell["bbox"]
|
|
82
|
+
bbox_resized = [
|
|
83
|
+
bbox[0] * original_width,
|
|
84
|
+
bbox[1] * original_height,
|
|
85
|
+
bbox[2] * original_width,
|
|
86
|
+
bbox[3] * original_height,
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
cell["bbox"] = bbox_resized
|
|
90
|
+
|
|
91
|
+
return response
|
|
92
|
+
|
|
93
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
94
|
+
image = page.image
|
|
95
|
+
|
|
96
|
+
# Call MinerU API
|
|
97
|
+
response = await self._parse_image_with_api(image)
|
|
98
|
+
logger.info("Response: " + str(response))
|
|
99
|
+
|
|
100
|
+
contents = [item.get("content", "") for item in response]
|
|
101
|
+
text = "\n\n".join([content for content in contents if content is not None])
|
|
102
|
+
items = []
|
|
103
|
+
for item in response:
|
|
104
|
+
l, t, r, b = item["bbox"]
|
|
105
|
+
txt = item.get("content", "")
|
|
106
|
+
|
|
107
|
+
items.append(
|
|
108
|
+
Item(
|
|
109
|
+
text=txt if txt is not None else "",
|
|
110
|
+
box=BoundingBox(l=l, t=t, r=r, b=b),
|
|
111
|
+
category=item["type"],
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
page.items = items
|
|
115
|
+
|
|
116
|
+
text = clean_response(text)
|
|
117
|
+
text = html_to_md_keep_tables(text)
|
|
118
|
+
page.text = text
|
|
119
|
+
return page
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from vlmparse.clients.openai_converter import OpenAIConverterConfig
|
|
4
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
|
|
8
|
+
"""Configuration for NanonetOCR2 model."""
|
|
9
|
+
|
|
10
|
+
model_name: str = "nanonets/Nanonets-OCR2-3B"
|
|
11
|
+
aliases: list[str] = Field(default_factory=lambda: ["nanonetsocr2"])
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def client_config(self):
|
|
15
|
+
return NanonetOCR2ConverterConfig(llm_params=self.llm_params)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
|
|
19
|
+
"""Configuration for NanonetOCR2 model."""
|
|
20
|
+
|
|
21
|
+
model_name: str = "nanonets/Nanonets-OCR2-3B"
|
|
22
|
+
preprompt: str | None = None
|
|
23
|
+
postprompt: str | None = (
|
|
24
|
+
"Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."
|
|
25
|
+
)
|
|
26
|
+
completion_kwargs: dict | None = {"temperature": 0.0, "max_tokens": 15000}
|
|
27
|
+
max_image_size: int | None = None
|
|
28
|
+
dpi: int = 200
|
|
29
|
+
aliases: list[str] = Field(default_factory=lambda: ["nanonetsocr2"])
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from vlmparse.clients.openai_converter import OpenAIConverterConfig
|
|
4
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
8
|
+
"""Configuration for OlmOCR model."""
|
|
9
|
+
|
|
10
|
+
model_name: str = "allenai/olmOCR-2-7B-1025-FP8"
|
|
11
|
+
command_args: list[str] = Field(
|
|
12
|
+
default_factory=lambda: [
|
|
13
|
+
"--limit-mm-per-prompt",
|
|
14
|
+
'{"image": 1, "video": 0}',
|
|
15
|
+
"--disable-log-requests",
|
|
16
|
+
"--uvicorn-log-level",
|
|
17
|
+
"warning",
|
|
18
|
+
"--max-model-len",
|
|
19
|
+
"16384",
|
|
20
|
+
]
|
|
21
|
+
)
|
|
22
|
+
aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def client_config(self):
|
|
26
|
+
return OlmOCRConverterConfig(llm_params=self.llm_params)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
30
|
+
"""OlmOCR converter"""
|
|
31
|
+
|
|
32
|
+
model_name: str = "allenai/olmOCR-2-7B-1025-FP8"
|
|
33
|
+
preprompt: str | None = (
|
|
34
|
+
"Attached is one page of a document that you must process. "
|
|
35
|
+
"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to HTML.\n"
|
|
36
|
+
"If there are any figures or charts, label them with the following markdown syntax \n"
|
|
37
|
+
"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
|
|
38
|
+
)
|
|
39
|
+
postprompt: str | None = None
|
|
40
|
+
completion_kwargs: dict | None = {
|
|
41
|
+
"temperature": 0.1,
|
|
42
|
+
"max_tokens": 8000,
|
|
43
|
+
}
|
|
44
|
+
max_image_size: int | None = 1288
|
|
45
|
+
dpi: int = 200
|
|
46
|
+
aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
|
|
7
|
+
from vlmparse.base_model import VLMParseBaseModel
|
|
8
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
9
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
10
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
11
|
+
from vlmparse.data_model.document import Page
|
|
12
|
+
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
|
|
13
|
+
from vlmparse.utils import to_base64
|
|
14
|
+
|
|
15
|
+
from .prompts import PDF2MD_PROMPT
|
|
16
|
+
|
|
17
|
+
GOOGLE_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LLMParams(VLMParseBaseModel):
|
|
21
|
+
api_key: str = ""
|
|
22
|
+
base_url: str | None = None
|
|
23
|
+
model_name: str = DEFAULT_MODEL_NAME
|
|
24
|
+
timeout: int | None = 500
|
|
25
|
+
max_retries: int = 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_llm_params(model_name: str, uri: str | None = None):
|
|
29
|
+
if uri is not None:
|
|
30
|
+
return LLMParams(base_url=uri, model_name="vllm-model", api_key="")
|
|
31
|
+
if model_name in [
|
|
32
|
+
"gpt-4o",
|
|
33
|
+
"gpt-4o-mini",
|
|
34
|
+
"gpt-4.1",
|
|
35
|
+
"gpt-4.1-mini",
|
|
36
|
+
"gpt-4.1-nano",
|
|
37
|
+
"gpt-5",
|
|
38
|
+
"gpt-5-mini",
|
|
39
|
+
"gpt-5-nano",
|
|
40
|
+
]:
|
|
41
|
+
base_url = None
|
|
42
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
43
|
+
else:
|
|
44
|
+
if model_name in [
|
|
45
|
+
"gemini-2.5-flash-lite",
|
|
46
|
+
"gemini-2.5-flash",
|
|
47
|
+
"gemini-2.5-pro",
|
|
48
|
+
]:
|
|
49
|
+
base_url = GOOGLE_API_BASE_URL
|
|
50
|
+
api_key = os.getenv("GOOGLE_API_KEY")
|
|
51
|
+
else:
|
|
52
|
+
return None
|
|
53
|
+
return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OpenAIConverterConfig(ConverterConfig):
|
|
57
|
+
llm_params: LLMParams
|
|
58
|
+
preprompt: str | None = None
|
|
59
|
+
postprompt: str | None = PDF2MD_PROMPT
|
|
60
|
+
completion_kwargs: dict = Field(default_factory=dict)
|
|
61
|
+
stream: bool = False
|
|
62
|
+
|
|
63
|
+
def get_client(self, **kwargs) -> "OpenAIConverterClient":
|
|
64
|
+
return OpenAIConverterClient(config=self, **kwargs)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class OpenAIConverterClient(BaseConverter):
|
|
68
|
+
"""Client for OpenAI-compatible API servers."""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
config: OpenAIConverterConfig,
|
|
73
|
+
num_concurrent_files: int = 10,
|
|
74
|
+
num_concurrent_pages: int = 10,
|
|
75
|
+
save_folder: str | None = None,
|
|
76
|
+
save_mode: Literal["document", "md", "md_page"] = "document",
|
|
77
|
+
debug: bool = False,
|
|
78
|
+
return_documents_in_batch_mode: bool = False,
|
|
79
|
+
):
|
|
80
|
+
super().__init__(
|
|
81
|
+
config=config,
|
|
82
|
+
num_concurrent_files=num_concurrent_files,
|
|
83
|
+
num_concurrent_pages=num_concurrent_pages,
|
|
84
|
+
save_folder=save_folder,
|
|
85
|
+
save_mode=save_mode,
|
|
86
|
+
debug=debug,
|
|
87
|
+
return_documents_in_batch_mode=return_documents_in_batch_mode,
|
|
88
|
+
)
|
|
89
|
+
from openai import AsyncOpenAI
|
|
90
|
+
|
|
91
|
+
self.model = AsyncOpenAI(
|
|
92
|
+
base_url=self.config.llm_params.base_url,
|
|
93
|
+
api_key=self.config.llm_params.api_key,
|
|
94
|
+
timeout=self.config.llm_params.timeout,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def _get_chat_completion(
|
|
98
|
+
self, messages: list[dict], completion_kwargs: dict | None = None
|
|
99
|
+
) -> str:
|
|
100
|
+
"""Helper to handle chat completion with optional streaming."""
|
|
101
|
+
if completion_kwargs is None:
|
|
102
|
+
completion_kwargs = self.config.completion_kwargs
|
|
103
|
+
|
|
104
|
+
if self.config.stream:
|
|
105
|
+
response_stream = await self.model.chat.completions.create(
|
|
106
|
+
model=self.config.llm_params.model_name,
|
|
107
|
+
messages=messages,
|
|
108
|
+
stream=True,
|
|
109
|
+
**completion_kwargs,
|
|
110
|
+
)
|
|
111
|
+
response_parts = []
|
|
112
|
+
async for chunk in response_stream:
|
|
113
|
+
if chunk.choices and chunk.choices[0].delta.content:
|
|
114
|
+
response_parts.append(chunk.choices[0].delta.content)
|
|
115
|
+
return "".join(response_parts)
|
|
116
|
+
else:
|
|
117
|
+
response_obj = await self.model.chat.completions.create(
|
|
118
|
+
model=self.config.llm_params.model_name,
|
|
119
|
+
messages=messages,
|
|
120
|
+
**completion_kwargs,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if response_obj.choices[0].message.content is None:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
"Response is None, finish reason: "
|
|
126
|
+
+ response_obj.choices[0].finish_reason
|
|
127
|
+
)
|
|
128
|
+
return response_obj.choices[0].message.content
|
|
129
|
+
|
|
130
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
131
|
+
"""Process a single page using OpenAI-compatible API."""
|
|
132
|
+
image = page.image
|
|
133
|
+
if self.config.preprompt:
|
|
134
|
+
preprompt = [
|
|
135
|
+
{
|
|
136
|
+
"role": "system",
|
|
137
|
+
"content": [{"type": "text", "text": self.config.preprompt}],
|
|
138
|
+
}
|
|
139
|
+
]
|
|
140
|
+
else:
|
|
141
|
+
preprompt = []
|
|
142
|
+
|
|
143
|
+
postprompt = (
|
|
144
|
+
[{"type": "text", "text": self.config.postprompt}]
|
|
145
|
+
if self.config.postprompt
|
|
146
|
+
else []
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
messages = [
|
|
150
|
+
*preprompt,
|
|
151
|
+
{
|
|
152
|
+
"role": "user",
|
|
153
|
+
"content": [
|
|
154
|
+
{
|
|
155
|
+
"type": "image_url",
|
|
156
|
+
"image_url": {
|
|
157
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
158
|
+
},
|
|
159
|
+
},
|
|
160
|
+
*postprompt,
|
|
161
|
+
],
|
|
162
|
+
},
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
response = await self._get_chat_completion(messages)
|
|
166
|
+
logger.info("Response: " + str(response))
|
|
167
|
+
page.raw_response = response
|
|
168
|
+
text = clean_response(response)
|
|
169
|
+
|
|
170
|
+
text = html_to_md_keep_tables(text)
|
|
171
|
+
page.text = text
|
|
172
|
+
|
|
173
|
+
return page
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from vlmparse.clients.openai_converter import OpenAIConverterConfig
|
|
4
|
+
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PaddleOCRVLDockerServerConfig(VLLMDockerServerConfig):
|
|
8
|
+
"""Configuration for PaddleOCRVL model."""
|
|
9
|
+
|
|
10
|
+
model_name: str = "PaddlePaddle/PaddleOCR-VL"
|
|
11
|
+
command_args: list[str] = Field(
|
|
12
|
+
default_factory=lambda: [
|
|
13
|
+
"--limit-mm-per-prompt",
|
|
14
|
+
'{"image": 1}',
|
|
15
|
+
"--async-scheduling",
|
|
16
|
+
"--trust-remote-code",
|
|
17
|
+
"--mm-processor-cache-gb",
|
|
18
|
+
"0",
|
|
19
|
+
]
|
|
20
|
+
)
|
|
21
|
+
aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def client_config(self):
|
|
25
|
+
return PaddleOCRVLConverterConfig(llm_params=self.llm_params)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Task-specific base prompts
|
|
29
|
+
TASKS = {
|
|
30
|
+
"ocr": "OCR:",
|
|
31
|
+
"table": "Table Recognition:",
|
|
32
|
+
"formula": "Formula Recognition:",
|
|
33
|
+
"chart": "Chart Recognition:",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
|
|
38
|
+
"""PaddleOCRVL converter"""
|
|
39
|
+
|
|
40
|
+
model_name: str = "PaddlePaddle/PaddleOCR-VL"
|
|
41
|
+
preprompt: str | None = None
|
|
42
|
+
postprompt: str | None = TASKS["ocr"]
|
|
43
|
+
completion_kwargs: dict | None = {
|
|
44
|
+
"temperature": 0.0,
|
|
45
|
+
}
|
|
46
|
+
max_image_size: int | None = 1540
|
|
47
|
+
dpi: int = 200
|
|
48
|
+
aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import unicodedata
|
|
3
|
+
|
|
4
|
+
# Code adapted from olmocr.bench.tests.normalize_text
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def normalize_text(
|
|
8
|
+
md_content: str,
|
|
9
|
+
additional_replacements: dict = {},
|
|
10
|
+
only_alphanum: bool = False,
|
|
11
|
+
remove_md_images: bool = True,
|
|
12
|
+
) -> str:
|
|
13
|
+
"""Normalise md text"""
|
|
14
|
+
|
|
15
|
+
if md_content is None:
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
# Normalize <br> and <br/> to newlines
|
|
19
|
+
md_content = re.sub(r"<br/?>", " ", md_content)
|
|
20
|
+
|
|
21
|
+
# Normalize whitespace in the md_content
|
|
22
|
+
md_content = re.sub(r"[ \t]+", " ", md_content)
|
|
23
|
+
|
|
24
|
+
# remove_title_emphasis:
|
|
25
|
+
md_content = re.sub(r"[-=]{3,}", "", md_content)
|
|
26
|
+
|
|
27
|
+
# remove_space_with_newlines:
|
|
28
|
+
md_content = re.sub(r"\n *", "\n", md_content)
|
|
29
|
+
|
|
30
|
+
# remove_more_than_2_newlines:
|
|
31
|
+
md_content = re.sub(r"\n{2,}", "\n\n", md_content)
|
|
32
|
+
|
|
33
|
+
# Remove markdown bold formatting (** or __ for bold)
|
|
34
|
+
md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
|
|
35
|
+
# md_content = re.sub(r"__(.*?)__", r"\1", md_content)
|
|
36
|
+
md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
|
|
37
|
+
md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
|
|
38
|
+
|
|
39
|
+
# Remove markdown italics formatting (* or _ for italics)
|
|
40
|
+
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
|
|
41
|
+
# md_content = re.sub(r"_(.*?)_", r"\1", md_content)
|
|
42
|
+
|
|
43
|
+
# remove_more_than_1_spaces:
|
|
44
|
+
md_content = re.sub(r" {2,}", " ", md_content)
|
|
45
|
+
|
|
46
|
+
# Convert down to a consistent unicode form, so é == e + accent, unicode forms
|
|
47
|
+
md_content = unicodedata.normalize("NFC", md_content)
|
|
48
|
+
|
|
49
|
+
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
|
|
50
|
+
replacements = {
|
|
51
|
+
"‘": "'",
|
|
52
|
+
"’": "'",
|
|
53
|
+
"‚": "'",
|
|
54
|
+
"“": '"',
|
|
55
|
+
"”": '"',
|
|
56
|
+
"„": '"',
|
|
57
|
+
"_": "_",
|
|
58
|
+
"–": "-",
|
|
59
|
+
"—": "-",
|
|
60
|
+
"‑": "-",
|
|
61
|
+
"‒": "-",
|
|
62
|
+
"−": "-",
|
|
63
|
+
"\u00b5": "\u03bc",
|
|
64
|
+
} | additional_replacements
|
|
65
|
+
|
|
66
|
+
# Apply all replacements from the dictionary
|
|
67
|
+
for fancy_char, ascii_char in replacements.items():
|
|
68
|
+
md_content = md_content.replace(fancy_char, ascii_char)
|
|
69
|
+
|
|
70
|
+
if only_alphanum:
|
|
71
|
+
md_content = re.sub(r"[^a-zA-Z0-9]", "", md_content)
|
|
72
|
+
if remove_md_images:
|
|
73
|
+
md_content = re.sub(r"!\[[\s\S]*?]\([\s\S]*?\)", "", md_content, flags=re.S)
|
|
74
|
+
return md_content.strip()
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
from bs4.element import NavigableString
|
|
5
|
+
from html_to_markdown import convert_to_markdown
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def html_to_md_keep_tables(html: str, remove_head: bool = False) -> str:
|
|
9
|
+
# remove the whole <img>…</img> block (tag + contents)
|
|
10
|
+
html = re.sub(r"<img>([\s\S]*?)</img>", "", html, flags=re.S)
|
|
11
|
+
|
|
12
|
+
soup = BeautifulSoup(html, "lxml")
|
|
13
|
+
if remove_head:
|
|
14
|
+
# Remove global <head> tags if present
|
|
15
|
+
if soup.head:
|
|
16
|
+
soup.head.decompose()
|
|
17
|
+
|
|
18
|
+
# --- recurse inside tables first ---------------------------------------
|
|
19
|
+
for table in soup.find_all("table"):
|
|
20
|
+
for cell in table.find_all(["td", "th"]):
|
|
21
|
+
inner = "".join(map(str, cell.contents))
|
|
22
|
+
cell_md = html_to_md_keep_tables(inner).strip() # recursion
|
|
23
|
+
cell.clear()
|
|
24
|
+
cell.append(NavigableString(cell_md))
|
|
25
|
+
|
|
26
|
+
# --- protect current-level tables --------------------------------------
|
|
27
|
+
markers = {}
|
|
28
|
+
for i, t in enumerate(soup.find_all("table")):
|
|
29
|
+
key = f"%%%TAB{i}%%%"
|
|
30
|
+
markers[key] = str(t)
|
|
31
|
+
t.replace_with(key)
|
|
32
|
+
|
|
33
|
+
# --- html → markdown (tables excluded) ---------------------------------
|
|
34
|
+
if len(str(soup)) > 0:
|
|
35
|
+
md_txt = convert_to_markdown(
|
|
36
|
+
str(soup),
|
|
37
|
+
strip=["table", "b", "strong", "i", "em"],
|
|
38
|
+
heading_style="atx",
|
|
39
|
+
escape_misc=False,
|
|
40
|
+
bullets="-",
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
md_txt = ""
|
|
44
|
+
|
|
45
|
+
# --- restore tables -----------------------------------------------------
|
|
46
|
+
for k, raw in markers.items():
|
|
47
|
+
md_txt = md_txt.replace(k, raw)
|
|
48
|
+
# print(md_txt)
|
|
49
|
+
|
|
50
|
+
return md_txt
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# %%
|
|
54
|
+
def md_tables_to_html(md_text: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Convert all Markdown tables in the text to HTML tables.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
md_text: Markdown text containing tables
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Text with Markdown tables replaced by HTML tables
|
|
63
|
+
"""
|
|
64
|
+
# Pattern to match Markdown tables
|
|
65
|
+
table_pattern = r"(\|[^\n]*\|\n(?:\|[-\s:]*\|\n)?(?:\|[^\n]*\|\n?)*)"
|
|
66
|
+
|
|
67
|
+
def convert_md_table_to_html(match):
|
|
68
|
+
table_text = match.group(1).strip()
|
|
69
|
+
lines = table_text.split("\n")
|
|
70
|
+
|
|
71
|
+
# Remove empty lines
|
|
72
|
+
lines = [line for line in lines if line.strip()]
|
|
73
|
+
|
|
74
|
+
if len(lines) < 2:
|
|
75
|
+
return table_text # Not a valid table
|
|
76
|
+
|
|
77
|
+
# Check if second line is a separator (contains only |, -, :, and spaces)
|
|
78
|
+
separator_pattern = r"^\|[-:\s|]+\|$"
|
|
79
|
+
has_separator = bool(re.match(separator_pattern, lines[1]))
|
|
80
|
+
|
|
81
|
+
if not has_separator:
|
|
82
|
+
return table_text # Not a valid table
|
|
83
|
+
|
|
84
|
+
# Parse header
|
|
85
|
+
header_row = lines[0]
|
|
86
|
+
header_cells = [cell.strip() for cell in header_row.split("|")[1:-1]]
|
|
87
|
+
|
|
88
|
+
# Parse data rows
|
|
89
|
+
data_rows = []
|
|
90
|
+
for line in lines[2:]:
|
|
91
|
+
if line.strip():
|
|
92
|
+
cells = [cell.strip() for cell in line.split("|")[1:-1]]
|
|
93
|
+
data_rows.append(cells)
|
|
94
|
+
|
|
95
|
+
# Build HTML table
|
|
96
|
+
html_parts = ["<table>"]
|
|
97
|
+
|
|
98
|
+
# Add header
|
|
99
|
+
if header_cells:
|
|
100
|
+
html_parts.append("<thead>")
|
|
101
|
+
html_parts.append("<tr>")
|
|
102
|
+
for cell in header_cells:
|
|
103
|
+
html_parts.append(f"<th>{cell}</th>")
|
|
104
|
+
html_parts.append("</tr>")
|
|
105
|
+
html_parts.append("</thead>")
|
|
106
|
+
|
|
107
|
+
# Add body
|
|
108
|
+
if data_rows:
|
|
109
|
+
html_parts.append("<tbody>")
|
|
110
|
+
for row in data_rows:
|
|
111
|
+
html_parts.append("<tr>")
|
|
112
|
+
for cell in row:
|
|
113
|
+
html_parts.append(f"<td>{cell}</td>")
|
|
114
|
+
html_parts.append("</tr>")
|
|
115
|
+
html_parts.append("</tbody>")
|
|
116
|
+
|
|
117
|
+
html_parts.append("</table>")
|
|
118
|
+
|
|
119
|
+
return "".join(html_parts)
|
|
120
|
+
|
|
121
|
+
# Replace all Markdown tables with HTML tables
|
|
122
|
+
result = re.sub(
|
|
123
|
+
table_pattern, convert_md_table_to_html, md_text, flags=re.MULTILINE
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
md_text = """| Name | Age | City |
|
|
131
|
+
|------|-----|------|
|
|
132
|
+
| John | 25 | NYC |
|
|
133
|
+
| Jane | 30 | LA |"""
|
|
134
|
+
|
|
135
|
+
html_result = md_tables_to_html(md_text)
|
|
136
|
+
print(html_result)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
def clean_response(text):
|
|
2
|
+
"""Clean markdown/html markers from response text."""
|
|
3
|
+
return (
|
|
4
|
+
text.strip()
|
|
5
|
+
.removeprefix("")
|
|
6
|
+
.removesuffix("")
|
|
7
|
+
.removeprefix("```")
|
|
8
|
+
.removesuffix("```")
|
|
9
|
+
.removeprefix("markdown")
|
|
10
|
+
.removeprefix("html")
|
|
11
|
+
.strip()
|
|
12
|
+
)
|