mineru 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/__init__.py +1 -0
- mineru/backend/hybrid/hybrid_analyze.py +526 -0
- mineru/backend/hybrid/hybrid_magic_model.py +617 -0
- mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
- mineru/backend/pipeline/batch_analyze.py +9 -1
- mineru/backend/pipeline/model_init.py +96 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -4
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
- mineru/backend/vlm/utils.py +3 -1
- mineru/backend/vlm/vlm_analyze.py +12 -12
- mineru/backend/vlm/vlm_magic_model.py +24 -89
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
- mineru/cli/client.py +17 -17
- mineru/cli/common.py +169 -20
- mineru/cli/fast_api.py +39 -13
- mineru/cli/gradio_app.py +232 -206
- mineru/model/mfd/yolo_v8.py +12 -6
- mineru/model/mfr/unimernet/Unimernet.py +71 -3
- mineru/resources/header.html +5 -1
- mineru/utils/boxbase.py +23 -0
- mineru/utils/char_utils.py +55 -0
- mineru/utils/engine_utils.py +74 -0
- mineru/utils/enum_class.py +18 -1
- mineru/utils/magic_model_utils.py +85 -2
- mineru/utils/pdf_image_tools.py +37 -17
- mineru/utils/span_pre_proc.py +5 -3
- mineru/utils/table_merge.py +13 -22
- mineru/version.py +1 -1
- mineru-2.7.1.dist-info/METADATA +438 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/RECORD +34 -28
- mineru-2.6.8.dist-info/METADATA +0 -954
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0
mineru/cli/fast_api.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import uuid
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
@@ -14,6 +15,11 @@ from fastapi.responses import JSONResponse, FileResponse
|
|
|
14
15
|
from starlette.background import BackgroundTask
|
|
15
16
|
from typing import List, Optional
|
|
16
17
|
from loguru import logger
|
|
18
|
+
|
|
19
|
+
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
|
20
|
+
logger.remove() # 移除默认handler
|
|
21
|
+
logger.add(sys.stderr, level=log_level) # 添加新handler
|
|
22
|
+
|
|
17
23
|
from base64 import b64encode
|
|
18
24
|
|
|
19
25
|
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
@@ -105,23 +111,38 @@ async def parse_pdf(
|
|
|
105
111
|
output_dir: str = Form("./output", description="Output local directory"),
|
|
106
112
|
lang_list: List[str] = Form(
|
|
107
113
|
["ch"],
|
|
108
|
-
description="""(Adapted only for pipeline backend)Input the languages in the pdf to improve OCR accuracy.
|
|
109
|
-
|
|
114
|
+
description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
|
|
115
|
+
- ch: Chinese, English, Chinese Traditional.
|
|
116
|
+
- ch_lite: Chinese, English, Chinese Traditional, Japanese.
|
|
117
|
+
- ch_server: Chinese, English, Chinese Traditional, Japanese.
|
|
118
|
+
- en: English.
|
|
119
|
+
- korean: Korean, English.
|
|
120
|
+
- japan: Chinese, English, Chinese Traditional, Japanese.
|
|
121
|
+
- chinese_cht: Chinese, English, Chinese Traditional, Japanese.
|
|
122
|
+
- ta: Tamil, English.
|
|
123
|
+
- te: Telugu, English.
|
|
124
|
+
- ka: Kannada.
|
|
125
|
+
- th: Thai, English.
|
|
126
|
+
- el: Greek, English.
|
|
127
|
+
- latin: French, German, Afrikaans, Italian, Spanish, Bosnian, Portuguese, Czech, Welsh, Danish, Estonian, Irish, Croatian, Uzbek, Hungarian, Serbian (Latin), Indonesian, Occitan, Icelandic, Lithuanian, Maori, Malay, Dutch, Norwegian, Polish, Slovak, Slovenian, Albanian, Swedish, Swahili, Tagalog, Turkish, Latin, Azerbaijani, Kurdish, Latvian, Maltese, Pali, Romanian, Vietnamese, Finnish, Basque, Galician, Luxembourgish, Romansh, Catalan, Quechua.
|
|
128
|
+
- arabic: Arabic, Persian, Uyghur, Urdu, Pashto, Kurdish, Sindhi, Balochi, English.
|
|
129
|
+
- east_slavic: Russian, Belarusian, Ukrainian, English.
|
|
130
|
+
- cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
|
|
131
|
+
- devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
|
|
110
132
|
"""
|
|
111
133
|
),
|
|
112
134
|
backend: str = Form(
|
|
113
|
-
"
|
|
135
|
+
"hybrid-auto-engine",
|
|
114
136
|
description="""The backend for parsing:
|
|
115
|
-
- pipeline: More general
|
|
116
|
-
- vlm-
|
|
117
|
-
- vlm-
|
|
118
|
-
-
|
|
119
|
-
-
|
|
120
|
-
- vlm-http-client: Faster (client suitable for openai-compatible servers)"""
|
|
137
|
+
- pipeline: More general, supports multiple languages, hallucination-free.
|
|
138
|
+
- vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
|
|
139
|
+
- vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
|
|
140
|
+
- hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
|
|
141
|
+
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages."""
|
|
121
142
|
),
|
|
122
143
|
parse_method: str = Form(
|
|
123
144
|
"auto",
|
|
124
|
-
description="""(Adapted only for pipeline backend)The method for parsing PDF:
|
|
145
|
+
description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
|
|
125
146
|
- auto: Automatically determine the method based on the file type
|
|
126
147
|
- txt: Use text extraction method
|
|
127
148
|
- ocr: Use OCR method for image-based PDFs
|
|
@@ -131,7 +152,7 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
131
152
|
table_enable: bool = Form(True, description="Enable table parsing."),
|
|
132
153
|
server_url: Optional[str] = Form(
|
|
133
154
|
None,
|
|
134
|
-
description="(Adapted only for vlm
|
|
155
|
+
description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
|
|
135
156
|
),
|
|
136
157
|
return_md: bool = Form(True, description="Return markdown content in response"),
|
|
137
158
|
return_middle_json: bool = Form(False, description="Return middle JSON in response"),
|
|
@@ -220,10 +241,13 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
220
241
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
221
242
|
for pdf_name in pdf_file_names:
|
|
222
243
|
safe_pdf_name = sanitize_filename(pdf_name)
|
|
244
|
+
|
|
223
245
|
if backend.startswith("pipeline"):
|
|
224
246
|
parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
|
|
225
|
-
|
|
247
|
+
elif backend.startswith("vlm"):
|
|
226
248
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
249
|
+
elif backend.startswith("hybrid"):
|
|
250
|
+
parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
|
|
227
251
|
|
|
228
252
|
if not os.path.exists(parse_dir):
|
|
229
253
|
continue
|
|
@@ -271,8 +295,10 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
271
295
|
|
|
272
296
|
if backend.startswith("pipeline"):
|
|
273
297
|
parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
|
|
274
|
-
|
|
298
|
+
elif backend.startswith("vlm"):
|
|
275
299
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
300
|
+
elif backend.startswith("hybrid"):
|
|
301
|
+
parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
|
|
276
302
|
|
|
277
303
|
if os.path.exists(parse_dir):
|
|
278
304
|
if return_md:
|
mineru/cli/gradio_app.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import time
|
|
7
8
|
import zipfile
|
|
8
9
|
from pathlib import Path
|
|
@@ -12,9 +13,13 @@ import gradio as gr
|
|
|
12
13
|
from gradio_pdf import PDF
|
|
13
14
|
from loguru import logger
|
|
14
15
|
|
|
16
|
+
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
|
17
|
+
logger.remove() # 移除默认handler
|
|
18
|
+
logger.add(sys.stderr, level=log_level) # 添加新handler
|
|
19
|
+
|
|
15
20
|
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
|
|
16
|
-
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
|
17
21
|
from mineru.utils.cli_parser import arg_parse
|
|
22
|
+
from mineru.utils.engine_utils import get_vlm_engine
|
|
18
23
|
from mineru.utils.hash_utils import str_sha256
|
|
19
24
|
|
|
20
25
|
|
|
@@ -24,15 +29,20 @@ async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, t
|
|
|
24
29
|
try:
|
|
25
30
|
file_name = f'{safe_stem(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
|
|
26
31
|
pdf_data = read_fn(doc_path)
|
|
27
|
-
|
|
28
|
-
parse_method = 'ocr'
|
|
29
|
-
else:
|
|
30
|
-
parse_method = 'auto'
|
|
31
|
-
|
|
32
|
+
# 根据 backend 确定 parse_method
|
|
32
33
|
if backend.startswith("vlm"):
|
|
33
34
|
parse_method = "vlm"
|
|
35
|
+
else:
|
|
36
|
+
parse_method = 'ocr' if is_ocr else 'auto'
|
|
37
|
+
|
|
38
|
+
# 根据 backend 类型准备环境目录
|
|
39
|
+
if backend.startswith("hybrid"):
|
|
40
|
+
env_name = f"hybrid_{parse_method}"
|
|
41
|
+
else:
|
|
42
|
+
env_name = parse_method
|
|
43
|
+
|
|
44
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, env_name)
|
|
34
45
|
|
|
35
|
-
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
|
36
46
|
await aio_do_parse(
|
|
37
47
|
output_dir=output_dir,
|
|
38
48
|
pdf_file_names=[file_name],
|
|
@@ -100,6 +110,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
|
|
100
110
|
|
|
101
111
|
|
|
102
112
|
async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
|
|
113
|
+
# 如果language包含(),则提取括号前的内容作为实际语言
|
|
114
|
+
if '(' in language and ')' in language:
|
|
115
|
+
language = language.split('(')[0].strip()
|
|
103
116
|
file_path = to_pdf(file_path)
|
|
104
117
|
# 获取识别的md文件以及压缩包文件路径
|
|
105
118
|
local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
|
|
@@ -130,120 +143,31 @@ latex_delimiters_type_b = [
|
|
|
130
143
|
latex_delimiters_type_all = latex_delimiters_type_a + latex_delimiters_type_b
|
|
131
144
|
|
|
132
145
|
header_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources', 'header.html')
|
|
133
|
-
with open(header_path, 'r') as header_file:
|
|
146
|
+
with open(header_path, mode='r', encoding='utf-8') as header_file:
|
|
134
147
|
header = header_file.read()
|
|
135
148
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
"hr",
|
|
150
|
-
"hu",
|
|
151
|
-
"id",
|
|
152
|
-
"is",
|
|
153
|
-
"it",
|
|
154
|
-
"ku",
|
|
155
|
-
"la",
|
|
156
|
-
"lt",
|
|
157
|
-
"lv",
|
|
158
|
-
"mi",
|
|
159
|
-
"ms",
|
|
160
|
-
"mt",
|
|
161
|
-
"nl",
|
|
162
|
-
"no",
|
|
163
|
-
"oc",
|
|
164
|
-
"pi",
|
|
165
|
-
"pl",
|
|
166
|
-
"pt",
|
|
167
|
-
"ro",
|
|
168
|
-
"rs_latin",
|
|
169
|
-
"sk",
|
|
170
|
-
"sl",
|
|
171
|
-
"sq",
|
|
172
|
-
"sv",
|
|
173
|
-
"sw",
|
|
174
|
-
"tl",
|
|
175
|
-
"tr",
|
|
176
|
-
"uz",
|
|
177
|
-
"vi",
|
|
178
|
-
"french",
|
|
179
|
-
"german",
|
|
180
|
-
"fi",
|
|
181
|
-
"eu",
|
|
182
|
-
"gl",
|
|
183
|
-
"lb",
|
|
184
|
-
"rm",
|
|
185
|
-
"ca",
|
|
186
|
-
"qu",
|
|
149
|
+
other_lang = [
|
|
150
|
+
'ch (Chinese, English, Chinese Traditional)',
|
|
151
|
+
'ch_lite (Chinese, English, Chinese Traditional, Japanese)',
|
|
152
|
+
'ch_server (Chinese, English, Chinese Traditional, Japanese)',
|
|
153
|
+
'en (English)',
|
|
154
|
+
'korean (Korean, English)',
|
|
155
|
+
'japan (Chinese, English, Chinese Traditional, Japanese)',
|
|
156
|
+
'chinese_cht (Chinese, English, Chinese Traditional, Japanese)',
|
|
157
|
+
'ta (Tamil, English)',
|
|
158
|
+
'te (Telugu, English)',
|
|
159
|
+
'ka (Kannada)',
|
|
160
|
+
'el (Greek, English)',
|
|
161
|
+
'th (Thai, English)'
|
|
187
162
|
]
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
"uk",
|
|
195
|
-
"mn",
|
|
196
|
-
"abq",
|
|
197
|
-
"ady",
|
|
198
|
-
"kbd",
|
|
199
|
-
"ava",
|
|
200
|
-
"dar",
|
|
201
|
-
"inh",
|
|
202
|
-
"che",
|
|
203
|
-
"lbe",
|
|
204
|
-
"lez",
|
|
205
|
-
"tab",
|
|
206
|
-
"kk",
|
|
207
|
-
"ky",
|
|
208
|
-
"tg",
|
|
209
|
-
"mk",
|
|
210
|
-
"tt",
|
|
211
|
-
"cv",
|
|
212
|
-
"ba",
|
|
213
|
-
"mhr",
|
|
214
|
-
"mo",
|
|
215
|
-
"udm",
|
|
216
|
-
"kv",
|
|
217
|
-
"os",
|
|
218
|
-
"bua",
|
|
219
|
-
"xal",
|
|
220
|
-
"tyv",
|
|
221
|
-
"sah",
|
|
222
|
-
"kaa",
|
|
163
|
+
add_lang = [
|
|
164
|
+
'latin (French, German, Afrikaans, Italian, Spanish, Bosnian, Portuguese, Czech, Welsh, Danish, Estonian, Irish, Croatian, Uzbek, Hungarian, Serbian (Latin), Indonesian, Occitan, Icelandic, Lithuanian, Maori, Malay, Dutch, Norwegian, Polish, Slovak, Slovenian, Albanian, Swedish, Swahili, Tagalog, Turkish, Latin, Azerbaijani, Kurdish, Latvian, Maltese, Pali, Romanian, Vietnamese, Finnish, Basque, Galician, Luxembourgish, Romansh, Catalan, Quechua)',
|
|
165
|
+
'arabic (Arabic, Persian, Uyghur, Urdu, Pashto, Kurdish, Sindhi, Balochi, English)',
|
|
166
|
+
'east_slavic (Russian, Belarusian, Ukrainian, English)',
|
|
167
|
+
'cyrillic (Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English)',
|
|
168
|
+
'devanagari (Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English)'
|
|
223
169
|
]
|
|
224
|
-
|
|
225
|
-
devanagari_lang = [
|
|
226
|
-
"hi",
|
|
227
|
-
"mr",
|
|
228
|
-
"ne",
|
|
229
|
-
"bh",
|
|
230
|
-
"mai",
|
|
231
|
-
"ang",
|
|
232
|
-
"bho",
|
|
233
|
-
"mah",
|
|
234
|
-
"sck",
|
|
235
|
-
"new",
|
|
236
|
-
"gom",
|
|
237
|
-
"sa",
|
|
238
|
-
"bgc",
|
|
239
|
-
]
|
|
240
|
-
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
|
|
241
|
-
add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
|
|
242
|
-
|
|
243
|
-
# all_lang = ['', 'auto']
|
|
244
|
-
all_lang = []
|
|
245
|
-
# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
|
246
|
-
all_lang.extend([*other_lang, *add_lang])
|
|
170
|
+
all_lang = [*other_lang, *add_lang]
|
|
247
171
|
|
|
248
172
|
|
|
249
173
|
def safe_stem(file_path):
|
|
@@ -272,18 +196,6 @@ def to_pdf(file_path):
|
|
|
272
196
|
return tmp_file_path
|
|
273
197
|
|
|
274
198
|
|
|
275
|
-
# 更新界面函数
|
|
276
|
-
def update_interface(backend_choice):
|
|
277
|
-
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine", "vlm-lmdeploy-engine", "vlm-mlx-engine"]:
|
|
278
|
-
return gr.update(visible=False), gr.update(visible=False)
|
|
279
|
-
elif backend_choice in ["vlm-http-client"]:
|
|
280
|
-
return gr.update(visible=True), gr.update(visible=False)
|
|
281
|
-
elif backend_choice in ["pipeline"]:
|
|
282
|
-
return gr.update(visible=False), gr.update(visible=True)
|
|
283
|
-
else:
|
|
284
|
-
pass
|
|
285
|
-
|
|
286
|
-
|
|
287
199
|
@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
|
|
288
200
|
@click.pass_context
|
|
289
201
|
@click.option(
|
|
@@ -295,17 +207,10 @@ def update_interface(backend_choice):
|
|
|
295
207
|
default=True,
|
|
296
208
|
)
|
|
297
209
|
@click.option(
|
|
298
|
-
'--enable-
|
|
299
|
-
'
|
|
300
|
-
type=bool,
|
|
301
|
-
help="Enable vLLM engine backend for faster processing.",
|
|
302
|
-
default=False,
|
|
303
|
-
)
|
|
304
|
-
@click.option(
|
|
305
|
-
'--enable-lmdeploy-engine',
|
|
306
|
-
'lmdeploy_engine_enable',
|
|
210
|
+
'--enable-http-client',
|
|
211
|
+
'http_client_enable',
|
|
307
212
|
type=bool,
|
|
308
|
-
help="Enable
|
|
213
|
+
help="Enable http-client backend to link openai-compatible servers.",
|
|
309
214
|
default=False,
|
|
310
215
|
)
|
|
311
216
|
@click.option(
|
|
@@ -345,10 +250,125 @@ def update_interface(backend_choice):
|
|
|
345
250
|
default='all',
|
|
346
251
|
)
|
|
347
252
|
def main(ctx,
|
|
348
|
-
example_enable,
|
|
253
|
+
example_enable,
|
|
254
|
+
http_client_enable,
|
|
255
|
+
api_enable, max_convert_pages,
|
|
349
256
|
server_name, server_port, latex_delimiters_type, **kwargs
|
|
350
257
|
):
|
|
351
258
|
|
|
259
|
+
# 创建 i18n 实例,支持中英文
|
|
260
|
+
i18n = gr.I18n(
|
|
261
|
+
en={
|
|
262
|
+
"upload_file": "Please upload a PDF or image",
|
|
263
|
+
"max_pages": "Max convert pages",
|
|
264
|
+
"backend": "Backend",
|
|
265
|
+
"server_url": "Server URL",
|
|
266
|
+
"server_url_info": "OpenAI-compatible server URL for http-client backend.",
|
|
267
|
+
"recognition_options": "**Recognition Options:**",
|
|
268
|
+
"table_enable": "Enable table recognition",
|
|
269
|
+
"table_info": "If disabled, tables will be shown as images.",
|
|
270
|
+
"formula_label_vlm": "Enable display formula recognition",
|
|
271
|
+
"formula_label_pipeline": "Enable formula recognition",
|
|
272
|
+
"formula_label_hybrid": "Enable inline formula recognition",
|
|
273
|
+
"formula_info_vlm": "If disabled, display formulas will be shown as images.",
|
|
274
|
+
"formula_info_pipeline": "If disabled, display formulas will be shown as images, and inline formulas will not be detected or parsed.",
|
|
275
|
+
"formula_info_hybrid": "If disabled, inline formulas will not be detected or parsed.",
|
|
276
|
+
"ocr_language": "OCR Language",
|
|
277
|
+
"ocr_language_info": "Select the OCR language for image-based PDFs and images.",
|
|
278
|
+
"force_ocr": "Force enable OCR",
|
|
279
|
+
"force_ocr_info": "Enable only if the result is extremely poor. Requires correct OCR language.",
|
|
280
|
+
"convert": "Convert",
|
|
281
|
+
"clear": "Clear",
|
|
282
|
+
"pdf_preview": "PDF preview",
|
|
283
|
+
"examples": "Examples:",
|
|
284
|
+
"convert_result": "Convert result",
|
|
285
|
+
"md_rendering": "Markdown rendering",
|
|
286
|
+
"md_text": "Markdown text",
|
|
287
|
+
"backend_info_vlm": "High-precision parsing via VLM, supports Chinese and English documents only.",
|
|
288
|
+
"backend_info_pipeline": "Traditional Multi-model pipeline parsing, supports multiple languages, hallucination-free.",
|
|
289
|
+
"backend_info_hybrid": "High-precision hybrid parsing, supports multiple languages.",
|
|
290
|
+
"backend_info_default": "Select the backend engine for document parsing.",
|
|
291
|
+
},
|
|
292
|
+
zh={
|
|
293
|
+
"upload_file": "请上传 PDF 或图片",
|
|
294
|
+
"max_pages": "最大转换页数",
|
|
295
|
+
"backend": "解析后端",
|
|
296
|
+
"server_url": "服务器地址",
|
|
297
|
+
"server_url_info": "http-client 后端的 OpenAI 兼容服务器地址。",
|
|
298
|
+
"recognition_options": "**识别选项:**",
|
|
299
|
+
"table_enable": "启用表格识别",
|
|
300
|
+
"table_info": "禁用后,表格将显示为图片。",
|
|
301
|
+
"formula_label_vlm": "启用行间公式识别",
|
|
302
|
+
"formula_label_pipeline": "启用公式识别",
|
|
303
|
+
"formula_label_hybrid": "启用行内公式识别",
|
|
304
|
+
"formula_info_vlm": "禁用后,行间公式将显示为图片。",
|
|
305
|
+
"formula_info_pipeline": "禁用后,行间公式将显示为图片,行内公式将不会被检测或解析。",
|
|
306
|
+
"formula_info_hybrid": "禁用后,行内公式将不会被检测或解析。",
|
|
307
|
+
"ocr_language": "OCR 语言",
|
|
308
|
+
"ocr_language_info": "为扫描版 PDF 和图片选择 OCR 语言。",
|
|
309
|
+
"force_ocr": "强制启用 OCR",
|
|
310
|
+
"force_ocr_info": "仅在识别效果极差时启用,需选择正确的 OCR 语言。",
|
|
311
|
+
"convert": "转换",
|
|
312
|
+
"clear": "清除",
|
|
313
|
+
"pdf_preview": "PDF 预览",
|
|
314
|
+
"examples": "示例:",
|
|
315
|
+
"convert_result": "转换结果",
|
|
316
|
+
"md_rendering": "Markdown 渲染",
|
|
317
|
+
"md_text": "Markdown 文本",
|
|
318
|
+
"backend_info_vlm": "多模态大模型高精度解析,仅支持中英文文档。",
|
|
319
|
+
"backend_info_pipeline": "传统多模型管道解析,支持多语言,无幻觉。",
|
|
320
|
+
"backend_info_hybrid": "高精度混合解析,支持多语言。",
|
|
321
|
+
"backend_info_default": "选择文档解析的后端引擎。",
|
|
322
|
+
},
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# 根据后端类型获取公式识别标签(闭包函数以支持 i18n)
|
|
326
|
+
def get_formula_label(backend_choice):
|
|
327
|
+
if backend_choice.startswith("vlm"):
|
|
328
|
+
return i18n("formula_label_vlm")
|
|
329
|
+
elif backend_choice == "pipeline":
|
|
330
|
+
return i18n("formula_label_pipeline")
|
|
331
|
+
elif backend_choice.startswith("hybrid"):
|
|
332
|
+
return i18n("formula_label_hybrid")
|
|
333
|
+
else:
|
|
334
|
+
return i18n("formula_label_pipeline")
|
|
335
|
+
|
|
336
|
+
def get_formula_info(backend_choice):
|
|
337
|
+
if backend_choice.startswith("vlm"):
|
|
338
|
+
return i18n("formula_info_vlm")
|
|
339
|
+
elif backend_choice == "pipeline":
|
|
340
|
+
return i18n("formula_info_pipeline")
|
|
341
|
+
elif backend_choice.startswith("hybrid"):
|
|
342
|
+
return i18n("formula_info_hybrid")
|
|
343
|
+
else:
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
def get_backend_info(backend_choice):
|
|
347
|
+
if backend_choice.startswith("vlm"):
|
|
348
|
+
return i18n("backend_info_vlm")
|
|
349
|
+
elif backend_choice == "pipeline":
|
|
350
|
+
return i18n("backend_info_pipeline")
|
|
351
|
+
elif backend_choice.startswith("hybrid"):
|
|
352
|
+
return i18n("backend_info_hybrid")
|
|
353
|
+
else:
|
|
354
|
+
return i18n("backend_info_default")
|
|
355
|
+
|
|
356
|
+
# 更新界面函数
|
|
357
|
+
def update_interface(backend_choice):
|
|
358
|
+
formula_label_update = gr.update(label=get_formula_label(backend_choice), info=get_formula_info(backend_choice))
|
|
359
|
+
backend_info_update = gr.update(info=get_backend_info(backend_choice))
|
|
360
|
+
if "http-client" in backend_choice:
|
|
361
|
+
client_options_update = gr.update(visible=True)
|
|
362
|
+
else:
|
|
363
|
+
client_options_update = gr.update(visible=False)
|
|
364
|
+
if "vlm" in backend_choice:
|
|
365
|
+
ocr_options_update = gr.update(visible=False)
|
|
366
|
+
else:
|
|
367
|
+
ocr_options_update = gr.update(visible=True)
|
|
368
|
+
|
|
369
|
+
return client_options_update, ocr_options_update, formula_label_update, backend_info_update
|
|
370
|
+
|
|
371
|
+
|
|
352
372
|
kwargs.update(arg_parse(ctx))
|
|
353
373
|
|
|
354
374
|
if latex_delimiters_type == 'a':
|
|
@@ -360,120 +380,126 @@ def main(ctx,
|
|
|
360
380
|
else:
|
|
361
381
|
raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.")
|
|
362
382
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
model_singleton = ModelSingleton()
|
|
368
|
-
predictor = model_singleton.get_model(
|
|
369
|
-
"vllm-async-engine",
|
|
370
|
-
None,
|
|
371
|
-
None,
|
|
372
|
-
**kwargs
|
|
373
|
-
)
|
|
374
|
-
print("vLLM engine init successfully.")
|
|
375
|
-
except Exception as e:
|
|
376
|
-
logger.exception(e)
|
|
377
|
-
elif lmdeploy_engine_enable:
|
|
383
|
+
vlm_engine = get_vlm_engine("auto", is_async=True)
|
|
384
|
+
if vlm_engine in ["transformers", "mlx-engine"]:
|
|
385
|
+
http_client_enable = True
|
|
386
|
+
else:
|
|
378
387
|
try:
|
|
379
|
-
|
|
388
|
+
logger.info(f"Start init {vlm_engine}...")
|
|
380
389
|
from mineru.backend.vlm.vlm_analyze import ModelSingleton
|
|
381
390
|
model_singleton = ModelSingleton()
|
|
382
391
|
predictor = model_singleton.get_model(
|
|
383
|
-
|
|
392
|
+
vlm_engine,
|
|
384
393
|
None,
|
|
385
394
|
None,
|
|
386
395
|
**kwargs
|
|
387
396
|
)
|
|
388
|
-
|
|
397
|
+
logger.info(f"{vlm_engine} init successfully.")
|
|
389
398
|
except Exception as e:
|
|
390
399
|
logger.exception(e)
|
|
400
|
+
|
|
391
401
|
suffixes = [f".{suffix}" for suffix in pdf_suffixes + image_suffixes]
|
|
392
402
|
with gr.Blocks() as demo:
|
|
393
403
|
gr.HTML(header)
|
|
394
404
|
with gr.Row():
|
|
395
405
|
with gr.Column(variant='panel', scale=5):
|
|
396
406
|
with gr.Row():
|
|
397
|
-
input_file = gr.File(label=
|
|
407
|
+
input_file = gr.File(label=i18n("upload_file"), file_types=suffixes)
|
|
398
408
|
with gr.Row():
|
|
399
|
-
max_pages = gr.Slider(1, max_convert_pages,
|
|
409
|
+
max_pages = gr.Slider(1, max_convert_pages, max_convert_pages, step=1, label=i18n("max_pages"))
|
|
400
410
|
with gr.Row():
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
preferred_option = "vlm-lmdeploy-engine"
|
|
407
|
-
else:
|
|
408
|
-
drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
|
|
409
|
-
if is_mac_os_version_supported():
|
|
410
|
-
drop_list.append("vlm-mlx-engine")
|
|
411
|
-
preferred_option = "pipeline"
|
|
412
|
-
backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
|
|
411
|
+
drop_list = ["pipeline", "vlm-auto-engine", "hybrid-auto-engine"]
|
|
412
|
+
preferred_option = "hybrid-auto-engine"
|
|
413
|
+
if http_client_enable:
|
|
414
|
+
drop_list.extend(["vlm-http-client", "hybrid-http-client"])
|
|
415
|
+
backend = gr.Dropdown(drop_list, label=i18n("backend"), value=preferred_option, info=get_backend_info(preferred_option))
|
|
413
416
|
with gr.Row(visible=False) as client_options:
|
|
414
|
-
url = gr.Textbox(label=
|
|
417
|
+
url = gr.Textbox(label=i18n("server_url"), value='http://localhost:30000', placeholder='http://localhost:30000', info=i18n("server_url_info"))
|
|
415
418
|
with gr.Row(equal_height=True):
|
|
416
419
|
with gr.Column():
|
|
417
|
-
gr.Markdown("
|
|
418
|
-
|
|
419
|
-
|
|
420
|
+
gr.Markdown(i18n("recognition_options"))
|
|
421
|
+
table_enable = gr.Checkbox(label=i18n("table_enable"), value=True, info=i18n("table_info"))
|
|
422
|
+
formula_enable = gr.Checkbox(label=get_formula_label(preferred_option), value=True, info=get_formula_info(preferred_option))
|
|
420
423
|
with gr.Column(visible=False) as ocr_options:
|
|
421
|
-
language = gr.Dropdown(all_lang, label=
|
|
422
|
-
is_ocr = gr.Checkbox(label=
|
|
424
|
+
language = gr.Dropdown(all_lang, label=i18n("ocr_language"), value='ch (Chinese, English, Chinese Traditional)', info=i18n("ocr_language_info"))
|
|
425
|
+
is_ocr = gr.Checkbox(label=i18n("force_ocr"), value=False, info=i18n("force_ocr_info"))
|
|
423
426
|
with gr.Row():
|
|
424
|
-
change_bu = gr.Button(
|
|
425
|
-
clear_bu = gr.ClearButton(value=
|
|
426
|
-
pdf_show = PDF(label=
|
|
427
|
+
change_bu = gr.Button(i18n("convert"))
|
|
428
|
+
clear_bu = gr.ClearButton(value=i18n("clear"))
|
|
429
|
+
pdf_show = PDF(label=i18n("pdf_preview"), interactive=False, visible=True, height=800)
|
|
427
430
|
if example_enable:
|
|
428
431
|
example_root = os.path.join(os.getcwd(), 'examples')
|
|
429
432
|
if os.path.exists(example_root):
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
433
|
+
gr.Examples(
|
|
434
|
+
label=i18n("examples"),
|
|
435
|
+
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
|
436
|
+
_.endswith(tuple(suffixes))],
|
|
437
|
+
inputs=input_file
|
|
438
|
+
)
|
|
436
439
|
|
|
437
440
|
with gr.Column(variant='panel', scale=5):
|
|
438
|
-
output_file = gr.File(label=
|
|
441
|
+
output_file = gr.File(label=i18n("convert_result"), interactive=False)
|
|
439
442
|
with gr.Tabs():
|
|
440
|
-
with gr.Tab(
|
|
441
|
-
md = gr.Markdown(
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
443
|
+
with gr.Tab(i18n("md_rendering")):
|
|
444
|
+
md = gr.Markdown(
|
|
445
|
+
label=i18n("md_rendering"),
|
|
446
|
+
height=1200,
|
|
447
|
+
# buttons=["copy"], # gradio 6 以上版本使用
|
|
448
|
+
show_copy_button=True, # gradio 6 以下版本使用
|
|
449
|
+
latex_delimiters=latex_delimiters,
|
|
450
|
+
line_breaks=True
|
|
451
|
+
)
|
|
452
|
+
with gr.Tab(i18n("md_text")):
|
|
453
|
+
md_text = gr.TextArea(
|
|
454
|
+
lines=45,
|
|
455
|
+
# buttons=["copy"], # gradio 6 以上版本使用
|
|
456
|
+
show_copy_button=True, # gradio 6 以下版本使用
|
|
457
|
+
label=i18n("md_text")
|
|
458
|
+
)
|
|
446
459
|
|
|
447
460
|
# 添加事件处理
|
|
448
461
|
backend.change(
|
|
449
462
|
fn=update_interface,
|
|
450
463
|
inputs=[backend],
|
|
451
|
-
outputs=[client_options, ocr_options],
|
|
452
|
-
|
|
464
|
+
outputs=[client_options, ocr_options, formula_enable, backend],
|
|
465
|
+
# api_visibility="private" # gradio 6 以上版本使用
|
|
466
|
+
api_name=False # gradio 6 以下版本使用
|
|
453
467
|
)
|
|
454
468
|
# 添加demo.load事件,在页面加载时触发一次界面更新
|
|
455
469
|
demo.load(
|
|
456
470
|
fn=update_interface,
|
|
457
471
|
inputs=[backend],
|
|
458
|
-
outputs=[client_options, ocr_options],
|
|
459
|
-
|
|
472
|
+
outputs=[client_options, ocr_options, formula_enable, backend],
|
|
473
|
+
# api_visibility="private" # gradio 6 以上版本使用
|
|
474
|
+
api_name=False # gradio 6 以下版本使用
|
|
460
475
|
)
|
|
461
476
|
clear_bu.add([input_file, md, pdf_show, md_text, output_file, is_ocr])
|
|
462
477
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
478
|
+
input_file.change(
|
|
479
|
+
fn=to_pdf,
|
|
480
|
+
inputs=input_file,
|
|
481
|
+
outputs=pdf_show,
|
|
482
|
+
api_name="to_pdf" if api_enable else False, # gradio 6 以下版本使用
|
|
483
|
+
# api_visibility="public" if api_enable else "private" # gradio 6 以上版本使用
|
|
484
|
+
)
|
|
469
485
|
change_bu.click(
|
|
470
486
|
fn=to_markdown,
|
|
471
487
|
inputs=[input_file, max_pages, is_ocr, formula_enable, table_enable, language, backend, url],
|
|
472
488
|
outputs=[md, md_text, output_file, pdf_show],
|
|
473
|
-
api_name=
|
|
489
|
+
api_name="to_markdown" if api_enable else False, # gradio 6 以下版本使用
|
|
490
|
+
# api_visibility="public" if api_enable else "private" # gradio 6 以上版本使用
|
|
474
491
|
)
|
|
475
492
|
|
|
476
|
-
|
|
493
|
+
footer_links = ["gradio", "settings"]
|
|
494
|
+
if api_enable:
|
|
495
|
+
footer_links.append("api")
|
|
496
|
+
demo.launch(
|
|
497
|
+
server_name=server_name,
|
|
498
|
+
server_port=server_port,
|
|
499
|
+
# footer_links=footer_links, # gradio 6 以上版本使用
|
|
500
|
+
show_api=api_enable, # gradio 6 以下版本使用
|
|
501
|
+
i18n=i18n
|
|
502
|
+
)
|
|
477
503
|
|
|
478
504
|
|
|
479
505
|
if __name__ == '__main__':
|