mineru 2.2.2__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
- mineru/backend/vlm/model_output_to_middle_json.py +123 -0
- mineru/backend/vlm/vlm_analyze.py +97 -16
- mineru/backend/vlm/vlm_magic_model.py +201 -135
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
- mineru/cli/client.py +6 -5
- mineru/cli/common.py +17 -16
- mineru/cli/fast_api.py +9 -7
- mineru/cli/gradio_app.py +15 -16
- mineru/cli/vlm_vllm_server.py +4 -0
- mineru/model/table/rec/unet_table/main.py +8 -0
- mineru/model/vlm_vllm_model/__init__.py +0 -0
- mineru/model/vlm_vllm_model/server.py +51 -0
- mineru/resources/header.html +10 -2
- mineru/utils/draw_bbox.py +32 -10
- mineru/utils/enum_class.py +16 -2
- mineru/utils/guess_suffix_or_lang.py +20 -0
- mineru/utils/span_block_fix.py +4 -2
- mineru/version.py +1 -1
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/METADATA +70 -25
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/RECORD +25 -38
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
- mineru/backend/vlm/base_predictor.py +0 -186
- mineru/backend/vlm/hf_predictor.py +0 -217
- mineru/backend/vlm/predictor.py +0 -111
- mineru/backend/vlm/sglang_client_predictor.py +0 -443
- mineru/backend/vlm/sglang_engine_predictor.py +0 -246
- mineru/backend/vlm/token_to_middle_json.py +0 -122
- mineru/backend/vlm/utils.py +0 -40
- mineru/cli/vlm_sglang_server.py +0 -4
- mineru/model/vlm_hf_model/__init__.py +0 -9
- mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
- mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
- mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
- mineru/model/vlm_sglang_model/__init__.py +0 -14
- mineru/model/vlm_sglang_model/engine.py +0 -264
- mineru/model/vlm_sglang_model/image_processor.py +0 -213
- mineru/model/vlm_sglang_model/logit_processor.py +0 -90
- mineru/model/vlm_sglang_model/model.py +0 -453
- mineru/model/vlm_sglang_model/server.py +0 -75
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
mineru/cli/common.py
CHANGED
|
@@ -11,13 +11,14 @@ from loguru import logger
|
|
|
11
11
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
12
12
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
|
13
13
|
from mineru.utils.enum_class import MakeMode
|
|
14
|
+
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
|
|
14
15
|
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
15
16
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
16
17
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
17
18
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
18
19
|
|
|
19
|
-
pdf_suffixes = ["
|
|
20
|
-
image_suffixes = ["
|
|
20
|
+
pdf_suffixes = ["pdf"]
|
|
21
|
+
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
def read_fn(path):
|
|
@@ -25,12 +26,13 @@ def read_fn(path):
|
|
|
25
26
|
path = Path(path)
|
|
26
27
|
with open(str(path), "rb") as input_file:
|
|
27
28
|
file_bytes = input_file.read()
|
|
28
|
-
|
|
29
|
+
file_suffix = guess_suffix_by_bytes(file_bytes)
|
|
30
|
+
if file_suffix in image_suffixes:
|
|
29
31
|
return images_bytes_to_pdf_bytes(file_bytes)
|
|
30
|
-
elif
|
|
32
|
+
elif file_suffix in pdf_suffixes:
|
|
31
33
|
return file_bytes
|
|
32
34
|
else:
|
|
33
|
-
raise Exception(f"Unknown file suffix: {
|
|
35
|
+
raise Exception(f"Unknown file suffix: {file_suffix}")
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
def prepare_env(output_dir, pdf_file_name, parse_method):
|
|
@@ -145,17 +147,10 @@ def _process_output(
|
|
|
145
147
|
)
|
|
146
148
|
|
|
147
149
|
if f_dump_model_output:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
)
|
|
153
|
-
else:
|
|
154
|
-
output_text = ("\n" + "-" * 50 + "\n").join(model_output)
|
|
155
|
-
md_writer.write_string(
|
|
156
|
-
f"{pdf_file_name}_model_output.txt",
|
|
157
|
-
output_text,
|
|
158
|
-
)
|
|
150
|
+
md_writer.write_string(
|
|
151
|
+
f"{pdf_file_name}_model.json",
|
|
152
|
+
json.dumps(model_output, ensure_ascii=False, indent=4),
|
|
153
|
+
)
|
|
159
154
|
|
|
160
155
|
logger.info(f"local output dir is {local_md_dir}")
|
|
161
156
|
|
|
@@ -333,6 +328,9 @@ def do_parse(
|
|
|
333
328
|
if backend.startswith("vlm-"):
|
|
334
329
|
backend = backend[4:]
|
|
335
330
|
|
|
331
|
+
if backend == "vllm-async-engine":
|
|
332
|
+
raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
|
|
333
|
+
|
|
336
334
|
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
337
335
|
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
338
336
|
|
|
@@ -381,6 +379,9 @@ async def aio_do_parse(
|
|
|
381
379
|
if backend.startswith("vlm-"):
|
|
382
380
|
backend = backend[4:]
|
|
383
381
|
|
|
382
|
+
if backend == "vllm-engine":
|
|
383
|
+
raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
|
|
384
|
+
|
|
384
385
|
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
385
386
|
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
386
387
|
|
mineru/cli/fast_api.py
CHANGED
|
@@ -18,6 +18,7 @@ from base64 import b64encode
|
|
|
18
18
|
|
|
19
19
|
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
20
20
|
from mineru.utils.cli_parser import arg_parse
|
|
21
|
+
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
|
21
22
|
from mineru.version import __version__
|
|
22
23
|
|
|
23
24
|
app = FastAPI()
|
|
@@ -95,13 +96,14 @@ async def parse_pdf(
|
|
|
95
96
|
content = await file.read()
|
|
96
97
|
file_path = Path(file.filename)
|
|
97
98
|
|
|
98
|
-
#
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
with open(temp_path, "wb") as f:
|
|
103
|
-
f.write(content)
|
|
99
|
+
# 创建临时文件
|
|
100
|
+
temp_path = Path(unique_dir) / file_path.name
|
|
101
|
+
with open(temp_path, "wb") as f:
|
|
102
|
+
f.write(content)
|
|
104
103
|
|
|
104
|
+
# 如果是图像文件或PDF,使用read_fn处理
|
|
105
|
+
file_suffix = guess_suffix_by_path(temp_path)
|
|
106
|
+
if file_suffix in pdf_suffixes + image_suffixes:
|
|
105
107
|
try:
|
|
106
108
|
pdf_bytes = read_fn(temp_path)
|
|
107
109
|
pdf_bytes_list.append(pdf_bytes)
|
|
@@ -115,7 +117,7 @@ async def parse_pdf(
|
|
|
115
117
|
else:
|
|
116
118
|
return JSONResponse(
|
|
117
119
|
status_code=400,
|
|
118
|
-
content={"error": f"Unsupported file type: {
|
|
120
|
+
content={"error": f"Unsupported file type: {file_suffix}"}
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
|
mineru/cli/gradio_app.py
CHANGED
|
@@ -182,9 +182,9 @@ def to_pdf(file_path):
|
|
|
182
182
|
|
|
183
183
|
# 更新界面函数
|
|
184
184
|
def update_interface(backend_choice):
|
|
185
|
-
if backend_choice in ["vlm-transformers", "vlm-
|
|
185
|
+
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
|
|
186
186
|
return gr.update(visible=False), gr.update(visible=False)
|
|
187
|
-
elif backend_choice in ["vlm-
|
|
187
|
+
elif backend_choice in ["vlm-http-client"]:
|
|
188
188
|
return gr.update(visible=True), gr.update(visible=False)
|
|
189
189
|
elif backend_choice in ["pipeline"]:
|
|
190
190
|
return gr.update(visible=False), gr.update(visible=True)
|
|
@@ -203,10 +203,10 @@ def update_interface(backend_choice):
|
|
|
203
203
|
default=True,
|
|
204
204
|
)
|
|
205
205
|
@click.option(
|
|
206
|
-
'--enable-
|
|
207
|
-
'
|
|
206
|
+
'--enable-vllm-engine',
|
|
207
|
+
'vllm_engine_enable',
|
|
208
208
|
type=bool,
|
|
209
|
-
help="Enable
|
|
209
|
+
help="Enable vLLM engine backend for faster processing.",
|
|
210
210
|
default=False,
|
|
211
211
|
)
|
|
212
212
|
@click.option(
|
|
@@ -246,7 +246,7 @@ def update_interface(backend_choice):
|
|
|
246
246
|
default='all',
|
|
247
247
|
)
|
|
248
248
|
def main(ctx,
|
|
249
|
-
example_enable,
|
|
249
|
+
example_enable, vllm_engine_enable, api_enable, max_convert_pages,
|
|
250
250
|
server_name, server_port, latex_delimiters_type, **kwargs
|
|
251
251
|
):
|
|
252
252
|
|
|
@@ -261,22 +261,21 @@ def main(ctx,
|
|
|
261
261
|
else:
|
|
262
262
|
raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.")
|
|
263
263
|
|
|
264
|
-
if
|
|
264
|
+
if vllm_engine_enable:
|
|
265
265
|
try:
|
|
266
|
-
print("Start init
|
|
266
|
+
print("Start init vLLM engine...")
|
|
267
267
|
from mineru.backend.vlm.vlm_analyze import ModelSingleton
|
|
268
268
|
model_singleton = ModelSingleton()
|
|
269
269
|
predictor = model_singleton.get_model(
|
|
270
|
-
"
|
|
270
|
+
"vllm-async-engine",
|
|
271
271
|
None,
|
|
272
272
|
None,
|
|
273
273
|
**kwargs
|
|
274
274
|
)
|
|
275
|
-
print("
|
|
275
|
+
print("vLLM engine init successfully.")
|
|
276
276
|
except Exception as e:
|
|
277
277
|
logger.exception(e)
|
|
278
|
-
|
|
279
|
-
suffixes = pdf_suffixes + image_suffixes
|
|
278
|
+
suffixes = [f".{suffix}" for suffix in pdf_suffixes + image_suffixes]
|
|
280
279
|
with gr.Blocks() as demo:
|
|
281
280
|
gr.HTML(header)
|
|
282
281
|
with gr.Row():
|
|
@@ -286,11 +285,11 @@ def main(ctx,
|
|
|
286
285
|
with gr.Row():
|
|
287
286
|
max_pages = gr.Slider(1, max_convert_pages, int(max_convert_pages/2), step=1, label='Max convert pages')
|
|
288
287
|
with gr.Row():
|
|
289
|
-
if
|
|
290
|
-
drop_list = ["pipeline", "vlm-
|
|
291
|
-
preferred_option = "vlm-
|
|
288
|
+
if vllm_engine_enable:
|
|
289
|
+
drop_list = ["pipeline", "vlm-vllm-async-engine"]
|
|
290
|
+
preferred_option = "vlm-vllm-async-engine"
|
|
292
291
|
else:
|
|
293
|
-
drop_list = ["pipeline", "vlm-transformers", "vlm-
|
|
292
|
+
drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
|
|
294
293
|
preferred_option = "pipeline"
|
|
295
294
|
backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
|
|
296
295
|
with gr.Row(visible=False) as client_options:
|
|
@@ -12,6 +12,7 @@ from PIL import Image
|
|
|
12
12
|
from loguru import logger
|
|
13
13
|
from bs4 import BeautifulSoup
|
|
14
14
|
|
|
15
|
+
from mineru.utils.span_pre_proc import calculate_contrast
|
|
15
16
|
from .table_structure_unet import TSRUnet
|
|
16
17
|
|
|
17
18
|
from mineru.utils.enum_class import ModelPath
|
|
@@ -191,6 +192,13 @@ class WiredTableRecognition:
|
|
|
191
192
|
# logger.warning(f"Box {i} has invalid aspect ratio: {x1, y1, x2, y2}")
|
|
192
193
|
continue
|
|
193
194
|
img_crop = bgr_img[int(y1):int(y2), int(x1):int(x2)]
|
|
195
|
+
|
|
196
|
+
# 计算span的对比度,低于0.20的span不进行ocr
|
|
197
|
+
if calculate_contrast(img_crop, img_mode='bgr') <= 0.17:
|
|
198
|
+
cell_box_map[i] = [[box, "", 0.1]]
|
|
199
|
+
# logger.debug(f"Box {i} skipped due to low contrast.")
|
|
200
|
+
continue
|
|
201
|
+
|
|
194
202
|
img_crop_list.append(img_crop)
|
|
195
203
|
img_crop_info_list.append([i, box])
|
|
196
204
|
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
4
|
+
from vllm.entrypoints.cli.main import main as vllm_main
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
args = sys.argv[1:]
|
|
9
|
+
|
|
10
|
+
has_port_arg = False
|
|
11
|
+
has_gpu_memory_utilization_arg = False
|
|
12
|
+
model_path = None
|
|
13
|
+
model_arg_indices = []
|
|
14
|
+
|
|
15
|
+
# 检查现有参数
|
|
16
|
+
for i, arg in enumerate(args):
|
|
17
|
+
if arg == "--port" or arg.startswith("--port="):
|
|
18
|
+
has_port_arg = True
|
|
19
|
+
if arg == "--gpu-memory-utilization" or arg.startswith("--gpu-memory-utilization="):
|
|
20
|
+
has_gpu_memory_utilization_arg = True
|
|
21
|
+
if arg == "--model":
|
|
22
|
+
if i + 1 < len(args):
|
|
23
|
+
model_path = args[i + 1]
|
|
24
|
+
model_arg_indices.extend([i, i + 1])
|
|
25
|
+
elif arg.startswith("--model="):
|
|
26
|
+
model_path = arg.split("=", 1)[1]
|
|
27
|
+
model_arg_indices.append(i)
|
|
28
|
+
|
|
29
|
+
# 从参数列表中移除 --model 参数
|
|
30
|
+
if model_arg_indices:
|
|
31
|
+
for index in sorted(model_arg_indices, reverse=True):
|
|
32
|
+
args.pop(index)
|
|
33
|
+
|
|
34
|
+
# 添加默认参数
|
|
35
|
+
if not has_port_arg:
|
|
36
|
+
args.extend(["--port", "30000"])
|
|
37
|
+
if not has_gpu_memory_utilization_arg:
|
|
38
|
+
args.extend(["--gpu-memory-utilization", "0.5"])
|
|
39
|
+
if not model_path:
|
|
40
|
+
model_path = auto_download_and_get_model_root_path("/", "vlm")
|
|
41
|
+
|
|
42
|
+
# 重构参数,将模型路径作为位置参数
|
|
43
|
+
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
|
44
|
+
|
|
45
|
+
# 启动vllm服务器
|
|
46
|
+
print(f"start vllm server: {sys.argv}")
|
|
47
|
+
vllm_main()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
main()
|
mineru/resources/header.html
CHANGED
|
@@ -54,7 +54,7 @@
|
|
|
54
54
|
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
|
55
55
|
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
|
56
56
|
">
|
|
57
|
-
MinerU 2: PDF Extraction Demo
|
|
57
|
+
MinerU 2.5: PDF Extraction Demo
|
|
58
58
|
</h1>
|
|
59
59
|
</div>
|
|
60
60
|
</div>
|
|
@@ -88,7 +88,15 @@
|
|
|
88
88
|
<span style="color: white">Code</span>
|
|
89
89
|
</a>
|
|
90
90
|
</span>
|
|
91
|
-
|
|
91
|
+
<!-- Code Link. -->
|
|
92
|
+
<span class="link-block">
|
|
93
|
+
<a href="https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
|
94
|
+
<span class="icon" style="margin-right: 4px">
|
|
95
|
+
<i class="fas fa-archive" style="color: white; margin-right: 4px"></i>
|
|
96
|
+
</span>
|
|
97
|
+
<span style="color: white">Model</span>
|
|
98
|
+
</a>
|
|
99
|
+
</span>
|
|
92
100
|
<!-- arXiv Link. -->
|
|
93
101
|
<span class="link-block">
|
|
94
102
|
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
mineru/utils/draw_bbox.py
CHANGED
|
@@ -119,22 +119,26 @@ def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_b
|
|
|
119
119
|
|
|
120
120
|
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
121
121
|
dropped_bbox_list = []
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
122
|
+
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
|
|
123
|
+
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
|
|
124
|
+
codes_body_list, codes_caption_list = [], []
|
|
125
125
|
titles_list = []
|
|
126
126
|
texts_list = []
|
|
127
127
|
interequations_list = []
|
|
128
128
|
lists_list = []
|
|
129
|
+
list_items_list = []
|
|
129
130
|
indexs_list = []
|
|
131
|
+
|
|
130
132
|
for page in pdf_info:
|
|
131
133
|
page_dropped_list = []
|
|
132
|
-
|
|
133
|
-
|
|
134
|
+
tables_body, tables_caption, tables_footnote = [], [], []
|
|
135
|
+
imgs_body, imgs_caption, imgs_footnote = [], [], []
|
|
136
|
+
codes_body, codes_caption = [], []
|
|
134
137
|
titles = []
|
|
135
138
|
texts = []
|
|
136
139
|
interequations = []
|
|
137
140
|
lists = []
|
|
141
|
+
list_items = []
|
|
138
142
|
indices = []
|
|
139
143
|
|
|
140
144
|
for dropped_bbox in page['discarded_blocks']:
|
|
@@ -143,7 +147,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
143
147
|
for block in page["para_blocks"]:
|
|
144
148
|
bbox = block["bbox"]
|
|
145
149
|
if block["type"] == BlockType.TABLE:
|
|
146
|
-
tables.append(bbox)
|
|
147
150
|
for nested_block in block["blocks"]:
|
|
148
151
|
bbox = nested_block["bbox"]
|
|
149
152
|
if nested_block["type"] == BlockType.TABLE_BODY:
|
|
@@ -155,7 +158,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
155
158
|
continue
|
|
156
159
|
tables_footnote.append(bbox)
|
|
157
160
|
elif block["type"] == BlockType.IMAGE:
|
|
158
|
-
imgs.append(bbox)
|
|
159
161
|
for nested_block in block["blocks"]:
|
|
160
162
|
bbox = nested_block["bbox"]
|
|
161
163
|
if nested_block["type"] == BlockType.IMAGE_BODY:
|
|
@@ -164,22 +166,31 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
164
166
|
imgs_caption.append(bbox)
|
|
165
167
|
elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
|
|
166
168
|
imgs_footnote.append(bbox)
|
|
169
|
+
elif block["type"] == BlockType.CODE:
|
|
170
|
+
for nested_block in block["blocks"]:
|
|
171
|
+
if nested_block["type"] == BlockType.CODE_BODY:
|
|
172
|
+
bbox = nested_block["bbox"]
|
|
173
|
+
codes_body.append(bbox)
|
|
174
|
+
elif nested_block["type"] == BlockType.CODE_CAPTION:
|
|
175
|
+
bbox = nested_block["bbox"]
|
|
176
|
+
codes_caption.append(bbox)
|
|
167
177
|
elif block["type"] == BlockType.TITLE:
|
|
168
178
|
titles.append(bbox)
|
|
169
|
-
elif block["type"]
|
|
179
|
+
elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
|
|
170
180
|
texts.append(bbox)
|
|
171
181
|
elif block["type"] == BlockType.INTERLINE_EQUATION:
|
|
172
182
|
interequations.append(bbox)
|
|
173
183
|
elif block["type"] == BlockType.LIST:
|
|
174
184
|
lists.append(bbox)
|
|
185
|
+
if "blocks" in block:
|
|
186
|
+
for sub_block in block["blocks"]:
|
|
187
|
+
list_items.append(sub_block["bbox"])
|
|
175
188
|
elif block["type"] == BlockType.INDEX:
|
|
176
189
|
indices.append(bbox)
|
|
177
190
|
|
|
178
|
-
tables_list.append(tables)
|
|
179
191
|
tables_body_list.append(tables_body)
|
|
180
192
|
tables_caption_list.append(tables_caption)
|
|
181
193
|
tables_footnote_list.append(tables_footnote)
|
|
182
|
-
imgs_list.append(imgs)
|
|
183
194
|
imgs_body_list.append(imgs_body)
|
|
184
195
|
imgs_caption_list.append(imgs_caption)
|
|
185
196
|
imgs_footnote_list.append(imgs_footnote)
|
|
@@ -187,7 +198,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
187
198
|
texts_list.append(texts)
|
|
188
199
|
interequations_list.append(interequations)
|
|
189
200
|
lists_list.append(lists)
|
|
201
|
+
list_items_list.append(list_items)
|
|
190
202
|
indexs_list.append(indices)
|
|
203
|
+
codes_body_list.append(codes_body)
|
|
204
|
+
codes_caption_list.append(codes_caption)
|
|
191
205
|
|
|
192
206
|
layout_bbox_list = []
|
|
193
207
|
|
|
@@ -197,6 +211,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
197
211
|
for block in page["para_blocks"]:
|
|
198
212
|
if block["type"] in [
|
|
199
213
|
BlockType.TEXT,
|
|
214
|
+
BlockType.REF_TEXT,
|
|
200
215
|
BlockType.TITLE,
|
|
201
216
|
BlockType.INTERLINE_EQUATION,
|
|
202
217
|
BlockType.LIST,
|
|
@@ -215,6 +230,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
215
230
|
continue
|
|
216
231
|
bbox = sub_block["bbox"]
|
|
217
232
|
page_block_list.append(bbox)
|
|
233
|
+
elif block["type"] in [BlockType.CODE]:
|
|
234
|
+
for sub_block in block["blocks"]:
|
|
235
|
+
bbox = sub_block["bbox"]
|
|
236
|
+
page_block_list.append(bbox)
|
|
218
237
|
|
|
219
238
|
layout_bbox_list.append(page_block_list)
|
|
220
239
|
|
|
@@ -231,6 +250,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
231
250
|
# 使用原始PDF的尺寸创建canvas
|
|
232
251
|
c = canvas.Canvas(packet, pagesize=custom_page_size)
|
|
233
252
|
|
|
253
|
+
c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
|
|
254
|
+
c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
|
|
234
255
|
c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
|
|
235
256
|
c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
|
|
236
257
|
c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
|
|
@@ -242,6 +263,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
242
263
|
c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
|
|
243
264
|
c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
|
|
244
265
|
c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
|
|
266
|
+
c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
|
|
245
267
|
c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
|
|
246
268
|
c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
|
|
247
269
|
|
mineru/utils/enum_class.py
CHANGED
|
@@ -14,6 +14,19 @@ class BlockType:
|
|
|
14
14
|
INDEX = 'index'
|
|
15
15
|
DISCARDED = 'discarded'
|
|
16
16
|
|
|
17
|
+
# Added in vlm 2.5
|
|
18
|
+
CODE = "code"
|
|
19
|
+
CODE_BODY = "code_body"
|
|
20
|
+
CODE_CAPTION = "code_caption"
|
|
21
|
+
ALGORITHM = "algorithm"
|
|
22
|
+
REF_TEXT = "ref_text"
|
|
23
|
+
PHONETIC = "phonetic"
|
|
24
|
+
HEADER = "header"
|
|
25
|
+
FOOTER = "footer"
|
|
26
|
+
PAGE_NUMBER = "page_number"
|
|
27
|
+
ASIDE_TEXT = "aside_text"
|
|
28
|
+
PAGE_FOOTNOTE = "page_footnote"
|
|
29
|
+
|
|
17
30
|
|
|
18
31
|
class ContentType:
|
|
19
32
|
IMAGE = 'image'
|
|
@@ -22,6 +35,7 @@ class ContentType:
|
|
|
22
35
|
INTERLINE_EQUATION = 'interline_equation'
|
|
23
36
|
INLINE_EQUATION = 'inline_equation'
|
|
24
37
|
EQUATION = 'equation'
|
|
38
|
+
CODE = 'code'
|
|
25
39
|
|
|
26
40
|
|
|
27
41
|
class CategoryId:
|
|
@@ -49,8 +63,8 @@ class MakeMode:
|
|
|
49
63
|
|
|
50
64
|
|
|
51
65
|
class ModelPath:
|
|
52
|
-
vlm_root_hf = "opendatalab/MinerU2.
|
|
53
|
-
vlm_root_modelscope = "OpenDataLab/MinerU2.
|
|
66
|
+
vlm_root_hf = "opendatalab/MinerU2.5-2509-1.2B"
|
|
67
|
+
vlm_root_modelscope = "OpenDataLab/MinerU2.5-2509-1.2B"
|
|
54
68
|
pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0"
|
|
55
69
|
pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0"
|
|
56
70
|
doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from magika import Magika
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
DEFAULT_LANG = "txt"
|
|
5
|
+
magika = Magika()
|
|
6
|
+
|
|
7
|
+
def guess_language_by_text(code):
|
|
8
|
+
codebytes = code.encode(encoding="utf-8")
|
|
9
|
+
lang = magika.identify_bytes(codebytes).prediction.output.label
|
|
10
|
+
return lang if lang != "unknown" else DEFAULT_LANG
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def guess_suffix_by_bytes(file_bytes) -> str:
|
|
14
|
+
suffix = magika.identify_bytes(file_bytes).prediction.output.label
|
|
15
|
+
return suffix
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def guess_suffix_by_path(file_path) -> str:
|
|
19
|
+
suffix = magika.identify_path(file_path).prediction.output.label
|
|
20
|
+
return suffix
|
mineru/utils/span_block_fix.py
CHANGED
|
@@ -23,9 +23,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
|
23
23
|
block_dict['group_id'] = block[-1]
|
|
24
24
|
block_spans = []
|
|
25
25
|
for span in spans:
|
|
26
|
+
temp_radio = radio
|
|
26
27
|
span_bbox = span['bbox']
|
|
27
|
-
if
|
|
28
|
-
|
|
28
|
+
if span['type'] in [ContentType.IMAGE, ContentType.TABLE]:
|
|
29
|
+
temp_radio = 0.9
|
|
30
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > temp_radio and span_block_type_compatible(span['type'], block_type):
|
|
29
31
|
block_spans.append(span)
|
|
30
32
|
|
|
31
33
|
block_dict['spans'] = block_spans
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.
|
|
1
|
+
__version__ = "2.5.0"
|