mineru 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/batch_analyze.py +20 -43
- mineru/backend/pipeline/model_init.py +1 -1
- mineru/backend/pipeline/model_json_to_middle_json.py +1 -1
- mineru/backend/pipeline/pipeline_analyze.py +3 -0
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +17 -4
- mineru/backend/vlm/vlm_analyze.py +12 -2
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +6 -3
- mineru/cli/client.py +24 -14
- mineru/cli/common.py +8 -15
- mineru/cli/gradio_app.py +4 -1
- mineru/cli/models_download.py +1 -1
- mineru/model/ocr/{paddleocr2pytorch/pytorch_paddle.py → pytorch_paddle.py} +1 -1
- mineru/model/table/rec/RapidTable.py +1 -1
- mineru/model/table/rec/slanet_plus/table_structure.py +4 -0
- mineru/model/table/rec/unet_table/table_structure_unet.py +5 -0
- mineru/utils/block_sort.py +3 -2
- mineru/utils/check_sys_env.py +34 -0
- mineru/utils/llm_aided.py +13 -8
- mineru/utils/os_env_config.py +30 -0
- mineru/utils/pdf_image_tools.py +118 -14
- mineru/utils/pdf_page_id.py +10 -0
- mineru/version.py +1 -1
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/METADATA +77 -37
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/RECORD +28 -26
- mineru/model/ocr/paddleocr2pytorch/__init__.py +0 -1
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/WHEEL +0 -0
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.2.dist-info → mineru-2.6.4.dist-info}/top_level.txt +0 -0
|
@@ -281,28 +281,20 @@ class BatchAnalyze:
|
|
|
281
281
|
|
|
282
282
|
# 按分辨率分组并同时完成padding
|
|
283
283
|
# RESOLUTION_GROUP_STRIDE = 32
|
|
284
|
-
RESOLUTION_GROUP_STRIDE = 64
|
|
284
|
+
RESOLUTION_GROUP_STRIDE = 64
|
|
285
285
|
|
|
286
286
|
resolution_groups = defaultdict(list)
|
|
287
287
|
for crop_info in lang_crop_list:
|
|
288
288
|
cropped_img = crop_info[0]
|
|
289
289
|
h, w = cropped_img.shape[:2]
|
|
290
|
-
#
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
group_key = (normalized_h, normalized_w)
|
|
290
|
+
# 直接计算目标尺寸并用作分组键
|
|
291
|
+
target_h = ((h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
|
|
292
|
+
target_w = ((w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
|
|
293
|
+
group_key = (target_h, target_w)
|
|
295
294
|
resolution_groups[group_key].append(crop_info)
|
|
296
295
|
|
|
297
296
|
# 对每个分辨率组进行批处理
|
|
298
|
-
for
|
|
299
|
-
|
|
300
|
-
# 计算目标尺寸(组内最大尺寸,向上取整到32的倍数)
|
|
301
|
-
max_h = max(crop_info[0].shape[0] for crop_info in group_crops)
|
|
302
|
-
max_w = max(crop_info[0].shape[1] for crop_info in group_crops)
|
|
303
|
-
target_h = ((max_h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
|
|
304
|
-
target_w = ((max_w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
|
|
305
|
-
|
|
297
|
+
for (target_h, target_w), group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
|
|
306
298
|
# 对所有图像进行padding到统一尺寸
|
|
307
299
|
batch_images = []
|
|
308
300
|
for crop_info in group_crops:
|
|
@@ -310,49 +302,34 @@ class BatchAnalyze:
|
|
|
310
302
|
h, w = img.shape[:2]
|
|
311
303
|
# 创建目标尺寸的白色背景
|
|
312
304
|
padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
|
|
313
|
-
# 将原图像粘贴到左上角
|
|
314
305
|
padded_img[:h, :w] = img
|
|
315
306
|
batch_images.append(padded_img)
|
|
316
307
|
|
|
317
308
|
# 批处理检测
|
|
318
|
-
det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
|
|
319
|
-
# logger.debug(f"OCR-det batch: {det_batch_size} images, target size: {target_h}x{target_w}")
|
|
309
|
+
det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
|
|
320
310
|
batch_results = ocr_model.text_detector.batch_predict(batch_images, det_batch_size)
|
|
321
311
|
|
|
322
312
|
# 处理批处理结果
|
|
323
|
-
for
|
|
313
|
+
for crop_info, (dt_boxes, _) in zip(group_crops, batch_results):
|
|
324
314
|
bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
|
|
325
315
|
|
|
326
316
|
if dt_boxes is not None and len(dt_boxes) > 0:
|
|
327
|
-
#
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
if
|
|
337
|
-
|
|
338
|
-
else:
|
|
339
|
-
dt_boxes_merged = []
|
|
340
|
-
|
|
341
|
-
# 3. 根据公式位置更新检测框(关键步骤!)
|
|
342
|
-
if dt_boxes_merged and adjusted_mfdetrec_res:
|
|
343
|
-
dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
|
|
344
|
-
else:
|
|
345
|
-
dt_boxes_final = dt_boxes_merged
|
|
346
|
-
|
|
347
|
-
# 构造OCR结果格式
|
|
348
|
-
ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
|
|
349
|
-
|
|
350
|
-
if ocr_res:
|
|
317
|
+
# 处理检测框
|
|
318
|
+
dt_boxes_sorted = sorted_boxes(dt_boxes)
|
|
319
|
+
dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) if dt_boxes_sorted else []
|
|
320
|
+
|
|
321
|
+
# 根据公式位置更新检测框
|
|
322
|
+
dt_boxes_final = (update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
|
|
323
|
+
if dt_boxes_merged and adjusted_mfdetrec_res
|
|
324
|
+
else dt_boxes_merged)
|
|
325
|
+
|
|
326
|
+
if dt_boxes_final:
|
|
327
|
+
ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
|
|
351
328
|
ocr_result_list = get_ocr_result_list(
|
|
352
329
|
ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
|
|
353
330
|
)
|
|
354
|
-
|
|
355
331
|
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
|
|
332
|
+
|
|
356
333
|
else:
|
|
357
334
|
# 原始单张处理模式
|
|
358
335
|
for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
|
|
@@ -8,7 +8,7 @@ from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
|
|
|
8
8
|
from ...model.mfd.yolo_v8 import YOLOv8MFDModel
|
|
9
9
|
from ...model.mfr.unimernet.Unimernet import UnimernetModel
|
|
10
10
|
from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
|
|
11
|
-
from
|
|
11
|
+
from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
|
|
12
12
|
from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
|
|
13
13
|
from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
|
|
14
14
|
# from ...model.table.rec.RapidTable import RapidTableModel
|
|
@@ -148,7 +148,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
|
|
|
148
148
|
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
|
|
149
149
|
|
|
150
150
|
"""如果当前页面没有有效的bbox则跳过"""
|
|
151
|
-
if len(all_bboxes) == 0:
|
|
151
|
+
if len(all_bboxes) == 0 and len(fix_discarded_blocks) == 0:
|
|
152
152
|
return None
|
|
153
153
|
|
|
154
154
|
"""对image/table/interline_equation截图"""
|
|
@@ -99,7 +99,10 @@ def doc_analyze(
|
|
|
99
99
|
_lang = lang_list[pdf_idx]
|
|
100
100
|
|
|
101
101
|
# 收集每个数据集中的页面
|
|
102
|
+
# load_images_start = time.time()
|
|
102
103
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
104
|
+
# load_images_time = round(time.time() - load_images_start, 2)
|
|
105
|
+
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
|
|
103
106
|
all_image_lists.append(images_list)
|
|
104
107
|
all_pdf_docs.append(pdf_doc)
|
|
105
108
|
for page_idx in range(len(images_list)):
|
|
@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
|
|
|
191
191
|
def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
|
|
192
192
|
para_type = para_block['type']
|
|
193
193
|
para_content = {}
|
|
194
|
-
if para_type in [
|
|
194
|
+
if para_type in [
|
|
195
|
+
BlockType.TEXT,
|
|
196
|
+
BlockType.LIST,
|
|
197
|
+
BlockType.INDEX,
|
|
198
|
+
]:
|
|
195
199
|
para_content = {
|
|
196
200
|
'type': ContentType.TEXT,
|
|
197
201
|
'text': merge_para_with_text(para_block),
|
|
198
202
|
}
|
|
203
|
+
elif para_type == BlockType.DISCARDED:
|
|
204
|
+
para_content = {
|
|
205
|
+
'type': para_type,
|
|
206
|
+
'text': merge_para_with_text(para_block),
|
|
207
|
+
}
|
|
199
208
|
elif para_type == BlockType.TITLE:
|
|
200
209
|
para_content = {
|
|
201
210
|
'type': ContentType.TEXT,
|
|
@@ -268,15 +277,19 @@ def union_make(pdf_info_dict: list,
|
|
|
268
277
|
output_content = []
|
|
269
278
|
for page_info in pdf_info_dict:
|
|
270
279
|
paras_of_layout = page_info.get('para_blocks')
|
|
280
|
+
paras_of_discarded = page_info.get('discarded_blocks')
|
|
271
281
|
page_idx = page_info.get('page_idx')
|
|
272
282
|
page_size = page_info.get('page_size')
|
|
273
|
-
if not paras_of_layout:
|
|
274
|
-
continue
|
|
275
283
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
|
284
|
+
if not paras_of_layout:
|
|
285
|
+
continue
|
|
276
286
|
page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
|
|
277
287
|
output_content.extend(page_markdown)
|
|
278
288
|
elif make_mode == MakeMode.CONTENT_LIST:
|
|
279
|
-
|
|
289
|
+
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
|
290
|
+
if not para_blocks:
|
|
291
|
+
continue
|
|
292
|
+
for para_block in para_blocks:
|
|
280
293
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
281
294
|
if para_content:
|
|
282
295
|
output_content.append(para_content)
|
|
@@ -8,6 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
|
|
|
8
8
|
from .model_output_to_middle_json import result_to_middle_json
|
|
9
9
|
from ...data.data_reader_writer import DataWriter
|
|
10
10
|
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
11
|
+
from ...utils.check_sys_env import is_mac_os_version_supported
|
|
11
12
|
from ...utils.config_reader import get_device
|
|
12
13
|
|
|
13
14
|
from ...utils.enum_class import ImageType
|
|
@@ -47,7 +48,7 @@ class ModelSingleton:
|
|
|
47
48
|
for param in ["batch_size", "max_concurrency", "http_timeout"]:
|
|
48
49
|
if param in kwargs:
|
|
49
50
|
del kwargs[param]
|
|
50
|
-
if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
|
|
51
|
+
if backend in ['transformers', 'vllm-engine', "vllm-async-engine", "mlx-engine"] and not model_path:
|
|
51
52
|
model_path = auto_download_and_get_model_root_path("/","vlm")
|
|
52
53
|
if backend == "transformers":
|
|
53
54
|
try:
|
|
@@ -75,6 +76,15 @@ class ModelSingleton:
|
|
|
75
76
|
)
|
|
76
77
|
if batch_size == 0:
|
|
77
78
|
batch_size = set_default_batch_size()
|
|
79
|
+
elif backend == "mlx-engine":
|
|
80
|
+
mlx_supported = is_mac_os_version_supported()
|
|
81
|
+
if not mlx_supported:
|
|
82
|
+
raise EnvironmentError("mlx-engine backend is only supported on macOS 13.5+ with Apple Silicon.")
|
|
83
|
+
try:
|
|
84
|
+
from mlx_vlm import load as mlx_load
|
|
85
|
+
except ImportError:
|
|
86
|
+
raise ImportError("Please install mlx-vlm to use the mlx-engine backend.")
|
|
87
|
+
model, processor = mlx_load(model_path)
|
|
78
88
|
else:
|
|
79
89
|
if os.getenv('OMP_NUM_THREADS') is None:
|
|
80
90
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
@@ -167,7 +177,7 @@ async def aio_doc_analyze(
|
|
|
167
177
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
168
178
|
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
169
179
|
# load_images_time = round(time.time() - load_images_start, 2)
|
|
170
|
-
# logger.
|
|
180
|
+
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
|
171
181
|
|
|
172
182
|
# infer_start = time.time()
|
|
173
183
|
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
|
@@ -248,13 +248,16 @@ def union_make(pdf_info_dict: list,
|
|
|
248
248
|
paras_of_discarded = page_info.get('discarded_blocks')
|
|
249
249
|
page_idx = page_info.get('page_idx')
|
|
250
250
|
page_size = page_info.get('page_size')
|
|
251
|
-
if not paras_of_layout:
|
|
252
|
-
continue
|
|
253
251
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
|
252
|
+
if not paras_of_layout:
|
|
253
|
+
continue
|
|
254
254
|
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
|
|
255
255
|
output_content.extend(page_markdown)
|
|
256
256
|
elif make_mode == MakeMode.CONTENT_LIST:
|
|
257
|
-
|
|
257
|
+
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
|
258
|
+
if not para_blocks:
|
|
259
|
+
continue
|
|
260
|
+
for para_block in para_blocks:
|
|
258
261
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
259
262
|
output_content.append(para_content)
|
|
260
263
|
|
mineru/cli/client.py
CHANGED
|
@@ -4,6 +4,7 @@ import click
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from loguru import logger
|
|
6
6
|
|
|
7
|
+
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
|
7
8
|
from mineru.utils.cli_parser import arg_parse
|
|
8
9
|
from mineru.utils.config_reader import get_device
|
|
9
10
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
|
@@ -11,6 +12,11 @@ from mineru.utils.model_utils import get_vram
|
|
|
11
12
|
from ..version import __version__
|
|
12
13
|
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
13
14
|
|
|
15
|
+
|
|
16
|
+
backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']
|
|
17
|
+
if is_mac_os_version_supported():
|
|
18
|
+
backends.append("vlm-mlx-engine")
|
|
19
|
+
|
|
14
20
|
@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
|
|
15
21
|
@click.pass_context
|
|
16
22
|
@click.version_option(__version__,
|
|
@@ -38,25 +44,28 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
38
44
|
'--method',
|
|
39
45
|
'method',
|
|
40
46
|
type=click.Choice(['auto', 'txt', 'ocr']),
|
|
41
|
-
help="""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
help="""\b
|
|
48
|
+
the method for parsing pdf:
|
|
49
|
+
auto: Automatically determine the method based on the file type.
|
|
50
|
+
txt: Use text extraction method.
|
|
51
|
+
ocr: Use OCR method for image-based PDFs.
|
|
45
52
|
Without method specified, 'auto' will be used by default.
|
|
46
|
-
Adapted only for the case where the backend is set to
|
|
53
|
+
Adapted only for the case where the backend is set to 'pipeline'.""",
|
|
47
54
|
default='auto',
|
|
48
55
|
)
|
|
49
56
|
@click.option(
|
|
50
57
|
'-b',
|
|
51
58
|
'--backend',
|
|
52
59
|
'backend',
|
|
53
|
-
type=click.Choice(
|
|
54
|
-
help="""
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
+
type=click.Choice(backends),
|
|
61
|
+
help="""\b
|
|
62
|
+
the backend for parsing pdf:
|
|
63
|
+
pipeline: More general.
|
|
64
|
+
vlm-transformers: More general, but slower.
|
|
65
|
+
vlm-mlx-engine: Faster than transformers.
|
|
66
|
+
vlm-vllm-engine: Faster(engine).
|
|
67
|
+
vlm-http-client: Faster(client).
|
|
68
|
+
Without method specified, pipeline will be used by default.""",
|
|
60
69
|
default='pipeline',
|
|
61
70
|
)
|
|
62
71
|
@click.option(
|
|
@@ -66,7 +75,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
66
75
|
type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'th', 'el',
|
|
67
76
|
'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']),
|
|
68
77
|
help="""
|
|
69
|
-
Input the languages in the pdf (if known) to improve OCR accuracy.
|
|
78
|
+
Input the languages in the pdf (if known) to improve OCR accuracy.
|
|
70
79
|
Without languages specified, 'ch' will be used by default.
|
|
71
80
|
Adapted only for the case where the backend is set to "pipeline".
|
|
72
81
|
""",
|
|
@@ -119,7 +128,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
119
128
|
'--device',
|
|
120
129
|
'device_mode',
|
|
121
130
|
type=str,
|
|
122
|
-
help=
|
|
131
|
+
help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
|
|
132
|
+
Adapted only for the case where the backend is set to "pipeline" and "vlm-transformers". """,
|
|
123
133
|
default=None,
|
|
124
134
|
)
|
|
125
135
|
@click.option(
|
mineru/cli/common.py
CHANGED
|
@@ -5,8 +5,8 @@ import os
|
|
|
5
5
|
import copy
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
import pypdfium2 as pdfium
|
|
9
8
|
from loguru import logger
|
|
9
|
+
import pypdfium2 as pdfium
|
|
10
10
|
|
|
11
11
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
12
12
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
|
@@ -16,10 +16,12 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
|
16
16
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
17
17
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
18
18
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
19
|
+
from mineru.utils.pdf_page_id import get_end_page_id
|
|
19
20
|
|
|
20
21
|
pdf_suffixes = ["pdf"]
|
|
21
22
|
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
|
22
23
|
|
|
24
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
23
25
|
|
|
24
26
|
def read_fn(path):
|
|
25
27
|
if not isinstance(path, Path):
|
|
@@ -44,18 +46,10 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
|
|
49
|
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
50
|
+
output_pdf = pdfium.PdfDocument.new()
|
|
47
51
|
try:
|
|
48
|
-
|
|
49
|
-
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
50
|
-
|
|
51
|
-
# 确定结束页
|
|
52
|
-
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
|
|
53
|
-
if end_page_id > len(pdf) - 1:
|
|
54
|
-
logger.warning("end_page_id is out of range, use pdf_docs length")
|
|
55
|
-
end_page_id = len(pdf) - 1
|
|
56
|
-
|
|
57
|
-
# 创建一个新的PDF文档
|
|
58
|
-
output_pdf = pdfium.PdfDocument.new()
|
|
52
|
+
end_page_id = get_end_page_id(end_page_id, len(pdf))
|
|
59
53
|
|
|
60
54
|
# 选择要导入的页面索引
|
|
61
55
|
page_indices = list(range(start_page_id, end_page_id + 1))
|
|
@@ -69,13 +63,12 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
|
|
|
69
63
|
|
|
70
64
|
# 获取字节数据
|
|
71
65
|
output_bytes = output_buffer.getvalue()
|
|
72
|
-
|
|
73
|
-
pdf.close() # 关闭原PDF文档以释放资源
|
|
74
|
-
output_pdf.close() # 关闭新PDF文档以释放资源
|
|
75
66
|
except Exception as e:
|
|
76
67
|
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
|
77
68
|
output_bytes = pdf_bytes
|
|
78
69
|
|
|
70
|
+
pdf.close()
|
|
71
|
+
output_pdf.close()
|
|
79
72
|
return output_bytes
|
|
80
73
|
|
|
81
74
|
|
mineru/cli/gradio_app.py
CHANGED
|
@@ -13,6 +13,7 @@ from gradio_pdf import PDF
|
|
|
13
13
|
from loguru import logger
|
|
14
14
|
|
|
15
15
|
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
|
|
16
|
+
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
|
16
17
|
from mineru.utils.cli_parser import arg_parse
|
|
17
18
|
from mineru.utils.hash_utils import str_sha256
|
|
18
19
|
|
|
@@ -273,7 +274,7 @@ def to_pdf(file_path):
|
|
|
273
274
|
|
|
274
275
|
# 更新界面函数
|
|
275
276
|
def update_interface(backend_choice):
|
|
276
|
-
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
|
|
277
|
+
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine", "vlm-mlx-engine"]:
|
|
277
278
|
return gr.update(visible=False), gr.update(visible=False)
|
|
278
279
|
elif backend_choice in ["vlm-http-client"]:
|
|
279
280
|
return gr.update(visible=True), gr.update(visible=False)
|
|
@@ -381,6 +382,8 @@ def main(ctx,
|
|
|
381
382
|
preferred_option = "vlm-vllm-async-engine"
|
|
382
383
|
else:
|
|
383
384
|
drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
|
|
385
|
+
if is_mac_os_version_supported():
|
|
386
|
+
drop_list.append("vlm-mlx-engine")
|
|
384
387
|
preferred_option = "pipeline"
|
|
385
388
|
backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
|
|
386
389
|
with gr.Row(visible=False) as client_options:
|
mineru/cli/models_download.py
CHANGED
|
@@ -21,7 +21,7 @@ def download_and_modify_json(url, local_filename, modifications):
|
|
|
21
21
|
if os.path.exists(local_filename):
|
|
22
22
|
data = json.load(open(local_filename))
|
|
23
23
|
config_version = data.get('config_version', '0.0.0')
|
|
24
|
-
if config_version < '1.3.
|
|
24
|
+
if config_version < '1.3.1':
|
|
25
25
|
data = download_json(url)
|
|
26
26
|
else:
|
|
27
27
|
data = download_json(url)
|
|
@@ -134,7 +134,7 @@ def get_model_params(lang, config):
|
|
|
134
134
|
raise Exception (f'Language {lang} not supported')
|
|
135
135
|
|
|
136
136
|
|
|
137
|
-
root_dir = os.path.join(Path(__file__).resolve().parent.parent
|
|
137
|
+
root_dir = os.path.join(Path(__file__).resolve().parent.parent, 'utils')
|
|
138
138
|
|
|
139
139
|
|
|
140
140
|
class PytorchPaddleOCR(TextSystem):
|
|
@@ -11,7 +11,7 @@ from rapid_table import ModelType, RapidTable, RapidTableInput
|
|
|
11
11
|
from rapid_table.utils import RapidTableOutput
|
|
12
12
|
from tqdm import tqdm
|
|
13
13
|
|
|
14
|
-
from mineru.model.ocr.
|
|
14
|
+
from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
|
|
15
15
|
from mineru.utils.enum_class import ModelPath
|
|
16
16
|
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
17
17
|
|
|
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Tuple
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
+
from mineru.utils.os_env_config import get_op_num_threads
|
|
19
20
|
from .table_structure_utils import (
|
|
20
21
|
OrtInferSession,
|
|
21
22
|
TableLabelDecode,
|
|
@@ -29,6 +30,9 @@ class TableStructurer:
|
|
|
29
30
|
self.preprocess_op = TablePreprocess()
|
|
30
31
|
self.batch_preprocess_op = BatchTablePreprocess()
|
|
31
32
|
|
|
33
|
+
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
|
34
|
+
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
|
35
|
+
|
|
32
36
|
self.session = OrtInferSession(config)
|
|
33
37
|
|
|
34
38
|
self.character = self.session.get_metadata()
|
|
@@ -5,6 +5,8 @@ from typing import Optional, Dict, Any, Tuple
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
7
|
from skimage import measure
|
|
8
|
+
|
|
9
|
+
from mineru.utils.os_env_config import get_op_num_threads
|
|
8
10
|
from .utils import OrtInferSession, resize_img
|
|
9
11
|
from .utils_table_line_rec import (
|
|
10
12
|
get_table_line,
|
|
@@ -28,6 +30,9 @@ class TSRUnet:
|
|
|
28
30
|
self.inp_height = 1024
|
|
29
31
|
self.inp_width = 1024
|
|
30
32
|
|
|
33
|
+
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
|
34
|
+
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
|
35
|
+
|
|
31
36
|
self.session = OrtInferSession(config)
|
|
32
37
|
|
|
33
38
|
def __call__(
|
mineru/utils/block_sort.py
CHANGED
|
@@ -179,13 +179,14 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
|
|
|
179
179
|
def model_init(model_name: str):
|
|
180
180
|
from transformers import LayoutLMv3ForTokenClassification
|
|
181
181
|
device_name = get_device()
|
|
182
|
+
device = torch.device(device_name)
|
|
182
183
|
bf_16_support = False
|
|
183
184
|
if device_name.startswith("cuda"):
|
|
184
|
-
|
|
185
|
+
if torch.cuda.get_device_properties(device).major >= 8:
|
|
186
|
+
bf_16_support = True
|
|
185
187
|
elif device_name.startswith("mps"):
|
|
186
188
|
bf_16_support = True
|
|
187
189
|
|
|
188
|
-
device = torch.device(device_name)
|
|
189
190
|
if model_name == 'layoutreader':
|
|
190
191
|
# 检测modelscope的缓存目录是否存在
|
|
191
192
|
layoutreader_model_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.layout_reader), ModelPath.layout_reader)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
import platform
|
|
3
|
+
|
|
4
|
+
from packaging import version
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_windows_environment() -> bool:
|
|
8
|
+
return platform.system() == "Windows"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Detect if the current environment is a Mac computer
|
|
12
|
+
def is_mac_environment() -> bool:
|
|
13
|
+
return platform.system() == "Darwin"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Detect if CPU is Apple Silicon architecture
|
|
17
|
+
def is_apple_silicon_cpu() -> bool:
|
|
18
|
+
return platform.machine() in ["arm64", "aarch64"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# If Mac computer with Apple Silicon architecture, check if macOS version is 13.5 or above
|
|
22
|
+
def is_mac_os_version_supported(min_version: str = "13.5") -> bool:
|
|
23
|
+
if not is_mac_environment() or not is_apple_silicon_cpu():
|
|
24
|
+
return False
|
|
25
|
+
mac_version = platform.mac_ver()[0]
|
|
26
|
+
if not mac_version:
|
|
27
|
+
return False
|
|
28
|
+
# print("Mac OS Version:", mac_version)
|
|
29
|
+
return version.parse(mac_version) >= version.parse(min_version)
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
print("Is Mac Environment:", is_mac_environment())
|
|
33
|
+
print("Is Apple Silicon CPU:", is_apple_silicon_cpu())
|
|
34
|
+
print("Is Mac OS Version Supported (>=13.5):", is_mac_os_version_supported())
|
mineru/utils/llm_aided.py
CHANGED
|
@@ -84,16 +84,21 @@ Corrected title list:
|
|
|
84
84
|
max_retries = 3
|
|
85
85
|
dict_completion = None
|
|
86
86
|
|
|
87
|
+
# Build API call parameters
|
|
88
|
+
api_params = {
|
|
89
|
+
"model": title_aided_config["model"],
|
|
90
|
+
"messages": [{'role': 'user', 'content': title_optimize_prompt}],
|
|
91
|
+
"temperature": 0.7,
|
|
92
|
+
"stream": True,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Only add extra_body when explicitly specified in config
|
|
96
|
+
if "enable_thinking" in title_aided_config:
|
|
97
|
+
api_params["extra_body"] = {"enable_thinking": title_aided_config["enable_thinking"]}
|
|
98
|
+
|
|
87
99
|
while retry_count < max_retries:
|
|
88
100
|
try:
|
|
89
|
-
completion = client.chat.completions.create(
|
|
90
|
-
model=title_aided_config["model"],
|
|
91
|
-
messages=[
|
|
92
|
-
{'role': 'user', 'content': title_optimize_prompt}],
|
|
93
|
-
extra_body={"enable_thinking": False},
|
|
94
|
-
temperature=0.7,
|
|
95
|
-
stream=True,
|
|
96
|
-
)
|
|
101
|
+
completion = client.chat.completions.create(**api_params)
|
|
97
102
|
content_pieces = []
|
|
98
103
|
for chunk in completion:
|
|
99
104
|
if chunk.choices and chunk.choices[0].delta.content is not None:
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_op_num_threads(env_name: str) -> int:
|
|
5
|
+
env_value = os.getenv(env_name, None)
|
|
6
|
+
return get_value_from_string(env_value, -1)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_load_images_timeout() -> int:
|
|
10
|
+
env_value = os.getenv('MINERU_PDF_RENDER_TIMEOUT', None)
|
|
11
|
+
return get_value_from_string(env_value, 300)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_value_from_string(env_value: str, default_value: int) -> int:
|
|
15
|
+
if env_value is not None:
|
|
16
|
+
try:
|
|
17
|
+
num_threads = int(env_value)
|
|
18
|
+
if num_threads > 0:
|
|
19
|
+
return num_threads
|
|
20
|
+
except ValueError:
|
|
21
|
+
return default_value
|
|
22
|
+
return default_value
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == '__main__':
|
|
26
|
+
print(get_value_from_string('1', -1))
|
|
27
|
+
print(get_value_from_string('0', -1))
|
|
28
|
+
print(get_value_from_string('-1', -1))
|
|
29
|
+
print(get_value_from_string('abc', -1))
|
|
30
|
+
print(get_load_images_timeout())
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
import os
|
|
2
3
|
from io import BytesIO
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -7,9 +8,14 @@ from loguru import logger
|
|
|
7
8
|
from PIL import Image
|
|
8
9
|
|
|
9
10
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
|
+
from mineru.utils.check_sys_env import is_windows_environment
|
|
12
|
+
from mineru.utils.os_env_config import get_load_images_timeout
|
|
10
13
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
|
11
|
-
from .enum_class import ImageType
|
|
12
|
-
from .hash_utils import str_sha256
|
|
14
|
+
from mineru.utils.enum_class import ImageType
|
|
15
|
+
from mineru.utils.hash_utils import str_sha256
|
|
16
|
+
from mineru.utils.pdf_page_id import get_end_page_id
|
|
17
|
+
|
|
18
|
+
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
|
@@ -35,7 +41,106 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
|
|
35
41
|
return image_dict
|
|
36
42
|
|
|
37
43
|
|
|
44
|
+
def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
|
|
45
|
+
"""用于进程池的包装函数"""
|
|
46
|
+
return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
|
|
47
|
+
|
|
48
|
+
|
|
38
49
|
def load_images_from_pdf(
|
|
50
|
+
pdf_bytes: bytes,
|
|
51
|
+
dpi=200,
|
|
52
|
+
start_page_id=0,
|
|
53
|
+
end_page_id=None,
|
|
54
|
+
image_type=ImageType.PIL,
|
|
55
|
+
timeout=None,
|
|
56
|
+
threads=4,
|
|
57
|
+
):
|
|
58
|
+
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
pdf_bytes (bytes): PDF 文件的 bytes
|
|
62
|
+
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
|
63
|
+
start_page_id (int, optional): 起始页码. Defaults to 0.
|
|
64
|
+
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
|
65
|
+
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
|
66
|
+
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
|
|
67
|
+
threads (int): 进程数,默认 4
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
TimeoutError: 当转换超时时抛出
|
|
71
|
+
"""
|
|
72
|
+
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
|
73
|
+
if is_windows_environment():
|
|
74
|
+
# Windows 环境下不使用多进程
|
|
75
|
+
return load_images_from_pdf_core(
|
|
76
|
+
pdf_bytes,
|
|
77
|
+
dpi,
|
|
78
|
+
start_page_id,
|
|
79
|
+
get_end_page_id(end_page_id, len(pdf_doc)),
|
|
80
|
+
image_type
|
|
81
|
+
), pdf_doc
|
|
82
|
+
else:
|
|
83
|
+
if timeout is None:
|
|
84
|
+
timeout = get_load_images_timeout()
|
|
85
|
+
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
|
86
|
+
|
|
87
|
+
# 计算总页数
|
|
88
|
+
total_pages = end_page_id - start_page_id + 1
|
|
89
|
+
|
|
90
|
+
# 实际使用的进程数不超过总页数
|
|
91
|
+
actual_threads = min(os.cpu_count() or 1, threads, total_pages)
|
|
92
|
+
|
|
93
|
+
# 根据实际进程数分组页面范围
|
|
94
|
+
pages_per_thread = max(1, total_pages // actual_threads)
|
|
95
|
+
page_ranges = []
|
|
96
|
+
|
|
97
|
+
for i in range(actual_threads):
|
|
98
|
+
range_start = start_page_id + i * pages_per_thread
|
|
99
|
+
if i == actual_threads - 1:
|
|
100
|
+
# 最后一个进程处理剩余所有页面
|
|
101
|
+
range_end = end_page_id
|
|
102
|
+
else:
|
|
103
|
+
range_end = start_page_id + (i + 1) * pages_per_thread - 1
|
|
104
|
+
|
|
105
|
+
page_ranges.append((range_start, range_end))
|
|
106
|
+
|
|
107
|
+
# logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
|
108
|
+
|
|
109
|
+
with ProcessPoolExecutor(max_workers=actual_threads) as executor:
|
|
110
|
+
# 提交所有任务
|
|
111
|
+
futures = []
|
|
112
|
+
for range_start, range_end in page_ranges:
|
|
113
|
+
future = executor.submit(
|
|
114
|
+
_load_images_from_pdf_worker,
|
|
115
|
+
pdf_bytes,
|
|
116
|
+
dpi,
|
|
117
|
+
range_start,
|
|
118
|
+
range_end,
|
|
119
|
+
image_type
|
|
120
|
+
)
|
|
121
|
+
futures.append((range_start, future))
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
# 收集结果并按页码排序
|
|
125
|
+
all_results = []
|
|
126
|
+
for range_start, future in futures:
|
|
127
|
+
images_list = future.result(timeout=timeout)
|
|
128
|
+
all_results.append((range_start, images_list))
|
|
129
|
+
|
|
130
|
+
# 按起始页码排序并合并结果
|
|
131
|
+
all_results.sort(key=lambda x: x[0])
|
|
132
|
+
images_list = []
|
|
133
|
+
for _, imgs in all_results:
|
|
134
|
+
images_list.extend(imgs)
|
|
135
|
+
|
|
136
|
+
return images_list, pdf_doc
|
|
137
|
+
except FuturesTimeoutError:
|
|
138
|
+
pdf_doc.close()
|
|
139
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
140
|
+
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load_images_from_pdf_core(
|
|
39
144
|
pdf_bytes: bytes,
|
|
40
145
|
dpi=200,
|
|
41
146
|
start_page_id=0,
|
|
@@ -45,18 +150,17 @@ def load_images_from_pdf(
|
|
|
45
150
|
images_list = []
|
|
46
151
|
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
|
47
152
|
pdf_page_num = len(pdf_doc)
|
|
48
|
-
end_page_id = end_page_id
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
return images_list, pdf_doc
|
|
153
|
+
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
|
154
|
+
|
|
155
|
+
for index in range(start_page_id, end_page_id + 1):
|
|
156
|
+
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
|
|
157
|
+
page = pdf_doc[index]
|
|
158
|
+
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
|
159
|
+
images_list.append(image_dict)
|
|
160
|
+
|
|
161
|
+
pdf_doc.close()
|
|
162
|
+
|
|
163
|
+
return images_list
|
|
60
164
|
|
|
61
165
|
|
|
62
166
|
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_end_page_id(end_page_id, pdf_page_num):
|
|
6
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
|
7
|
+
if end_page_id > pdf_page_num - 1:
|
|
8
|
+
logger.warning("end_page_id is out of range, use images length")
|
|
9
|
+
end_page_id = pdf_page_num - 1
|
|
10
|
+
return end_page_id
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.6.
|
|
1
|
+
__version__ = "2.6.4"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.4
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -37,7 +37,7 @@ Requires-Dist: scikit-image<1.0.0,>=0.25.0
|
|
|
37
37
|
Requires-Dist: openai<3,>=1.70.0
|
|
38
38
|
Requires-Dist: beautifulsoup4<5,>=4.13.5
|
|
39
39
|
Requires-Dist: magika<0.7.0,>=0.6.2
|
|
40
|
-
Requires-Dist: mineru-vl-utils<1,>=0.1.
|
|
40
|
+
Requires-Dist: mineru-vl-utils<1,>=0.1.15
|
|
41
41
|
Provides-Extra: test
|
|
42
42
|
Requires-Dist: mineru[core]; extra == "test"
|
|
43
43
|
Requires-Dist: pytest; extra == "test"
|
|
@@ -50,6 +50,8 @@ Requires-Dist: transformers<5.0.0,>=4.51.1; extra == "vlm"
|
|
|
50
50
|
Requires-Dist: accelerate>=1.5.1; extra == "vlm"
|
|
51
51
|
Provides-Extra: vllm
|
|
52
52
|
Requires-Dist: vllm<0.12,>=0.10.1.1; extra == "vllm"
|
|
53
|
+
Provides-Extra: mlx
|
|
54
|
+
Requires-Dist: mlx-vlm<0.4,>=0.3.3; extra == "mlx"
|
|
53
55
|
Provides-Extra: pipeline
|
|
54
56
|
Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
|
|
55
57
|
Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
|
|
@@ -76,6 +78,7 @@ Requires-Dist: mineru[vlm]; extra == "core"
|
|
|
76
78
|
Requires-Dist: mineru[pipeline]; extra == "core"
|
|
77
79
|
Requires-Dist: mineru[api]; extra == "core"
|
|
78
80
|
Requires-Dist: mineru[gradio]; extra == "core"
|
|
81
|
+
Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "core"
|
|
79
82
|
Provides-Extra: all
|
|
80
83
|
Requires-Dist: mineru[core]; extra == "all"
|
|
81
84
|
Requires-Dist: mineru[vllm]; extra == "all"
|
|
@@ -127,6 +130,14 @@ Dynamic: license-file
|
|
|
127
130
|
</div>
|
|
128
131
|
|
|
129
132
|
# Changelog
|
|
133
|
+
- 2025/11/04 2.6.4 Release
|
|
134
|
+
- Added timeout configuration for PDF image rendering, default is 300 seconds, can be configured via environment variable `MINERU_PDF_RENDER_TIMEOUT` to prevent long blocking of the rendering process caused by some abnormal PDF files.
|
|
135
|
+
- Added CPU thread count configuration options for ONNX models, default is the system CPU core count, can be configured via environment variables `MINERU_INTRA_OP_NUM_THREADS` and `MINERU_INTER_OP_NUM_THREADS` to reduce CPU resource contention conflicts in high concurrency scenarios.
|
|
136
|
+
|
|
137
|
+
- 2025/10/31 2.6.3 Release
|
|
138
|
+
- Added support for a new backend `vlm-mlx-engine`, enabling MLX-accelerated inference for the MinerU2.5 model on Apple Silicon devices. Compared to the `vlm-transformers` backend, `vlm-mlx-engine` delivers a 100%–200% speed improvement.
|
|
139
|
+
- Bug fixes: #3849, #3859
|
|
140
|
+
|
|
130
141
|
- 2025/10/24 2.6.2 Release
|
|
131
142
|
- `pipeline` backend optimizations
|
|
132
143
|
- Added experimental support for Chinese formulas, which can be enabled by setting the environment variable `export MINERU_FORMULA_CH_SUPPORT=1`. This feature may cause a slight decrease in MFR speed and failures in recognizing some long formulas. It is recommended to enable it only when parsing Chinese formulas is needed. To disable this feature, set the environment variable to `0`.
|
|
@@ -666,7 +677,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
|
666
677
|
- Automatically recognize and convert formulas in the document to LaTeX format.
|
|
667
678
|
- Automatically recognize and convert tables in the document to HTML format.
|
|
668
679
|
- Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
|
|
669
|
-
- OCR supports detection and recognition of
|
|
680
|
+
- OCR supports detection and recognition of 109 languages.
|
|
670
681
|
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
|
|
671
682
|
- Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
|
|
672
683
|
- Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
|
|
@@ -703,41 +714,70 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
|
|
|
703
714
|
> In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
|
|
704
715
|
|
|
705
716
|
<table>
|
|
706
|
-
<
|
|
707
|
-
<
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
<
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
717
|
+
<thead>
|
|
718
|
+
<tr>
|
|
719
|
+
<th rowspan="2">Parsing Backend</th>
|
|
720
|
+
<th rowspan="2">pipeline <br> (Accuracy<sup>1</sup> 82+)</th>
|
|
721
|
+
<th colspan="4">vlm (Accuracy<sup>1</sup> 90+)</th>
|
|
722
|
+
</tr>
|
|
723
|
+
<tr>
|
|
724
|
+
<th>transformers</th>
|
|
725
|
+
<th>mlx-engine</th>
|
|
726
|
+
<th>vllm-engine / <br>vllm-async-engine</th>
|
|
727
|
+
<th>http-client</th>
|
|
728
|
+
</tr>
|
|
729
|
+
</thead>
|
|
730
|
+
<tbody>
|
|
731
|
+
<tr>
|
|
732
|
+
<th>Backend Features</th>
|
|
733
|
+
<td>Fast, no hallucinations</td>
|
|
734
|
+
<td>Good compatibility, <br>but slower</td>
|
|
735
|
+
<td>Faster than transformers</td>
|
|
736
|
+
<td>Fast, compatible with the vLLM ecosystem</td>
|
|
737
|
+
<td>Suitable for OpenAI-compatible servers<sup>5</sup></td>
|
|
738
|
+
</tr>
|
|
739
|
+
<tr>
|
|
740
|
+
<th>Operating System</th>
|
|
741
|
+
<td colspan="2" style="text-align:center;">Linux<sup>2</sup> / Windows / macOS</td>
|
|
742
|
+
<td style="text-align:center;">macOS<sup>3</sup></td>
|
|
743
|
+
<td style="text-align:center;">Linux<sup>2</sup> / Windows<sup>4</sup> </td>
|
|
744
|
+
<td>Any</td>
|
|
745
|
+
</tr>
|
|
746
|
+
<tr>
|
|
747
|
+
<th>CPU inference support</th>
|
|
748
|
+
<td colspan="2" style="text-align:center;">✅</td>
|
|
749
|
+
<td colspan="2" style="text-align:center;">❌</td>
|
|
750
|
+
<td>Not required</td>
|
|
751
|
+
</tr>
|
|
752
|
+
<tr>
|
|
753
|
+
<th>GPU Requirements</th><td colspan="2" style="text-align:center;">Volta or later architectures, 6 GB VRAM or more, or Apple Silicon</td>
|
|
754
|
+
<td>Apple Silicon</td>
|
|
755
|
+
<td>Volta or later architectures, 8 GB VRAM or more</td>
|
|
756
|
+
<td>Not required</td>
|
|
757
|
+
</tr>
|
|
758
|
+
<tr>
|
|
759
|
+
<th>Memory Requirements</th>
|
|
760
|
+
<td colspan="4" style="text-align:center;">Minimum 16 GB, 32 GB recommended</td>
|
|
761
|
+
<td>8 GB</td>
|
|
762
|
+
</tr>
|
|
763
|
+
<tr>
|
|
764
|
+
<th>Disk Space Requirements</th>
|
|
765
|
+
<td colspan="4" style="text-align:center;">20 GB or more, SSD recommended</td>
|
|
766
|
+
<td>2 GB</td>
|
|
767
|
+
</tr>
|
|
768
|
+
<tr>
|
|
769
|
+
<th>Python Version</th>
|
|
770
|
+
<td colspan="5" style="text-align:center;">3.10-3.13</td>
|
|
771
|
+
</tr>
|
|
772
|
+
</tbody>
|
|
740
773
|
</table>
|
|
774
|
+
|
|
775
|
+
<sup>1</sup> Accuracy metric is the End-to-End Evaluation Overall score of OmniDocBench (v1.5), tested on the latest `MinerU` version.
|
|
776
|
+
<sup>2</sup> Linux supports only distributions released in 2019 or later.
|
|
777
|
+
<sup>3</sup> MLX requires macOS 13.5 or later, recommended for use with version 14.0 or higher.
|
|
778
|
+
<sup>4</sup> Windows vLLM support via WSL2(Windows Subsystem for Linux).
|
|
779
|
+
<sup>5</sup> Servers compatible with the OpenAI API, such as local or remote model services deployed via inference frameworks like `vLLM`, `SGLang`, or `LMDeploy`.
|
|
780
|
+
|
|
741
781
|
|
|
742
782
|
### Install MinerU
|
|
743
783
|
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=ODIwI6SfzWmx_FdtwCfr6k5TmpNuA5JdvGyV-9G9YrM,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
6
|
-
mineru/backend/pipeline/batch_analyze.py,sha256=
|
|
7
|
-
mineru/backend/pipeline/model_init.py,sha256=
|
|
8
|
-
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=
|
|
6
|
+
mineru/backend/pipeline/batch_analyze.py,sha256=gnilKhFlMe8-55X2PJnb-ZSVeZIS-5DxIbMpHnwLne8,20889
|
|
7
|
+
mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5IvEQ648lpI,9358
|
|
8
|
+
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
|
|
9
9
|
mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
|
|
10
10
|
mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
|
|
11
|
-
mineru/backend/pipeline/pipeline_analyze.py,sha256=
|
|
11
|
+
mineru/backend/pipeline/pipeline_analyze.py,sha256=O_HGifodg03VZbmTve-U6Cmo0T03AmuK86t1v1J9X-Q,6897
|
|
12
12
|
mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
|
|
13
|
-
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=
|
|
13
|
+
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
|
|
14
14
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
15
15
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
16
16
|
mineru/backend/vlm/utils.py,sha256=woGqyRI4S7p69daLCU07XNXWTV27aLf7YBjjVH1x-5o,2794
|
|
17
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
17
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=EQKNtc12pQ6so5NuUE-ppUtWI1QH_CQnsx1QfHdzAwA,8790
|
|
18
18
|
mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
|
|
19
|
-
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=
|
|
19
|
+
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=5V-AU9KkxxMn0DDSQBrb15I4GVpEyiQy8uNI_tQhS6M,13498
|
|
20
20
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
21
|
-
mineru/cli/client.py,sha256=
|
|
22
|
-
mineru/cli/common.py,sha256=
|
|
21
|
+
mineru/cli/client.py,sha256=ul2Twu-MWT2pCPrtvWbhIwWnoR6aurHJ3KhFOmElP90,6915
|
|
22
|
+
mineru/cli/common.py,sha256=3kd6sF6BlnBNL_UeMjXKJ11fGQA4Y9lOckznWNiIWY8,13988
|
|
23
23
|
mineru/cli/fast_api.py,sha256=t5bda769VbM5iokAboiJfPIOnm-r5GTFReE-KQy8L3g,10941
|
|
24
|
-
mineru/cli/gradio_app.py,sha256=
|
|
25
|
-
mineru/cli/models_download.py,sha256=
|
|
24
|
+
mineru/cli/gradio_app.py,sha256=hyhI38y-JahMJgYZiikC3CYUVrtYVjbZb67Q4RUKbw4,14731
|
|
25
|
+
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
26
26
|
mineru/cli/vlm_vllm_server.py,sha256=fQJyD-gIPQ41hR_6aIaDJczl66N310t0CiZEBAfX5mc,90
|
|
27
27
|
mineru/data/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
28
28
|
mineru/data/data_reader_writer/__init__.py,sha256=9qnGNrsuGBMwwfsQy6oChdkz--a_LPdYWE0VZZr0yr4,490
|
|
@@ -62,8 +62,7 @@ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
|
|
|
62
62
|
mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py,sha256=a9kCvwzJJSRrKQNtW2oOpTwrapzep8BjGFWLhLF1T0k,6036
|
|
63
63
|
mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py,sha256=Q_fdmFHUBtEoAfWp9aowdwTCE2MIFMOPbYjoSyXK2iU,48929
|
|
64
64
|
mineru/model/ocr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
65
|
-
mineru/model/ocr/
|
|
66
|
-
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=wZOw82q1NARNHBW2Lk5zumjdAqzPZqnhV6rvMULvLs8,9207
|
|
65
|
+
mineru/model/ocr/pytorch_paddle.py,sha256=cHMTl5sKyn4BY2207-7GQ4eZl9BQUcs5ucxw_NFezII,9200
|
|
67
66
|
mineru/model/ori_cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
68
67
|
mineru/model/ori_cls/paddle_ori_cls.py,sha256=VIS22IerHST7g60AC9r2PEQIG6NQWeQaH1OrXIxNTsg,11943
|
|
69
68
|
mineru/model/reading_order/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -72,18 +71,18 @@ mineru/model/reading_order/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2u
|
|
|
72
71
|
mineru/model/table/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
73
72
|
mineru/model/table/cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
74
73
|
mineru/model/table/cls/paddle_table_cls.py,sha256=5PtieKQnAzgMNRTZFgnqQsGWKTEQ3yyFWQnBRIjfQ4A,5781
|
|
75
|
-
mineru/model/table/rec/RapidTable.py,sha256=
|
|
74
|
+
mineru/model/table/rec/RapidTable.py,sha256=2dNdGJsVdsGfRm6r3deERUMst5RIxH0YuiGALkQbNTw,5955
|
|
76
75
|
mineru/model/table/rec/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
77
76
|
mineru/model/table/rec/slanet_plus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
77
|
mineru/model/table/rec/slanet_plus/main.py,sha256=vfrcvQ9JBf32YZU9eNoetoqdpcrFNsA1WNqQBsG8i2o,7646
|
|
79
78
|
mineru/model/table/rec/slanet_plus/matcher.py,sha256=uwF-wCLaYlaQ3JQ_-YywGVl1XQYnx7G_RTuWLW8JlBk,7321
|
|
80
79
|
mineru/model/table/rec/slanet_plus/matcher_utils.py,sha256=9wt_ydeeViLd57bU6g3lnXXni49qLSra2C6wSFQZkiw,9597
|
|
81
|
-
mineru/model/table/rec/slanet_plus/table_structure.py,sha256=
|
|
80
|
+
mineru/model/table/rec/slanet_plus/table_structure.py,sha256=qt-HPYIQyp0aWG_MmnM_sMQCV8ZLb4rALSueyCohPgM,4085
|
|
82
81
|
mineru/model/table/rec/slanet_plus/table_structure_utils.py,sha256=YYSkwN2WdLx7qkWMSGkPY7yXOH5ENVhg5CsRGhtZ5Wk,19281
|
|
83
82
|
mineru/model/table/rec/unet_table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
84
83
|
mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGfTg3Z3ob4GDuM,15565
|
|
85
84
|
mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
|
|
86
|
-
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=
|
|
85
|
+
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
|
|
87
86
|
mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
|
|
88
87
|
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=zrCdPwI4M8nu0FEfd7lRJAe0z8kYq3KFbzwElM82USE,11174
|
|
89
88
|
mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
|
|
@@ -152,8 +151,9 @@ mineru/resources/header.html,sha256=PUselBXLBn8gfeP3zwEtj6zIxfhcCN4vN_B796nQFNQ,
|
|
|
152
151
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
153
152
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
154
153
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
155
|
-
mineru/utils/block_sort.py,sha256=
|
|
154
|
+
mineru/utils/block_sort.py,sha256=5e1mOLB3W7xu5Y1hmhvGSHPL_aQ41R_4VXcP4vjYAOU,12976
|
|
156
155
|
mineru/utils/boxbase.py,sha256=moP660AmZq_udHEsfvFkTQdJ4gjrrBwN7t0Enx7CIL8,6903
|
|
156
|
+
mineru/utils/check_sys_env.py,sha256=1o7Do3k84Hnwvlnmzx8JqkcGJA3UqiGfucMv9sPgPyI,1113
|
|
157
157
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
158
158
|
mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
|
|
159
159
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
@@ -163,22 +163,24 @@ mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,
|
|
|
163
163
|
mineru/utils/guess_suffix_or_lang.py,sha256=nznyQpUn1BSA8JNw9HuG3pVV-xtVAtrtcGuHZ-VXt9M,856
|
|
164
164
|
mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
|
|
165
165
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
166
|
-
mineru/utils/llm_aided.py,sha256=
|
|
166
|
+
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
167
167
|
mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
|
|
168
168
|
mineru/utils/model_utils.py,sha256=6OsgFLsABX5JuShSzCMSNHWV-yi-1cjwHweafyxIgRo,18448
|
|
169
169
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
170
170
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
171
|
+
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
171
172
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
172
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
173
|
+
mineru/utils/pdf_image_tools.py,sha256=86_xvsGOEde5QGlKz5uJemjoO1upr6n_K7o3lCdyIjQ,7981
|
|
174
|
+
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
173
175
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
174
176
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
175
177
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
176
178
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
177
179
|
mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
|
|
178
180
|
mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
|
|
179
|
-
mineru-2.6.
|
|
180
|
-
mineru-2.6.
|
|
181
|
-
mineru-2.6.
|
|
182
|
-
mineru-2.6.
|
|
183
|
-
mineru-2.6.
|
|
184
|
-
mineru-2.6.
|
|
181
|
+
mineru-2.6.4.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
182
|
+
mineru-2.6.4.dist-info/METADATA,sha256=igOwr_rwmoJGD4KXKyEBgpESlUr6CZHThNXXE2PQ59U,71241
|
|
183
|
+
mineru-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
184
|
+
mineru-2.6.4.dist-info/entry_points.txt,sha256=luXmbhPiZK_tKlRgWuYOaW_V6EFpG-yJcAevVv9MEqE,252
|
|
185
|
+
mineru-2.6.4.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
186
|
+
mineru-2.6.4.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Opendatalab. All rights reserved.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|