mineru 2.6.3__py3-none-any.whl → 2.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_analyze.py +3 -0
- mineru/backend/vlm/vlm_analyze.py +2 -2
- mineru/cli/client.py +1 -1
- mineru/cli/common.py +8 -15
- mineru/cli/gradio_app.py +1 -1
- mineru/model/table/rec/slanet_plus/table_structure.py +4 -0
- mineru/model/table/rec/unet_table/table_structure_unet.py +5 -0
- mineru/utils/{check_mac_env.py → check_sys_env.py} +4 -0
- mineru/utils/os_env_config.py +30 -0
- mineru/utils/pdf_image_tools.py +118 -14
- mineru/utils/pdf_page_id.py +10 -0
- mineru/version.py +1 -1
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/METADATA +5 -1
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/RECORD +18 -16
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/WHEEL +0 -0
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.3.dist-info → mineru-2.6.4.dist-info}/top_level.txt +0 -0
|
@@ -99,7 +99,10 @@ def doc_analyze(
|
|
|
99
99
|
_lang = lang_list[pdf_idx]
|
|
100
100
|
|
|
101
101
|
# 收集每个数据集中的页面
|
|
102
|
+
# load_images_start = time.time()
|
|
102
103
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
104
|
+
# load_images_time = round(time.time() - load_images_start, 2)
|
|
105
|
+
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
|
|
103
106
|
all_image_lists.append(images_list)
|
|
104
107
|
all_pdf_docs.append(pdf_doc)
|
|
105
108
|
for page_idx in range(len(images_list)):
|
|
@@ -8,7 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
|
|
|
8
8
|
from .model_output_to_middle_json import result_to_middle_json
|
|
9
9
|
from ...data.data_reader_writer import DataWriter
|
|
10
10
|
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
11
|
-
from ...utils.
|
|
11
|
+
from ...utils.check_sys_env import is_mac_os_version_supported
|
|
12
12
|
from ...utils.config_reader import get_device
|
|
13
13
|
|
|
14
14
|
from ...utils.enum_class import ImageType
|
|
@@ -177,7 +177,7 @@ async def aio_doc_analyze(
|
|
|
177
177
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
178
178
|
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
179
179
|
# load_images_time = round(time.time() - load_images_start, 2)
|
|
180
|
-
# logger.
|
|
180
|
+
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
|
181
181
|
|
|
182
182
|
# infer_start = time.time()
|
|
183
183
|
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
mineru/cli/client.py
CHANGED
|
@@ -4,7 +4,7 @@ import click
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from loguru import logger
|
|
6
6
|
|
|
7
|
-
from mineru.utils.
|
|
7
|
+
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
|
8
8
|
from mineru.utils.cli_parser import arg_parse
|
|
9
9
|
from mineru.utils.config_reader import get_device
|
|
10
10
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
mineru/cli/common.py
CHANGED
|
@@ -5,8 +5,8 @@ import os
|
|
|
5
5
|
import copy
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
import pypdfium2 as pdfium
|
|
9
8
|
from loguru import logger
|
|
9
|
+
import pypdfium2 as pdfium
|
|
10
10
|
|
|
11
11
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
12
12
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
|
@@ -16,10 +16,12 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
|
16
16
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
17
17
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
18
18
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
19
|
+
from mineru.utils.pdf_page_id import get_end_page_id
|
|
19
20
|
|
|
20
21
|
pdf_suffixes = ["pdf"]
|
|
21
22
|
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
|
22
23
|
|
|
24
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
23
25
|
|
|
24
26
|
def read_fn(path):
|
|
25
27
|
if not isinstance(path, Path):
|
|
@@ -44,18 +46,10 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
|
|
49
|
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
50
|
+
output_pdf = pdfium.PdfDocument.new()
|
|
47
51
|
try:
|
|
48
|
-
|
|
49
|
-
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
50
|
-
|
|
51
|
-
# 确定结束页
|
|
52
|
-
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
|
|
53
|
-
if end_page_id > len(pdf) - 1:
|
|
54
|
-
logger.warning("end_page_id is out of range, use pdf_docs length")
|
|
55
|
-
end_page_id = len(pdf) - 1
|
|
56
|
-
|
|
57
|
-
# 创建一个新的PDF文档
|
|
58
|
-
output_pdf = pdfium.PdfDocument.new()
|
|
52
|
+
end_page_id = get_end_page_id(end_page_id, len(pdf))
|
|
59
53
|
|
|
60
54
|
# 选择要导入的页面索引
|
|
61
55
|
page_indices = list(range(start_page_id, end_page_id + 1))
|
|
@@ -69,13 +63,12 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
|
|
|
69
63
|
|
|
70
64
|
# 获取字节数据
|
|
71
65
|
output_bytes = output_buffer.getvalue()
|
|
72
|
-
|
|
73
|
-
pdf.close() # 关闭原PDF文档以释放资源
|
|
74
|
-
output_pdf.close() # 关闭新PDF文档以释放资源
|
|
75
66
|
except Exception as e:
|
|
76
67
|
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
|
77
68
|
output_bytes = pdf_bytes
|
|
78
69
|
|
|
70
|
+
pdf.close()
|
|
71
|
+
output_pdf.close()
|
|
79
72
|
return output_bytes
|
|
80
73
|
|
|
81
74
|
|
mineru/cli/gradio_app.py
CHANGED
|
@@ -13,7 +13,7 @@ from gradio_pdf import PDF
|
|
|
13
13
|
from loguru import logger
|
|
14
14
|
|
|
15
15
|
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
|
|
16
|
-
from mineru.utils.
|
|
16
|
+
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
|
17
17
|
from mineru.utils.cli_parser import arg_parse
|
|
18
18
|
from mineru.utils.hash_utils import str_sha256
|
|
19
19
|
|
|
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Tuple
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
+
from mineru.utils.os_env_config import get_op_num_threads
|
|
19
20
|
from .table_structure_utils import (
|
|
20
21
|
OrtInferSession,
|
|
21
22
|
TableLabelDecode,
|
|
@@ -29,6 +30,9 @@ class TableStructurer:
|
|
|
29
30
|
self.preprocess_op = TablePreprocess()
|
|
30
31
|
self.batch_preprocess_op = BatchTablePreprocess()
|
|
31
32
|
|
|
33
|
+
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
|
34
|
+
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
|
35
|
+
|
|
32
36
|
self.session = OrtInferSession(config)
|
|
33
37
|
|
|
34
38
|
self.character = self.session.get_metadata()
|
|
@@ -5,6 +5,8 @@ from typing import Optional, Dict, Any, Tuple
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
7
|
from skimage import measure
|
|
8
|
+
|
|
9
|
+
from mineru.utils.os_env_config import get_op_num_threads
|
|
8
10
|
from .utils import OrtInferSession, resize_img
|
|
9
11
|
from .utils_table_line_rec import (
|
|
10
12
|
get_table_line,
|
|
@@ -28,6 +30,9 @@ class TSRUnet:
|
|
|
28
30
|
self.inp_height = 1024
|
|
29
31
|
self.inp_width = 1024
|
|
30
32
|
|
|
33
|
+
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
|
34
|
+
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
|
35
|
+
|
|
31
36
|
self.session = OrtInferSession(config)
|
|
32
37
|
|
|
33
38
|
def __call__(
|
|
@@ -4,6 +4,10 @@ import platform
|
|
|
4
4
|
from packaging import version
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
def is_windows_environment() -> bool:
|
|
8
|
+
return platform.system() == "Windows"
|
|
9
|
+
|
|
10
|
+
|
|
7
11
|
# Detect if the current environment is a Mac computer
|
|
8
12
|
def is_mac_environment() -> bool:
|
|
9
13
|
return platform.system() == "Darwin"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_op_num_threads(env_name: str) -> int:
|
|
5
|
+
env_value = os.getenv(env_name, None)
|
|
6
|
+
return get_value_from_string(env_value, -1)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_load_images_timeout() -> int:
|
|
10
|
+
env_value = os.getenv('MINERU_PDF_RENDER_TIMEOUT', None)
|
|
11
|
+
return get_value_from_string(env_value, 300)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_value_from_string(env_value: str, default_value: int) -> int:
|
|
15
|
+
if env_value is not None:
|
|
16
|
+
try:
|
|
17
|
+
num_threads = int(env_value)
|
|
18
|
+
if num_threads > 0:
|
|
19
|
+
return num_threads
|
|
20
|
+
except ValueError:
|
|
21
|
+
return default_value
|
|
22
|
+
return default_value
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == '__main__':
|
|
26
|
+
print(get_value_from_string('1', -1))
|
|
27
|
+
print(get_value_from_string('0', -1))
|
|
28
|
+
print(get_value_from_string('-1', -1))
|
|
29
|
+
print(get_value_from_string('abc', -1))
|
|
30
|
+
print(get_load_images_timeout())
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
import os
|
|
2
3
|
from io import BytesIO
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -7,9 +8,14 @@ from loguru import logger
|
|
|
7
8
|
from PIL import Image
|
|
8
9
|
|
|
9
10
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
|
+
from mineru.utils.check_sys_env import is_windows_environment
|
|
12
|
+
from mineru.utils.os_env_config import get_load_images_timeout
|
|
10
13
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
|
11
|
-
from .enum_class import ImageType
|
|
12
|
-
from .hash_utils import str_sha256
|
|
14
|
+
from mineru.utils.enum_class import ImageType
|
|
15
|
+
from mineru.utils.hash_utils import str_sha256
|
|
16
|
+
from mineru.utils.pdf_page_id import get_end_page_id
|
|
17
|
+
|
|
18
|
+
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
|
@@ -35,7 +41,106 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
|
|
35
41
|
return image_dict
|
|
36
42
|
|
|
37
43
|
|
|
44
|
+
def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
|
|
45
|
+
"""用于进程池的包装函数"""
|
|
46
|
+
return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
|
|
47
|
+
|
|
48
|
+
|
|
38
49
|
def load_images_from_pdf(
|
|
50
|
+
pdf_bytes: bytes,
|
|
51
|
+
dpi=200,
|
|
52
|
+
start_page_id=0,
|
|
53
|
+
end_page_id=None,
|
|
54
|
+
image_type=ImageType.PIL,
|
|
55
|
+
timeout=None,
|
|
56
|
+
threads=4,
|
|
57
|
+
):
|
|
58
|
+
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
pdf_bytes (bytes): PDF 文件的 bytes
|
|
62
|
+
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
|
63
|
+
start_page_id (int, optional): 起始页码. Defaults to 0.
|
|
64
|
+
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
|
65
|
+
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
|
66
|
+
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
|
|
67
|
+
threads (int): 进程数,默认 4
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
TimeoutError: 当转换超时时抛出
|
|
71
|
+
"""
|
|
72
|
+
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
|
73
|
+
if is_windows_environment():
|
|
74
|
+
# Windows 环境下不使用多进程
|
|
75
|
+
return load_images_from_pdf_core(
|
|
76
|
+
pdf_bytes,
|
|
77
|
+
dpi,
|
|
78
|
+
start_page_id,
|
|
79
|
+
get_end_page_id(end_page_id, len(pdf_doc)),
|
|
80
|
+
image_type
|
|
81
|
+
), pdf_doc
|
|
82
|
+
else:
|
|
83
|
+
if timeout is None:
|
|
84
|
+
timeout = get_load_images_timeout()
|
|
85
|
+
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
|
86
|
+
|
|
87
|
+
# 计算总页数
|
|
88
|
+
total_pages = end_page_id - start_page_id + 1
|
|
89
|
+
|
|
90
|
+
# 实际使用的进程数不超过总页数
|
|
91
|
+
actual_threads = min(os.cpu_count() or 1, threads, total_pages)
|
|
92
|
+
|
|
93
|
+
# 根据实际进程数分组页面范围
|
|
94
|
+
pages_per_thread = max(1, total_pages // actual_threads)
|
|
95
|
+
page_ranges = []
|
|
96
|
+
|
|
97
|
+
for i in range(actual_threads):
|
|
98
|
+
range_start = start_page_id + i * pages_per_thread
|
|
99
|
+
if i == actual_threads - 1:
|
|
100
|
+
# 最后一个进程处理剩余所有页面
|
|
101
|
+
range_end = end_page_id
|
|
102
|
+
else:
|
|
103
|
+
range_end = start_page_id + (i + 1) * pages_per_thread - 1
|
|
104
|
+
|
|
105
|
+
page_ranges.append((range_start, range_end))
|
|
106
|
+
|
|
107
|
+
# logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
|
108
|
+
|
|
109
|
+
with ProcessPoolExecutor(max_workers=actual_threads) as executor:
|
|
110
|
+
# 提交所有任务
|
|
111
|
+
futures = []
|
|
112
|
+
for range_start, range_end in page_ranges:
|
|
113
|
+
future = executor.submit(
|
|
114
|
+
_load_images_from_pdf_worker,
|
|
115
|
+
pdf_bytes,
|
|
116
|
+
dpi,
|
|
117
|
+
range_start,
|
|
118
|
+
range_end,
|
|
119
|
+
image_type
|
|
120
|
+
)
|
|
121
|
+
futures.append((range_start, future))
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
# 收集结果并按页码排序
|
|
125
|
+
all_results = []
|
|
126
|
+
for range_start, future in futures:
|
|
127
|
+
images_list = future.result(timeout=timeout)
|
|
128
|
+
all_results.append((range_start, images_list))
|
|
129
|
+
|
|
130
|
+
# 按起始页码排序并合并结果
|
|
131
|
+
all_results.sort(key=lambda x: x[0])
|
|
132
|
+
images_list = []
|
|
133
|
+
for _, imgs in all_results:
|
|
134
|
+
images_list.extend(imgs)
|
|
135
|
+
|
|
136
|
+
return images_list, pdf_doc
|
|
137
|
+
except FuturesTimeoutError:
|
|
138
|
+
pdf_doc.close()
|
|
139
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
140
|
+
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load_images_from_pdf_core(
|
|
39
144
|
pdf_bytes: bytes,
|
|
40
145
|
dpi=200,
|
|
41
146
|
start_page_id=0,
|
|
@@ -45,18 +150,17 @@ def load_images_from_pdf(
|
|
|
45
150
|
images_list = []
|
|
46
151
|
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
|
47
152
|
pdf_page_num = len(pdf_doc)
|
|
48
|
-
end_page_id = end_page_id
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
return images_list, pdf_doc
|
|
153
|
+
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
|
154
|
+
|
|
155
|
+
for index in range(start_page_id, end_page_id + 1):
|
|
156
|
+
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
|
|
157
|
+
page = pdf_doc[index]
|
|
158
|
+
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
|
159
|
+
images_list.append(image_dict)
|
|
160
|
+
|
|
161
|
+
pdf_doc.close()
|
|
162
|
+
|
|
163
|
+
return images_list
|
|
60
164
|
|
|
61
165
|
|
|
62
166
|
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_end_page_id(end_page_id, pdf_page_num):
|
|
6
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
|
7
|
+
if end_page_id > pdf_page_num - 1:
|
|
8
|
+
logger.warning("end_page_id is out of range, use images length")
|
|
9
|
+
end_page_id = pdf_page_num - 1
|
|
10
|
+
return end_page_id
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.6.
|
|
1
|
+
__version__ = "2.6.4"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.4
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -130,6 +130,10 @@ Dynamic: license-file
|
|
|
130
130
|
</div>
|
|
131
131
|
|
|
132
132
|
# Changelog
|
|
133
|
+
- 2025/11/04 2.6.4 Release
|
|
134
|
+
- Added timeout configuration for PDF image rendering, default is 300 seconds, can be configured via environment variable `MINERU_PDF_RENDER_TIMEOUT` to prevent long blocking of the rendering process caused by some abnormal PDF files.
|
|
135
|
+
- Added CPU thread count configuration options for ONNX models, default is the system CPU core count, can be configured via environment variables `MINERU_INTRA_OP_NUM_THREADS` and `MINERU_INTER_OP_NUM_THREADS` to reduce CPU resource contention conflicts in high concurrency scenarios.
|
|
136
|
+
|
|
133
137
|
- 2025/10/31 2.6.3 Release
|
|
134
138
|
- Added support for a new backend `vlm-mlx-engine`, enabling MLX-accelerated inference for the MinerU2.5 model on Apple Silicon devices. Compared to the `vlm-transformers` backend, `vlm-mlx-engine` delivers a 100%–200% speed improvement.
|
|
135
139
|
- Bug fixes: #3849, #3859
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=ODIwI6SfzWmx_FdtwCfr6k5TmpNuA5JdvGyV-9G9YrM,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -8,20 +8,20 @@ mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5Iv
|
|
|
8
8
|
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
|
|
9
9
|
mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
|
|
10
10
|
mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
|
|
11
|
-
mineru/backend/pipeline/pipeline_analyze.py,sha256=
|
|
11
|
+
mineru/backend/pipeline/pipeline_analyze.py,sha256=O_HGifodg03VZbmTve-U6Cmo0T03AmuK86t1v1J9X-Q,6897
|
|
12
12
|
mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
|
|
13
13
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
|
|
14
14
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
15
15
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
16
16
|
mineru/backend/vlm/utils.py,sha256=woGqyRI4S7p69daLCU07XNXWTV27aLf7YBjjVH1x-5o,2794
|
|
17
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
17
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=EQKNtc12pQ6so5NuUE-ppUtWI1QH_CQnsx1QfHdzAwA,8790
|
|
18
18
|
mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
|
|
19
19
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=5V-AU9KkxxMn0DDSQBrb15I4GVpEyiQy8uNI_tQhS6M,13498
|
|
20
20
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
21
|
-
mineru/cli/client.py,sha256=
|
|
22
|
-
mineru/cli/common.py,sha256=
|
|
21
|
+
mineru/cli/client.py,sha256=ul2Twu-MWT2pCPrtvWbhIwWnoR6aurHJ3KhFOmElP90,6915
|
|
22
|
+
mineru/cli/common.py,sha256=3kd6sF6BlnBNL_UeMjXKJ11fGQA4Y9lOckznWNiIWY8,13988
|
|
23
23
|
mineru/cli/fast_api.py,sha256=t5bda769VbM5iokAboiJfPIOnm-r5GTFReE-KQy8L3g,10941
|
|
24
|
-
mineru/cli/gradio_app.py,sha256=
|
|
24
|
+
mineru/cli/gradio_app.py,sha256=hyhI38y-JahMJgYZiikC3CYUVrtYVjbZb67Q4RUKbw4,14731
|
|
25
25
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
26
26
|
mineru/cli/vlm_vllm_server.py,sha256=fQJyD-gIPQ41hR_6aIaDJczl66N310t0CiZEBAfX5mc,90
|
|
27
27
|
mineru/data/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -77,12 +77,12 @@ mineru/model/table/rec/slanet_plus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
77
77
|
mineru/model/table/rec/slanet_plus/main.py,sha256=vfrcvQ9JBf32YZU9eNoetoqdpcrFNsA1WNqQBsG8i2o,7646
|
|
78
78
|
mineru/model/table/rec/slanet_plus/matcher.py,sha256=uwF-wCLaYlaQ3JQ_-YywGVl1XQYnx7G_RTuWLW8JlBk,7321
|
|
79
79
|
mineru/model/table/rec/slanet_plus/matcher_utils.py,sha256=9wt_ydeeViLd57bU6g3lnXXni49qLSra2C6wSFQZkiw,9597
|
|
80
|
-
mineru/model/table/rec/slanet_plus/table_structure.py,sha256=
|
|
80
|
+
mineru/model/table/rec/slanet_plus/table_structure.py,sha256=qt-HPYIQyp0aWG_MmnM_sMQCV8ZLb4rALSueyCohPgM,4085
|
|
81
81
|
mineru/model/table/rec/slanet_plus/table_structure_utils.py,sha256=YYSkwN2WdLx7qkWMSGkPY7yXOH5ENVhg5CsRGhtZ5Wk,19281
|
|
82
82
|
mineru/model/table/rec/unet_table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
83
|
mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGfTg3Z3ob4GDuM,15565
|
|
84
84
|
mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
|
|
85
|
-
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=
|
|
85
|
+
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
|
|
86
86
|
mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
|
|
87
87
|
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=zrCdPwI4M8nu0FEfd7lRJAe0z8kYq3KFbzwElM82USE,11174
|
|
88
88
|
mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
|
|
@@ -153,7 +153,7 @@ mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
|
153
153
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
154
154
|
mineru/utils/block_sort.py,sha256=5e1mOLB3W7xu5Y1hmhvGSHPL_aQ41R_4VXcP4vjYAOU,12976
|
|
155
155
|
mineru/utils/boxbase.py,sha256=moP660AmZq_udHEsfvFkTQdJ4gjrrBwN7t0Enx7CIL8,6903
|
|
156
|
-
mineru/utils/
|
|
156
|
+
mineru/utils/check_sys_env.py,sha256=1o7Do3k84Hnwvlnmzx8JqkcGJA3UqiGfucMv9sPgPyI,1113
|
|
157
157
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
158
158
|
mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
|
|
159
159
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
@@ -168,17 +168,19 @@ mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZ
|
|
|
168
168
|
mineru/utils/model_utils.py,sha256=6OsgFLsABX5JuShSzCMSNHWV-yi-1cjwHweafyxIgRo,18448
|
|
169
169
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
170
170
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
171
|
+
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
171
172
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
172
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
173
|
+
mineru/utils/pdf_image_tools.py,sha256=86_xvsGOEde5QGlKz5uJemjoO1upr6n_K7o3lCdyIjQ,7981
|
|
174
|
+
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
173
175
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
174
176
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
175
177
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
176
178
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
177
179
|
mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
|
|
178
180
|
mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
|
|
179
|
-
mineru-2.6.
|
|
180
|
-
mineru-2.6.
|
|
181
|
-
mineru-2.6.
|
|
182
|
-
mineru-2.6.
|
|
183
|
-
mineru-2.6.
|
|
184
|
-
mineru-2.6.
|
|
181
|
+
mineru-2.6.4.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
182
|
+
mineru-2.6.4.dist-info/METADATA,sha256=igOwr_rwmoJGD4KXKyEBgpESlUr6CZHThNXXE2PQ59U,71241
|
|
183
|
+
mineru-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
184
|
+
mineru-2.6.4.dist-info/entry_points.txt,sha256=luXmbhPiZK_tKlRgWuYOaW_V6EFpG-yJcAevVv9MEqE,252
|
|
185
|
+
mineru-2.6.4.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
186
|
+
mineru-2.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|