magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/batch_build_dataset.py +156 -0
- magic_pdf/data/dataset.py +56 -25
- magic_pdf/data/utils.py +108 -9
- magic_pdf/dict2md/ocr_mkcontent.py +4 -3
- magic_pdf/libs/pdf_image_tools.py +11 -6
- magic_pdf/libs/performance_stats.py +12 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +175 -201
- magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
- magic_pdf/model/pdf_extract_kit.py +5 -38
- magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
- magic_pdf/model/sub_modules/model_init.py +50 -37
- magic_pdf/model/sub_modules/model_utils.py +18 -12
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
- magic_pdf/pdf_parse_union_core_v2.py +112 -74
- magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
- magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
- magic_pdf/resources/model_config/model_configs.yaml +1 -1
- magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
- magic_pdf/tools/cli.py +30 -12
- magic_pdf/tools/common.py +90 -12
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
- magic_pdf-1.3.1.dist-info/RECORD +203 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
- magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
- magic_pdf-1.2.2.dist-info/RECORD +0 -147
- /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
- /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
- /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
import concurrent.futures
|
2
|
+
|
3
|
+
import fitz
|
4
|
+
|
5
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
6
|
+
from magic_pdf.data.utils import fitz_doc_to_image # PyMuPDF
|
7
|
+
|
8
|
+
|
9
|
+
def partition_array_greedy(arr, k):
|
10
|
+
"""Partition an array into k parts using a simple greedy approach.
|
11
|
+
|
12
|
+
Parameters:
|
13
|
+
-----------
|
14
|
+
arr : list
|
15
|
+
The input array of integers
|
16
|
+
k : int
|
17
|
+
Number of partitions to create
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
--------
|
21
|
+
partitions : list of lists
|
22
|
+
The k partitions of the array
|
23
|
+
"""
|
24
|
+
# Handle edge cases
|
25
|
+
if k <= 0:
|
26
|
+
raise ValueError('k must be a positive integer')
|
27
|
+
if k > len(arr):
|
28
|
+
k = len(arr) # Adjust k if it's too large
|
29
|
+
if k == 1:
|
30
|
+
return [list(range(len(arr)))]
|
31
|
+
if k == len(arr):
|
32
|
+
return [[i] for i in range(len(arr))]
|
33
|
+
|
34
|
+
# Sort the array in descending order
|
35
|
+
sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)
|
36
|
+
|
37
|
+
# Initialize k empty partitions
|
38
|
+
partitions = [[] for _ in range(k)]
|
39
|
+
partition_sums = [0] * k
|
40
|
+
|
41
|
+
# Assign each element to the partition with the smallest current sum
|
42
|
+
for idx in sorted_indices:
|
43
|
+
# Find the partition with the smallest sum
|
44
|
+
min_sum_idx = partition_sums.index(min(partition_sums))
|
45
|
+
|
46
|
+
# Add the element to this partition
|
47
|
+
partitions[min_sum_idx].append(idx) # Store the original index
|
48
|
+
partition_sums[min_sum_idx] += arr[idx][1]
|
49
|
+
|
50
|
+
return partitions
|
51
|
+
|
52
|
+
|
53
|
+
def process_pdf_batch(pdf_jobs, idx):
|
54
|
+
"""Process a batch of PDF pages using multiple threads.
|
55
|
+
|
56
|
+
Parameters:
|
57
|
+
-----------
|
58
|
+
pdf_jobs : list of tuples
|
59
|
+
List of (pdf_path, page_num) tuples
|
60
|
+
output_dir : str or None
|
61
|
+
Directory to save images to
|
62
|
+
num_threads : int
|
63
|
+
Number of threads to use
|
64
|
+
**kwargs :
|
65
|
+
Additional arguments for process_pdf_page
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
--------
|
69
|
+
images : list
|
70
|
+
List of processed images
|
71
|
+
"""
|
72
|
+
images = []
|
73
|
+
|
74
|
+
for pdf_path, _ in pdf_jobs:
|
75
|
+
doc = fitz.open(pdf_path)
|
76
|
+
tmp = []
|
77
|
+
for page_num in range(len(doc)):
|
78
|
+
page = doc[page_num]
|
79
|
+
tmp.append(fitz_doc_to_image(page))
|
80
|
+
images.append(tmp)
|
81
|
+
return (idx, images)
|
82
|
+
|
83
|
+
|
84
|
+
def batch_build_dataset(pdf_paths, k, lang=None):
|
85
|
+
"""Process multiple PDFs by partitioning them into k balanced parts and
|
86
|
+
processing each part in parallel.
|
87
|
+
|
88
|
+
Parameters:
|
89
|
+
-----------
|
90
|
+
pdf_paths : list
|
91
|
+
List of paths to PDF files
|
92
|
+
k : int
|
93
|
+
Number of partitions to create
|
94
|
+
output_dir : str or None
|
95
|
+
Directory to save images to
|
96
|
+
threads_per_worker : int
|
97
|
+
Number of threads to use per worker
|
98
|
+
**kwargs :
|
99
|
+
Additional arguments for process_pdf_page
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
--------
|
103
|
+
all_images : list
|
104
|
+
List of all processed images
|
105
|
+
"""
|
106
|
+
# Get page counts for each PDF
|
107
|
+
pdf_info = []
|
108
|
+
total_pages = 0
|
109
|
+
|
110
|
+
for pdf_path in pdf_paths:
|
111
|
+
try:
|
112
|
+
doc = fitz.open(pdf_path)
|
113
|
+
num_pages = len(doc)
|
114
|
+
pdf_info.append((pdf_path, num_pages))
|
115
|
+
total_pages += num_pages
|
116
|
+
doc.close()
|
117
|
+
except Exception as e:
|
118
|
+
print(f'Error opening {pdf_path}: {e}')
|
119
|
+
|
120
|
+
# Partition the jobs based on page countEach job has 1 page
|
121
|
+
partitions = partition_array_greedy(pdf_info, k)
|
122
|
+
|
123
|
+
# Process each partition in parallel
|
124
|
+
all_images_h = {}
|
125
|
+
|
126
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
|
127
|
+
# Submit one task per partition
|
128
|
+
futures = []
|
129
|
+
for sn, partition in enumerate(partitions):
|
130
|
+
# Get the jobs for this partition
|
131
|
+
partition_jobs = [pdf_info[idx] for idx in partition]
|
132
|
+
|
133
|
+
# Submit the task
|
134
|
+
future = executor.submit(
|
135
|
+
process_pdf_batch,
|
136
|
+
partition_jobs,
|
137
|
+
sn
|
138
|
+
)
|
139
|
+
futures.append(future)
|
140
|
+
# Process results as they complete
|
141
|
+
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
142
|
+
try:
|
143
|
+
idx, images = future.result()
|
144
|
+
all_images_h[idx] = images
|
145
|
+
except Exception as e:
|
146
|
+
print(f'Error processing partition: {e}')
|
147
|
+
results = [None] * len(pdf_paths)
|
148
|
+
for i in range(len(partitions)):
|
149
|
+
partition = partitions[i]
|
150
|
+
for j in range(len(partition)):
|
151
|
+
with open(pdf_info[partition[j]][0], 'rb') as f:
|
152
|
+
pdf_bytes = f.read()
|
153
|
+
dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
154
|
+
dataset.set_images(all_images_h[i][j])
|
155
|
+
results[partition[j]] = dataset
|
156
|
+
return results
|
magic_pdf/data/dataset.py
CHANGED
@@ -97,10 +97,10 @@ class Dataset(ABC):
|
|
97
97
|
|
98
98
|
@abstractmethod
|
99
99
|
def dump_to_file(self, file_path: str):
|
100
|
-
"""Dump the file
|
100
|
+
"""Dump the file.
|
101
101
|
|
102
|
-
Args:
|
103
|
-
file_path (str): the file path
|
102
|
+
Args:
|
103
|
+
file_path (str): the file path
|
104
104
|
"""
|
105
105
|
pass
|
106
106
|
|
@@ -119,7 +119,7 @@ class Dataset(ABC):
|
|
119
119
|
|
120
120
|
@abstractmethod
|
121
121
|
def classify(self) -> SupportedPdfParseMethod:
|
122
|
-
"""classify the dataset
|
122
|
+
"""classify the dataset.
|
123
123
|
|
124
124
|
Returns:
|
125
125
|
SupportedPdfParseMethod: _description_
|
@@ -128,8 +128,7 @@ class Dataset(ABC):
|
|
128
128
|
|
129
129
|
@abstractmethod
|
130
130
|
def clone(self):
|
131
|
-
"""clone this dataset
|
132
|
-
"""
|
131
|
+
"""clone this dataset."""
|
133
132
|
pass
|
134
133
|
|
135
134
|
|
@@ -144,16 +143,19 @@ class PymuDocDataset(Dataset):
|
|
144
143
|
self._records = [Doc(v) for v in self._raw_fitz]
|
145
144
|
self._data_bits = bits
|
146
145
|
self._raw_data = bits
|
146
|
+
self._classify_result = None
|
147
147
|
|
148
148
|
if lang == '':
|
149
149
|
self._lang = None
|
150
150
|
elif lang == 'auto':
|
151
|
-
from magic_pdf.model.sub_modules.language_detection.utils import
|
151
|
+
from magic_pdf.model.sub_modules.language_detection.utils import \
|
152
|
+
auto_detect_lang
|
152
153
|
self._lang = auto_detect_lang(bits)
|
153
|
-
logger.info(f
|
154
|
+
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
|
154
155
|
else:
|
155
156
|
self._lang = lang
|
156
|
-
logger.info(f
|
157
|
+
logger.info(f'lang: {lang}')
|
158
|
+
|
157
159
|
def __len__(self) -> int:
|
158
160
|
"""The page number of the pdf."""
|
159
161
|
return len(self._records)
|
@@ -186,12 +188,12 @@ class PymuDocDataset(Dataset):
|
|
186
188
|
return self._records[page_id]
|
187
189
|
|
188
190
|
def dump_to_file(self, file_path: str):
|
189
|
-
"""Dump the file
|
191
|
+
"""Dump the file.
|
190
192
|
|
191
|
-
Args:
|
192
|
-
file_path (str): the file path
|
193
|
+
Args:
|
194
|
+
file_path (str): the file path
|
193
195
|
"""
|
194
|
-
|
196
|
+
|
195
197
|
dir_name = os.path.dirname(file_path)
|
196
198
|
if dir_name not in ('', '.', '..'):
|
197
199
|
os.makedirs(dir_name, exist_ok=True)
|
@@ -212,21 +214,25 @@ class PymuDocDataset(Dataset):
|
|
212
214
|
return proc(self, *args, **kwargs)
|
213
215
|
|
214
216
|
def classify(self) -> SupportedPdfParseMethod:
|
215
|
-
"""classify the dataset
|
217
|
+
"""classify the dataset.
|
216
218
|
|
217
219
|
Returns:
|
218
220
|
SupportedPdfParseMethod: _description_
|
219
221
|
"""
|
220
|
-
|
222
|
+
if self._classify_result is None:
|
223
|
+
self._classify_result = classify(self._data_bits)
|
224
|
+
return self._classify_result
|
221
225
|
|
222
226
|
def clone(self):
|
223
|
-
"""clone this dataset
|
224
|
-
"""
|
227
|
+
"""clone this dataset."""
|
225
228
|
return PymuDocDataset(self._raw_data)
|
226
229
|
|
230
|
+
def set_images(self, images):
|
231
|
+
for i in range(len(self._records)):
|
232
|
+
self._records[i].set_image(images[i])
|
227
233
|
|
228
234
|
class ImageDataset(Dataset):
|
229
|
-
def __init__(self, bits: bytes):
|
235
|
+
def __init__(self, bits: bytes, lang=None):
|
230
236
|
"""Initialize the dataset, which wraps the pymudoc documents.
|
231
237
|
|
232
238
|
Args:
|
@@ -238,6 +244,17 @@ class ImageDataset(Dataset):
|
|
238
244
|
self._raw_data = bits
|
239
245
|
self._data_bits = pdf_bytes
|
240
246
|
|
247
|
+
if lang == '':
|
248
|
+
self._lang = None
|
249
|
+
elif lang == 'auto':
|
250
|
+
from magic_pdf.model.sub_modules.language_detection.utils import \
|
251
|
+
auto_detect_lang
|
252
|
+
self._lang = auto_detect_lang(bits)
|
253
|
+
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
|
254
|
+
else:
|
255
|
+
self._lang = lang
|
256
|
+
logger.info(f'lang: {lang}')
|
257
|
+
|
241
258
|
def __len__(self) -> int:
|
242
259
|
"""The length of the dataset."""
|
243
260
|
return len(self._records)
|
@@ -270,10 +287,10 @@ class ImageDataset(Dataset):
|
|
270
287
|
return self._records[page_id]
|
271
288
|
|
272
289
|
def dump_to_file(self, file_path: str):
|
273
|
-
"""Dump the file
|
290
|
+
"""Dump the file.
|
274
291
|
|
275
|
-
Args:
|
276
|
-
file_path (str): the file path
|
292
|
+
Args:
|
293
|
+
file_path (str): the file path
|
277
294
|
"""
|
278
295
|
dir_name = os.path.dirname(file_path)
|
279
296
|
if dir_name not in ('', '.', '..'):
|
@@ -293,7 +310,7 @@ class ImageDataset(Dataset):
|
|
293
310
|
return proc(self, *args, **kwargs)
|
294
311
|
|
295
312
|
def classify(self) -> SupportedPdfParseMethod:
|
296
|
-
"""classify the dataset
|
313
|
+
"""classify the dataset.
|
297
314
|
|
298
315
|
Returns:
|
299
316
|
SupportedPdfParseMethod: _description_
|
@@ -301,15 +318,19 @@ class ImageDataset(Dataset):
|
|
301
318
|
return SupportedPdfParseMethod.OCR
|
302
319
|
|
303
320
|
def clone(self):
|
304
|
-
"""clone this dataset
|
305
|
-
"""
|
321
|
+
"""clone this dataset."""
|
306
322
|
return ImageDataset(self._raw_data)
|
307
323
|
|
324
|
+
def set_images(self, images):
|
325
|
+
for i in range(len(self._records)):
|
326
|
+
self._records[i].set_image(images[i])
|
327
|
+
|
308
328
|
class Doc(PageableData):
|
309
329
|
"""Initialized with pymudoc object."""
|
310
330
|
|
311
331
|
def __init__(self, doc: fitz.Page):
|
312
332
|
self._doc = doc
|
333
|
+
self._img = None
|
313
334
|
|
314
335
|
def get_image(self):
|
315
336
|
"""Return the image info.
|
@@ -321,7 +342,17 @@ class Doc(PageableData):
|
|
321
342
|
height: int
|
322
343
|
}
|
323
344
|
"""
|
324
|
-
|
345
|
+
if self._img is None:
|
346
|
+
self._img = fitz_doc_to_image(self._doc)
|
347
|
+
return self._img
|
348
|
+
|
349
|
+
def set_image(self, img):
|
350
|
+
"""
|
351
|
+
Args:
|
352
|
+
img (np.ndarray): the image
|
353
|
+
"""
|
354
|
+
if self._img is None:
|
355
|
+
self._img = img
|
325
356
|
|
326
357
|
def get_doc(self) -> fitz.Page:
|
327
358
|
"""Get the pymudoc object.
|
magic_pdf/data/utils.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
|
2
|
+
import multiprocessing as mp
|
3
|
+
import threading
|
4
|
+
from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
|
5
|
+
as_completed)
|
6
|
+
|
2
7
|
import fitz
|
3
8
|
import numpy as np
|
4
9
|
from loguru import logger
|
5
10
|
|
6
|
-
from magic_pdf.utils.annotations import ImportPIL
|
7
11
|
|
8
12
|
|
9
|
-
@ImportPIL
|
10
13
|
def fitz_doc_to_image(doc, dpi=200) -> dict:
|
11
14
|
"""Convert fitz.Document to image, Then convert the image to numpy array.
|
12
15
|
|
@@ -17,7 +20,6 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
17
20
|
Returns:
|
18
21
|
dict: {'img': numpy array, 'width': width, 'height': height }
|
19
22
|
"""
|
20
|
-
from PIL import Image
|
21
23
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
22
24
|
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
23
25
|
|
@@ -25,16 +27,14 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
25
27
|
if pm.width > 4500 or pm.height > 4500:
|
26
28
|
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
27
29
|
|
28
|
-
|
29
|
-
img = np.
|
30
|
+
# Convert pixmap samples directly to numpy array
|
31
|
+
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
30
32
|
|
31
33
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
32
34
|
|
33
35
|
return img_dict
|
34
36
|
|
35
|
-
@ImportPIL
|
36
37
|
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
37
|
-
from PIL import Image
|
38
38
|
images = []
|
39
39
|
with fitz.open('pdf', pdf_bytes) as doc:
|
40
40
|
pdf_page_num = doc.page_count
|
@@ -57,11 +57,110 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
|
|
57
57
|
if pm.width > 4500 or pm.height > 4500:
|
58
58
|
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
59
59
|
|
60
|
-
|
61
|
-
img = np.
|
60
|
+
# Convert pixmap samples directly to numpy array
|
61
|
+
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
62
|
+
|
62
63
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
63
64
|
else:
|
64
65
|
img_dict = {'img': [], 'width': 0, 'height': 0}
|
65
66
|
|
66
67
|
images.append(img_dict)
|
67
68
|
return images
|
69
|
+
|
70
|
+
|
71
|
+
def convert_page(bytes_page):
|
72
|
+
pdfs = fitz.open('pdf', bytes_page)
|
73
|
+
page = pdfs[0]
|
74
|
+
return fitz_doc_to_image(page)
|
75
|
+
|
76
|
+
def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
|
77
|
+
"""Process PDF pages in parallel with serialization-safe approach."""
|
78
|
+
if num_workers is None:
|
79
|
+
num_workers = mp.cpu_count()
|
80
|
+
|
81
|
+
|
82
|
+
# Process the extracted page data in parallel
|
83
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
84
|
+
# Process the page data
|
85
|
+
results = list(
|
86
|
+
executor.map(convert_page, pages)
|
87
|
+
)
|
88
|
+
|
89
|
+
return results
|
90
|
+
|
91
|
+
|
92
|
+
def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
|
93
|
+
"""Process all pages of a PDF using multiple threads.
|
94
|
+
|
95
|
+
Parameters:
|
96
|
+
-----------
|
97
|
+
pdf_path : str
|
98
|
+
Path to the PDF file
|
99
|
+
num_threads : int
|
100
|
+
Number of threads to use
|
101
|
+
**kwargs :
|
102
|
+
Additional arguments for fitz_doc_to_image
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
--------
|
106
|
+
images : list
|
107
|
+
List of processed images, in page order
|
108
|
+
"""
|
109
|
+
# Open the PDF
|
110
|
+
doc = fitz.open(pdf_path)
|
111
|
+
num_pages = len(doc)
|
112
|
+
|
113
|
+
# Create a list to store results in the correct order
|
114
|
+
results = [None] * num_pages
|
115
|
+
|
116
|
+
# Create a thread pool
|
117
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
118
|
+
# Submit all tasks
|
119
|
+
futures = {}
|
120
|
+
for page_num in range(num_pages):
|
121
|
+
page = doc[page_num]
|
122
|
+
future = executor.submit(fitz_doc_to_image, page, **kwargs)
|
123
|
+
futures[future] = page_num
|
124
|
+
# Process results as they complete with progress bar
|
125
|
+
for future in as_completed(futures):
|
126
|
+
page_num = futures[future]
|
127
|
+
try:
|
128
|
+
results[page_num] = future.result()
|
129
|
+
except Exception as e:
|
130
|
+
print(f'Error processing page {page_num}: {e}')
|
131
|
+
results[page_num] = None
|
132
|
+
|
133
|
+
# Close the document
|
134
|
+
doc.close()
|
135
|
+
|
136
|
+
if __name__ == '__main__':
|
137
|
+
pdf = fitz.open('/tmp/[MS-DOC].pdf')
|
138
|
+
|
139
|
+
|
140
|
+
pdf_page = [fitz.open() for i in range(pdf.page_count)]
|
141
|
+
[pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
|
142
|
+
|
143
|
+
pdf_page = [v.tobytes() for v in pdf_page]
|
144
|
+
results = parallel_process_pdf_safe(pdf_page, num_workers=16)
|
145
|
+
|
146
|
+
# threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
|
147
|
+
|
148
|
+
""" benchmark results of multi-threaded processing (fitz page to image)
|
149
|
+
total page nums: 578
|
150
|
+
thread nums, time cost
|
151
|
+
1 7.351 sec
|
152
|
+
2 6.334 sec
|
153
|
+
4 5.968 sec
|
154
|
+
8 6.728 sec
|
155
|
+
16 8.085 sec
|
156
|
+
"""
|
157
|
+
|
158
|
+
""" benchmark results of multi-processor processing (fitz page to image)
|
159
|
+
total page nums: 578
|
160
|
+
processor nums, time cost
|
161
|
+
1 17.170 sec
|
162
|
+
2 10.170 sec
|
163
|
+
4 7.841 sec
|
164
|
+
8 7.900 sec
|
165
|
+
16 7.984 sec
|
166
|
+
"""
|
@@ -208,12 +208,13 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
208
208
|
'text': merge_para_with_text(para_block),
|
209
209
|
}
|
210
210
|
elif para_type == BlockType.Title:
|
211
|
-
title_level = get_title_level(para_block)
|
212
211
|
para_content = {
|
213
212
|
'type': 'text',
|
214
213
|
'text': merge_para_with_text(para_block),
|
215
|
-
'text_level': title_level,
|
216
214
|
}
|
215
|
+
title_level = get_title_level(para_block)
|
216
|
+
if title_level != 0:
|
217
|
+
para_content['text_level'] = title_level
|
217
218
|
elif para_type == BlockType.InterlineEquation:
|
218
219
|
para_content = {
|
219
220
|
'type': 'equation',
|
@@ -319,5 +320,5 @@ def get_title_level(block):
|
|
319
320
|
if title_level > 4:
|
320
321
|
title_level = 4
|
321
322
|
elif title_level < 1:
|
322
|
-
title_level =
|
323
|
+
title_level = 0
|
323
324
|
return title_level
|
@@ -44,14 +44,19 @@ def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
|
|
44
44
|
# 截取图片
|
45
45
|
pix = page.get_pixmap(clip=rect, matrix=zoom)
|
46
46
|
|
47
|
-
# 将字节数据转换为文件对象
|
48
|
-
image_file = BytesIO(pix.tobytes(output='png'))
|
49
|
-
# 使用 Pillow 打开图像
|
50
|
-
pil_image = Image.open(image_file)
|
51
47
|
if mode == "cv2":
|
52
|
-
|
48
|
+
# 直接转换为numpy数组供cv2使用
|
49
|
+
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
50
|
+
# PyMuPDF使用RGB顺序,而cv2使用BGR顺序
|
51
|
+
if pix.n == 3 or pix.n == 4:
|
52
|
+
image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
53
|
+
else:
|
54
|
+
image_result = img_array
|
53
55
|
elif mode == "pillow":
|
54
|
-
|
56
|
+
# 将字节数据转换为文件对象
|
57
|
+
image_file = BytesIO(pix.tobytes(output='png'))
|
58
|
+
# 使用 Pillow 打开图像
|
59
|
+
image_result = Image.open(image_file)
|
55
60
|
else:
|
56
61
|
raise ValueError(f"mode: {mode} is not supported.")
|
57
62
|
|
@@ -48,7 +48,18 @@ def measure_time(func):
|
|
48
48
|
start_time = time.time()
|
49
49
|
result = func(*args, **kwargs)
|
50
50
|
execution_time = time.time() - start_time
|
51
|
-
|
51
|
+
|
52
|
+
# 获取更详细的函数标识
|
53
|
+
if hasattr(func, "__self__"): # 实例方法
|
54
|
+
class_name = func.__self__.__class__.__name__
|
55
|
+
full_name = f"{class_name}.{func.__name__}"
|
56
|
+
elif hasattr(func, "__qualname__"): # 类方法或静态方法
|
57
|
+
full_name = func.__qualname__
|
58
|
+
else:
|
59
|
+
module_name = func.__module__
|
60
|
+
full_name = f"{module_name}.{func.__name__}"
|
61
|
+
|
62
|
+
PerformanceStats.add_execution_time(full_name, execution_time)
|
52
63
|
return result
|
53
64
|
|
54
65
|
return wrapper
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.
|
1
|
+
__version__ = "1.3.1"
|