magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +56 -25
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +18 -12
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  82. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  83. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  84. magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
  85. magic_pdf/tools/cli.py +30 -12
  86. magic_pdf/tools/common.py +90 -12
  87. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
  88. magic_pdf-1.3.1.dist-info/RECORD +203 -0
  89. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
  90. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  91. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  92. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  93. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  94. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  95. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  96. magic_pdf-1.2.2.dist-info/RECORD +0 -147
  97. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  98. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  99. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  100. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
  101. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
  102. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
1
+ import concurrent.futures
2
+
3
+ import fitz
4
+
5
+ from magic_pdf.data.dataset import PymuDocDataset
6
+ from magic_pdf.data.utils import fitz_doc_to_image # PyMuPDF
7
+
8
+
9
+ def partition_array_greedy(arr, k):
10
+ """Partition an array into k parts using a simple greedy approach.
11
+
12
+ Parameters:
13
+ -----------
14
+ arr : list
15
+ The input array of integers
16
+ k : int
17
+ Number of partitions to create
18
+
19
+ Returns:
20
+ --------
21
+ partitions : list of lists
22
+ The k partitions of the array
23
+ """
24
+ # Handle edge cases
25
+ if k <= 0:
26
+ raise ValueError('k must be a positive integer')
27
+ if k > len(arr):
28
+ k = len(arr) # Adjust k if it's too large
29
+ if k == 1:
30
+ return [list(range(len(arr)))]
31
+ if k == len(arr):
32
+ return [[i] for i in range(len(arr))]
33
+
34
+ # Sort the array in descending order
35
+ sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)
36
+
37
+ # Initialize k empty partitions
38
+ partitions = [[] for _ in range(k)]
39
+ partition_sums = [0] * k
40
+
41
+ # Assign each element to the partition with the smallest current sum
42
+ for idx in sorted_indices:
43
+ # Find the partition with the smallest sum
44
+ min_sum_idx = partition_sums.index(min(partition_sums))
45
+
46
+ # Add the element to this partition
47
+ partitions[min_sum_idx].append(idx) # Store the original index
48
+ partition_sums[min_sum_idx] += arr[idx][1]
49
+
50
+ return partitions
51
+
52
+
53
+ def process_pdf_batch(pdf_jobs, idx):
54
+ """Process a batch of PDF pages using multiple threads.
55
+
56
+ Parameters:
57
+ -----------
58
+ pdf_jobs : list of tuples
59
+ List of (pdf_path, page_num) tuples
60
+ output_dir : str or None
61
+ Directory to save images to
62
+ num_threads : int
63
+ Number of threads to use
64
+ **kwargs :
65
+ Additional arguments for process_pdf_page
66
+
67
+ Returns:
68
+ --------
69
+ images : list
70
+ List of processed images
71
+ """
72
+ images = []
73
+
74
+ for pdf_path, _ in pdf_jobs:
75
+ doc = fitz.open(pdf_path)
76
+ tmp = []
77
+ for page_num in range(len(doc)):
78
+ page = doc[page_num]
79
+ tmp.append(fitz_doc_to_image(page))
80
+ images.append(tmp)
81
+ return (idx, images)
82
+
83
+
84
+ def batch_build_dataset(pdf_paths, k, lang=None):
85
+ """Process multiple PDFs by partitioning them into k balanced parts and
86
+ processing each part in parallel.
87
+
88
+ Parameters:
89
+ -----------
90
+ pdf_paths : list
91
+ List of paths to PDF files
92
+ k : int
93
+ Number of partitions to create
94
+ output_dir : str or None
95
+ Directory to save images to
96
+ threads_per_worker : int
97
+ Number of threads to use per worker
98
+ **kwargs :
99
+ Additional arguments for process_pdf_page
100
+
101
+ Returns:
102
+ --------
103
+ all_images : list
104
+ List of all processed images
105
+ """
106
+ # Get page counts for each PDF
107
+ pdf_info = []
108
+ total_pages = 0
109
+
110
+ for pdf_path in pdf_paths:
111
+ try:
112
+ doc = fitz.open(pdf_path)
113
+ num_pages = len(doc)
114
+ pdf_info.append((pdf_path, num_pages))
115
+ total_pages += num_pages
116
+ doc.close()
117
+ except Exception as e:
118
+ print(f'Error opening {pdf_path}: {e}')
119
+
120
+ # Partition the jobs based on page countEach job has 1 page
121
+ partitions = partition_array_greedy(pdf_info, k)
122
+
123
+ # Process each partition in parallel
124
+ all_images_h = {}
125
+
126
+ with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
127
+ # Submit one task per partition
128
+ futures = []
129
+ for sn, partition in enumerate(partitions):
130
+ # Get the jobs for this partition
131
+ partition_jobs = [pdf_info[idx] for idx in partition]
132
+
133
+ # Submit the task
134
+ future = executor.submit(
135
+ process_pdf_batch,
136
+ partition_jobs,
137
+ sn
138
+ )
139
+ futures.append(future)
140
+ # Process results as they complete
141
+ for i, future in enumerate(concurrent.futures.as_completed(futures)):
142
+ try:
143
+ idx, images = future.result()
144
+ all_images_h[idx] = images
145
+ except Exception as e:
146
+ print(f'Error processing partition: {e}')
147
+ results = [None] * len(pdf_paths)
148
+ for i in range(len(partitions)):
149
+ partition = partitions[i]
150
+ for j in range(len(partition)):
151
+ with open(pdf_info[partition[j]][0], 'rb') as f:
152
+ pdf_bytes = f.read()
153
+ dataset = PymuDocDataset(pdf_bytes, lang=lang)
154
+ dataset.set_images(all_images_h[i][j])
155
+ results[partition[j]] = dataset
156
+ return results
magic_pdf/data/dataset.py CHANGED
@@ -97,10 +97,10 @@ class Dataset(ABC):
97
97
 
98
98
  @abstractmethod
99
99
  def dump_to_file(self, file_path: str):
100
- """Dump the file
100
+ """Dump the file.
101
101
 
102
- Args:
103
- file_path (str): the file path
102
+ Args:
103
+ file_path (str): the file path
104
104
  """
105
105
  pass
106
106
 
@@ -119,7 +119,7 @@ class Dataset(ABC):
119
119
 
120
120
  @abstractmethod
121
121
  def classify(self) -> SupportedPdfParseMethod:
122
- """classify the dataset
122
+ """classify the dataset.
123
123
 
124
124
  Returns:
125
125
  SupportedPdfParseMethod: _description_
@@ -128,8 +128,7 @@ class Dataset(ABC):
128
128
 
129
129
  @abstractmethod
130
130
  def clone(self):
131
- """clone this dataset
132
- """
131
+ """clone this dataset."""
133
132
  pass
134
133
 
135
134
 
@@ -144,16 +143,19 @@ class PymuDocDataset(Dataset):
144
143
  self._records = [Doc(v) for v in self._raw_fitz]
145
144
  self._data_bits = bits
146
145
  self._raw_data = bits
146
+ self._classify_result = None
147
147
 
148
148
  if lang == '':
149
149
  self._lang = None
150
150
  elif lang == 'auto':
151
- from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
151
+ from magic_pdf.model.sub_modules.language_detection.utils import \
152
+ auto_detect_lang
152
153
  self._lang = auto_detect_lang(bits)
153
- logger.info(f"lang: {lang}, detect_lang: {self._lang}")
154
+ logger.info(f'lang: {lang}, detect_lang: {self._lang}')
154
155
  else:
155
156
  self._lang = lang
156
- logger.info(f"lang: {lang}")
157
+ logger.info(f'lang: {lang}')
158
+
157
159
  def __len__(self) -> int:
158
160
  """The page number of the pdf."""
159
161
  return len(self._records)
@@ -186,12 +188,12 @@ class PymuDocDataset(Dataset):
186
188
  return self._records[page_id]
187
189
 
188
190
  def dump_to_file(self, file_path: str):
189
- """Dump the file
191
+ """Dump the file.
190
192
 
191
- Args:
192
- file_path (str): the file path
193
+ Args:
194
+ file_path (str): the file path
193
195
  """
194
-
196
+
195
197
  dir_name = os.path.dirname(file_path)
196
198
  if dir_name not in ('', '.', '..'):
197
199
  os.makedirs(dir_name, exist_ok=True)
@@ -212,21 +214,25 @@ class PymuDocDataset(Dataset):
212
214
  return proc(self, *args, **kwargs)
213
215
 
214
216
  def classify(self) -> SupportedPdfParseMethod:
215
- """classify the dataset
217
+ """classify the dataset.
216
218
 
217
219
  Returns:
218
220
  SupportedPdfParseMethod: _description_
219
221
  """
220
- return classify(self._data_bits)
222
+ if self._classify_result is None:
223
+ self._classify_result = classify(self._data_bits)
224
+ return self._classify_result
221
225
 
222
226
  def clone(self):
223
- """clone this dataset
224
- """
227
+ """clone this dataset."""
225
228
  return PymuDocDataset(self._raw_data)
226
229
 
230
+ def set_images(self, images):
231
+ for i in range(len(self._records)):
232
+ self._records[i].set_image(images[i])
227
233
 
228
234
  class ImageDataset(Dataset):
229
- def __init__(self, bits: bytes):
235
+ def __init__(self, bits: bytes, lang=None):
230
236
  """Initialize the dataset, which wraps the pymudoc documents.
231
237
 
232
238
  Args:
@@ -238,6 +244,17 @@ class ImageDataset(Dataset):
238
244
  self._raw_data = bits
239
245
  self._data_bits = pdf_bytes
240
246
 
247
+ if lang == '':
248
+ self._lang = None
249
+ elif lang == 'auto':
250
+ from magic_pdf.model.sub_modules.language_detection.utils import \
251
+ auto_detect_lang
252
+ self._lang = auto_detect_lang(bits)
253
+ logger.info(f'lang: {lang}, detect_lang: {self._lang}')
254
+ else:
255
+ self._lang = lang
256
+ logger.info(f'lang: {lang}')
257
+
241
258
  def __len__(self) -> int:
242
259
  """The length of the dataset."""
243
260
  return len(self._records)
@@ -270,10 +287,10 @@ class ImageDataset(Dataset):
270
287
  return self._records[page_id]
271
288
 
272
289
  def dump_to_file(self, file_path: str):
273
- """Dump the file
290
+ """Dump the file.
274
291
 
275
- Args:
276
- file_path (str): the file path
292
+ Args:
293
+ file_path (str): the file path
277
294
  """
278
295
  dir_name = os.path.dirname(file_path)
279
296
  if dir_name not in ('', '.', '..'):
@@ -293,7 +310,7 @@ class ImageDataset(Dataset):
293
310
  return proc(self, *args, **kwargs)
294
311
 
295
312
  def classify(self) -> SupportedPdfParseMethod:
296
- """classify the dataset
313
+ """classify the dataset.
297
314
 
298
315
  Returns:
299
316
  SupportedPdfParseMethod: _description_
@@ -301,15 +318,19 @@ class ImageDataset(Dataset):
301
318
  return SupportedPdfParseMethod.OCR
302
319
 
303
320
  def clone(self):
304
- """clone this dataset
305
- """
321
+ """clone this dataset."""
306
322
  return ImageDataset(self._raw_data)
307
323
 
324
+ def set_images(self, images):
325
+ for i in range(len(self._records)):
326
+ self._records[i].set_image(images[i])
327
+
308
328
  class Doc(PageableData):
309
329
  """Initialized with pymudoc object."""
310
330
 
311
331
  def __init__(self, doc: fitz.Page):
312
332
  self._doc = doc
333
+ self._img = None
313
334
 
314
335
  def get_image(self):
315
336
  """Return the image info.
@@ -321,7 +342,17 @@ class Doc(PageableData):
321
342
  height: int
322
343
  }
323
344
  """
324
- return fitz_doc_to_image(self._doc)
345
+ if self._img is None:
346
+ self._img = fitz_doc_to_image(self._doc)
347
+ return self._img
348
+
349
+ def set_image(self, img):
350
+ """
351
+ Args:
352
+ img (np.ndarray): the image
353
+ """
354
+ if self._img is None:
355
+ self._img = img
325
356
 
326
357
  def get_doc(self) -> fitz.Page:
327
358
  """Get the pymudoc object.
magic_pdf/data/utils.py CHANGED
@@ -1,12 +1,15 @@
1
1
 
2
+ import multiprocessing as mp
3
+ import threading
4
+ from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
5
+ as_completed)
6
+
2
7
  import fitz
3
8
  import numpy as np
4
9
  from loguru import logger
5
10
 
6
- from magic_pdf.utils.annotations import ImportPIL
7
11
 
8
12
 
9
- @ImportPIL
10
13
  def fitz_doc_to_image(doc, dpi=200) -> dict:
11
14
  """Convert fitz.Document to image, Then convert the image to numpy array.
12
15
 
@@ -17,7 +20,6 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
17
20
  Returns:
18
21
  dict: {'img': numpy array, 'width': width, 'height': height }
19
22
  """
20
- from PIL import Image
21
23
  mat = fitz.Matrix(dpi / 72, dpi / 72)
22
24
  pm = doc.get_pixmap(matrix=mat, alpha=False)
23
25
 
@@ -25,16 +27,14 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
25
27
  if pm.width > 4500 or pm.height > 4500:
26
28
  pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
27
29
 
28
- img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
29
- img = np.array(img)
30
+ # Convert pixmap samples directly to numpy array
31
+ img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
30
32
 
31
33
  img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
32
34
 
33
35
  return img_dict
34
36
 
35
- @ImportPIL
36
37
  def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
37
- from PIL import Image
38
38
  images = []
39
39
  with fitz.open('pdf', pdf_bytes) as doc:
40
40
  pdf_page_num = doc.page_count
@@ -57,11 +57,110 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
57
57
  if pm.width > 4500 or pm.height > 4500:
58
58
  pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
59
59
 
60
- img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
61
- img = np.array(img)
60
+ # Convert pixmap samples directly to numpy array
61
+ img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
62
+
62
63
  img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
63
64
  else:
64
65
  img_dict = {'img': [], 'width': 0, 'height': 0}
65
66
 
66
67
  images.append(img_dict)
67
68
  return images
69
+
70
+
71
+ def convert_page(bytes_page):
72
+ pdfs = fitz.open('pdf', bytes_page)
73
+ page = pdfs[0]
74
+ return fitz_doc_to_image(page)
75
+
76
+ def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
77
+ """Process PDF pages in parallel with serialization-safe approach."""
78
+ if num_workers is None:
79
+ num_workers = mp.cpu_count()
80
+
81
+
82
+ # Process the extracted page data in parallel
83
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
84
+ # Process the page data
85
+ results = list(
86
+ executor.map(convert_page, pages)
87
+ )
88
+
89
+ return results
90
+
91
+
92
+ def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
93
+ """Process all pages of a PDF using multiple threads.
94
+
95
+ Parameters:
96
+ -----------
97
+ pdf_path : str
98
+ Path to the PDF file
99
+ num_threads : int
100
+ Number of threads to use
101
+ **kwargs :
102
+ Additional arguments for fitz_doc_to_image
103
+
104
+ Returns:
105
+ --------
106
+ images : list
107
+ List of processed images, in page order
108
+ """
109
+ # Open the PDF
110
+ doc = fitz.open(pdf_path)
111
+ num_pages = len(doc)
112
+
113
+ # Create a list to store results in the correct order
114
+ results = [None] * num_pages
115
+
116
+ # Create a thread pool
117
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
118
+ # Submit all tasks
119
+ futures = {}
120
+ for page_num in range(num_pages):
121
+ page = doc[page_num]
122
+ future = executor.submit(fitz_doc_to_image, page, **kwargs)
123
+ futures[future] = page_num
124
+ # Process results as they complete with progress bar
125
+ for future in as_completed(futures):
126
+ page_num = futures[future]
127
+ try:
128
+ results[page_num] = future.result()
129
+ except Exception as e:
130
+ print(f'Error processing page {page_num}: {e}')
131
+ results[page_num] = None
132
+
133
+ # Close the document
134
+ doc.close()
135
+
136
+ if __name__ == '__main__':
137
+ pdf = fitz.open('/tmp/[MS-DOC].pdf')
138
+
139
+
140
+ pdf_page = [fitz.open() for i in range(pdf.page_count)]
141
+ [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
142
+
143
+ pdf_page = [v.tobytes() for v in pdf_page]
144
+ results = parallel_process_pdf_safe(pdf_page, num_workers=16)
145
+
146
+ # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
147
+
148
+ """ benchmark results of multi-threaded processing (fitz page to image)
149
+ total page nums: 578
150
+ thread nums, time cost
151
+ 1 7.351 sec
152
+ 2 6.334 sec
153
+ 4 5.968 sec
154
+ 8 6.728 sec
155
+ 16 8.085 sec
156
+ """
157
+
158
+ """ benchmark results of multi-processor processing (fitz page to image)
159
+ total page nums: 578
160
+ processor nums, time cost
161
+ 1 17.170 sec
162
+ 2 10.170 sec
163
+ 4 7.841 sec
164
+ 8 7.900 sec
165
+ 16 7.984 sec
166
+ """
@@ -208,12 +208,13 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
208
208
  'text': merge_para_with_text(para_block),
209
209
  }
210
210
  elif para_type == BlockType.Title:
211
- title_level = get_title_level(para_block)
212
211
  para_content = {
213
212
  'type': 'text',
214
213
  'text': merge_para_with_text(para_block),
215
- 'text_level': title_level,
216
214
  }
215
+ title_level = get_title_level(para_block)
216
+ if title_level != 0:
217
+ para_content['text_level'] = title_level
217
218
  elif para_type == BlockType.InterlineEquation:
218
219
  para_content = {
219
220
  'type': 'equation',
@@ -319,5 +320,5 @@ def get_title_level(block):
319
320
  if title_level > 4:
320
321
  title_level = 4
321
322
  elif title_level < 1:
322
- title_level = 1
323
+ title_level = 0
323
324
  return title_level
@@ -44,14 +44,19 @@ def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
44
44
  # 截取图片
45
45
  pix = page.get_pixmap(clip=rect, matrix=zoom)
46
46
 
47
- # 将字节数据转换为文件对象
48
- image_file = BytesIO(pix.tobytes(output='png'))
49
- # 使用 Pillow 打开图像
50
- pil_image = Image.open(image_file)
51
47
  if mode == "cv2":
52
- image_result = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2BGR)
48
+ # 直接转换为numpy数组供cv2使用
49
+ img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
50
+ # PyMuPDF使用RGB顺序,而cv2使用BGR顺序
51
+ if pix.n == 3 or pix.n == 4:
52
+ image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
53
+ else:
54
+ image_result = img_array
53
55
  elif mode == "pillow":
54
- image_result = pil_image
56
+ # 将字节数据转换为文件对象
57
+ image_file = BytesIO(pix.tobytes(output='png'))
58
+ # 使用 Pillow 打开图像
59
+ image_result = Image.open(image_file)
55
60
  else:
56
61
  raise ValueError(f"mode: {mode} is not supported.")
57
62
 
@@ -48,7 +48,18 @@ def measure_time(func):
48
48
  start_time = time.time()
49
49
  result = func(*args, **kwargs)
50
50
  execution_time = time.time() - start_time
51
- PerformanceStats.add_execution_time(func.__name__, execution_time)
51
+
52
+ # 获取更详细的函数标识
53
+ if hasattr(func, "__self__"): # 实例方法
54
+ class_name = func.__self__.__class__.__name__
55
+ full_name = f"{class_name}.{func.__name__}"
56
+ elif hasattr(func, "__qualname__"): # 类方法或静态方法
57
+ full_name = func.__qualname__
58
+ else:
59
+ module_name = func.__module__
60
+ full_name = f"{module_name}.{func.__name__}"
61
+
62
+ PerformanceStats.add_execution_time(full_name, execution_time)
52
63
  return result
53
64
 
54
65
  return wrapper
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.2"
1
+ __version__ = "1.3.1"