magic-pdf 0.10.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +2 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  4. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  5. magic_pdf/data/dataset.py +13 -1
  6. magic_pdf/data/read_api.py +59 -12
  7. magic_pdf/data/utils.py +35 -0
  8. magic_pdf/dict2md/ocr_mkcontent.py +14 -13
  9. magic_pdf/libs/clean_memory.py +11 -4
  10. magic_pdf/libs/config_reader.py +9 -0
  11. magic_pdf/libs/draw_bbox.py +8 -12
  12. magic_pdf/libs/language.py +3 -0
  13. magic_pdf/libs/version.py +1 -1
  14. magic_pdf/model/__init__.py +1 -125
  15. magic_pdf/model/batch_analyze.py +275 -0
  16. magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
  17. magic_pdf/model/magic_model.py +4 -435
  18. magic_pdf/model/model_list.py +1 -0
  19. magic_pdf/model/pdf_extract_kit.py +33 -22
  20. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  21. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  22. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  23. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  24. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  25. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  26. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  27. magic_pdf/model/sub_modules/model_init.py +30 -4
  28. magic_pdf/model/sub_modules/model_utils.py +8 -2
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  31. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  32. magic_pdf/operators/__init__.py +94 -0
  33. magic_pdf/{model/operators.py → operators/models.py} +2 -38
  34. magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
  35. magic_pdf/pdf_parse_union_core_v2.py +68 -17
  36. magic_pdf/post_proc/__init__.py +1 -0
  37. magic_pdf/post_proc/llm_aided.py +133 -0
  38. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  39. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  40. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  41. magic_pdf/tools/cli.py +36 -11
  42. magic_pdf/tools/common.py +28 -18
  43. magic_pdf/utils/office_to_pdf.py +29 -0
  44. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +73 -23
  45. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +50 -53
  46. magic_pdf/para/__init__.py +0 -0
  47. magic_pdf/pdf_parse_by_ocr.py +0 -22
  48. magic_pdf/pdf_parse_by_txt.py +0 -23
  49. magic_pdf/pipe/AbsPipe.py +0 -99
  50. magic_pdf/pipe/OCRPipe.py +0 -80
  51. magic_pdf/pipe/TXTPipe.py +0 -42
  52. magic_pdf/pipe/UNIPipe.py +0 -150
  53. magic_pdf/pipe/__init__.py +0 -0
  54. magic_pdf/rw/AbsReaderWriter.py +0 -17
  55. magic_pdf/rw/DiskReaderWriter.py +0 -74
  56. magic_pdf/rw/S3ReaderWriter.py +0 -142
  57. magic_pdf/rw/__init__.py +0 -0
  58. magic_pdf/user_api.py +0 -144
  59. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  60. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  61. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  62. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
1
+ import time
2
+
3
+ import cv2
4
+ import numpy as np
5
+ import torch
6
+ from loguru import logger
7
+ from PIL import Image
8
+
9
+ from magic_pdf.config.constants import MODEL_NAME
10
+ from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE
11
+ from magic_pdf.data.dataset import Dataset
12
+ from magic_pdf.libs.clean_memory import clean_memory
13
+ from magic_pdf.libs.config_reader import get_device
14
+ from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
15
+ from magic_pdf.model.pdf_extract_kit import CustomPEKModel
16
+ from magic_pdf.model.sub_modules.model_utils import (
17
+ clean_vram, crop_img, get_res_list_from_layout_res)
18
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
19
+ get_adjusted_mfdetrec_res, get_ocr_result_list)
20
+ from magic_pdf.operators.models import InferenceResult
21
+
22
+ YOLO_LAYOUT_BASE_BATCH_SIZE = 4
23
+ MFD_BASE_BATCH_SIZE = 1
24
+ MFR_BASE_BATCH_SIZE = 16
25
+
26
+
27
+ class BatchAnalyze:
28
+ def __init__(self, model: CustomPEKModel, batch_ratio: int):
29
+ self.model = model
30
+ self.batch_ratio = batch_ratio
31
+
32
+ def __call__(self, images: list) -> list:
33
+ images_layout_res = []
34
+
35
+ layout_start_time = time.time()
36
+ if self.model.layout_model_name == MODEL_NAME.LAYOUTLMv3:
37
+ # layoutlmv3
38
+ for image in images:
39
+ layout_res = self.model.layout_model(image, ignore_catids=[])
40
+ images_layout_res.append(layout_res)
41
+ elif self.model.layout_model_name == MODEL_NAME.DocLayout_YOLO:
42
+ # doclayout_yolo
43
+ layout_images = []
44
+ modified_images = []
45
+ for image_index, image in enumerate(images):
46
+ pil_img = Image.fromarray(image)
47
+ width, height = pil_img.size
48
+ if height > width:
49
+ input_res = {'poly': [0, 0, width, 0, width, height, 0, height]}
50
+ new_image, useful_list = crop_img(
51
+ input_res, pil_img, crop_paste_x=width // 2, crop_paste_y=0
52
+ )
53
+ layout_images.append(new_image)
54
+ modified_images.append([image_index, useful_list])
55
+ else:
56
+ layout_images.append(pil_img)
57
+
58
+ images_layout_res += self.model.layout_model.batch_predict(
59
+ layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE
60
+ )
61
+
62
+ for image_index, useful_list in modified_images:
63
+ for res in images_layout_res[image_index]:
64
+ for i in range(len(res['poly'])):
65
+ if i % 2 == 0:
66
+ res['poly'][i] = (
67
+ res['poly'][i] - useful_list[0] + useful_list[2]
68
+ )
69
+ else:
70
+ res['poly'][i] = (
71
+ res['poly'][i] - useful_list[1] + useful_list[3]
72
+ )
73
+ logger.info(
74
+ f'layout time: {round(time.time() - layout_start_time, 2)}, image num: {len(images)}'
75
+ )
76
+
77
+ if self.model.apply_formula:
78
+ # 公式检测
79
+ mfd_start_time = time.time()
80
+ images_mfd_res = self.model.mfd_model.batch_predict(
81
+ images, self.batch_ratio * MFD_BASE_BATCH_SIZE
82
+ )
83
+ logger.info(
84
+ f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}'
85
+ )
86
+
87
+ # 公式识别
88
+ mfr_start_time = time.time()
89
+ images_formula_list = self.model.mfr_model.batch_predict(
90
+ images_mfd_res,
91
+ images,
92
+ batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE,
93
+ )
94
+ for image_index in range(len(images)):
95
+ images_layout_res[image_index] += images_formula_list[image_index]
96
+ logger.info(
97
+ f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {len(images)}'
98
+ )
99
+
100
+ # 清理显存
101
+ clean_vram(self.model.device, vram_threshold=8)
102
+
103
+ ocr_time = 0
104
+ ocr_count = 0
105
+ table_time = 0
106
+ table_count = 0
107
+ # reference: magic_pdf/model/doc_analyze_by_custom_model.py:doc_analyze
108
+ for index in range(len(images)):
109
+ layout_res = images_layout_res[index]
110
+ pil_img = Image.fromarray(images[index])
111
+
112
+ ocr_res_list, table_res_list, single_page_mfdetrec_res = (
113
+ get_res_list_from_layout_res(layout_res)
114
+ )
115
+ # ocr识别
116
+ ocr_start = time.time()
117
+ # Process each area that requires OCR processing
118
+ for res in ocr_res_list:
119
+ new_image, useful_list = crop_img(
120
+ res, pil_img, crop_paste_x=50, crop_paste_y=50
121
+ )
122
+ adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
123
+ single_page_mfdetrec_res, useful_list
124
+ )
125
+
126
+ # OCR recognition
127
+ new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
128
+
129
+ if self.model.apply_ocr:
130
+ ocr_res = self.model.ocr_model.ocr(
131
+ new_image, mfd_res=adjusted_mfdetrec_res
132
+ )[0]
133
+ else:
134
+ ocr_res = self.model.ocr_model.ocr(
135
+ new_image, mfd_res=adjusted_mfdetrec_res, rec=False
136
+ )[0]
137
+
138
+ # Integration results
139
+ if ocr_res:
140
+ ocr_result_list = get_ocr_result_list(ocr_res, useful_list)
141
+ layout_res.extend(ocr_result_list)
142
+ ocr_time += time.time() - ocr_start
143
+ ocr_count += len(ocr_res_list)
144
+
145
+ # 表格识别 table recognition
146
+ if self.model.apply_table:
147
+ table_start = time.time()
148
+ for res in table_res_list:
149
+ new_image, _ = crop_img(res, pil_img)
150
+ single_table_start_time = time.time()
151
+ html_code = None
152
+ if self.model.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
153
+ with torch.no_grad():
154
+ table_result = self.model.table_model.predict(
155
+ new_image, 'html'
156
+ )
157
+ if len(table_result) > 0:
158
+ html_code = table_result[0]
159
+ elif self.model.table_model_name == MODEL_NAME.TABLE_MASTER:
160
+ html_code = self.model.table_model.img2html(new_image)
161
+ elif self.model.table_model_name == MODEL_NAME.RAPID_TABLE:
162
+ html_code, table_cell_bboxes, elapse = (
163
+ self.model.table_model.predict(new_image)
164
+ )
165
+ run_time = time.time() - single_table_start_time
166
+ if run_time > self.model.table_max_time:
167
+ logger.warning(
168
+ f'table recognition processing exceeds max time {self.model.table_max_time}s'
169
+ )
170
+ # 判断是否返回正常
171
+ if html_code:
172
+ expected_ending = html_code.strip().endswith(
173
+ '</html>'
174
+ ) or html_code.strip().endswith('</table>')
175
+ if expected_ending:
176
+ res['html'] = html_code
177
+ else:
178
+ logger.warning(
179
+ 'table recognition processing fails, not found expected HTML table end'
180
+ )
181
+ else:
182
+ logger.warning(
183
+ 'table recognition processing fails, not get html return'
184
+ )
185
+ table_time += time.time() - table_start
186
+ table_count += len(table_res_list)
187
+
188
+ if self.model.apply_ocr:
189
+ logger.info(f'ocr time: {round(ocr_time, 2)}, image num: {ocr_count}')
190
+ else:
191
+ logger.info(f'det time: {round(ocr_time, 2)}, image num: {ocr_count}')
192
+ if self.model.apply_table:
193
+ logger.info(f'table time: {round(table_time, 2)}, image num: {table_count}')
194
+
195
+ return images_layout_res
196
+
197
+
198
+ def doc_batch_analyze(
199
+ dataset: Dataset,
200
+ ocr: bool = False,
201
+ show_log: bool = False,
202
+ start_page_id=0,
203
+ end_page_id=None,
204
+ lang=None,
205
+ layout_model=None,
206
+ formula_enable=None,
207
+ table_enable=None,
208
+ batch_ratio: int | None = None,
209
+ ) -> InferenceResult:
210
+ """Perform batch analysis on a document dataset.
211
+
212
+ Args:
213
+ dataset (Dataset): The dataset containing document pages to be analyzed.
214
+ ocr (bool, optional): Flag to enable OCR (Optical Character Recognition). Defaults to False.
215
+ show_log (bool, optional): Flag to enable logging. Defaults to False.
216
+ start_page_id (int, optional): The starting page ID for analysis. Defaults to 0.
217
+ end_page_id (int, optional): The ending page ID for analysis. Defaults to None, which means analyze till the last page.
218
+ lang (str, optional): Language for OCR. Defaults to None.
219
+ layout_model (optional): Layout model to be used for analysis. Defaults to None.
220
+ formula_enable (optional): Flag to enable formula detection. Defaults to None.
221
+ table_enable (optional): Flag to enable table detection. Defaults to None.
222
+ batch_ratio (int | None, optional): Ratio for batch processing. Defaults to None, which sets it to 1.
223
+
224
+ Raises:
225
+ CUDA_NOT_AVAILABLE: If CUDA is not available, raises an exception as batch analysis is not supported in CPU mode.
226
+
227
+ Returns:
228
+ InferenceResult: The result of the batch analysis containing the analyzed data and the dataset.
229
+ """
230
+
231
+ if not torch.cuda.is_available():
232
+ raise CUDA_NOT_AVAILABLE('batch analyze not support in CPU mode')
233
+
234
+ lang = None if lang == '' else lang
235
+ # TODO: auto detect batch size
236
+ batch_ratio = 1 if batch_ratio is None else batch_ratio
237
+ end_page_id = end_page_id if end_page_id else len(dataset)
238
+
239
+ model_manager = ModelSingleton()
240
+ custom_model: CustomPEKModel = model_manager.get_model(
241
+ ocr, show_log, lang, layout_model, formula_enable, table_enable
242
+ )
243
+ batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
244
+
245
+ model_json = []
246
+
247
+ # batch analyze
248
+ images = []
249
+ for index in range(len(dataset)):
250
+ if start_page_id <= index <= end_page_id:
251
+ page_data = dataset.get_page(index)
252
+ img_dict = page_data.get_image()
253
+ images.append(img_dict['img'])
254
+ analyze_result = batch_model(images)
255
+
256
+ for index in range(len(dataset)):
257
+ page_data = dataset.get_page(index)
258
+ img_dict = page_data.get_image()
259
+ page_width = img_dict['width']
260
+ page_height = img_dict['height']
261
+ if start_page_id <= index <= end_page_id:
262
+ result = analyze_result.pop(0)
263
+ else:
264
+ result = []
265
+
266
+ page_info = {'page_no': index, 'height': page_height, 'width': page_width}
267
+ page_dict = {'layout_dets': result, 'page_info': page_info}
268
+ model_json.append(page_dict)
269
+
270
+ # TODO: clean memory when gpu memory is not enough
271
+ clean_memory_start_time = time.time()
272
+ clean_memory(get_device())
273
+ logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}')
274
+
275
+ return InferenceResult(model_json, dataset)
@@ -1,16 +1,13 @@
1
1
  import os
2
2
  import time
3
3
 
4
- import fitz
5
- import numpy as np
6
- from loguru import logger
7
-
8
4
  # 关闭paddle的信号处理
9
5
  import paddle
6
+ from loguru import logger
7
+
10
8
  paddle.disable_signal_handler()
11
9
 
12
10
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
13
- os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
14
11
 
15
12
  try:
16
13
  import torchtext
@@ -28,7 +25,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
28
25
  get_local_models_dir,
29
26
  get_table_recog_config)
30
27
  from magic_pdf.model.model_list import MODEL
31
- from magic_pdf.model.operators import InferenceResult
28
+ from magic_pdf.operators.models import InferenceResult
32
29
 
33
30
 
34
31
  def dict_compare(d1, d2):
@@ -45,47 +42,6 @@ def remove_duplicates_dicts(lst):
45
42
  return unique_dicts
46
43
 
47
44
 
48
- def load_images_from_pdf(
49
- pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None
50
- ) -> list:
51
- try:
52
- from PIL import Image
53
- except ImportError:
54
- logger.error('Pillow not installed, please install by pip.')
55
- exit(1)
56
-
57
- images = []
58
- with fitz.open('pdf', pdf_bytes) as doc:
59
- pdf_page_num = doc.page_count
60
- end_page_id = (
61
- end_page_id
62
- if end_page_id is not None and end_page_id >= 0
63
- else pdf_page_num - 1
64
- )
65
- if end_page_id > pdf_page_num - 1:
66
- logger.warning('end_page_id is out of range, use images length')
67
- end_page_id = pdf_page_num - 1
68
-
69
- for index in range(0, doc.page_count):
70
- if start_page_id <= index <= end_page_id:
71
- page = doc[index]
72
- mat = fitz.Matrix(dpi / 72, dpi / 72)
73
- pm = page.get_pixmap(matrix=mat, alpha=False)
74
-
75
- # If the width or height exceeds 4500 after scaling, do not scale further.
76
- if pm.width > 4500 or pm.height > 4500:
77
- pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
78
-
79
- img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
80
- img = np.array(img)
81
- img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
82
- else:
83
- img_dict = {'img': [], 'width': 0, 'height': 0}
84
-
85
- images.append(img_dict)
86
- return images
87
-
88
-
89
45
  class ModelSingleton:
90
46
  _instance = None
91
47
  _models = {}
@@ -198,9 +154,6 @@ def doc_analyze(
198
154
  table_enable=None,
199
155
  ) -> InferenceResult:
200
156
 
201
- if lang == '':
202
- lang = None
203
-
204
157
  model_manager = ModelSingleton()
205
158
  custom_model = model_manager.get_model(
206
159
  ocr, show_log, lang, layout_model, formula_enable, table_enable
@@ -230,7 +183,7 @@ def doc_analyze(
230
183
  model_json.append(page_dict)
231
184
 
232
185
  gc_start = time.time()
233
- clean_memory()
186
+ clean_memory(get_device())
234
187
  gc_time = round(time.time() - gc_start, 2)
235
188
  logger.info(f'gc time: {gc_time}')
236
189