magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,31 @@
1
+ import os
1
2
  import time
2
3
 
3
- import fitz
4
- import numpy as np
4
+ # 关闭paddle的信号处理
5
+ import paddle
5
6
  from loguru import logger
6
7
 
8
+ paddle.disable_signal_handler()
9
+
10
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
11
+
12
+ try:
13
+ import torchtext
14
+
15
+ if torchtext.__version__ >= '0.18.0':
16
+ torchtext.disable_torchtext_deprecation_warning()
17
+ except ImportError:
18
+ pass
19
+
20
+ import magic_pdf.model as model_config
21
+ from magic_pdf.data.dataset import Dataset
7
22
  from magic_pdf.libs.clean_memory import clean_memory
8
- from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config, get_layout_config, \
9
- get_formula_config
23
+ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
24
+ get_layout_config,
25
+ get_local_models_dir,
26
+ get_table_recog_config)
10
27
  from magic_pdf.model.model_list import MODEL
11
- import magic_pdf.model as model_config
28
+ from magic_pdf.operators.models import InferenceResult
12
29
 
13
30
 
14
31
  def dict_compare(d1, d2):
@@ -19,47 +36,12 @@ def remove_duplicates_dicts(lst):
19
36
  unique_dicts = []
20
37
  for dict_item in lst:
21
38
  if not any(
22
- dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
39
+ dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
23
40
  ):
24
41
  unique_dicts.append(dict_item)
25
42
  return unique_dicts
26
43
 
27
44
 
28
- def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
29
- try:
30
- from PIL import Image
31
- except ImportError:
32
- logger.error("Pillow not installed, please install by pip.")
33
- exit(1)
34
-
35
- images = []
36
- with fitz.open("pdf", pdf_bytes) as doc:
37
- pdf_page_num = doc.page_count
38
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
39
- if end_page_id > pdf_page_num - 1:
40
- logger.warning("end_page_id is out of range, use images length")
41
- end_page_id = pdf_page_num - 1
42
-
43
- for index in range(0, doc.page_count):
44
- if start_page_id <= index <= end_page_id:
45
- page = doc[index]
46
- mat = fitz.Matrix(dpi / 72, dpi / 72)
47
- pm = page.get_pixmap(matrix=mat, alpha=False)
48
-
49
- # If the width or height exceeds 4500 after scaling, do not scale further.
50
- if pm.width > 4500 or pm.height > 4500:
51
- pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
52
-
53
- img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
54
- img = np.array(img)
55
- img_dict = {"img": img, "width": pm.width, "height": pm.height}
56
- else:
57
- img_dict = {"img": [], "width": 0, "height": 0}
58
-
59
- images.append(img_dict)
60
- return images
61
-
62
-
63
45
  class ModelSingleton:
64
46
  _instance = None
65
47
  _models = {}
@@ -69,117 +51,147 @@ class ModelSingleton:
69
51
  cls._instance = super().__new__(cls)
70
52
  return cls._instance
71
53
 
72
- def get_model(self, ocr: bool, show_log: bool, lang=None, layout_model=None, formula_enable=None, table_enable=None):
54
+ def get_model(
55
+ self,
56
+ ocr: bool,
57
+ show_log: bool,
58
+ lang=None,
59
+ layout_model=None,
60
+ formula_enable=None,
61
+ table_enable=None,
62
+ ):
73
63
  key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
74
64
  if key not in self._models:
75
- self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang, layout_model=layout_model,
76
- formula_enable=formula_enable, table_enable=table_enable)
65
+ self._models[key] = custom_model_init(
66
+ ocr=ocr,
67
+ show_log=show_log,
68
+ lang=lang,
69
+ layout_model=layout_model,
70
+ formula_enable=formula_enable,
71
+ table_enable=table_enable,
72
+ )
77
73
  return self._models[key]
78
74
 
79
75
 
80
- def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None,
81
- layout_model=None, formula_enable=None, table_enable=None):
76
+ def custom_model_init(
77
+ ocr: bool = False,
78
+ show_log: bool = False,
79
+ lang=None,
80
+ layout_model=None,
81
+ formula_enable=None,
82
+ table_enable=None,
83
+ ):
82
84
 
83
85
  model = None
84
86
 
85
- if model_config.__model_mode__ == "lite":
86
- logger.warning("The Lite mode is provided for developers to conduct testing only, and the output quality is "
87
- "not guaranteed to be reliable.")
87
+ if model_config.__model_mode__ == 'lite':
88
+ logger.warning(
89
+ 'The Lite mode is provided for developers to conduct testing only, and the output quality is '
90
+ 'not guaranteed to be reliable.'
91
+ )
88
92
  model = MODEL.Paddle
89
- elif model_config.__model_mode__ == "full":
93
+ elif model_config.__model_mode__ == 'full':
90
94
  model = MODEL.PEK
91
95
 
92
96
  if model_config.__use_inside_model__:
93
97
  model_init_start = time.time()
94
98
  if model == MODEL.Paddle:
95
99
  from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
100
+
96
101
  custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
97
102
  elif model == MODEL.PEK:
98
103
  from magic_pdf.model.pdf_extract_kit import CustomPEKModel
104
+
99
105
  # 从配置文件读取model-dir和device
100
106
  local_models_dir = get_local_models_dir()
101
107
  device = get_device()
102
108
 
103
109
  layout_config = get_layout_config()
104
110
  if layout_model is not None:
105
- layout_config["model"] = layout_model
111
+ layout_config['model'] = layout_model
106
112
 
107
113
  formula_config = get_formula_config()
108
114
  if formula_enable is not None:
109
- formula_config["enable"] = formula_enable
115
+ formula_config['enable'] = formula_enable
110
116
 
111
117
  table_config = get_table_recog_config()
112
118
  if table_enable is not None:
113
- table_config["enable"] = table_enable
119
+ table_config['enable'] = table_enable
114
120
 
115
121
  model_input = {
116
- "ocr": ocr,
117
- "show_log": show_log,
118
- "models_dir": local_models_dir,
119
- "device": device,
120
- "table_config": table_config,
121
- "layout_config": layout_config,
122
- "formula_config": formula_config,
123
- "lang": lang,
122
+ 'ocr': ocr,
123
+ 'show_log': show_log,
124
+ 'models_dir': local_models_dir,
125
+ 'device': device,
126
+ 'table_config': table_config,
127
+ 'layout_config': layout_config,
128
+ 'formula_config': formula_config,
129
+ 'lang': lang,
124
130
  }
125
131
 
126
132
  custom_model = CustomPEKModel(**model_input)
127
133
  else:
128
- logger.error("Not allow model_name!")
134
+ logger.error('Not allow model_name!')
129
135
  exit(1)
130
136
  model_init_cost = time.time() - model_init_start
131
- logger.info(f"model init cost: {model_init_cost}")
137
+ logger.info(f'model init cost: {model_init_cost}')
132
138
  else:
133
- logger.error("use_inside_model is False, not allow to use inside model")
139
+ logger.error('use_inside_model is False, not allow to use inside model')
134
140
  exit(1)
135
141
 
136
142
  return custom_model
137
143
 
138
144
 
139
- def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
140
- start_page_id=0, end_page_id=None, lang=None,
141
- layout_model=None, formula_enable=None, table_enable=None):
142
-
143
- if lang == "":
144
- lang = None
145
+ def doc_analyze(
146
+ dataset: Dataset,
147
+ ocr: bool = False,
148
+ show_log: bool = False,
149
+ start_page_id=0,
150
+ end_page_id=None,
151
+ lang=None,
152
+ layout_model=None,
153
+ formula_enable=None,
154
+ table_enable=None,
155
+ ) -> InferenceResult:
145
156
 
146
157
  model_manager = ModelSingleton()
147
- custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
148
-
149
- with fitz.open("pdf", pdf_bytes) as doc:
150
- pdf_page_num = doc.page_count
151
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
152
- if end_page_id > pdf_page_num - 1:
153
- logger.warning("end_page_id is out of range, use images length")
154
- end_page_id = pdf_page_num - 1
155
-
156
- images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
158
+ custom_model = model_manager.get_model(
159
+ ocr, show_log, lang, layout_model, formula_enable, table_enable
160
+ )
157
161
 
158
162
  model_json = []
159
163
  doc_analyze_start = time.time()
160
164
 
161
- for index, img_dict in enumerate(images):
162
- img = img_dict["img"]
163
- page_width = img_dict["width"]
164
- page_height = img_dict["height"]
165
+ if end_page_id is None:
166
+ end_page_id = len(dataset)
167
+
168
+ for index in range(len(dataset)):
169
+ page_data = dataset.get_page(index)
170
+ img_dict = page_data.get_image()
171
+ img = img_dict['img']
172
+ page_width = img_dict['width']
173
+ page_height = img_dict['height']
165
174
  if start_page_id <= index <= end_page_id:
166
175
  page_start = time.time()
167
176
  result = custom_model(img)
168
177
  logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
169
178
  else:
170
179
  result = []
171
- page_info = {"page_no": index, "height": page_height, "width": page_width}
172
- page_dict = {"layout_dets": result, "page_info": page_info}
180
+
181
+ page_info = {'page_no': index, 'height': page_height, 'width': page_width}
182
+ page_dict = {'layout_dets': result, 'page_info': page_info}
173
183
  model_json.append(page_dict)
174
184
 
175
185
  gc_start = time.time()
176
- clean_memory()
186
+ clean_memory(get_device())
177
187
  gc_time = round(time.time() - gc_start, 2)
178
- logger.info(f"gc time: {gc_time}")
188
+ logger.info(f'gc time: {gc_time}')
179
189
 
180
190
  doc_analyze_time = round(time.time() - doc_analyze_start, 2)
181
- doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
182
- logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
183
- f" speed: {doc_analyze_speed} pages/second")
191
+ doc_analyze_speed = round((end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
192
+ logger.info(
193
+ f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
194
+ f' speed: {doc_analyze_speed} pages/second'
195
+ )
184
196
 
185
- return model_json
197
+ return InferenceResult(model_json, dataset)