magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
  9. magic_pdf/data/data_reader_writer/s3.py +69 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +0 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +15 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +74 -234
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +54 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +418 -51
  29. magic_pdf/model/pdf_extract_kit.py +164 -80
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
  31. magic_pdf/model/ppTableModel.py +2 -2
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +296 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +19 -9
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
  53. magic_pdf-0.9.0.dist-info/METADATA +507 -0
  54. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
  55. magic_pdf-0.8.0.dist-info/METADATA +0 -459
  56. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
  57. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
  58. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,46 +1,44 @@
1
- """
2
- 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
3
-
4
- """
1
+ """根据bucket的名字返回对应的s3 AK, SK,endpoint三元组."""
5
2
 
6
3
  import json
7
4
  import os
8
5
 
9
6
  from loguru import logger
10
7
 
8
+ from magic_pdf.libs.Constants import MODEL_NAME
11
9
  from magic_pdf.libs.commons import parse_bucket_key
12
10
 
13
11
  # 定义配置文件名常量
14
- CONFIG_FILE_NAME = "magic-pdf.json"
12
+ CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
15
13
 
16
14
 
17
15
  def read_config():
18
- home_dir = os.path.expanduser("~")
19
-
20
- config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
16
+ if os.path.isabs(CONFIG_FILE_NAME):
17
+ config_file = CONFIG_FILE_NAME
18
+ else:
19
+ home_dir = os.path.expanduser('~')
20
+ config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
21
21
 
22
22
  if not os.path.exists(config_file):
23
- raise FileNotFoundError(f"{config_file} not found")
23
+ raise FileNotFoundError(f'{config_file} not found')
24
24
 
25
- with open(config_file, "r", encoding="utf-8") as f:
25
+ with open(config_file, 'r', encoding='utf-8') as f:
26
26
  config = json.load(f)
27
27
  return config
28
28
 
29
29
 
30
30
  def get_s3_config(bucket_name: str):
31
- """
32
- ~/magic-pdf.json 读出来
33
- """
31
+ """~/magic-pdf.json 读出来."""
34
32
  config = read_config()
35
33
 
36
- bucket_info = config.get("bucket_info")
34
+ bucket_info = config.get('bucket_info')
37
35
  if bucket_name not in bucket_info:
38
- access_key, secret_key, storage_endpoint = bucket_info["[default]"]
36
+ access_key, secret_key, storage_endpoint = bucket_info['[default]']
39
37
  else:
40
38
  access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
41
39
 
42
40
  if access_key is None or secret_key is None or storage_endpoint is None:
43
- raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
41
+ raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
44
42
 
45
43
  # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
46
44
 
@@ -49,7 +47,7 @@ def get_s3_config(bucket_name: str):
49
47
 
50
48
  def get_s3_config_dict(path: str):
51
49
  access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
52
- return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
50
+ return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
53
51
 
54
52
 
55
53
  def get_bucket_name(path):
@@ -59,33 +57,65 @@ def get_bucket_name(path):
59
57
 
60
58
  def get_local_models_dir():
61
59
  config = read_config()
62
- models_dir = config.get("models-dir")
60
+ models_dir = config.get('models-dir')
63
61
  if models_dir is None:
64
62
  logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
65
- return "/tmp/models"
63
+ return '/tmp/models'
66
64
  else:
67
65
  return models_dir
68
66
 
69
67
 
68
+ def get_local_layoutreader_model_dir():
69
+ config = read_config()
70
+ layoutreader_model_dir = config.get('layoutreader-model-dir')
71
+ if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
72
+ home_dir = os.path.expanduser('~')
73
+ layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
74
+ logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
75
+ return layoutreader_at_modelscope_dir_path
76
+ else:
77
+ return layoutreader_model_dir
78
+
79
+
70
80
  def get_device():
71
81
  config = read_config()
72
- device = config.get("device-mode")
82
+ device = config.get('device-mode')
73
83
  if device is None:
74
84
  logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
75
- return "cpu"
85
+ return 'cpu'
76
86
  else:
77
87
  return device
78
88
 
79
89
 
80
90
  def get_table_recog_config():
81
91
  config = read_config()
82
- table_config = config.get("table-config")
92
+ table_config = config.get('table-config')
83
93
  if table_config is None:
84
94
  logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
85
- return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
95
+ return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
86
96
  else:
87
97
  return table_config
88
98
 
89
99
 
100
+ def get_layout_config():
101
+ config = read_config()
102
+ layout_config = config.get("layout-config")
103
+ if layout_config is None:
104
+ logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
105
+ return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
106
+ else:
107
+ return layout_config
108
+
109
+
110
+ def get_formula_config():
111
+ config = read_config()
112
+ formula_config = config.get("formula-config")
113
+ if formula_config is None:
114
+ logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
115
+ return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
116
+ else:
117
+ return formula_config
118
+
119
+
90
120
  if __name__ == "__main__":
91
121
  ak, sk, endpoint = get_s3_config("llm-raw")
@@ -1,3 +1,4 @@
1
+ from magic_pdf.data.dataset import PymuDocDataset
1
2
  from magic_pdf.libs.commons import fitz # PyMuPDF
2
3
  from magic_pdf.libs.Constants import CROSS_PAGE
3
4
  from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
@@ -33,7 +34,7 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
33
34
  ) # Draw the rectangle
34
35
 
35
36
 
36
- def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
37
+ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
37
38
  new_rgb = []
38
39
  for item in rgb_config:
39
40
  item = float(item) / 255
@@ -42,31 +43,31 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
42
43
  for j, bbox in enumerate(page_data):
43
44
  x0, y0, x1, y1 = bbox
44
45
  rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
45
- if fill_config:
46
- page.draw_rect(
47
- rect_coords,
48
- color=None,
49
- fill=new_rgb,
50
- fill_opacity=0.3,
51
- width=0.5,
52
- overlay=True,
53
- ) # Draw the rectangle
54
- else:
55
- page.draw_rect(
56
- rect_coords,
57
- color=new_rgb,
58
- fill=None,
59
- fill_opacity=1,
60
- width=0.5,
61
- overlay=True,
62
- ) # Draw the rectangle
46
+ if draw_bbox:
47
+ if fill_config:
48
+ page.draw_rect(
49
+ rect_coords,
50
+ color=None,
51
+ fill=new_rgb,
52
+ fill_opacity=0.3,
53
+ width=0.5,
54
+ overlay=True,
55
+ ) # Draw the rectangle
56
+ else:
57
+ page.draw_rect(
58
+ rect_coords,
59
+ color=new_rgb,
60
+ fill=None,
61
+ fill_opacity=1,
62
+ width=0.5,
63
+ overlay=True,
64
+ ) # Draw the rectangle
63
65
  page.insert_text(
64
- (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
66
+ (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
65
67
  ) # Insert the index in the top left corner of the rectangle
66
68
 
67
69
 
68
70
  def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
69
- layout_bbox_list = []
70
71
  dropped_bbox_list = []
71
72
  tables_list, tables_body_list = [], []
72
73
  tables_caption_list, tables_footnote_list = [], []
@@ -75,17 +76,19 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
75
76
  titles_list = []
76
77
  texts_list = []
77
78
  interequations_list = []
79
+ lists_list = []
80
+ indexs_list = []
78
81
  for page in pdf_info:
79
- page_layout_list = []
82
+
80
83
  page_dropped_list = []
81
84
  tables, tables_body, tables_caption, tables_footnote = [], [], [], []
82
85
  imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
83
86
  titles = []
84
87
  texts = []
85
88
  interequations = []
86
- for layout in page['layout_bboxes']:
87
- page_layout_list.append(layout['layout_bbox'])
88
- layout_bbox_list.append(page_layout_list)
89
+ lists = []
90
+ indices = []
91
+
89
92
  for dropped_bbox in page['discarded_blocks']:
90
93
  page_dropped_list.append(dropped_bbox['bbox'])
91
94
  dropped_bbox_list.append(page_dropped_list)
@@ -117,6 +120,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
117
120
  texts.append(bbox)
118
121
  elif block['type'] == BlockType.InterlineEquation:
119
122
  interequations.append(bbox)
123
+ elif block['type'] == BlockType.List:
124
+ lists.append(bbox)
125
+ elif block['type'] == BlockType.Index:
126
+ indices.append(bbox)
127
+
120
128
  tables_list.append(tables)
121
129
  tables_body_list.append(tables_body)
122
130
  tables_caption_list.append(tables_caption)
@@ -128,30 +136,62 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
128
136
  titles_list.append(titles)
129
137
  texts_list.append(texts)
130
138
  interequations_list.append(interequations)
139
+ lists_list.append(lists)
140
+ indexs_list.append(indices)
141
+
142
+ layout_bbox_list = []
143
+
144
+ table_type_order = {
145
+ 'table_caption': 1,
146
+ 'table_body': 2,
147
+ 'table_footnote': 3
148
+ }
149
+ for page in pdf_info:
150
+ page_block_list = []
151
+ for block in page['para_blocks']:
152
+ if block['type'] in [
153
+ BlockType.Text,
154
+ BlockType.Title,
155
+ BlockType.InterlineEquation,
156
+ BlockType.List,
157
+ BlockType.Index,
158
+ ]:
159
+ bbox = block['bbox']
160
+ page_block_list.append(bbox)
161
+ elif block['type'] in [BlockType.Image]:
162
+ for sub_block in block['blocks']:
163
+ bbox = sub_block['bbox']
164
+ page_block_list.append(bbox)
165
+ elif block['type'] in [BlockType.Table]:
166
+ sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
167
+ for sub_block in sorted_blocks:
168
+ bbox = sub_block['bbox']
169
+ page_block_list.append(bbox)
170
+
171
+ layout_bbox_list.append(page_block_list)
131
172
 
132
173
  pdf_docs = fitz.open('pdf', pdf_bytes)
174
+
133
175
  for i, page in enumerate(pdf_docs):
134
- draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
135
- draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
136
- True)
137
- draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
138
- True) # color !
139
- draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
140
- True)
141
- draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
142
- True)
143
- draw_bbox_without_number(i, tables_footnote_list, page,
144
- [229, 255, 204], True)
145
- draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
176
+
177
+ draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
178
+ # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
179
+ draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
180
+ draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
181
+ draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
182
+ # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
146
183
  draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
147
- draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
148
- True)
149
- draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
150
- True),
184
+ draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
185
+ draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
151
186
  draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
152
187
  draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
153
- draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
154
- True)
188
+ draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
189
+ draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
190
+ draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
191
+
192
+ draw_bbox_with_number(
193
+ i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
194
+ )
155
195
 
156
196
  # Save the PDF
157
197
  pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
@@ -209,11 +249,14 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
209
249
  page_dropped_list.append(span['bbox'])
210
250
  dropped_list.append(page_dropped_list)
211
251
  # 构造其余useful_list
212
- for block in page['para_blocks']:
252
+ # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
253
+ for block in page['preproc_blocks']:
213
254
  if block['type'] in [
214
- BlockType.Text,
215
- BlockType.Title,
216
- BlockType.InterlineEquation,
255
+ BlockType.Text,
256
+ BlockType.Title,
257
+ BlockType.InterlineEquation,
258
+ BlockType.List,
259
+ BlockType.Index,
217
260
  ]:
218
261
  for line in block['lines']:
219
262
  for span in line['spans']:
@@ -232,10 +275,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
232
275
  for i, page in enumerate(pdf_docs):
233
276
  # 获取当前页面的数据
234
277
  draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
235
- draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
236
- False)
237
- draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
238
- False)
278
+ draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
279
+ draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
239
280
  draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
240
281
  draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
241
282
  draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
@@ -244,7 +285,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
244
285
  pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
245
286
 
246
287
 
247
- def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
288
+ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
248
289
  dropped_bbox_list = []
249
290
  tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
250
291
  imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
@@ -252,7 +293,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
252
293
  texts_list = []
253
294
  interequations_list = []
254
295
  pdf_docs = fitz.open('pdf', pdf_bytes)
255
- magic_model = MagicModel(model_list, pdf_docs)
296
+ magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
256
297
  for i in range(len(model_list)):
257
298
  page_dropped_list = []
258
299
  tables_body, tables_caption, tables_footnote = [], [], []
@@ -278,8 +319,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
278
319
  imgs_body.append(bbox)
279
320
  elif layout_det['category_id'] == CategoryId.ImageCaption:
280
321
  imgs_caption.append(bbox)
281
- elif layout_det[
282
- 'category_id'] == CategoryId.InterlineEquation_YOLO:
322
+ elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
283
323
  interequations.append(bbox)
284
324
  elif layout_det['category_id'] == CategoryId.Abandon:
285
325
  page_dropped_list.append(bbox)
@@ -298,21 +338,66 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
298
338
  imgs_footnote_list.append(imgs_footnote)
299
339
 
300
340
  for i, page in enumerate(pdf_docs):
301
- draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
302
- True) # color !
341
+ draw_bbox_with_number(
342
+ i, dropped_bbox_list, page, [158, 158, 158], True
343
+ ) # color !
303
344
  draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
304
- draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
305
- True)
306
- draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
307
- True)
345
+ draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
346
+ draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
308
347
  draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
309
- draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
310
- True)
311
- draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
312
- True)
348
+ draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
349
+ draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
313
350
  draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
314
351
  draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
315
352
  draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
316
353
 
317
354
  # Save the PDF
318
355
  pdf_docs.save(f'{out_path}/{filename}_model.pdf')
356
+
357
+
358
+ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
359
+ layout_bbox_list = []
360
+
361
+ for page in pdf_info:
362
+ page_line_list = []
363
+ for block in page['preproc_blocks']:
364
+ if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
365
+ for line in block['lines']:
366
+ bbox = line['bbox']
367
+ index = line['index']
368
+ page_line_list.append({'index': index, 'bbox': bbox})
369
+ if block['type'] in [BlockType.Image, BlockType.Table]:
370
+ for sub_block in block['blocks']:
371
+ if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
372
+ for line in sub_block['virtual_lines']:
373
+ bbox = line['bbox']
374
+ index = line['index']
375
+ page_line_list.append({'index': index, 'bbox': bbox})
376
+ elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
377
+ for line in sub_block['lines']:
378
+ bbox = line['bbox']
379
+ index = line['index']
380
+ page_line_list.append({'index': index, 'bbox': bbox})
381
+ sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
382
+ layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
383
+ pdf_docs = fitz.open('pdf', pdf_bytes)
384
+ for i, page in enumerate(pdf_docs):
385
+ draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
386
+
387
+ pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
388
+
389
+
390
+ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
391
+ layout_bbox_list = []
392
+
393
+ for page in pdf_info:
394
+ page_block_list = []
395
+ for block in page['para_blocks']:
396
+ bbox = block['bbox']
397
+ page_block_list.append(bbox)
398
+ layout_bbox_list.append(page_block_list)
399
+ pdf_docs = fitz.open('pdf', pdf_bytes)
400
+ for i, page in enumerate(pdf_docs):
401
+ draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
402
+
403
+ pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')
@@ -20,6 +20,8 @@ class BlockType:
20
20
  InterlineEquation = 'interline_equation'
21
21
  Footnote = 'footnote'
22
22
  Discarded = 'discarded'
23
+ List = 'list'
24
+ Index = 'index'
23
25
 
24
26
 
25
27
  class CategoryId:
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.8.0"
1
+ __version__ = "0.9.0"
@@ -4,7 +4,9 @@ import fitz
4
4
  import numpy as np
5
5
  from loguru import logger
6
6
 
7
- from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
7
+ from magic_pdf.libs.clean_memory import clean_memory
8
+ from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config, get_layout_config, \
9
+ get_formula_config
8
10
  from magic_pdf.model.model_list import MODEL
9
11
  import magic_pdf.model as model_config
10
12
 
@@ -23,7 +25,7 @@ def remove_duplicates_dicts(lst):
23
25
  return unique_dicts
24
26
 
25
27
 
26
- def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
28
+ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
27
29
  try:
28
30
  from PIL import Image
29
31
  except ImportError:
@@ -32,18 +34,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
32
34
 
33
35
  images = []
34
36
  with fitz.open("pdf", pdf_bytes) as doc:
37
+ pdf_page_num = doc.page_count
38
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
39
+ if end_page_id > pdf_page_num - 1:
40
+ logger.warning("end_page_id is out of range, use images length")
41
+ end_page_id = pdf_page_num - 1
42
+
35
43
  for index in range(0, doc.page_count):
36
- page = doc[index]
37
- mat = fitz.Matrix(dpi / 72, dpi / 72)
38
- pm = page.get_pixmap(matrix=mat, alpha=False)
44
+ if start_page_id <= index <= end_page_id:
45
+ page = doc[index]
46
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
47
+ pm = page.get_pixmap(matrix=mat, alpha=False)
48
+
49
+ # If the width or height exceeds 9000 after scaling, do not scale further.
50
+ if pm.width > 9000 or pm.height > 9000:
51
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
39
52
 
40
- # If the width or height exceeds 9000 after scaling, do not scale further.
41
- if pm.width > 9000 or pm.height > 9000:
42
- pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
53
+ img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
54
+ img = np.array(img)
55
+ img_dict = {"img": img, "width": pm.width, "height": pm.height}
56
+ else:
57
+ img_dict = {"img": [], "width": 0, "height": 0}
43
58
 
44
- img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
45
- img = np.array(img)
46
- img_dict = {"img": img, "width": pm.width, "height": pm.height}
47
59
  images.append(img_dict)
48
60
  return images
49
61
 
@@ -57,14 +69,17 @@ class ModelSingleton:
57
69
  cls._instance = super().__new__(cls)
58
70
  return cls._instance
59
71
 
60
- def get_model(self, ocr: bool, show_log: bool):
61
- key = (ocr, show_log)
72
+ def get_model(self, ocr: bool, show_log: bool, lang=None, layout_model=None, formula_enable=None, table_enable=None):
73
+ key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
62
74
  if key not in self._models:
63
- self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
75
+ self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang, layout_model=layout_model,
76
+ formula_enable=formula_enable, table_enable=table_enable)
64
77
  return self._models[key]
65
78
 
66
79
 
67
- def custom_model_init(ocr: bool = False, show_log: bool = False):
80
+ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None,
81
+ layout_model=None, formula_enable=None, table_enable=None):
82
+
68
83
  model = None
69
84
 
70
85
  if model_config.__model_mode__ == "lite":
@@ -78,18 +93,36 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
78
93
  model_init_start = time.time()
79
94
  if model == MODEL.Paddle:
80
95
  from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
81
- custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
96
+ custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
82
97
  elif model == MODEL.PEK:
83
98
  from magic_pdf.model.pdf_extract_kit import CustomPEKModel
84
99
  # 从配置文件读取model-dir和device
85
100
  local_models_dir = get_local_models_dir()
86
101
  device = get_device()
102
+
103
+ layout_config = get_layout_config()
104
+ if layout_model is not None:
105
+ layout_config["model"] = layout_model
106
+
107
+ formula_config = get_formula_config()
108
+ if formula_enable is not None:
109
+ formula_config["enable"] = formula_enable
110
+
87
111
  table_config = get_table_recog_config()
88
- model_input = {"ocr": ocr,
89
- "show_log": show_log,
90
- "models_dir": local_models_dir,
91
- "device": device,
92
- "table_config": table_config}
112
+ if table_enable is not None:
113
+ table_config["enable"] = table_enable
114
+
115
+ model_input = {
116
+ "ocr": ocr,
117
+ "show_log": show_log,
118
+ "models_dir": local_models_dir,
119
+ "device": device,
120
+ "table_config": table_config,
121
+ "layout_config": layout_config,
122
+ "formula_config": formula_config,
123
+ "lang": lang,
124
+ }
125
+
93
126
  custom_model = CustomPEKModel(**model_input)
94
127
  else:
95
128
  logger.error("Not allow model_name!")
@@ -104,19 +137,23 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
104
137
 
105
138
 
106
139
  def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
107
- start_page_id=0, end_page_id=None):
140
+ start_page_id=0, end_page_id=None, lang=None,
141
+ layout_model=None, formula_enable=None, table_enable=None):
108
142
 
109
- model_manager = ModelSingleton()
110
- custom_model = model_manager.get_model(ocr, show_log)
143
+ if lang == "":
144
+ lang = None
111
145
 
112
- images = load_images_from_pdf(pdf_bytes)
146
+ model_manager = ModelSingleton()
147
+ custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
113
148
 
114
- # end_page_id = end_page_id if end_page_id else len(images) - 1
115
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
149
+ with fitz.open("pdf", pdf_bytes) as doc:
150
+ pdf_page_num = doc.page_count
151
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
152
+ if end_page_id > pdf_page_num - 1:
153
+ logger.warning("end_page_id is out of range, use images length")
154
+ end_page_id = pdf_page_num - 1
116
155
 
117
- if end_page_id > len(images) - 1:
118
- logger.warning("end_page_id is out of range, use images length")
119
- end_page_id = len(images) - 1
156
+ images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
120
157
 
121
158
  model_json = []
122
159
  doc_analyze_start = time.time()
@@ -132,7 +169,15 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
132
169
  page_info = {"page_no": index, "height": page_height, "width": page_width}
133
170
  page_dict = {"layout_dets": result, "page_info": page_info}
134
171
  model_json.append(page_dict)
135
- doc_analyze_cost = time.time() - doc_analyze_start
136
- logger.info(f"doc analyze cost: {doc_analyze_cost}")
172
+
173
+ gc_start = time.time()
174
+ clean_memory()
175
+ gc_time = round(time.time() - gc_start, 2)
176
+ logger.info(f"gc time: {gc_time}")
177
+
178
+ doc_analyze_time = round(time.time() - doc_analyze_start, 2)
179
+ doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
180
+ logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
181
+ f" speed: {doc_analyze_speed} pages/second")
137
182
 
138
183
  return model_json