magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- from magic_pdf.libs.Constants import CROSS_PAGE
2
1
  from magic_pdf.libs.commons import fitz # PyMuPDF
3
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId
2
+ from magic_pdf.libs.Constants import CROSS_PAGE
3
+ from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
4
4
  from magic_pdf.model.magic_model import MagicModel
5
5
 
6
6
 
@@ -65,11 +65,13 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
65
65
  ) # Insert the index in the top left corner of the rectangle
66
66
 
67
67
 
68
- def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
68
+ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
69
69
  layout_bbox_list = []
70
70
  dropped_bbox_list = []
71
- tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
71
+ tables_list, tables_body_list = [], []
72
+ tables_caption_list, tables_footnote_list = [], []
72
73
  imgs_list, imgs_body_list, imgs_caption_list = [], [], []
74
+ imgs_footnote_list = []
73
75
  titles_list = []
74
76
  texts_list = []
75
77
  interequations_list = []
@@ -77,41 +79,43 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
77
79
  page_layout_list = []
78
80
  page_dropped_list = []
79
81
  tables, tables_body, tables_caption, tables_footnote = [], [], [], []
80
- imgs, imgs_body, imgs_caption = [], [], []
82
+ imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
81
83
  titles = []
82
84
  texts = []
83
85
  interequations = []
84
- for layout in page["layout_bboxes"]:
85
- page_layout_list.append(layout["layout_bbox"])
86
+ for layout in page['layout_bboxes']:
87
+ page_layout_list.append(layout['layout_bbox'])
86
88
  layout_bbox_list.append(page_layout_list)
87
- for dropped_bbox in page["discarded_blocks"]:
88
- page_dropped_list.append(dropped_bbox["bbox"])
89
+ for dropped_bbox in page['discarded_blocks']:
90
+ page_dropped_list.append(dropped_bbox['bbox'])
89
91
  dropped_bbox_list.append(page_dropped_list)
90
- for block in page["para_blocks"]:
91
- bbox = block["bbox"]
92
- if block["type"] == BlockType.Table:
92
+ for block in page['para_blocks']:
93
+ bbox = block['bbox']
94
+ if block['type'] == BlockType.Table:
93
95
  tables.append(bbox)
94
- for nested_block in block["blocks"]:
95
- bbox = nested_block["bbox"]
96
- if nested_block["type"] == BlockType.TableBody:
96
+ for nested_block in block['blocks']:
97
+ bbox = nested_block['bbox']
98
+ if nested_block['type'] == BlockType.TableBody:
97
99
  tables_body.append(bbox)
98
- elif nested_block["type"] == BlockType.TableCaption:
100
+ elif nested_block['type'] == BlockType.TableCaption:
99
101
  tables_caption.append(bbox)
100
- elif nested_block["type"] == BlockType.TableFootnote:
102
+ elif nested_block['type'] == BlockType.TableFootnote:
101
103
  tables_footnote.append(bbox)
102
- elif block["type"] == BlockType.Image:
104
+ elif block['type'] == BlockType.Image:
103
105
  imgs.append(bbox)
104
- for nested_block in block["blocks"]:
105
- bbox = nested_block["bbox"]
106
- if nested_block["type"] == BlockType.ImageBody:
106
+ for nested_block in block['blocks']:
107
+ bbox = nested_block['bbox']
108
+ if nested_block['type'] == BlockType.ImageBody:
107
109
  imgs_body.append(bbox)
108
- elif nested_block["type"] == BlockType.ImageCaption:
110
+ elif nested_block['type'] == BlockType.ImageCaption:
109
111
  imgs_caption.append(bbox)
110
- elif block["type"] == BlockType.Title:
112
+ elif nested_block['type'] == BlockType.ImageFootnote:
113
+ imgs_footnote.append(bbox)
114
+ elif block['type'] == BlockType.Title:
111
115
  titles.append(bbox)
112
- elif block["type"] == BlockType.Text:
116
+ elif block['type'] == BlockType.Text:
113
117
  texts.append(bbox)
114
- elif block["type"] == BlockType.InterlineEquation:
118
+ elif block['type'] == BlockType.InterlineEquation:
115
119
  interequations.append(bbox)
116
120
  tables_list.append(tables)
117
121
  tables_body_list.append(tables_body)
@@ -120,30 +124,40 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
120
124
  imgs_list.append(imgs)
121
125
  imgs_body_list.append(imgs_body)
122
126
  imgs_caption_list.append(imgs_caption)
127
+ imgs_footnote_list.append(imgs_footnote)
123
128
  titles_list.append(titles)
124
129
  texts_list.append(texts)
125
130
  interequations_list.append(interequations)
126
131
 
127
- pdf_docs = fitz.open("pdf", pdf_bytes)
132
+ pdf_docs = fitz.open('pdf', pdf_bytes)
128
133
  for i, page in enumerate(pdf_docs):
129
134
  draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
130
- draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
131
- draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
132
- draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
133
- draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
134
- draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
135
+ draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
136
+ True)
137
+ draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
138
+ True) # color !
139
+ draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
140
+ True)
141
+ draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
142
+ True)
143
+ draw_bbox_without_number(i, tables_footnote_list, page,
144
+ [229, 255, 204], True)
135
145
  draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
136
146
  draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
137
- draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
147
+ draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
148
+ True)
149
+ draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
150
+ True),
138
151
  draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
139
152
  draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
140
- draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
153
+ draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
154
+ True)
141
155
 
142
156
  # Save the PDF
143
- pdf_docs.save(f"{out_path}/layout.pdf")
157
+ pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
144
158
 
145
159
 
146
- def draw_span_bbox(pdf_info, pdf_bytes, out_path):
160
+ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
147
161
  text_list = []
148
162
  inline_equation_list = []
149
163
  interline_equation_list = []
@@ -154,22 +168,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
154
168
  next_page_inline_equation_list = []
155
169
 
156
170
  def get_span_info(span):
157
- if span["type"] == ContentType.Text:
171
+ if span['type'] == ContentType.Text:
158
172
  if span.get(CROSS_PAGE, False):
159
- next_page_text_list.append(span["bbox"])
173
+ next_page_text_list.append(span['bbox'])
160
174
  else:
161
- page_text_list.append(span["bbox"])
162
- elif span["type"] == ContentType.InlineEquation:
175
+ page_text_list.append(span['bbox'])
176
+ elif span['type'] == ContentType.InlineEquation:
163
177
  if span.get(CROSS_PAGE, False):
164
- next_page_inline_equation_list.append(span["bbox"])
178
+ next_page_inline_equation_list.append(span['bbox'])
165
179
  else:
166
- page_inline_equation_list.append(span["bbox"])
167
- elif span["type"] == ContentType.InterlineEquation:
168
- page_interline_equation_list.append(span["bbox"])
169
- elif span["type"] == ContentType.Image:
170
- page_image_list.append(span["bbox"])
171
- elif span["type"] == ContentType.Table:
172
- page_table_list.append(span["bbox"])
180
+ page_inline_equation_list.append(span['bbox'])
181
+ elif span['type'] == ContentType.InterlineEquation:
182
+ page_interline_equation_list.append(span['bbox'])
183
+ elif span['type'] == ContentType.Image:
184
+ page_image_list.append(span['bbox'])
185
+ elif span['type'] == ContentType.Table:
186
+ page_table_list.append(span['bbox'])
173
187
 
174
188
  for page in pdf_info:
175
189
  page_text_list = []
@@ -188,84 +202,89 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
188
202
  next_page_inline_equation_list.clear()
189
203
 
190
204
  # 构造dropped_list
191
- for block in page["discarded_blocks"]:
192
- if block["type"] == BlockType.Discarded:
193
- for line in block["lines"]:
194
- for span in line["spans"]:
195
- page_dropped_list.append(span["bbox"])
205
+ for block in page['discarded_blocks']:
206
+ if block['type'] == BlockType.Discarded:
207
+ for line in block['lines']:
208
+ for span in line['spans']:
209
+ page_dropped_list.append(span['bbox'])
196
210
  dropped_list.append(page_dropped_list)
197
211
  # 构造其余useful_list
198
- for block in page["para_blocks"]:
199
- if block["type"] in [
200
- BlockType.Text,
201
- BlockType.Title,
202
- BlockType.InterlineEquation,
212
+ for block in page['para_blocks']:
213
+ if block['type'] in [
214
+ BlockType.Text,
215
+ BlockType.Title,
216
+ BlockType.InterlineEquation,
203
217
  ]:
204
- for line in block["lines"]:
205
- for span in line["spans"]:
218
+ for line in block['lines']:
219
+ for span in line['spans']:
206
220
  get_span_info(span)
207
- elif block["type"] in [BlockType.Image, BlockType.Table]:
208
- for sub_block in block["blocks"]:
209
- for line in sub_block["lines"]:
210
- for span in line["spans"]:
221
+ elif block['type'] in [BlockType.Image, BlockType.Table]:
222
+ for sub_block in block['blocks']:
223
+ for line in sub_block['lines']:
224
+ for span in line['spans']:
211
225
  get_span_info(span)
212
226
  text_list.append(page_text_list)
213
227
  inline_equation_list.append(page_inline_equation_list)
214
228
  interline_equation_list.append(page_interline_equation_list)
215
229
  image_list.append(page_image_list)
216
230
  table_list.append(page_table_list)
217
- pdf_docs = fitz.open("pdf", pdf_bytes)
231
+ pdf_docs = fitz.open('pdf', pdf_bytes)
218
232
  for i, page in enumerate(pdf_docs):
219
233
  # 获取当前页面的数据
220
234
  draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
221
- draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
222
- draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
235
+ draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
236
+ False)
237
+ draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
238
+ False)
223
239
  draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
224
240
  draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
225
241
  draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
226
242
 
227
243
  # Save the PDF
228
- pdf_docs.save(f"{out_path}/spans.pdf")
244
+ pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
229
245
 
230
246
 
231
- def drow_model_bbox(model_list: list, pdf_bytes, out_path):
247
+ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
232
248
  dropped_bbox_list = []
233
249
  tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
234
- imgs_body_list, imgs_caption_list = [], []
250
+ imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
235
251
  titles_list = []
236
252
  texts_list = []
237
253
  interequations_list = []
238
- pdf_docs = fitz.open("pdf", pdf_bytes)
254
+ pdf_docs = fitz.open('pdf', pdf_bytes)
239
255
  magic_model = MagicModel(model_list, pdf_docs)
240
256
  for i in range(len(model_list)):
241
257
  page_dropped_list = []
242
258
  tables_body, tables_caption, tables_footnote = [], [], []
243
- imgs_body, imgs_caption = [], []
259
+ imgs_body, imgs_caption, imgs_footnote = [], [], []
244
260
  titles = []
245
261
  texts = []
246
262
  interequations = []
247
263
  page_info = magic_model.get_model_list(i)
248
- layout_dets = page_info["layout_dets"]
264
+ layout_dets = page_info['layout_dets']
249
265
  for layout_det in layout_dets:
250
- bbox = layout_det["bbox"]
251
- if layout_det["category_id"] == CategoryId.Text:
266
+ bbox = layout_det['bbox']
267
+ if layout_det['category_id'] == CategoryId.Text:
252
268
  texts.append(bbox)
253
- elif layout_det["category_id"] == CategoryId.Title:
269
+ elif layout_det['category_id'] == CategoryId.Title:
254
270
  titles.append(bbox)
255
- elif layout_det["category_id"] == CategoryId.TableBody:
271
+ elif layout_det['category_id'] == CategoryId.TableBody:
256
272
  tables_body.append(bbox)
257
- elif layout_det["category_id"] == CategoryId.TableCaption:
273
+ elif layout_det['category_id'] == CategoryId.TableCaption:
258
274
  tables_caption.append(bbox)
259
- elif layout_det["category_id"] == CategoryId.TableFootnote:
275
+ elif layout_det['category_id'] == CategoryId.TableFootnote:
260
276
  tables_footnote.append(bbox)
261
- elif layout_det["category_id"] == CategoryId.ImageBody:
277
+ elif layout_det['category_id'] == CategoryId.ImageBody:
262
278
  imgs_body.append(bbox)
263
- elif layout_det["category_id"] == CategoryId.ImageCaption:
279
+ elif layout_det['category_id'] == CategoryId.ImageCaption:
264
280
  imgs_caption.append(bbox)
265
- elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO:
281
+ elif layout_det[
282
+ 'category_id'] == CategoryId.InterlineEquation_YOLO:
266
283
  interequations.append(bbox)
267
- elif layout_det["category_id"] == CategoryId.Abandon:
284
+ elif layout_det['category_id'] == CategoryId.Abandon:
268
285
  page_dropped_list.append(bbox)
286
+ elif layout_det['category_id'] == CategoryId.ImageFootnote:
287
+ imgs_footnote.append(bbox)
269
288
 
270
289
  tables_body_list.append(tables_body)
271
290
  tables_caption_list.append(tables_caption)
@@ -276,17 +295,24 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path):
276
295
  texts_list.append(texts)
277
296
  interequations_list.append(interequations)
278
297
  dropped_bbox_list.append(page_dropped_list)
298
+ imgs_footnote_list.append(imgs_footnote)
279
299
 
280
300
  for i, page in enumerate(pdf_docs):
281
- draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color !
301
+ draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
302
+ True) # color !
282
303
  draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
283
- draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
284
- draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
304
+ draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
305
+ True)
306
+ draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
307
+ True)
285
308
  draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
286
- draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
309
+ draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
310
+ True)
311
+ draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
312
+ True)
287
313
  draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
288
314
  draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
289
315
  draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
290
316
 
291
317
  # Save the PDF
292
- pdf_docs.save(f"{out_path}/model.pdf")
318
+ pdf_docs.save(f'{out_path}/{filename}_model.pdf')
@@ -1,23 +1,25 @@
1
1
  class ContentType:
2
- Image = "image"
3
- Table = "table"
4
- Text = "text"
5
- InlineEquation = "inline_equation"
6
- InterlineEquation = "interline_equation"
7
-
2
+ Image = 'image'
3
+ Table = 'table'
4
+ Text = 'text'
5
+ InlineEquation = 'inline_equation'
6
+ InterlineEquation = 'interline_equation'
7
+
8
+
8
9
  class BlockType:
9
- Image = "image"
10
- ImageBody = "image_body"
11
- ImageCaption = "image_caption"
12
- Table = "table"
13
- TableBody = "table_body"
14
- TableCaption = "table_caption"
15
- TableFootnote = "table_footnote"
16
- Text = "text"
17
- Title = "title"
18
- InterlineEquation = "interline_equation"
19
- Footnote = "footnote"
20
- Discarded = "discarded"
10
+ Image = 'image'
11
+ ImageBody = 'image_body'
12
+ ImageCaption = 'image_caption'
13
+ ImageFootnote = 'image_footnote'
14
+ Table = 'table'
15
+ TableBody = 'table_body'
16
+ TableCaption = 'table_caption'
17
+ TableFootnote = 'table_footnote'
18
+ Text = 'text'
19
+ Title = 'title'
20
+ InterlineEquation = 'interline_equation'
21
+ Footnote = 'footnote'
22
+ Discarded = 'discarded'
21
23
 
22
24
 
23
25
  class CategoryId:
@@ -33,3 +35,4 @@ class CategoryId:
33
35
  InlineEquation = 13
34
36
  InterlineEquation_YOLO = 14
35
37
  OcrText = 15
38
+ ImageFootnote = 101
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.7.1"
1
+ __version__ = "0.8.1"
@@ -103,20 +103,32 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
103
103
  return custom_model
104
104
 
105
105
 
106
- def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
106
+ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
107
+ start_page_id=0, end_page_id=None):
107
108
 
108
109
  model_manager = ModelSingleton()
109
110
  custom_model = model_manager.get_model(ocr, show_log)
110
111
 
111
112
  images = load_images_from_pdf(pdf_bytes)
112
113
 
114
+ # end_page_id = end_page_id if end_page_id else len(images) - 1
115
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
116
+
117
+ if end_page_id > len(images) - 1:
118
+ logger.warning("end_page_id is out of range, use images length")
119
+ end_page_id = len(images) - 1
120
+
113
121
  model_json = []
114
122
  doc_analyze_start = time.time()
123
+
115
124
  for index, img_dict in enumerate(images):
116
125
  img = img_dict["img"]
117
126
  page_width = img_dict["width"]
118
127
  page_height = img_dict["height"]
119
- result = custom_model(img)
128
+ if start_page_id <= index <= end_page_id:
129
+ result = custom_model(img)
130
+ else:
131
+ result = []
120
132
  page_info = {"page_no": index, "height": page_height, "width": page_width}
121
133
  page_dict = {"layout_dets": result, "page_info": page_info}
122
134
  model_json.append(page_dict)