magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,285 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from loguru import logger
6
+
7
+ import magic_pdf.model as model_config
8
+ from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
9
+ from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
10
+ ElementRelation, ElementRelType,
11
+ LayoutElements,
12
+ LayoutElementsExtra, PageInfo)
13
+ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
14
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
15
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
16
+ from magic_pdf.tools.common import do_parse, prepare_env
17
+
18
+
19
+ def convert_middle_json_to_layout_elements(
20
+ json_data: dict,
21
+ output_dir: str,
22
+ ) -> list[LayoutElements]:
23
+ uniq_anno_id = 0
24
+
25
+ res: list[LayoutElements] = []
26
+ for page_no, page_data in enumerate(json_data['pdf_info']):
27
+ order_id = 0
28
+ page_info = PageInfo(
29
+ height=int(page_data['page_size'][1]),
30
+ width=int(page_data['page_size'][0]),
31
+ page_no=page_no,
32
+ )
33
+ layout_dets: list[ContentObject] = []
34
+ extra_element_relation: list[ElementRelation] = []
35
+
36
+ for para_block in page_data['para_blocks']:
37
+ para_text = ''
38
+ para_type = para_block['type']
39
+
40
+ if para_type == BlockType.Text:
41
+ para_text = merge_para_with_text(para_block)
42
+ x0, y0, x1, y1 = para_block['bbox']
43
+ content = ContentObject(
44
+ anno_id=uniq_anno_id,
45
+ category_type=CategoryType.text,
46
+ text=para_text,
47
+ order=order_id,
48
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
49
+ )
50
+ uniq_anno_id += 1
51
+ order_id += 1
52
+ layout_dets.append(content)
53
+
54
+ elif para_type == BlockType.Title:
55
+ para_text = merge_para_with_text(para_block)
56
+ x0, y0, x1, y1 = para_block['bbox']
57
+ content = ContentObject(
58
+ anno_id=uniq_anno_id,
59
+ category_type=CategoryType.title,
60
+ text=para_text,
61
+ order=order_id,
62
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
63
+ )
64
+ uniq_anno_id += 1
65
+ order_id += 1
66
+ layout_dets.append(content)
67
+
68
+ elif para_type == BlockType.InterlineEquation:
69
+ para_text = merge_para_with_text(para_block)
70
+ x0, y0, x1, y1 = para_block['bbox']
71
+ content = ContentObject(
72
+ anno_id=uniq_anno_id,
73
+ category_type=CategoryType.interline_equation,
74
+ text=para_text,
75
+ order=order_id,
76
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
77
+ )
78
+ uniq_anno_id += 1
79
+ order_id += 1
80
+ layout_dets.append(content)
81
+
82
+ elif para_type == BlockType.Image:
83
+ body_anno_id = -1
84
+ caption_anno_id = -1
85
+
86
+ for block in para_block['blocks']:
87
+ if block['type'] == BlockType.ImageBody:
88
+ for line in block['lines']:
89
+ for span in line['spans']:
90
+ if span['type'] == ContentType.Image:
91
+ x0, y0, x1, y1 = block['bbox']
92
+ content = ContentObject(
93
+ anno_id=uniq_anno_id,
94
+ category_type=CategoryType.image_body,
95
+ image_path=os.path.join(
96
+ output_dir, span['image_path']),
97
+ order=order_id,
98
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
99
+ )
100
+ body_anno_id = uniq_anno_id
101
+ uniq_anno_id += 1
102
+ order_id += 1
103
+ layout_dets.append(content)
104
+
105
+ for block in para_block['blocks']:
106
+ if block['type'] == BlockType.ImageCaption:
107
+ para_text += merge_para_with_text(block)
108
+ x0, y0, x1, y1 = block['bbox']
109
+ content = ContentObject(
110
+ anno_id=uniq_anno_id,
111
+ category_type=CategoryType.image_caption,
112
+ text=para_text,
113
+ order=order_id,
114
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
115
+ )
116
+ caption_anno_id = uniq_anno_id
117
+ uniq_anno_id += 1
118
+ order_id += 1
119
+ layout_dets.append(content)
120
+
121
+ if body_anno_id > 0 and caption_anno_id > 0:
122
+ element_relation = ElementRelation(
123
+ relation=ElementRelType.sibling,
124
+ source_anno_id=body_anno_id,
125
+ target_anno_id=caption_anno_id,
126
+ )
127
+ extra_element_relation.append(element_relation)
128
+
129
+ elif para_type == BlockType.Table:
130
+ body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
131
+
132
+ for block in para_block['blocks']:
133
+ if block['type'] == BlockType.TableCaption:
134
+ para_text += merge_para_with_text(block)
135
+ x0, y0, x1, y1 = block['bbox']
136
+ content = ContentObject(
137
+ anno_id=uniq_anno_id,
138
+ category_type=CategoryType.table_caption,
139
+ text=para_text,
140
+ order=order_id,
141
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
142
+ )
143
+ caption_anno_id = uniq_anno_id
144
+ uniq_anno_id += 1
145
+ order_id += 1
146
+ layout_dets.append(content)
147
+
148
+ for block in para_block['blocks']:
149
+ if block['type'] == BlockType.TableBody:
150
+ for line in block['lines']:
151
+ for span in line['spans']:
152
+ if span['type'] == ContentType.Table:
153
+ x0, y0, x1, y1 = para_block['bbox']
154
+ content = ContentObject(
155
+ anno_id=uniq_anno_id,
156
+ category_type=CategoryType.table_body,
157
+ order=order_id,
158
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
159
+ )
160
+ body_anno_id = uniq_anno_id
161
+ uniq_anno_id += 1
162
+ order_id += 1
163
+ # if processed by table model
164
+ if span.get('latex', ''):
165
+ content.latex = span['latex']
166
+ else:
167
+ content.image_path = os.path.join(
168
+ output_dir, span['image_path'])
169
+ layout_dets.append(content)
170
+
171
+ for block in para_block['blocks']:
172
+ if block['type'] == BlockType.TableFootnote:
173
+ para_text += merge_para_with_text(block)
174
+ x0, y0, x1, y1 = block['bbox']
175
+ content = ContentObject(
176
+ anno_id=uniq_anno_id,
177
+ category_type=CategoryType.table_footnote,
178
+ text=para_text,
179
+ order=order_id,
180
+ poly=[x0, y0, x1, y0, x1, y1, x0, y1],
181
+ )
182
+ footnote_anno_id = uniq_anno_id
183
+ uniq_anno_id += 1
184
+ order_id += 1
185
+ layout_dets.append(content)
186
+
187
+ if caption_anno_id != -1 and body_anno_id != -1:
188
+ element_relation = ElementRelation(
189
+ relation=ElementRelType.sibling,
190
+ source_anno_id=body_anno_id,
191
+ target_anno_id=caption_anno_id,
192
+ )
193
+ extra_element_relation.append(element_relation)
194
+
195
+ if footnote_anno_id != -1 and body_anno_id != -1:
196
+ element_relation = ElementRelation(
197
+ relation=ElementRelType.sibling,
198
+ source_anno_id=body_anno_id,
199
+ target_anno_id=footnote_anno_id,
200
+ )
201
+ extra_element_relation.append(element_relation)
202
+
203
+ res.append(
204
+ LayoutElements(
205
+ page_info=page_info,
206
+ layout_dets=layout_dets,
207
+ extra=LayoutElementsExtra(
208
+ element_relation=extra_element_relation),
209
+ ))
210
+
211
+ return res
212
+
213
+
214
+ def inference(path, output_dir, method):
215
+ model_config.__use_inside_model__ = True
216
+ model_config.__model_mode__ = 'full'
217
+ if output_dir == '':
218
+ if os.path.isdir(path):
219
+ output_dir = os.path.join(path, 'output')
220
+ else:
221
+ output_dir = os.path.join(os.path.dirname(path), 'output')
222
+
223
+ local_image_dir, local_md_dir = prepare_env(output_dir,
224
+ str(Path(path).stem), method)
225
+
226
+ def read_fn(path):
227
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
228
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
229
+
230
+ def parse_doc(doc_path: str):
231
+ try:
232
+ file_name = str(Path(doc_path).stem)
233
+ pdf_data = read_fn(doc_path)
234
+ do_parse(
235
+ output_dir,
236
+ file_name,
237
+ pdf_data,
238
+ [],
239
+ method,
240
+ False,
241
+ f_draw_span_bbox=False,
242
+ f_draw_layout_bbox=False,
243
+ f_dump_md=False,
244
+ f_dump_middle_json=True,
245
+ f_dump_model_json=False,
246
+ f_dump_orig_pdf=False,
247
+ f_dump_content_list=False,
248
+ f_draw_model_bbox=False,
249
+ )
250
+
251
+ middle_json_fn = os.path.join(local_md_dir,
252
+ f'{file_name}_middle.json')
253
+ with open(middle_json_fn) as fd:
254
+ jso = json.load(fd)
255
+ os.remove(middle_json_fn)
256
+ return convert_middle_json_to_layout_elements(jso, local_image_dir)
257
+
258
+ except Exception as e:
259
+ logger.exception(e)
260
+
261
+ return parse_doc(path)
262
+
263
+
264
+ if __name__ == '__main__':
265
+ import pprint
266
+
267
+ base_dir = '/opt/data/pdf/resources/samples/'
268
+ if 0:
269
+ with open(base_dir + 'json_outputs/middle.json') as f:
270
+ d = json.load(f)
271
+ result = convert_middle_json_to_layout_elements(d, '/tmp')
272
+ pprint.pp(result)
273
+ if 0:
274
+ with open(base_dir + 'json_outputs/middle.3.json') as f:
275
+ d = json.load(f)
276
+ result = convert_middle_json_to_layout_elements(d, '/tmp')
277
+ pprint.pp(result)
278
+
279
+ if 1:
280
+ res = inference(
281
+ base_dir + 'samples/pdf/one_page_with_table_image.pdf',
282
+ '/tmp/output',
283
+ 'ocr',
284
+ )
285
+ pprint.pp(res)