magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
1
  import math
2
+
2
3
  from loguru import logger
3
4
 
4
- from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
5
+ from magic_pdf.config.ocr_content_type import ContentType
6
+ from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
7
+ find_top_nearest_text_bbox)
5
8
  from magic_pdf.libs.commons import join_path
6
- from magic_pdf.libs.ocr_content_type import ContentType
7
9
 
8
10
  TYPE_INLINE_EQUATION = ContentType.InlineEquation
9
11
  TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
@@ -12,33 +14,30 @@ UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
12
14
 
13
15
  @DeprecationWarning
14
16
  def mk_nlp_markdown_1(para_dict: dict):
15
- """
16
- 对排序后的bboxes拼接内容
17
- """
17
+ """对排序后的bboxes拼接内容."""
18
18
  content_lst = []
19
19
  for _, page_info in para_dict.items():
20
- para_blocks = page_info.get("para_blocks")
20
+ para_blocks = page_info.get('para_blocks')
21
21
  if not para_blocks:
22
22
  continue
23
23
 
24
24
  for block in para_blocks:
25
- item = block["paras"]
25
+ item = block['paras']
26
26
  for _, p in item.items():
27
- para_text = p["para_text"]
28
- is_title = p["is_para_title"]
27
+ para_text = p['para_text']
28
+ is_title = p['is_para_title']
29
29
  title_level = p['para_title_level']
30
- md_title_prefix = "#"*title_level
30
+ md_title_prefix = '#' * title_level
31
31
  if is_title:
32
- content_lst.append(f"{md_title_prefix} {para_text}")
32
+ content_lst.append(f'{md_title_prefix} {para_text}')
33
33
  else:
34
34
  content_lst.append(para_text)
35
35
 
36
- content_text = "\n\n".join(content_lst)
36
+ content_text = '\n\n'.join(content_lst)
37
37
 
38
38
  return content_text
39
39
 
40
40
 
41
-
42
41
  # 找到目标字符串在段落中的索引
43
42
  def __find_index(paragraph, target):
44
43
  index = paragraph.find(target)
@@ -48,69 +47,76 @@ def __find_index(paragraph, target):
48
47
  return None
49
48
 
50
49
 
51
- def __insert_string(paragraph, target, postion):
52
- new_paragraph = paragraph[:postion] + target + paragraph[postion:]
50
+ def __insert_string(paragraph, target, position):
51
+ new_paragraph = paragraph[:position] + target + paragraph[position:]
53
52
  return new_paragraph
54
53
 
55
54
 
56
55
  def __insert_after(content, image_content, target):
57
- """
58
- 在content中找到target,将image_content插入到target后面
59
- """
56
+ """在content中找到target,将image_content插入到target后面."""
60
57
  index = content.find(target)
61
58
  if index != -1:
62
- content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
59
+ content = (
60
+ content[: index + len(target)]
61
+ + '\n\n'
62
+ + image_content
63
+ + '\n\n'
64
+ + content[index + len(target) :]
65
+ )
63
66
  else:
64
- logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
67
+ logger.error(
68
+ f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
69
+ )
65
70
  return content
66
71
 
72
+
67
73
  def __insert_before(content, image_content, target):
68
- """
69
- 在content中找到target,将image_content插入到target前面
70
- """
74
+ """在content中找到target,将image_content插入到target前面."""
71
75
  index = content.find(target)
72
76
  if index != -1:
73
- content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
77
+ content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
74
78
  else:
75
- logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
79
+ logger.error(
80
+ f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
81
+ )
76
82
  return content
77
83
 
78
84
 
79
85
  @DeprecationWarning
80
86
  def mk_mm_markdown_1(para_dict: dict):
81
- """拼装多模态markdown"""
87
+ """拼装多模态markdown."""
82
88
  content_lst = []
83
89
  for _, page_info in para_dict.items():
84
- page_lst = [] # 一个page内的段落列表
85
- para_blocks = page_info.get("para_blocks")
86
- pymu_raw_blocks = page_info.get("preproc_blocks")
87
-
90
+ page_lst = [] # 一个page内的段落列表
91
+ para_blocks = page_info.get('para_blocks')
92
+ pymu_raw_blocks = page_info.get('preproc_blocks')
93
+
88
94
  all_page_images = []
89
- all_page_images.extend(page_info.get("images",[]))
90
- all_page_images.extend(page_info.get("image_backup", []) )
91
- all_page_images.extend(page_info.get("tables",[]))
92
- all_page_images.extend(page_info.get("table_backup",[]) )
93
-
94
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
95
+ all_page_images.extend(page_info.get('images', []))
96
+ all_page_images.extend(page_info.get('image_backup', []))
97
+ all_page_images.extend(page_info.get('tables', []))
98
+ all_page_images.extend(page_info.get('table_backup', []))
99
+
100
+ if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
95
101
  for img in all_page_images:
96
- page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
97
- page_md = "\n\n".join(page_lst)
98
-
102
+ page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
103
+ page_md = '\n\n'.join(page_lst)
104
+
99
105
  else:
100
106
  for block in para_blocks:
101
- item = block["paras"]
107
+ item = block['paras']
102
108
  for _, p in item.items():
103
- para_text = p["para_text"]
104
- is_title = p["is_para_title"]
109
+ para_text = p['para_text']
110
+ is_title = p['is_para_title']
105
111
  title_level = p['para_title_level']
106
- md_title_prefix = "#"*title_level
112
+ md_title_prefix = '#' * title_level
107
113
  if is_title:
108
- page_lst.append(f"{md_title_prefix} {para_text}")
114
+ page_lst.append(f'{md_title_prefix} {para_text}')
109
115
  else:
110
116
  page_lst.append(para_text)
111
-
117
+
112
118
  """拼装成一个页面的文本"""
113
- page_md = "\n\n".join(page_lst)
119
+ page_md = '\n\n'.join(page_lst)
114
120
  """插入图片"""
115
121
  for img in all_page_images:
116
122
  imgbox = img['bbox']
@@ -118,192 +124,215 @@ def mk_mm_markdown_1(para_dict: dict):
118
124
  # 先看在哪个block内
119
125
  for block in pymu_raw_blocks:
120
126
  bbox = block['bbox']
121
- if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
122
- for l in block['lines']:
127
+ if (
128
+ bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
129
+ and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
130
+ ): # 确定在block内
131
+ for l in block['lines']: # noqa: E741
123
132
  line_box = l['bbox']
124
- if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
125
- line_txt = "".join([s['text'] for s in l['spans']])
126
- page_md = __insert_before(page_md, img_content, line_txt)
133
+ if (
134
+ line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
135
+ and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
136
+ ): # 在line内的,插入line前面
137
+ line_txt = ''.join([s['text'] for s in l['spans']])
138
+ page_md = __insert_before(
139
+ page_md, img_content, line_txt
140
+ )
127
141
  break
128
142
  break
129
- else:# 在行与行之间
143
+ else: # 在行与行之间
130
144
  # 找到图片x0,y0与line的x0,y0最近的line
131
145
  min_distance = 100000
132
146
  min_line = None
133
- for l in block['lines']:
147
+ for l in block['lines']: # noqa: E741
134
148
  line_box = l['bbox']
135
- distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
149
+ distance = math.sqrt(
150
+ (line_box[0] - imgbox[0]) ** 2
151
+ + (line_box[1] - imgbox[1]) ** 2
152
+ )
136
153
  if distance < min_distance:
137
154
  min_distance = distance
138
155
  min_line = l
139
156
  if min_line:
140
- line_txt = "".join([s['text'] for s in min_line['spans']])
157
+ line_txt = ''.join(
158
+ [s['text'] for s in min_line['spans']]
159
+ )
141
160
  img_h = imgbox[3] - imgbox[1]
142
- if min_distance<img_h: # 文字在图片前面
143
- page_md = __insert_after(page_md, img_content, line_txt)
161
+ if min_distance < img_h: # 文字在图片前面
162
+ page_md = __insert_after(
163
+ page_md, img_content, line_txt
164
+ )
144
165
  else:
145
- page_md = __insert_before(page_md, img_content, line_txt)
166
+ page_md = __insert_before(
167
+ page_md, img_content, line_txt
168
+ )
146
169
  else:
147
- logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
148
- else:# 应当在两个block之间
170
+ logger.error(
171
+ f"Can't find the location of image {img['image_path']} in the markdown file #1"
172
+ )
173
+ else: # 应当在两个block之间
149
174
  # 找到上方最近的block,如果上方没有就找大下方最近的block
150
175
  top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
151
176
  if top_txt_block:
152
- line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
177
+ line_txt = ''.join(
178
+ [s['text'] for s in top_txt_block['lines'][-1]['spans']]
179
+ )
153
180
  page_md = __insert_after(page_md, img_content, line_txt)
154
181
  else:
155
- bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
182
+ bottom_txt_block = find_bottom_nearest_text_bbox(
183
+ pymu_raw_blocks, imgbox
184
+ )
156
185
  if bottom_txt_block:
157
- line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
186
+ line_txt = ''.join(
187
+ [
188
+ s['text']
189
+ for s in bottom_txt_block['lines'][0]['spans']
190
+ ]
191
+ )
158
192
  page_md = __insert_before(page_md, img_content, line_txt)
159
193
  else:
160
- logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
161
-
194
+ logger.error(
195
+ f"Can't find the location of image {img['image_path']} in the markdown file #2"
196
+ )
197
+
162
198
  content_lst.append(page_md)
163
-
199
+
164
200
  """拼装成全部页面的文本"""
165
- content_text = "\n\n".join(content_lst)
201
+ content_text = '\n\n'.join(content_lst)
166
202
 
167
203
  return content_text
168
204
 
169
205
 
170
206
  def __insert_after_para(text, type, element, content_list):
171
- """
172
- 在content_list中找到text,将image_path作为一个新的node插入到text后面
173
- """
207
+ """在content_list中找到text,将image_path作为一个新的node插入到text后面."""
174
208
  for i, c in enumerate(content_list):
175
- content_type = c.get("type")
176
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
177
- if type == "image":
209
+ content_type = c.get('type')
210
+ if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
211
+ if type == 'image':
178
212
  content_node = {
179
- "type": "image",
180
- "img_path": element.get("image_path"),
181
- "img_alt": "",
182
- "img_title": "",
183
- "img_caption": "",
213
+ 'type': 'image',
214
+ 'img_path': element.get('image_path'),
215
+ 'img_alt': '',
216
+ 'img_title': '',
217
+ 'img_caption': '',
184
218
  }
185
- elif type == "table":
219
+ elif type == 'table':
186
220
  content_node = {
187
- "type": "table",
188
- "img_path": element.get("image_path"),
189
- "table_latex": element.get("text"),
190
- "table_title": "",
191
- "table_caption": "",
192
- "table_quality": element.get("quality"),
221
+ 'type': 'table',
222
+ 'img_path': element.get('image_path'),
223
+ 'table_latex': element.get('text'),
224
+ 'table_title': '',
225
+ 'table_caption': '',
226
+ 'table_quality': element.get('quality'),
193
227
  }
194
- content_list.insert(i+1, content_node)
228
+ content_list.insert(i + 1, content_node)
195
229
  break
196
230
  else:
197
- logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
198
-
231
+ logger.error(
232
+ f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
233
+ )
199
234
 
200
235
 
201
236
  def __insert_before_para(text, type, element, content_list):
202
- """
203
- 在content_list中找到text,将image_path作为一个新的node插入到text前面
204
- """
237
+ """在content_list中找到text,将image_path作为一个新的node插入到text前面."""
205
238
  for i, c in enumerate(content_list):
206
- content_type = c.get("type")
207
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
208
- if type == "image":
239
+ content_type = c.get('type')
240
+ if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
241
+ if type == 'image':
209
242
  content_node = {
210
- "type": "image",
211
- "img_path": element.get("image_path"),
212
- "img_alt": "",
213
- "img_title": "",
214
- "img_caption": "",
243
+ 'type': 'image',
244
+ 'img_path': element.get('image_path'),
245
+ 'img_alt': '',
246
+ 'img_title': '',
247
+ 'img_caption': '',
215
248
  }
216
- elif type == "table":
249
+ elif type == 'table':
217
250
  content_node = {
218
- "type": "table",
219
- "img_path": element.get("image_path"),
220
- "table_latex": element.get("text"),
221
- "table_title": "",
222
- "table_caption": "",
223
- "table_quality": element.get("quality"),
251
+ 'type': 'table',
252
+ 'img_path': element.get('image_path'),
253
+ 'table_latex': element.get('text'),
254
+ 'table_title': '',
255
+ 'table_caption': '',
256
+ 'table_quality': element.get('quality'),
224
257
  }
225
258
  content_list.insert(i, content_node)
226
259
  break
227
260
  else:
228
- logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
229
-
261
+ logger.error(
262
+ f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
263
+ )
264
+
230
265
 
231
266
  def mk_universal_format(pdf_info_list: list, img_buket_path):
232
- """
233
- 构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
234
- """
267
+ """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
235
268
  content_lst = []
236
269
  for page_info in pdf_info_list:
237
- page_lst = [] # 一个page内的段落列表
238
- para_blocks = page_info.get("para_blocks")
239
- pymu_raw_blocks = page_info.get("preproc_blocks")
240
-
270
+ page_lst = [] # 一个page内的段落列表
271
+ para_blocks = page_info.get('para_blocks')
272
+ pymu_raw_blocks = page_info.get('preproc_blocks')
273
+
241
274
  all_page_images = []
242
- all_page_images.extend(page_info.get("images",[]))
243
- all_page_images.extend(page_info.get("image_backup", []) )
275
+ all_page_images.extend(page_info.get('images', []))
276
+ all_page_images.extend(page_info.get('image_backup', []))
244
277
  # all_page_images.extend(page_info.get("tables",[]))
245
278
  # all_page_images.extend(page_info.get("table_backup",[]) )
246
279
  all_page_tables = []
247
- all_page_tables.extend(page_info.get("tables", []))
280
+ all_page_tables.extend(page_info.get('tables', []))
248
281
 
249
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
282
+ if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
250
283
  for img in all_page_images:
251
284
  content_node = {
252
- "type": "image",
253
- "img_path": join_path(img_buket_path, img['image_path']),
254
- "img_alt":"",
255
- "img_title":"",
256
- "img_caption":""
285
+ 'type': 'image',
286
+ 'img_path': join_path(img_buket_path, img['image_path']),
287
+ 'img_alt': '',
288
+ 'img_title': '',
289
+ 'img_caption': '',
257
290
  }
258
- page_lst.append(content_node) # TODO 图片顺序
291
+ page_lst.append(content_node) # TODO 图片顺序
259
292
  for table in all_page_tables:
260
293
  content_node = {
261
- "type": "table",
262
- "img_path": join_path(img_buket_path, table['image_path']),
263
- "table_latex": table.get("text"),
264
- "table_title": "",
265
- "table_caption": "",
266
- "table_quality": table.get("quality"),
294
+ 'type': 'table',
295
+ 'img_path': join_path(img_buket_path, table['image_path']),
296
+ 'table_latex': table.get('text'),
297
+ 'table_title': '',
298
+ 'table_caption': '',
299
+ 'table_quality': table.get('quality'),
267
300
  }
268
- page_lst.append(content_node) # TODO 图片顺序
301
+ page_lst.append(content_node) # TODO 图片顺序
269
302
  else:
270
303
  for block in para_blocks:
271
- item = block["paras"]
304
+ item = block['paras']
272
305
  for _, p in item.items():
273
- font_type = p['para_font_type']# 对于文本来说,要么是普通文本,要么是个行间公式
306
+ font_type = p[
307
+ 'para_font_type'
308
+ ] # 对于文本来说,要么是普通文本,要么是个行间公式
274
309
  if font_type == TYPE_INTERLINE_EQUATION:
275
- content_node = {
276
- "type": "equation",
277
- "latex": p["para_text"]
278
- }
310
+ content_node = {'type': 'equation', 'latex': p['para_text']}
279
311
  page_lst.append(content_node)
280
312
  else:
281
- para_text = p["para_text"]
282
- is_title = p["is_para_title"]
313
+ para_text = p['para_text']
314
+ is_title = p['is_para_title']
283
315
  title_level = p['para_title_level']
284
-
316
+
285
317
  if is_title:
286
318
  content_node = {
287
- "type": f"h{title_level}",
288
- "text": para_text
319
+ 'type': f'h{title_level}',
320
+ 'text': para_text,
289
321
  }
290
322
  page_lst.append(content_node)
291
323
  else:
292
- content_node = {
293
- "type": "text",
294
- "text": para_text
295
- }
324
+ content_node = {'type': 'text', 'text': para_text}
296
325
  page_lst.append(content_node)
297
-
326
+
298
327
  content_lst.extend(page_lst)
299
-
328
+
300
329
  """插入图片"""
301
330
  for img in all_page_images:
302
- insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
331
+ insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
303
332
 
304
333
  """插入表格"""
305
334
  for table in all_page_tables:
306
- insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
335
+ insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
307
336
  # end for
308
337
  return content_lst
309
338
 
@@ -313,13 +342,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
313
342
  # 先看在哪个block内
314
343
  for block in pymu_raw_blocks:
315
344
  bbox = block['bbox']
316
- if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
317
- 3] + 1: # 确定在这个大的block内,然后进入逐行比较距离
318
- for l in block['lines']:
345
+ if (
346
+ bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
347
+ and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
348
+ ): # 确定在这个大的block内,然后进入逐行比较距离
349
+ for l in block['lines']: # noqa: E741
319
350
  line_box = l['bbox']
320
- if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
321
- 3] + 1: # 在line内的,插入line前面
322
- line_txt = "".join([s['text'] for s in l['spans']])
351
+ if (
352
+ line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
353
+ and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
354
+ ): # 在line内的,插入line前面
355
+ line_txt = ''.join([s['text'] for s in l['spans']])
323
356
  __insert_before_para(line_txt, type, element, content_lst)
324
357
  break
325
358
  break
@@ -327,14 +360,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
327
360
  # 找到图片x0,y0与line的x0,y0最近的line
328
361
  min_distance = 100000
329
362
  min_line = None
330
- for l in block['lines']:
363
+ for l in block['lines']: # noqa: E741
331
364
  line_box = l['bbox']
332
- distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
365
+ distance = math.sqrt(
366
+ (line_box[0] - element_bbox[0]) ** 2
367
+ + (line_box[1] - element_bbox[1]) ** 2
368
+ )
333
369
  if distance < min_distance:
334
370
  min_distance = distance
335
371
  min_line = l
336
372
  if min_line:
337
- line_txt = "".join([s['text'] for s in min_line['spans']])
373
+ line_txt = ''.join([s['text'] for s in min_line['spans']])
338
374
  img_h = element_bbox[3] - element_bbox[1]
339
375
  if min_distance < img_h: # 文字在图片前面
340
376
  __insert_after_para(line_txt, type, element, content_lst)
@@ -342,56 +378,61 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
342
378
  __insert_before_para(line_txt, type, element, content_lst)
343
379
  break
344
380
  else:
345
- logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
381
+ logger.error(
382
+ f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
383
+ )
346
384
  else: # 应当在两个block之间
347
385
  # 找到上方最近的block,如果上方没有就找大下方最近的block
348
386
  top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
349
387
  if top_txt_block:
350
- line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
388
+ line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
351
389
  __insert_after_para(line_txt, type, element, content_lst)
352
390
  else:
353
- bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
391
+ bottom_txt_block = find_bottom_nearest_text_bbox(
392
+ pymu_raw_blocks, element_bbox
393
+ )
354
394
  if bottom_txt_block:
355
- line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
395
+ line_txt = ''.join(
396
+ [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
397
+ )
356
398
  __insert_before_para(line_txt, type, element, content_lst)
357
399
  else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
358
- logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
400
+ logger.error(
401
+ f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
402
+ )
359
403
 
360
404
 
361
405
  def mk_mm_markdown(content_list):
362
- """
363
- 基于同一格式的内容列表,构造markdown,含图片
364
- """
406
+ """基于同一格式的内容列表,构造markdown,含图片."""
365
407
  content_md = []
366
408
  for c in content_list:
367
- content_type = c.get("type")
368
- if content_type == "text":
369
- content_md.append(c.get("text"))
370
- elif content_type == "equation":
371
- content = c.get("latex")
372
- if content.startswith("$$") and content.endswith("$$"):
409
+ content_type = c.get('type')
410
+ if content_type == 'text':
411
+ content_md.append(c.get('text'))
412
+ elif content_type == 'equation':
413
+ content = c.get('latex')
414
+ if content.startswith('$$') and content.endswith('$$'):
373
415
  content_md.append(content)
374
416
  else:
375
417
  content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
376
418
  elif content_type in UNI_FORMAT_TEXT_TYPE:
377
419
  content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
378
- elif content_type == "image":
420
+ elif content_type == 'image':
379
421
  content_md.append(f"![]({c.get('img_path')})")
380
- return "\n\n".join(content_md)
422
+ return '\n\n'.join(content_md)
423
+
381
424
 
382
425
  def mk_nlp_markdown(content_list):
383
- """
384
- 基于同一格式的内容列表,构造markdown,不含图片
385
- """
426
+ """基于同一格式的内容列表,构造markdown,不含图片."""
386
427
  content_md = []
387
428
  for c in content_list:
388
- content_type = c.get("type")
389
- if content_type == "text":
390
- content_md.append(c.get("text"))
391
- elif content_type == "equation":
429
+ content_type = c.get('type')
430
+ if content_type == 'text':
431
+ content_md.append(c.get('text'))
432
+ elif content_type == 'equation':
392
433
  content_md.append(f"$$\n{c.get('latex')}\n$$")
393
- elif content_type == "table":
434
+ elif content_type == 'table':
394
435
  content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
395
436
  elif content_type in UNI_FORMAT_TEXT_TYPE:
396
437
  content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
397
- return "\n\n".join(content_md)
438
+ return '\n\n'.join(content_md)