magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,16 @@
1
1
  import copy
2
+ import re
2
3
 
3
- from sklearn.cluster import DBSCAN
4
4
  import numpy as np
5
5
  from loguru import logger
6
- import re
7
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
8
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
9
- from magic_pdf.model.magic_model import MagicModel
10
- from magic_pdf.libs.Constants import *
6
+ from sklearn.cluster import DBSCAN
7
+
8
+ from magic_pdf.config.constants import * # noqa: F403
9
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
10
+ from magic_pdf.libs.boxbase import \
11
+ _is_in_or_part_overlap_with_area_ratio as is_in_layout
11
12
 
12
- LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?', "", ":", ")", "", ";"]
13
+ LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?', '', ':', ')', '', ';']
13
14
  INLINE_EQUATION = ContentType.InlineEquation
14
15
  INTERLINE_EQUATION = ContentType.InterlineEquation
15
16
  TEXT = ContentType.Text
@@ -36,7 +37,9 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
36
37
  ones_indices = []
37
38
  i = 0
38
39
  while i < len(lst): # Loop through the entire list
39
- if lst[i] == 1: # If we encounter a '1', we might be at the start of a pattern
40
+ if (
41
+ lst[i] == 1
42
+ ): # If we encounter a '1', we might be at the start of a pattern
40
43
  start = i
41
44
  ones_in_this_interval = [i]
42
45
  i += 1
@@ -46,7 +49,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
46
49
  ones_in_this_interval.append(i)
47
50
  i += 1
48
51
  if len(ones_in_this_interval) > 1 or (
49
- start < len(lst) - 1 and ones_in_this_interval and lst[start + 1] in [2, 3]):
52
+ start < len(lst) - 1
53
+ and ones_in_this_interval
54
+ and lst[start + 1] in [2, 3]
55
+ ):
50
56
  indices.append((start, i - 1))
51
57
  ones_indices.append(ones_in_this_interval)
52
58
  else:
@@ -65,7 +71,12 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
65
71
  while i < len(lst) and lst[i] in [2, 3]:
66
72
  i += 1
67
73
  # 验证下一个序列是否符合条件
68
- if i < len(lst) - 1 and lst[i] == 1 and lst[i + 1] in [2, 3] and lst[i - 1] in [2, 3]:
74
+ if (
75
+ i < len(lst) - 1
76
+ and lst[i] == 1
77
+ and lst[i + 1] in [2, 3]
78
+ and lst[i - 1] in [2, 3]
79
+ ):
69
80
  while i < len(lst) and lst[i] in [1, 2, 3]:
70
81
  if lst[i] == 1:
71
82
  ones_in_this_interval.append(i)
@@ -114,7 +125,7 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
114
125
  """
115
126
  if len(lines) > 0:
116
127
  x_map_tag_dict, min_x_tag = cluster_line_x(lines)
117
- for l in lines:
128
+ for l in lines: # noqa: E741
118
129
  span_text = __get_span_text(l['spans'][0])
119
130
  if not span_text:
120
131
  line_fea_encode.append(0)
@@ -142,28 +153,26 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
142
153
  list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
143
154
  if len(list_indice) > 0:
144
155
  if debug_able:
145
- logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
156
+ logger.info(f'发现了列表,列表行数:{list_indice}, {list_start_idx}')
146
157
 
147
158
  # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
148
- segments = []
159
+
149
160
  for start, end in list_indice:
150
161
  for i in range(start, end + 1):
151
162
  if i > 0:
152
163
  if line_fea_encode[i] == 4:
153
164
  if debug_able:
154
- logger.info(f"列表行的第{i}行不是顶格的")
165
+ logger.info(f'列表行的第{i}行不是顶格的')
155
166
  break
156
167
  else:
157
168
  if debug_able:
158
- logger.info(f"列表行的第{start}到第{end}行是列表")
169
+ logger.info(f'列表行的第{start}到第{end}行是列表')
159
170
 
160
171
  return split_indices(total_lines, list_indice), list_start_idx
161
172
 
162
173
 
163
174
  def cluster_line_x(lines: list) -> dict:
164
- """
165
- 对一个block内所有lines的bbox的x0聚类
166
- """
175
+ """对一个block内所有lines的bbox的x0聚类."""
167
176
  min_distance = 5
168
177
  min_sample = 1
169
178
  x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
@@ -171,14 +180,16 @@ def cluster_line_x(lines: list) -> dict:
171
180
  x0_uniq_label = np.unique(x0_clusters.labels_)
172
181
  # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
173
182
  x0_2_new_val = {} # 存储旧值对应的新值映射
174
- min_x0 = round(lines[0]["bbox"][0])
183
+ min_x0 = round(lines[0]['bbox'][0])
175
184
  for label in x0_uniq_label:
176
185
  if label == -1:
177
186
  continue
178
187
  x0_index_of_label = np.where(x0_clusters.labels_ == label)
179
188
  x0_raw_val = x0_lst[x0_index_of_label][:, 0]
180
189
  x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
181
- x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
190
+ x0_2_new_val.update(
191
+ {round(raw_val): round(x0_new_val) for raw_val in x0_raw_val}
192
+ )
182
193
  if x0_new_val < min_x0:
183
194
  min_x0 = x0_new_val
184
195
  return x0_2_new_val, min_x0
@@ -193,27 +204,41 @@ def if_match_reference_list(text: str) -> bool:
193
204
 
194
205
 
195
206
  def __valign_lines(blocks, layout_bboxes):
196
- """
197
- 在一个layoutbox内对齐行的左侧和右侧。
198
- 扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
199
- 3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
200
- """
207
+ """在一个layoutbox内对齐行的左侧和右侧。 扫描行的左侧和右侧,如果x0,
208
+ x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
209
+ 3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。"""
201
210
 
202
211
  min_distance = 3
203
212
  min_sample = 2
204
213
  new_layout_bboxes = []
205
214
  # add bbox_fs for para split calculation
206
215
  for block in blocks:
207
- block["bbox_fs"] = copy.deepcopy(block["bbox"])
216
+ block['bbox_fs'] = copy.deepcopy(block['bbox'])
208
217
  for layout_box in layout_bboxes:
209
- blocks_in_layoutbox = [b for b in blocks if
210
- b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
211
- if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
218
+ blocks_in_layoutbox = [
219
+ b
220
+ for b in blocks
221
+ if b['type'] == BlockType.Text
222
+ and is_in_layout(b['bbox'], layout_box['layout_bbox'])
223
+ ]
224
+ if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]['lines']) == 0:
212
225
  new_layout_bboxes.append(layout_box['layout_bbox'])
213
226
  continue
214
227
 
215
- x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
216
- x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
228
+ x0_lst = np.array(
229
+ [
230
+ [line['bbox'][0], 0]
231
+ for block in blocks_in_layoutbox
232
+ for line in block['lines']
233
+ ]
234
+ )
235
+ x1_lst = np.array(
236
+ [
237
+ [line['bbox'][2], 0]
238
+ for block in blocks_in_layoutbox
239
+ for line in block['lines']
240
+ ]
241
+ )
217
242
  x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
218
243
  x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
219
244
  x0_uniq_label = np.unique(x0_clusters.labels_)
@@ -248,11 +273,13 @@ def __valign_lines(blocks, layout_bboxes):
248
273
 
249
274
  # 由于修改了block里的line长度,现在需要重新计算block的bbox
250
275
  for block in blocks_in_layoutbox:
251
- if len(block["lines"]) > 0:
252
- block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
253
- min([line['bbox'][1] for line in block['lines']]),
254
- max([line['bbox'][2] for line in block['lines']]),
255
- max([line['bbox'][3] for line in block['lines']])]
276
+ if len(block['lines']) > 0:
277
+ block['bbox_fs'] = [
278
+ min([line['bbox'][0] for line in block['lines']]),
279
+ min([line['bbox'][1] for line in block['lines']]),
280
+ max([line['bbox'][2] for line in block['lines']]),
281
+ max([line['bbox'][3] for line in block['lines']]),
282
+ ]
256
283
  """新计算layout的bbox,因为block的bbox变了。"""
257
284
  layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
258
285
  layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
@@ -264,18 +291,19 @@ def __valign_lines(blocks, layout_bboxes):
264
291
 
265
292
 
266
293
  def __align_text_in_layout(blocks, layout_bboxes):
267
- """
268
- 由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
269
- """
294
+ """由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。"""
270
295
  for layout in layout_bboxes:
271
296
  lb = layout['layout_bbox']
272
- blocks_in_layoutbox = [block for block in blocks if
273
- block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)]
297
+ blocks_in_layoutbox = [
298
+ block
299
+ for block in blocks
300
+ if block['type'] == BlockType.Text and is_in_layout(block['bbox'], lb)
301
+ ]
274
302
  if len(blocks_in_layoutbox) == 0:
275
303
  continue
276
304
 
277
305
  for block in blocks_in_layoutbox:
278
- for line in block.get("lines", []):
306
+ for line in block.get('lines', []):
279
307
  x0, x1 = line['bbox'][0], line['bbox'][2]
280
308
  if x0 < lb[0]:
281
309
  line['bbox'][0] = lb[0]
@@ -284,9 +312,7 @@ def __align_text_in_layout(blocks, layout_bboxes):
284
312
 
285
313
 
286
314
  def __common_pre_proc(blocks, layout_bboxes):
287
- """
288
- 不分语言的,对文本进行预处理
289
- """
315
+ """不分语言的,对文本进行预处理."""
290
316
  # __add_line_period(blocks, layout_bboxes)
291
317
  __align_text_in_layout(blocks, layout_bboxes)
292
318
  aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
@@ -295,32 +321,30 @@ def __common_pre_proc(blocks, layout_bboxes):
295
321
 
296
322
 
297
323
  def __pre_proc_zh_blocks(blocks, layout_bboxes):
298
- """
299
- 对中文文本进行分段预处理
300
- """
324
+ """对中文文本进行分段预处理."""
301
325
  pass
302
326
 
303
327
 
304
328
  def __pre_proc_en_blocks(blocks, layout_bboxes):
305
- """
306
- 对英文文本进行分段预处理
307
- """
329
+ """对英文文本进行分段预处理."""
308
330
  pass
309
331
 
310
332
 
311
333
  def __group_line_by_layout(blocks, layout_bboxes):
312
- """
313
- 每个layout内的行进行聚合
314
- """
334
+ """每个layout内的行进行聚合."""
315
335
  # 因为只是一个block一行目前, 一个block就是一个段落
316
336
  blocks_group = []
317
337
  for lyout in layout_bboxes:
318
- blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
338
+ blocks_in_layout = [
339
+ block
340
+ for block in blocks
341
+ if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])
342
+ ]
319
343
  blocks_group.append(blocks_in_layout)
320
344
  return blocks_group
321
345
 
322
346
 
323
- def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
347
+ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang='en'):
324
348
  """
325
349
  lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
326
350
  1. 先计算每个group的左右边界。
@@ -336,17 +360,20 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
336
360
  if len(blocks) == 0:
337
361
  list_info.append([False, False])
338
362
  continue
339
- if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text:
363
+ if blocks[0]['type'] != BlockType.Text and blocks[-1]['type'] != BlockType.Text:
340
364
  list_info.append([False, False])
341
365
  continue
342
- if blocks[0]["type"] != BlockType.Text:
366
+ if blocks[0]['type'] != BlockType.Text:
343
367
  is_start_list = False
344
- if blocks[-1]["type"] != BlockType.Text:
368
+ if blocks[-1]['type'] != BlockType.Text:
345
369
  is_end_list = False
346
370
 
347
- lines = [line for block in blocks if
348
- block["type"] == BlockType.Text for line in
349
- block['lines']]
371
+ lines = [
372
+ line
373
+ for block in blocks
374
+ if block['type'] == BlockType.Text
375
+ for line in block['lines']
376
+ ]
350
377
  total_lines = len(lines)
351
378
  if total_lines == 1 or total_lines == 0:
352
379
  list_info.append([False, False])
@@ -359,7 +386,9 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
359
386
  2. 左对齐的列表块(其特点是左侧顶格的行数小于等于非顶格的行数,非定格首字母会有小写,顶格90%是大写。并且左侧顶格行数大于1,大于1是为了这种模式连续出现才能称之为列表)
360
387
  这样的文本块,顶格的为一个段落开头,紧随其后非顶格的行属于这个段落。
361
388
  """
362
- text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
389
+ text_segments, list_start_line = __detect_list_lines(
390
+ lines, new_layout_bbox, lang
391
+ )
363
392
  """根据list_range,把lines分成几个部分
364
393
 
365
394
  """
@@ -368,10 +397,17 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
368
397
  for i in range(0, len(list_start)):
369
398
  index = list_start[i] - 1
370
399
  if index >= 0:
371
- if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
372
- ContentType.InlineEquation, ContentType.InterlineEquation]:
373
- lines[index]["spans"][-1]["content"] += '\n\n'
374
- layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
400
+ if 'content' in lines[index]['spans'][-1] and lines[index][
401
+ 'spans'
402
+ ][-1].get('type', '') not in [
403
+ ContentType.InlineEquation,
404
+ ContentType.InterlineEquation,
405
+ ]:
406
+ lines[index]['spans'][-1]['content'] += '\n\n'
407
+ layout_list_info = [
408
+ False,
409
+ False,
410
+ ] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
375
411
  for content_type, start, end in text_segments:
376
412
  if content_type == 'list':
377
413
  if start == 0 and is_start_list is None:
@@ -388,8 +424,7 @@ def __split_para_lines(lines: list, text_blocks: list) -> list:
388
424
  other_paras = []
389
425
  text_lines = []
390
426
  for line in lines:
391
-
392
- spans_types = [span["type"] for span in line]
427
+ spans_types = [span['type'] for span in line]
393
428
  if ContentType.Table in spans_types:
394
429
  other_paras.append([line])
395
430
  continue
@@ -402,20 +437,22 @@ def __split_para_lines(lines: list, text_blocks: list) -> list:
402
437
  text_lines.append(line)
403
438
 
404
439
  for block in text_blocks:
405
- block_bbox = block["bbox"]
440
+ block_bbox = block['bbox']
406
441
  para = []
407
442
  for line in text_lines:
408
- bbox = line["bbox"]
443
+ bbox = line['bbox']
409
444
  if is_in_layout(bbox, block_bbox):
410
445
  para.append(line)
411
446
  if len(para) > 0:
412
447
  text_paras.append(para)
413
448
  paras = other_paras.extend(text_paras)
414
- paras_sorted = sorted(paras, key=lambda x: x[0]["bbox"][1])
449
+ paras_sorted = sorted(paras, key=lambda x: x[0]['bbox'][1])
415
450
  return paras_sorted
416
451
 
417
452
 
418
- def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, page_num, lang):
453
+ def __connect_list_inter_layout(
454
+ blocks_group, new_layout_bbox, layout_list_info, page_num, lang
455
+ ):
419
456
  global debug_able
420
457
  """
421
458
  如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
@@ -429,74 +466,108 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
429
466
  continue
430
467
  pre_layout_list_info = layout_list_info[i - 1]
431
468
  next_layout_list_info = layout_list_info[i]
432
- pre_last_para = blocks_group[i - 1][-1].get("lines", [])
469
+ pre_last_para = blocks_group[i - 1][-1].get('lines', [])
433
470
  next_paras = blocks_group[i]
434
471
  next_first_para = next_paras[0]
435
472
 
436
- if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
437
- "type"] == BlockType.Text: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
473
+ if (
474
+ pre_layout_list_info[1]
475
+ and not next_layout_list_info[0]
476
+ and next_first_para['type'] == BlockType.Text
477
+ ): # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
438
478
  if debug_able:
439
- logger.info(f"连接page {page_num} 内的list")
479
+ logger.info(f'连接page {page_num} 内的list')
440
480
  # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
441
481
  may_list_lines = []
442
- lines = next_first_para.get("lines", [])
482
+ lines = next_first_para.get('lines', [])
443
483
 
444
484
  for line in lines:
445
- if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
485
+ if (
486
+ line['bbox'][0]
487
+ > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]
488
+ ):
446
489
  may_list_lines.append(line)
447
490
  else:
448
491
  break
449
492
  # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
450
- if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
493
+ if (
494
+ len(may_list_lines) > 0
495
+ and len(set([x['bbox'][0] for x in may_list_lines])) == 1
496
+ ):
451
497
  pre_last_para.extend(may_list_lines)
452
- next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
453
-
454
- return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
455
-
456
-
457
- def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,
458
- pre_page_list_info, next_page_list_info, page_num, lang):
459
- """
460
- 如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
461
- 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。
462
- """
463
- if len(pre_page_paras) == 0 or len(next_page_paras) == 0: # 0的时候最后的return 会出错
498
+ next_first_para['lines'] = next_first_para['lines'][
499
+ len(may_list_lines) :
500
+ ]
501
+
502
+ return blocks_group, [
503
+ layout_list_info[0][0],
504
+ layout_list_info[-1][1],
505
+ ] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
506
+
507
+
508
+ def __connect_list_inter_page(
509
+ pre_page_paras,
510
+ next_page_paras,
511
+ pre_page_layout_bbox,
512
+ next_page_layout_bbox,
513
+ pre_page_list_info,
514
+ next_page_list_info,
515
+ page_num,
516
+ lang,
517
+ ):
518
+ """如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO
519
+ 因为没有区分列表和段落,所以这个方法暂时不实现。
520
+ 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。"""
521
+ if (
522
+ len(pre_page_paras) == 0 or len(next_page_paras) == 0
523
+ ): # 0的时候最后的return 会出错
464
524
  return False
465
525
  if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
466
526
  return False
467
- if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
527
+ if (
528
+ pre_page_paras[-1][-1]['type'] != BlockType.Text
529
+ or next_page_paras[0][0]['type'] != BlockType.Text
530
+ ):
468
531
  return False
469
- if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
532
+ if (
533
+ pre_page_list_info[1] and not next_page_list_info[0]
534
+ ): # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
470
535
  if debug_able:
471
- logger.info(f"连接page {page_num} 内的list")
536
+ logger.info(f'连接page {page_num} 内的list')
472
537
  # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
473
538
  may_list_lines = []
474
539
  next_page_first_para = next_page_paras[0][0]
475
- if next_page_first_para["type"] == BlockType.Text:
476
- lines = next_page_first_para["lines"]
540
+ if next_page_first_para['type'] == BlockType.Text:
541
+ lines = next_page_first_para['lines']
477
542
  for line in lines:
478
- if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
543
+ if (
544
+ line['bbox'][0]
545
+ > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]
546
+ ):
479
547
  may_list_lines.append(line)
480
548
  else:
481
549
  break
482
550
  # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
483
- if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
551
+ if (
552
+ len(may_list_lines) > 0
553
+ and len(set([x['bbox'][0] for x in may_list_lines])) == 1
554
+ ):
484
555
  # pre_page_paras[-1].append(may_list_lines)
485
556
  # 下一页合并到上一页最后一段,打一个cross_page的标签
486
557
  for line in may_list_lines:
487
- for span in line["spans"]:
488
- span[CROSS_PAGE] = True
489
- pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
490
- next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
558
+ for span in line['spans']:
559
+ span[CROSS_PAGE] = True # noqa: F405
560
+ pre_page_paras[-1][-1]['lines'].extend(may_list_lines)
561
+ next_page_first_para['lines'] = next_page_first_para['lines'][
562
+ len(may_list_lines) :
563
+ ]
491
564
  return True
492
565
 
493
566
  return False
494
567
 
495
568
 
496
569
  def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
497
- """
498
- 根据line找到所在的layout
499
- """
570
+ """根据line找到所在的layout."""
500
571
  for layout in layout_bboxes:
501
572
  if is_in_layout(line_bbox, layout):
502
573
  return layout
@@ -525,39 +596,59 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
525
596
  connected_layout_blocks.append(blocks_group[i])
526
597
  continue
527
598
  # text类型的段才需要考虑layout间的合并
528
- if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
599
+ if (
600
+ blocks_group[i - 1][-1]['type'] != BlockType.Text
601
+ or blocks_group[i][0]['type'] != BlockType.Text
602
+ ):
529
603
  connected_layout_blocks.append(blocks_group[i])
530
604
  continue
531
- if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
605
+ if (
606
+ len(blocks_group[i - 1][-1]['lines']) == 0
607
+ or len(blocks_group[i][0]['lines']) == 0
608
+ ):
532
609
  connected_layout_blocks.append(blocks_group[i])
533
610
  continue
534
- pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
535
- next_first_line = blocks_group[i][0]["lines"][0]
536
- except Exception as e:
537
- logger.error(f"page layout {i} has no line")
611
+ pre_last_line = blocks_group[i - 1][-1]['lines'][-1]
612
+ next_first_line = blocks_group[i][0]['lines'][0]
613
+ except Exception:
614
+ logger.error(f'page layout {i} has no line')
538
615
  continue
539
- pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
616
+ pre_last_line_text = ''.join(
617
+ [__get_span_text(span) for span in pre_last_line['spans']]
618
+ )
540
619
  pre_last_line_type = pre_last_line['spans'][-1]['type']
541
- next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
620
+ next_first_line_text = ''.join(
621
+ [__get_span_text(span) for span in next_first_line['spans']]
622
+ )
542
623
  next_first_line_type = next_first_line['spans'][0]['type']
543
- if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
624
+ if pre_last_line_type not in [
625
+ TEXT,
626
+ INLINE_EQUATION,
627
+ ] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
544
628
  connected_layout_blocks.append(blocks_group[i])
545
629
  continue
546
630
  pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
547
- next_layout = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)
631
+ next_layout = __find_layout_bbox_by_line(
632
+ next_first_line['bbox'], new_layout_bbox
633
+ )
548
634
 
549
635
  pre_x2_max = pre_layout[2] if pre_layout else -1
550
636
  next_x0_min = next_layout[0] if next_layout else -1
551
637
 
552
638
  pre_last_line_text = pre_last_line_text.strip()
553
639
  next_first_line_text = next_first_line_text.strip()
554
- if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[
555
- -1] not in LINE_STOP_FLAG and \
556
- next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
640
+ if (
641
+ pre_last_line['bbox'][2] == pre_x2_max
642
+ and pre_last_line_text
643
+ and pre_last_line_text[-1] not in LINE_STOP_FLAG
644
+ and next_first_line['bbox'][0] == next_x0_min
645
+ ): # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
557
646
  """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
558
- connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
559
- blocks_group[i][0]["lines"] = [] # 删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
560
- blocks_group[i][0][LINES_DELETED] = True
647
+ connected_layout_blocks[-1][-1]['lines'].extend(blocks_group[i][0]['lines'])
648
+ blocks_group[i][0][
649
+ 'lines'
650
+ ] = [] # 删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
651
+ blocks_group[i][0][LINES_DELETED] = True # noqa: F405
561
652
  # if len(layout_paras[i]) == 0:
562
653
  # layout_paras.pop(i)
563
654
  # else:
@@ -569,8 +660,14 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
569
660
  return connected_layout_blocks
570
661
 
571
662
 
572
- def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num,
573
- lang):
663
+ def __connect_para_inter_page(
664
+ pre_page_paras,
665
+ next_page_paras,
666
+ pre_page_layout_bbox,
667
+ next_page_layout_bbox,
668
+ page_num,
669
+ lang,
670
+ ):
574
671
  """
575
672
  连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
576
673
  是否可以连接的条件:
@@ -578,33 +675,53 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
578
675
  2. 后一个页面的第一个段落第一行没有空白开头。
579
676
  """
580
677
  # 有的页面可能压根没有文字
581
- if len(pre_page_paras) == 0 or len(next_page_paras) == 0 or len(pre_page_paras[0]) == 0 or len(
582
- next_page_paras[0]) == 0: # TODO [[]]为什么出现在pre_page_paras里?
678
+ if (
679
+ len(pre_page_paras) == 0
680
+ or len(next_page_paras) == 0
681
+ or len(pre_page_paras[0]) == 0
682
+ or len(next_page_paras[0]) == 0
683
+ ): # TODO [[]]为什么出现在pre_page_paras里?
583
684
  return False
584
685
  pre_last_block = pre_page_paras[-1][-1]
585
686
  next_first_block = next_page_paras[0][0]
586
- if pre_last_block["type"] != BlockType.Text or next_first_block["type"] != BlockType.Text:
687
+ if (
688
+ pre_last_block['type'] != BlockType.Text
689
+ or next_first_block['type'] != BlockType.Text
690
+ ):
587
691
  return False
588
- if len(pre_last_block["lines"]) == 0 or len(next_first_block["lines"]) == 0:
692
+ if len(pre_last_block['lines']) == 0 or len(next_first_block['lines']) == 0:
589
693
  return False
590
- pre_last_para = pre_last_block["lines"]
591
- next_first_para = next_first_block["lines"]
694
+ pre_last_para = pre_last_block['lines']
695
+ next_first_para = next_first_block['lines']
592
696
  pre_last_line = pre_last_para[-1]
593
697
  next_first_line = next_first_para[0]
594
- pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
698
+ pre_last_line_text = ''.join(
699
+ [__get_span_text(span) for span in pre_last_line['spans']]
700
+ )
595
701
  pre_last_line_type = pre_last_line['spans'][-1]['type']
596
- next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
702
+ next_first_line_text = ''.join(
703
+ [__get_span_text(span) for span in next_first_line['spans']]
704
+ )
597
705
  next_first_line_type = next_first_line['spans'][0]['type']
598
706
 
599
- if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT,
600
- INLINE_EQUATION]: # TODO,真的要做好,要考虑跨table, image, 行间的情况
707
+ if pre_last_line_type not in [
708
+ TEXT,
709
+ INLINE_EQUATION,
710
+ ] or next_first_line_type not in [
711
+ TEXT,
712
+ INLINE_EQUATION,
713
+ ]: # TODO,真的要做好,要考虑跨table, image, 行间的情况
601
714
  # 不是文本,不连接
602
715
  return False
603
716
 
604
- pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
717
+ pre_x2_max_bbox = __find_layout_bbox_by_line(
718
+ pre_last_line['bbox'], pre_page_layout_bbox
719
+ )
605
720
  if not pre_x2_max_bbox:
606
721
  return False
607
- next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
722
+ next_x0_min_bbox = __find_layout_bbox_by_line(
723
+ next_first_line['bbox'], next_page_layout_bbox
724
+ )
608
725
  if not next_x0_min_bbox:
609
726
  return False
610
727
 
@@ -613,18 +730,21 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
613
730
 
614
731
  pre_last_line_text = pre_last_line_text.strip()
615
732
  next_first_line_text = next_first_line_text.strip()
616
- if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
617
- next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
733
+ if (
734
+ pre_last_line['bbox'][2] == pre_x2_max
735
+ and pre_last_line_text[-1] not in LINE_STOP_FLAG
736
+ and next_first_line['bbox'][0] == next_x0_min
737
+ ): # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
618
738
  """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
619
739
  # 下一页合并到上一页最后一段,打一个cross_page的标签
620
740
  for line in next_first_para:
621
- for span in line["spans"]:
622
- span[CROSS_PAGE] = True
741
+ for span in line['spans']:
742
+ span[CROSS_PAGE] = True # noqa: F405
623
743
  pre_last_para.extend(next_first_para)
624
744
 
625
745
  # next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
626
- next_page_paras[0][0]["lines"] = []
627
- next_page_paras[0][0][LINES_DELETED] = True
746
+ next_page_paras[0][0]['lines'] = []
747
+ next_page_paras[0][0][LINES_DELETED] = True # noqa: F405
628
748
  return True
629
749
  else:
630
750
  return False
@@ -667,38 +787,73 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
667
787
  single_line_paras_tag = []
668
788
  for i in range(len(layout_para)):
669
789
  # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
670
- single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
790
+ single_line_paras_tag.append(
791
+ layout_para[i]['type'] == BlockType.Text
792
+ and len(layout_para[i]['lines']) == 1
793
+ )
671
794
  """找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。"""
672
- consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
795
+ consecutive_single_line_indices = find_consecutive_true_regions(
796
+ single_line_paras_tag
797
+ )
673
798
  if len(consecutive_single_line_indices) > 0:
674
- """检查这些行是否是高度相同的,居中的"""
799
+ """检查这些行是否是高度相同的,居中的."""
675
800
  for start, end in consecutive_single_line_indices:
676
801
  # start += index_offset
677
802
  # end += index_offset
678
- line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
679
- layout_para[start:end + 1]])
680
- first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
681
- if "Table" in first_line_text or "Figure" in first_line_text:
803
+ line_hi = np.array(
804
+ [
805
+ block['lines'][0]['bbox'][3] - block['lines'][0]['bbox'][1]
806
+ for block in layout_para[start : end + 1]
807
+ ]
808
+ )
809
+ first_line_text = ''.join(
810
+ [
811
+ __get_span_text(span)
812
+ for span in layout_para[start]['lines'][0]['spans']
813
+ ]
814
+ )
815
+ if 'Table' in first_line_text or 'Figure' in first_line_text:
682
816
  pass
683
817
  if debug_able:
684
818
  logger.info(line_hi.std())
685
819
 
686
820
  if line_hi.std() < 2:
687
- """行高度相同,那么判断是否居中"""
688
- all_left_x0 = [block["lines"][0]['bbox'][0] for block in layout_para[start:end + 1]]
689
- all_right_x1 = [block["lines"][0]['bbox'][2] for block in layout_para[start:end + 1]]
821
+ """行高度相同,那么判断是否居中."""
822
+ all_left_x0 = [
823
+ block['lines'][0]['bbox'][0]
824
+ for block in layout_para[start : end + 1]
825
+ ]
826
+ all_right_x1 = [
827
+ block['lines'][0]['bbox'][2]
828
+ for block in layout_para[start : end + 1]
829
+ ]
690
830
  layout_center = (layout_box[0] + layout_box[2]) / 2
691
- if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
692
- and not all([x0 == layout_box[0] for x0 in all_left_x0]) \
693
- and not all([x1 == layout_box[2] for x1 in all_right_x1]):
694
- merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
695
- para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
831
+ if (
832
+ all(
833
+ [
834
+ x0 < layout_center < x1
835
+ for x0, x1 in zip(all_left_x0, all_right_x1)
836
+ ]
837
+ )
838
+ and not all([x0 == layout_box[0] for x0 in all_left_x0])
839
+ and not all([x1 == layout_box[2] for x1 in all_right_x1])
840
+ ):
841
+ merge_para = [
842
+ block['lines'][0] for block in layout_para[start : end + 1]
843
+ ]
844
+ para_text = ''.join(
845
+ [
846
+ __get_span_text(span)
847
+ for line in merge_para
848
+ for span in line['spans']
849
+ ]
850
+ )
696
851
  if debug_able:
697
852
  logger.info(para_text)
698
- layout_para[start]["lines"] = merge_para
853
+ layout_para[start]['lines'] = merge_para
699
854
  for i_para in range(start + 1, end + 1):
700
- layout_para[i_para]["lines"] = []
701
- layout_para[i_para][LINES_DELETED] = True
855
+ layout_para[i_para]['lines'] = []
856
+ layout_para[i_para][LINES_DELETED] = True # noqa: F405
702
857
  # layout_para[start:end + 1] = [merge_para]
703
858
 
704
859
  # index_offset -= end - start
@@ -707,18 +862,13 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
707
862
 
708
863
 
709
864
  def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
710
- """
711
- 找出来连续的单行文本,如果首行顶格,接下来的几个单行段落缩进对齐,那么合并为一个段落。
712
- """
865
+ """找出来连续的单行文本,如果首行顶格,接下来的几个单行段落缩进对齐,那么合并为一个段落。"""
713
866
 
714
867
  pass
715
868
 
716
869
 
717
870
  def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
718
- """
719
- 根据line和layout情况进行分段
720
- 先实现一个根据行末尾特征分段的简单方法。
721
- """
871
+ """根据line和layout情况进行分段 先实现一个根据行末尾特征分段的简单方法。"""
722
872
  """
723
873
  算法思路:
724
874
  1. 扫描layout里每一行,找出来行尾距离layout有边界有一定距离的行。
@@ -727,15 +877,20 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
727
877
  4. 图、表,目前独占一行,不考虑分段。
728
878
  """
729
879
  blocks_group = __group_line_by_layout(blocks, layout_bboxes) # block内分段
730
- layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段
731
- blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
732
- page_num, lang) # layout之间连接列表段落
733
- connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox) # layout间链接段落
880
+ layout_list_info = __split_para_in_layoutbox(
881
+ blocks_group, new_layout_bbox, lang
882
+ ) # layout内分段
883
+ blocks_group, page_list_info = __connect_list_inter_layout(
884
+ blocks_group, new_layout_bbox, layout_list_info, page_num, lang
885
+ ) # layout之间连接列表段落
886
+ connected_layout_blocks = __connect_para_inter_layoutbox(
887
+ blocks_group, new_layout_bbox
888
+ ) # layout间链接段落
734
889
 
735
890
  return connected_layout_blocks, page_list_info
736
891
 
737
892
 
738
- def para_split(pdf_info_dict, debug_mode, lang="en"):
893
+ def para_split(pdf_info_dict, debug_mode, lang='en'):
739
894
  global debug_able
740
895
  debug_able = debug_mode
741
896
  new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS
@@ -745,7 +900,9 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
745
900
  layout_bboxes = page['layout_bboxes']
746
901
  new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
747
902
  new_layout_of_pages.append(new_layout_bbox)
748
- splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
903
+ splited_blocks, page_list_info = __do_split_page(
904
+ blocks, layout_bboxes, new_layout_bbox, page_num, lang
905
+ )
749
906
  all_page_list_info.append(page_list_info)
750
907
  page['para_blocks'] = splited_blocks
751
908
 
@@ -759,18 +916,31 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
759
916
  pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
760
917
  next_page_layout_bbox = new_layout_of_pages[page_num]
761
918
 
762
- is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
763
- next_page_layout_bbox, page_num, lang)
919
+ is_conn = __connect_para_inter_page(
920
+ pre_page_paras,
921
+ next_page_paras,
922
+ pre_page_layout_bbox,
923
+ next_page_layout_bbox,
924
+ page_num,
925
+ lang,
926
+ )
764
927
  if debug_able:
765
928
  if is_conn:
766
- logger.info(f"连接了第{page_num - 1}页和第{page_num}页的段落")
767
-
768
- is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
769
- next_page_layout_bbox, all_page_list_info[page_num - 1],
770
- all_page_list_info[page_num], page_num, lang)
929
+ logger.info(f'连接了第{page_num - 1}页和第{page_num}页的段落')
930
+
931
+ is_list_conn = __connect_list_inter_page(
932
+ pre_page_paras,
933
+ next_page_paras,
934
+ pre_page_layout_bbox,
935
+ next_page_layout_bbox,
936
+ all_page_list_info[page_num - 1],
937
+ all_page_list_info[page_num],
938
+ page_num,
939
+ lang,
940
+ )
771
941
  if debug_able:
772
942
  if is_list_conn:
773
- logger.info(f"连接了第{page_num - 1}页和第{page_num}页的列表段落")
943
+ logger.info(f'连接了第{page_num - 1}页和第{page_num}页的列表段落')
774
944
 
775
945
  """接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
776
946
  1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
@@ -786,4 +956,4 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
786
956
  for page_num, page in enumerate(pdf_info_dict.values()):
787
957
  page_paras = page['para_blocks']
788
958
  page_blocks = [block for layout in page_paras for block in layout]
789
- page["para_blocks"] = page_blocks
959
+ page['para_blocks'] = page_blocks