magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,45 @@
1
- """
2
- 对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
3
- """
1
+ """对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果."""
4
2
 
5
- from magic_pdf.libs.commons import fitz
6
3
  import json
7
4
  import os
8
5
  from pathlib import Path
6
+
9
7
  from loguru import logger
10
- from magic_pdf.libs.ocr_content_type import ContentType
8
+
9
+ from magic_pdf.config.ocr_content_type import ContentType
10
+ from magic_pdf.libs.commons import fitz
11
11
 
12
12
  TYPE_INLINE_EQUATION = ContentType.InlineEquation
13
13
  TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
14
14
 
15
15
 
16
16
  def combine_chars_to_pymudict(block_dict, char_dict):
17
- """
18
- 把block级别的pymupdf 结构里加入char结构
19
- """
17
+ """把block级别的pymupdf 结构里加入char结构."""
20
18
  # 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
21
- char_map = {tuple(item["bbox"]): item for item in char_dict}
19
+ char_map = {tuple(item['bbox']): item for item in char_dict}
22
20
 
23
- for i in range(len(block_dict)): # blcok
21
+ for i in range(len(block_dict)): # block
24
22
  block = block_dict[i]
25
- key = block["bbox"]
23
+ key = block['bbox']
26
24
  char_dict_item = char_map[tuple(key)]
27
- char_dict_map = {tuple(item["bbox"]): item for item in char_dict_item["lines"]}
28
- for j in range(len(block["lines"])):
29
- lines = block["lines"][j]
30
- with_char_lines = char_dict_map[lines["bbox"]]
31
- for k in range(len(lines["spans"])):
32
- spans = lines["spans"][k]
25
+ char_dict_map = {tuple(item['bbox']): item for item in char_dict_item['lines']}
26
+ for j in range(len(block['lines'])):
27
+ lines = block['lines'][j]
28
+ with_char_lines = char_dict_map[lines['bbox']]
29
+ for k in range(len(lines['spans'])):
30
+ spans = lines['spans'][k]
33
31
  try:
34
- chars = with_char_lines["spans"][k]["chars"]
35
- except Exception as e:
36
- logger.error(char_dict[i]["lines"][j])
32
+ chars = with_char_lines['spans'][k]['chars']
33
+ except Exception:
34
+ logger.error(char_dict[i]['lines'][j])
37
35
 
38
- spans["chars"] = chars
36
+ spans['chars'] = chars
39
37
 
40
38
  return block_dict
41
39
 
42
40
 
43
41
  def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
44
- """
45
- 计算box1和box2的重叠面积占最小面积的box的比例
46
- """
42
+ """计算box1和box2的重叠面积占最小面积的box的比例."""
47
43
  # Determine the coordinates of the intersection rectangle
48
44
  x_left = max(bbox1[0], min_bbox[0])
49
45
  y_top = max(bbox1[1], min_bbox[1])
@@ -74,13 +70,13 @@ def _is_xin(bbox1, bbox2):
74
70
 
75
71
 
76
72
  def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
77
- """消除掉整个块都在行间公式块内部的文本块"""
73
+ """消除掉整个块都在行间公式块内部的文本块."""
78
74
  for eq_bbox in interline_bboxes:
79
75
  removed_txt_blk = []
80
76
  for text_blk in text_blocks:
81
- text_bbox = text_blk["bbox"]
77
+ text_bbox = text_blk['bbox']
82
78
  if (
83
- calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], text_bbox)
79
+ calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)
84
80
  >= 0.7
85
81
  ):
86
82
  removed_txt_blk.append(text_blk)
@@ -91,9 +87,7 @@ def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
91
87
 
92
88
 
93
89
  def _is_in_or_part_overlap(box1, box2) -> bool:
94
- """
95
- 两个bbox是否有部分重叠或者包含
96
- """
90
+ """两个bbox是否有部分重叠或者包含."""
97
91
  if box1 is None or box2 is None:
98
92
  return False
99
93
 
@@ -111,62 +105,65 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
111
105
  def remove_text_block_overlap_interline_equation_bbox(
112
106
  interline_eq_bboxes, pymu_block_list
113
107
  ):
114
-
115
- """消除掉行行内公式有部分重叠的文本块的内容。
116
- 同时重新计算消除重叠之后文本块的大小"""
108
+ """消除掉行行内公式有部分重叠的文本块的内容。 同时重新计算消除重叠之后文本块的大小."""
117
109
  deleted_block = []
118
110
  for text_block in pymu_block_list:
119
111
  deleted_line = []
120
- for line in text_block["lines"]:
112
+ for line in text_block['lines']:
121
113
  deleted_span = []
122
- for span in line["spans"]:
114
+ for span in line['spans']:
123
115
  deleted_chars = []
124
- for char in span["chars"]:
116
+ for char in span['chars']:
125
117
  if any(
126
- [
127
- (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
128
- for eq_bbox in interline_eq_bboxes
129
- ]
118
+ [
119
+ (
120
+ calculate_overlap_area_2_minbox_area_ratio(
121
+ eq_bbox['bbox'], char['bbox']
122
+ )
123
+ > 0.5
124
+ )
125
+ for eq_bbox in interline_eq_bboxes
126
+ ]
130
127
  ):
131
128
  deleted_chars.append(char)
132
129
  # 检查span里没有char则删除这个span
133
130
  for char in deleted_chars:
134
- span["chars"].remove(char)
131
+ span['chars'].remove(char)
135
132
  # 重新计算这个span的大小
136
- if len(span["chars"]) == 0: # 删除这个span
133
+ if len(span['chars']) == 0: # 删除这个span
137
134
  deleted_span.append(span)
138
135
  else:
139
- span["bbox"] = (
140
- min([b["bbox"][0] for b in span["chars"]]),
141
- min([b["bbox"][1] for b in span["chars"]]),
142
- max([b["bbox"][2] for b in span["chars"]]),
143
- max([b["bbox"][3] for b in span["chars"]]),
136
+ span['bbox'] = (
137
+ min([b['bbox'][0] for b in span['chars']]),
138
+ min([b['bbox'][1] for b in span['chars']]),
139
+ max([b['bbox'][2] for b in span['chars']]),
140
+ max([b['bbox'][3] for b in span['chars']]),
144
141
  )
145
142
 
146
143
  # 检查这个span
147
144
  for span in deleted_span:
148
- line["spans"].remove(span)
149
- if len(line["spans"]) == 0: # 删除这个line
145
+ line['spans'].remove(span)
146
+ if len(line['spans']) == 0: # 删除这个line
150
147
  deleted_line.append(line)
151
148
  else:
152
- line["bbox"] = (
153
- min([b["bbox"][0] for b in line["spans"]]),
154
- min([b["bbox"][1] for b in line["spans"]]),
155
- max([b["bbox"][2] for b in line["spans"]]),
156
- max([b["bbox"][3] for b in line["spans"]]),
149
+ line['bbox'] = (
150
+ min([b['bbox'][0] for b in line['spans']]),
151
+ min([b['bbox'][1] for b in line['spans']]),
152
+ max([b['bbox'][2] for b in line['spans']]),
153
+ max([b['bbox'][3] for b in line['spans']]),
157
154
  )
158
155
 
159
156
  # 检查这个block是否可以删除
160
157
  for line in deleted_line:
161
- text_block["lines"].remove(line)
162
- if len(text_block["lines"]) == 0: # 删除block
158
+ text_block['lines'].remove(line)
159
+ if len(text_block['lines']) == 0: # 删除block
163
160
  deleted_block.append(text_block)
164
161
  else:
165
- text_block["bbox"] = (
166
- min([b["bbox"][0] for b in text_block["lines"]]),
167
- min([b["bbox"][1] for b in text_block["lines"]]),
168
- max([b["bbox"][2] for b in text_block["lines"]]),
169
- max([b["bbox"][3] for b in text_block["lines"]]),
162
+ text_block['bbox'] = (
163
+ min([b['bbox'][0] for b in text_block['lines']]),
164
+ min([b['bbox'][1] for b in text_block['lines']]),
165
+ max([b['bbox'][2] for b in text_block['lines']]),
166
+ max([b['bbox'][3] for b in text_block['lines']]),
170
167
  )
171
168
 
172
169
  # 检查text block删除
@@ -179,33 +176,33 @@ def remove_text_block_overlap_interline_equation_bbox(
179
176
 
180
177
 
181
178
  def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
182
- """在行间公式对应的地方插上一个伪造的block"""
179
+ """在行间公式对应的地方插上一个伪造的block."""
183
180
  for eq in interline_eq_bboxes:
184
- bbox = eq["bbox"]
185
- latex_content = eq["latex"]
181
+ bbox = eq['bbox']
182
+ latex_content = eq['latex']
186
183
  text_block = {
187
- "number": len(pymu_block_list),
188
- "type": 0,
189
- "bbox": bbox,
190
- "lines": [
184
+ 'number': len(pymu_block_list),
185
+ 'type': 0,
186
+ 'bbox': bbox,
187
+ 'lines': [
191
188
  {
192
- "spans": [
189
+ 'spans': [
193
190
  {
194
- "size": 9.962599754333496,
195
- "type": TYPE_INTERLINE_EQUATION,
196
- "flags": 4,
197
- "font": TYPE_INTERLINE_EQUATION,
198
- "color": 0,
199
- "ascender": 0.9409999847412109,
200
- "descender": -0.3050000071525574,
201
- "latex": latex_content,
202
- "origin": [bbox[0], bbox[1]],
203
- "bbox": bbox,
191
+ 'size': 9.962599754333496,
192
+ 'type': TYPE_INTERLINE_EQUATION,
193
+ 'flags': 4,
194
+ 'font': TYPE_INTERLINE_EQUATION,
195
+ 'color': 0,
196
+ 'ascender': 0.9409999847412109,
197
+ 'descender': -0.3050000071525574,
198
+ 'latex': latex_content,
199
+ 'origin': [bbox[0], bbox[1]],
200
+ 'bbox': bbox,
204
201
  }
205
202
  ],
206
- "wmode": 0,
207
- "dir": [1.0, 0.0],
208
- "bbox": bbox,
203
+ 'wmode': 0,
204
+ 'dir': [1.0, 0.0],
205
+ 'bbox': bbox,
209
206
  }
210
207
  ],
211
208
  }
@@ -250,53 +247,52 @@ def __y_overlap_ratio(box1, box2):
250
247
 
251
248
 
252
249
  def replace_line_v2(eqinfo, line):
253
- """
254
- 扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
255
- 最后与这个x0,x1有相交的span0, span1内部进行分割。
256
- """
250
+ """扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
251
+ 最后与这个x0,x1有相交的span0, span1内部进行分割。"""
257
252
  first_overlap_span = -1
258
253
  first_overlap_span_idx = -1
259
254
  last_overlap_span = -1
260
255
  delete_chars = []
261
- for i in range(0, len(line["spans"])):
262
- if "chars" not in line["spans"][i]:
256
+ for i in range(0, len(line['spans'])):
257
+ if 'chars' not in line['spans'][i]:
263
258
  continue
264
259
 
265
- if line["spans"][i].get("_type", None) is not None:
260
+ if line['spans'][i].get('_type', None) is not None:
266
261
  continue # 忽略,因为已经是插入的伪造span公式了
267
262
 
268
- for char in line["spans"][i]["chars"]:
269
- if __is_x_dir_overlap(eqinfo["bbox"], char["bbox"]):
270
- line_txt = ""
271
- for span in line["spans"]:
272
- span_txt = "<span>"
273
- for ch in span["chars"]:
274
- span_txt = span_txt + ch["c"]
263
+ for char in line['spans'][i]['chars']:
264
+ if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
265
+ line_txt = ''
266
+ for span in line['spans']:
267
+ span_txt = '<span>'
268
+ for ch in span['chars']:
269
+ span_txt = span_txt + ch['c']
275
270
 
276
- span_txt = span_txt + "</span>"
271
+ span_txt = span_txt + '</span>'
277
272
 
278
273
  line_txt = line_txt + span_txt
279
274
 
280
275
  if first_overlap_span_idx == -1:
281
- first_overlap_span = line["spans"][i]
276
+ first_overlap_span = line['spans'][i]
282
277
  first_overlap_span_idx = i
283
- last_overlap_span = line["spans"][i]
278
+ last_overlap_span = line['spans'][i]
284
279
  delete_chars.append(char)
285
280
 
286
281
  # 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
287
282
  if len(delete_chars) > 0:
288
- ch0_bbox = delete_chars[0]["bbox"]
289
- if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
283
+ ch0_bbox = delete_chars[0]['bbox']
284
+ if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
290
285
  delete_chars.remove(delete_chars[0])
291
286
  if len(delete_chars) > 0:
292
- ch0_bbox = delete_chars[-1]["bbox"]
293
- if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
287
+ ch0_bbox = delete_chars[-1]['bbox']
288
+ if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
294
289
  delete_chars.remove(delete_chars[-1])
295
290
 
296
291
  # 计算x方向上被删除区间内的char的真实x0, x1
297
292
  if len(delete_chars):
298
- x0, x1 = min([b["bbox"][0] for b in delete_chars]), max(
299
- [b["bbox"][2] for b in delete_chars]
293
+ x0, x1 = (
294
+ min([b['bbox'][0] for b in delete_chars]),
295
+ max([b['bbox'][2] for b in delete_chars]),
300
296
  )
301
297
  else:
302
298
  # logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
@@ -304,101 +300,101 @@ def replace_line_v2(eqinfo, line):
304
300
 
305
301
  # 删除位于x0, x1这两个中间的span
306
302
  delete_span = []
307
- for span in line["spans"]:
308
- span_box = span["bbox"]
303
+ for span in line['spans']:
304
+ span_box = span['bbox']
309
305
  if x0 <= span_box[0] and span_box[2] <= x1:
310
306
  delete_span.append(span)
311
307
  for span in delete_span:
312
- line["spans"].remove(span)
308
+ line['spans'].remove(span)
313
309
 
314
310
  equation_span = {
315
- "size": 9.962599754333496,
316
- "type": TYPE_INLINE_EQUATION,
317
- "flags": 4,
318
- "font": TYPE_INLINE_EQUATION,
319
- "color": 0,
320
- "ascender": 0.9409999847412109,
321
- "descender": -0.3050000071525574,
322
- "latex": "",
323
- "origin": [337.1410153102337, 216.0205245153934],
324
- "bbox": eqinfo["bbox"]
311
+ 'size': 9.962599754333496,
312
+ 'type': TYPE_INLINE_EQUATION,
313
+ 'flags': 4,
314
+ 'font': TYPE_INLINE_EQUATION,
315
+ 'color': 0,
316
+ 'ascender': 0.9409999847412109,
317
+ 'descender': -0.3050000071525574,
318
+ 'latex': '',
319
+ 'origin': [337.1410153102337, 216.0205245153934],
320
+ 'bbox': eqinfo['bbox'],
325
321
  }
326
322
  # equation_span = line['spans'][0].copy()
327
- equation_span["latex"] = eqinfo['latex']
328
- equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
329
- equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
330
- equation_span["chars"] = delete_chars
331
- equation_span["type"] = TYPE_INLINE_EQUATION
332
- equation_span["_eq_bbox"] = eqinfo["bbox"]
333
- line["spans"].insert(first_overlap_span_idx + 1, equation_span) # 放入公式
323
+ equation_span['latex'] = eqinfo['latex']
324
+ equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]]
325
+ equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
326
+ equation_span['chars'] = delete_chars
327
+ equation_span['type'] = TYPE_INLINE_EQUATION
328
+ equation_span['_eq_bbox'] = eqinfo['bbox']
329
+ line['spans'].insert(first_overlap_span_idx + 1, equation_span) # 放入公式
334
330
 
335
331
  # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
336
332
 
337
333
  # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
338
334
  first_span_chars = [
339
335
  char
340
- for char in first_overlap_span["chars"]
341
- if (char["bbox"][2] + char["bbox"][0]) / 2 < x0
336
+ for char in first_overlap_span['chars']
337
+ if (char['bbox'][2] + char['bbox'][0]) / 2 < x0
342
338
  ]
343
339
  tail_span_chars = [
344
340
  char
345
- for char in last_overlap_span["chars"]
346
- if (char["bbox"][0] + char["bbox"][2]) / 2 > x1
341
+ for char in last_overlap_span['chars']
342
+ if (char['bbox'][0] + char['bbox'][2]) / 2 > x1
347
343
  ]
348
344
 
349
345
  if len(first_span_chars) > 0:
350
- first_overlap_span["chars"] = first_span_chars
351
- first_overlap_span["text"] = "".join([char["c"] for char in first_span_chars])
352
- first_overlap_span["bbox"] = (
353
- first_overlap_span["bbox"][0],
354
- first_overlap_span["bbox"][1],
355
- max([chr["bbox"][2] for chr in first_span_chars]),
356
- first_overlap_span["bbox"][3],
346
+ first_overlap_span['chars'] = first_span_chars
347
+ first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
348
+ first_overlap_span['bbox'] = (
349
+ first_overlap_span['bbox'][0],
350
+ first_overlap_span['bbox'][1],
351
+ max([chr['bbox'][2] for chr in first_span_chars]),
352
+ first_overlap_span['bbox'][3],
357
353
  )
358
354
  # first_overlap_span['_type'] = "first"
359
355
  else:
360
356
  # 删掉
361
357
  if first_overlap_span not in delete_span:
362
- line["spans"].remove(first_overlap_span)
358
+ line['spans'].remove(first_overlap_span)
363
359
 
364
360
  if len(tail_span_chars) > 0:
365
- min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
366
- min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
367
- max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
368
- max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
361
+ min_of_tail_span_x0 = min([chr['bbox'][0] for chr in tail_span_chars])
362
+ min_of_tail_span_y0 = min([chr['bbox'][1] for chr in tail_span_chars])
363
+ max_of_tail_span_x1 = max([chr['bbox'][2] for chr in tail_span_chars])
364
+ max_of_tail_span_y1 = max([chr['bbox'][3] for chr in tail_span_chars])
369
365
 
370
366
  if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的
371
- tail_span_txt = "".join([char["c"] for char in tail_span_chars])
367
+ tail_span_txt = ''.join([char['c'] for char in tail_span_chars]) # noqa: F841
372
368
  last_span_to_insert = last_overlap_span.copy()
373
- last_span_to_insert["chars"] = tail_span_chars
374
- last_span_to_insert["text"] = "".join(
375
- [char["c"] for char in tail_span_chars]
369
+ last_span_to_insert['chars'] = tail_span_chars
370
+ last_span_to_insert['text'] = ''.join(
371
+ [char['c'] for char in tail_span_chars]
376
372
  )
377
- if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
378
- last_span_to_insert["bbox"] = (
373
+ if equation_span['bbox'][2] >= last_overlap_span['bbox'][2]:
374
+ last_span_to_insert['bbox'] = (
379
375
  min_of_tail_span_x0,
380
376
  min_of_tail_span_y0,
381
377
  max_of_tail_span_x1,
382
- max_of_tail_span_y1
378
+ max_of_tail_span_y1,
383
379
  )
384
380
  else:
385
- last_span_to_insert["bbox"] = (
386
- min([chr["bbox"][0] for chr in tail_span_chars]),
387
- last_overlap_span["bbox"][1],
388
- last_overlap_span["bbox"][2],
389
- last_overlap_span["bbox"][3],
381
+ last_span_to_insert['bbox'] = (
382
+ min([chr['bbox'][0] for chr in tail_span_chars]),
383
+ last_overlap_span['bbox'][1],
384
+ last_overlap_span['bbox'][2],
385
+ last_overlap_span['bbox'][3],
390
386
  )
391
387
  # 插入到公式对象之后
392
- equation_idx = line["spans"].index(equation_span)
393
- line["spans"].insert(equation_idx + 1, last_span_to_insert) # 放入公式
388
+ equation_idx = line['spans'].index(equation_span)
389
+ line['spans'].insert(equation_idx + 1, last_span_to_insert) # 放入公式
394
390
  else: # 直接修改原来的span
395
- last_overlap_span["chars"] = tail_span_chars
396
- last_overlap_span["text"] = "".join([char["c"] for char in tail_span_chars])
397
- last_overlap_span["bbox"] = (
398
- min([chr["bbox"][0] for chr in tail_span_chars]),
399
- last_overlap_span["bbox"][1],
400
- last_overlap_span["bbox"][2],
401
- last_overlap_span["bbox"][3],
391
+ last_overlap_span['chars'] = tail_span_chars
392
+ last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
393
+ last_overlap_span['bbox'] = (
394
+ min([chr['bbox'][0] for chr in tail_span_chars]),
395
+ last_overlap_span['bbox'][1],
396
+ last_overlap_span['bbox'][2],
397
+ last_overlap_span['bbox'][3],
402
398
  )
403
399
  else:
404
400
  # 删掉
@@ -406,15 +402,15 @@ def replace_line_v2(eqinfo, line):
406
402
  last_overlap_span not in delete_span
407
403
  and last_overlap_span != first_overlap_span
408
404
  ):
409
- line["spans"].remove(last_overlap_span)
405
+ line['spans'].remove(last_overlap_span)
410
406
 
411
- remain_txt = ""
412
- for span in line["spans"]:
413
- span_txt = "<span>"
414
- for char in span["chars"]:
415
- span_txt = span_txt + char["c"]
407
+ remain_txt = ''
408
+ for span in line['spans']:
409
+ span_txt = '<span>'
410
+ for char in span['chars']:
411
+ span_txt = span_txt + char['c']
416
412
 
417
- span_txt = span_txt + "</span>"
413
+ span_txt = span_txt + '</span>'
418
414
 
419
415
  remain_txt = remain_txt + span_txt
420
416
 
@@ -424,17 +420,15 @@ def replace_line_v2(eqinfo, line):
424
420
 
425
421
 
426
422
  def replace_eq_blk(eqinfo, text_block):
427
- """替换行内公式"""
428
- for line in text_block["lines"]:
429
- line_bbox = line["bbox"]
423
+ """替换行内公式."""
424
+ for line in text_block['lines']:
425
+ line_bbox = line['bbox']
430
426
  if (
431
- _is_xin(eqinfo["bbox"], line_bbox)
432
- or __y_overlap_ratio(eqinfo["bbox"], line_bbox) > 0.6
427
+ _is_xin(eqinfo['bbox'], line_bbox)
428
+ or __y_overlap_ratio(eqinfo['bbox'], line_bbox) > 0.6
433
429
  ): # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
434
430
  replace_succ = replace_line_v2(eqinfo, line)
435
- if (
436
- not replace_succ
437
- ): # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
431
+ if not replace_succ: # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
438
432
  continue
439
433
  else:
440
434
  break
@@ -444,13 +438,13 @@ def replace_eq_blk(eqinfo, text_block):
444
438
 
445
439
 
446
440
  def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
447
- """替换行内公式"""
441
+ """替换行内公式."""
448
442
  for eqinfo in inline_equation_bboxes:
449
- eqbox = eqinfo["bbox"]
443
+ eqbox = eqinfo['bbox']
450
444
  for blk in raw_text_blocks:
451
- if _is_xin(eqbox, blk["bbox"]):
445
+ if _is_xin(eqbox, blk['bbox']):
452
446
  if not replace_eq_blk(eqinfo, blk):
453
- logger.warning(f"行内公式没有替换成功:{eqinfo} ")
447
+ logger.warning(f'行内公式没有替换成功:{eqinfo} ')
454
448
  else:
455
449
  break
456
450
 
@@ -458,20 +452,18 @@ def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
458
452
 
459
453
 
460
454
  def remove_chars_in_text_blocks(text_blocks):
461
- """删除text_blocks里的char"""
455
+ """删除text_blocks里的char."""
462
456
  for blk in text_blocks:
463
- for line in blk["lines"]:
464
- for span in line["spans"]:
465
- _ = span.pop("chars", "no such key")
457
+ for line in blk['lines']:
458
+ for span in line['spans']:
459
+ _ = span.pop('chars', 'no such key')
466
460
  return text_blocks
467
461
 
468
462
 
469
463
  def replace_equations_in_textblock(
470
464
  raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
471
465
  ):
472
- """
473
- 替换行间和和行内公式为latex
474
- """
466
+ """替换行间和和行内公式为latex."""
475
467
  raw_text_blocks = remove_text_block_in_interline_equation_bbox(
476
468
  interline_equation_bboxes, raw_text_blocks
477
469
  ) # 消除重叠:第一步,在公式内部的
@@ -486,22 +478,22 @@ def replace_equations_in_textblock(
486
478
 
487
479
 
488
480
  def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
489
- """ """
490
- new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
491
- with open(json_path, "r", encoding="utf-8") as f:
481
+ """"""
482
+ new_pdf = f'{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf'
483
+ with open(json_path, 'r', encoding='utf-8') as f:
492
484
  obj = json.loads(f.read())
493
485
 
494
486
  if os.path.exists(new_pdf):
495
487
  os.remove(new_pdf)
496
- new_doc = fitz.open("")
488
+ new_doc = fitz.open('')
497
489
 
498
- doc = fitz.open(pdf_path)
490
+ doc = fitz.open(pdf_path) # noqa: F841
499
491
  new_doc = fitz.open(pdf_path)
500
492
  for i in range(len(new_doc)):
501
493
  page = new_doc[i]
502
- inline_equation_bboxes = obj[f"page_{i}"]["inline_equations"]
503
- interline_equation_bboxes = obj[f"page_{i}"]["interline_equations"]
504
- raw_text_blocks = obj[f"page_{i}"]["preproc_blocks"]
494
+ inline_equation_bboxes = obj[f'page_{i}']['inline_equations']
495
+ interline_equation_bboxes = obj[f'page_{i}']['interline_equations']
496
+ raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
505
497
  raw_text_blocks = remove_text_block_in_interline_equation_bbox(
506
498
  interline_equation_bboxes, raw_text_blocks
507
499
  ) # 消除重叠:第一步,在公式内部的
@@ -514,11 +506,10 @@ def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
514
506
  )
515
507
 
516
508
  # 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
517
- color_map = [fitz.pdfcolor["blue"], fitz.pdfcolor["green"]]
518
- j = 0
509
+ color_map = [fitz.pdfcolor['blue'], fitz.pdfcolor['green']] # noqa: F841
510
+ j = 0 # noqa: F841
519
511
  for blk in raw_text_blocks:
520
- for i, line in enumerate(blk["lines"]):
521
-
512
+ for i, line in enumerate(blk['lines']):
522
513
  # line_box = line['bbox']
523
514
  # shape = page.new_shape()
524
515
  # shape.draw_rect(line_box)
@@ -526,34 +517,34 @@ def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
526
517
  # shape.commit()
527
518
  # j = j+1
528
519
 
529
- for i, span in enumerate(line["spans"]):
520
+ for i, span in enumerate(line['spans']):
530
521
  shape_page = page.new_shape()
531
- span_type = span.get("_type")
532
- color = fitz.pdfcolor["blue"]
533
- if span_type == "first":
534
- color = fitz.pdfcolor["blue"]
535
- elif span_type == "tail":
536
- color = fitz.pdfcolor["green"]
522
+ span_type = span.get('_type')
523
+ color = fitz.pdfcolor['blue']
524
+ if span_type == 'first':
525
+ color = fitz.pdfcolor['blue']
526
+ elif span_type == 'tail':
527
+ color = fitz.pdfcolor['green']
537
528
  elif span_type == TYPE_INLINE_EQUATION:
538
- color = fitz.pdfcolor["black"]
529
+ color = fitz.pdfcolor['black']
539
530
  else:
540
531
  color = None
541
532
 
542
- b = span["bbox"]
533
+ b = span['bbox']
543
534
  shape_page.draw_rect(b)
544
535
 
545
536
  shape_page.finish(color=None, fill=color, fill_opacity=0.3)
546
537
  shape_page.commit()
547
538
 
548
539
  new_doc.save(new_pdf)
549
- logger.info(f"save ok {new_pdf}")
540
+ logger.info(f'save ok {new_pdf}')
550
541
  final_json = json.dumps(obj, ensure_ascii=False, indent=2)
551
- with open("equations_test/final_json.json", "w") as f:
542
+ with open('equations_test/final_json.json', 'w') as f:
552
543
  f.write(final_json)
553
544
 
554
545
  return new_pdf
555
546
 
556
547
 
557
- if __name__ == "__main__":
548
+ if __name__ == '__main__':
558
549
  # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
559
550
  pass