magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,562 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import unicodedata
|
3
|
-
|
4
|
-
from magic_pdf.para.commons import *
|
5
|
-
|
6
|
-
|
7
|
-
if sys.version_info[0] >= 3:
|
8
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
9
|
-
|
10
|
-
|
11
|
-
class BlockContinuationProcessor:
|
12
|
-
"""
|
13
|
-
This class is used to process the blocks to detect block continuations.
|
14
|
-
"""
|
15
|
-
|
16
|
-
def __init__(self) -> None:
|
17
|
-
pass
|
18
|
-
|
19
|
-
def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
|
20
|
-
"""
|
21
|
-
This function checks if the two font types are similar.
|
22
|
-
Definition of similar font types: the two font types have a common prefix,
|
23
|
-
and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
|
24
|
-
|
25
|
-
Parameters
|
26
|
-
----------
|
27
|
-
font_type1 : str
|
28
|
-
font type 1
|
29
|
-
font_type2 : str
|
30
|
-
font type 2
|
31
|
-
prefix_length_ratio : float
|
32
|
-
minimum ratio of the common prefix length to the length of the shorter font type
|
33
|
-
|
34
|
-
Returns
|
35
|
-
-------
|
36
|
-
bool
|
37
|
-
True if the two font types are similar, False otherwise.
|
38
|
-
"""
|
39
|
-
|
40
|
-
if isinstance(font_type1, list):
|
41
|
-
font_type1 = font_type1[0] if font_type1 else ""
|
42
|
-
if isinstance(font_type2, list):
|
43
|
-
font_type2 = font_type2[0] if font_type2 else ""
|
44
|
-
|
45
|
-
if font_type1 == font_type2:
|
46
|
-
return True
|
47
|
-
|
48
|
-
# Find the length of the common prefix
|
49
|
-
common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
|
50
|
-
|
51
|
-
# Calculate the minimum prefix length based on the ratio
|
52
|
-
min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
|
53
|
-
|
54
|
-
return common_prefix_length >= min_prefix_length
|
55
|
-
|
56
|
-
def __is_same_block_font(self, block1, block2):
|
57
|
-
"""
|
58
|
-
This function compares the font of block1 and block2
|
59
|
-
|
60
|
-
Parameters
|
61
|
-
----------
|
62
|
-
block1 : dict
|
63
|
-
block1
|
64
|
-
block2 : dict
|
65
|
-
block2
|
66
|
-
|
67
|
-
Returns
|
68
|
-
-------
|
69
|
-
is_same : bool
|
70
|
-
True if block1 and block2 have the same font, else False
|
71
|
-
"""
|
72
|
-
block_1_font_type = safe_get(block1, "block_font_type", "")
|
73
|
-
block_1_font_size = safe_get(block1, "block_font_size", 0)
|
74
|
-
block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
|
75
|
-
|
76
|
-
block_2_font_type = safe_get(block2, "block_font_type", "")
|
77
|
-
block_2_font_size = safe_get(block2, "block_font_size", 0)
|
78
|
-
block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
|
79
|
-
|
80
|
-
if isinstance(block_1_font_size, list):
|
81
|
-
block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
|
82
|
-
if isinstance(block_2_font_size, list):
|
83
|
-
block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
|
84
|
-
|
85
|
-
block_1_text = safe_get(block1, "text", "")
|
86
|
-
block_2_text = safe_get(block2, "text", "")
|
87
|
-
|
88
|
-
if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
|
89
|
-
return False
|
90
|
-
|
91
|
-
if not block_1_text or not block_2_text:
|
92
|
-
return False
|
93
|
-
else:
|
94
|
-
text_len_ratio = len(block_2_text) / len(block_1_text)
|
95
|
-
if text_len_ratio < 0.2:
|
96
|
-
avg_char_width_condition = (
|
97
|
-
abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
|
98
|
-
< 0.5
|
99
|
-
)
|
100
|
-
else:
|
101
|
-
avg_char_width_condition = (
|
102
|
-
abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
|
103
|
-
< 0.2
|
104
|
-
)
|
105
|
-
|
106
|
-
block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
|
107
|
-
|
108
|
-
return (
|
109
|
-
self.__is_similar_font_type(block_1_font_type, block_2_font_type)
|
110
|
-
and avg_char_width_condition
|
111
|
-
and block_font_size_condtion
|
112
|
-
)
|
113
|
-
|
114
|
-
def _is_alphabet_char(self, char):
|
115
|
-
if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
|
116
|
-
return True
|
117
|
-
else:
|
118
|
-
return False
|
119
|
-
|
120
|
-
def _is_chinese_char(self, char):
|
121
|
-
if char >= "\u4e00" and char <= "\u9fa5":
|
122
|
-
return True
|
123
|
-
else:
|
124
|
-
return False
|
125
|
-
|
126
|
-
def _is_other_letter_char(self, char):
|
127
|
-
try:
|
128
|
-
cat = unicodedata.category(char)
|
129
|
-
if cat == "Lu" or cat == "Ll":
|
130
|
-
return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
|
131
|
-
except TypeError:
|
132
|
-
print("The input to the function must be a single character.")
|
133
|
-
return False
|
134
|
-
|
135
|
-
def _is_year(self, s: str):
|
136
|
-
try:
|
137
|
-
number = int(s)
|
138
|
-
return 1900 <= number <= 2099
|
139
|
-
except ValueError:
|
140
|
-
return False
|
141
|
-
|
142
|
-
def __is_para_font_consistent(self, para_1, para_2):
|
143
|
-
"""
|
144
|
-
This function compares the font of para1 and para2
|
145
|
-
|
146
|
-
Parameters
|
147
|
-
----------
|
148
|
-
para1 : dict
|
149
|
-
para1
|
150
|
-
para2 : dict
|
151
|
-
para2
|
152
|
-
|
153
|
-
Returns
|
154
|
-
-------
|
155
|
-
is_same : bool
|
156
|
-
True if para1 and para2 have the same font, else False
|
157
|
-
"""
|
158
|
-
if para_1 is None or para_2 is None:
|
159
|
-
return False
|
160
|
-
|
161
|
-
para_1_font_type = safe_get(para_1, "para_font_type", "")
|
162
|
-
para_1_font_size = safe_get(para_1, "para_font_size", 0)
|
163
|
-
para_1_font_color = safe_get(para_1, "para_font_color", "")
|
164
|
-
|
165
|
-
para_2_font_type = safe_get(para_2, "para_font_type", "")
|
166
|
-
para_2_font_size = safe_get(para_2, "para_font_size", 0)
|
167
|
-
para_2_font_color = safe_get(para_2, "para_font_color", "")
|
168
|
-
|
169
|
-
if isinstance(para_1_font_type, list): # get the most common font type
|
170
|
-
para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
|
171
|
-
if isinstance(para_2_font_type, list):
|
172
|
-
para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
|
173
|
-
if isinstance(para_1_font_size, list): # compute average font type
|
174
|
-
para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
|
175
|
-
if isinstance(para_2_font_size, list): # compute average font type
|
176
|
-
para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
|
177
|
-
|
178
|
-
return (
|
179
|
-
self.__is_similar_font_type(para_1_font_type, para_2_font_type)
|
180
|
-
and abs(para_1_font_size - para_2_font_size) < 1.5
|
181
|
-
# and para_font_color1 == para_font_color2
|
182
|
-
)
|
183
|
-
|
184
|
-
def _is_para_puncs_consistent(self, para_1, para_2):
|
185
|
-
"""
|
186
|
-
This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
|
187
|
-
|
188
|
-
Parameters
|
189
|
-
----------
|
190
|
-
para1 : dict
|
191
|
-
para1
|
192
|
-
para2 : dict
|
193
|
-
para2
|
194
|
-
|
195
|
-
Returns
|
196
|
-
-------
|
197
|
-
is_same : bool
|
198
|
-
True if para1 and para2 are from the same paragraph by using the puncs, else False
|
199
|
-
"""
|
200
|
-
para_1_text = safe_get(para_1, "para_text", "").strip()
|
201
|
-
para_2_text = safe_get(para_2, "para_text", "").strip()
|
202
|
-
|
203
|
-
para_1_bboxes = safe_get(para_1, "para_bbox", [])
|
204
|
-
para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
|
205
|
-
|
206
|
-
para_2_bboxes = safe_get(para_2, "para_bbox", [])
|
207
|
-
para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
|
208
|
-
|
209
|
-
# print_yellow(" Features of determine puncs_consistent:")
|
210
|
-
# print(f" para_1_text: {para_1_text}")
|
211
|
-
# print(f" para_2_text: {para_2_text}")
|
212
|
-
# print(f" para_1_bboxes: {para_1_bboxes}")
|
213
|
-
# print(f" para_2_bboxes: {para_2_bboxes}")
|
214
|
-
# print(f" para_1_font_sizes: {para_1_font_sizes}")
|
215
|
-
# print(f" para_2_font_sizes: {para_2_font_sizes}")
|
216
|
-
|
217
|
-
if is_nested_list(para_1_bboxes):
|
218
|
-
x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
|
219
|
-
else:
|
220
|
-
x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
|
221
|
-
|
222
|
-
if is_nested_list(para_2_bboxes):
|
223
|
-
x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
|
224
|
-
para_2_font_sizes = para_2_font_sizes[0] # type: ignore
|
225
|
-
else:
|
226
|
-
x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
|
227
|
-
|
228
|
-
right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
|
229
|
-
are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
|
230
|
-
|
231
|
-
left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
|
232
|
-
is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
|
233
|
-
is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
|
234
|
-
|
235
|
-
# Check if either para_text1 or para_text2 is empty
|
236
|
-
if not para_1_text or not para_2_text:
|
237
|
-
return False
|
238
|
-
|
239
|
-
# Define the end puncs for a sentence to end and hyphen
|
240
|
-
end_puncs = [".", "?", "!", "。", "?", "!", "…"]
|
241
|
-
hyphen = ["-", "—"]
|
242
|
-
|
243
|
-
# Check if para_text1 ends with either hyphen or non-end punctuation or spaces
|
244
|
-
para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
|
245
|
-
para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
|
246
|
-
para_1_end_with_space = para_1_text and para_1_text[-1] == " "
|
247
|
-
para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
|
248
|
-
|
249
|
-
# print_yellow(f" para_1_end_with_hyphen: {para_1_end_with_hyphen}")
|
250
|
-
# print_yellow(f" para_1_end_with_end_punc: {para_1_end_with_end_punc}")
|
251
|
-
# print_yellow(f" para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
|
252
|
-
# print_yellow(f" para_1_end_with_space: {para_1_end_with_space}")
|
253
|
-
|
254
|
-
if para_1_end_with_hyphen: # If para_text1 ends with hyphen
|
255
|
-
# print_red(f"para_1 is end with hyphen.")
|
256
|
-
para_2_is_consistent = para_2_text and (
|
257
|
-
para_2_text[0] in hyphen
|
258
|
-
or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
|
259
|
-
or (self._is_chinese_char(para_2_text[0]))
|
260
|
-
or (self._is_other_letter_char(para_2_text[0]))
|
261
|
-
)
|
262
|
-
if para_2_is_consistent:
|
263
|
-
# print(f"para_2 is consistent.\n")
|
264
|
-
return True
|
265
|
-
else:
|
266
|
-
# print(f"para_2 is not consistent.\n")
|
267
|
-
pass
|
268
|
-
|
269
|
-
elif para_1_end_with_end_punc: # If para_text1 ends with ending punctuations
|
270
|
-
# print_red(f"para_1 is end with end_punc.")
|
271
|
-
para_2_is_consistent = (
|
272
|
-
para_2_text
|
273
|
-
and (
|
274
|
-
para_2_text[0] == " "
|
275
|
-
or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
|
276
|
-
or (self._is_chinese_char(para_2_text[0]))
|
277
|
-
or (self._is_other_letter_char(para_2_text[0]))
|
278
|
-
)
|
279
|
-
and not is_para2_left_indent_than_papa1
|
280
|
-
)
|
281
|
-
if para_2_is_consistent:
|
282
|
-
# print(f"para_2 is consistent.\n")
|
283
|
-
return True
|
284
|
-
else:
|
285
|
-
# print(f"para_2 is not consistent.\n")
|
286
|
-
pass
|
287
|
-
|
288
|
-
elif para_1_not_end_with_end_punc: # If para_text1 is not end with ending punctuations
|
289
|
-
# print_red(f"para_1 is NOT end with end_punc.")
|
290
|
-
para_2_is_consistent = para_2_text and (
|
291
|
-
para_2_text[0] == " "
|
292
|
-
or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
|
293
|
-
or (self._is_alphabet_char(para_2_text[0]))
|
294
|
-
or (self._is_year(para_2_text[0:4]))
|
295
|
-
or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
|
296
|
-
or (self._is_chinese_char(para_2_text[0]))
|
297
|
-
or (self._is_other_letter_char(para_2_text[0]))
|
298
|
-
)
|
299
|
-
if para_2_is_consistent:
|
300
|
-
# print(f"para_2 is consistent.\n")
|
301
|
-
return True
|
302
|
-
else:
|
303
|
-
# print(f"para_2 is not consistent.\n")
|
304
|
-
pass
|
305
|
-
|
306
|
-
elif para_1_end_with_space: # If para_text1 ends with space
|
307
|
-
# print_red(f"para_1 is end with space.")
|
308
|
-
para_2_is_consistent = para_2_text and (
|
309
|
-
para_2_text[0] == " "
|
310
|
-
or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
|
311
|
-
or (self._is_chinese_char(para_2_text[0]))
|
312
|
-
or (self._is_other_letter_char(para_2_text[0]))
|
313
|
-
)
|
314
|
-
if para_2_is_consistent:
|
315
|
-
# print(f"para_2 is consistent.\n")
|
316
|
-
return True
|
317
|
-
else:
|
318
|
-
pass
|
319
|
-
# print(f"para_2 is not consistent.\n")
|
320
|
-
|
321
|
-
return False
|
322
|
-
|
323
|
-
def _is_block_consistent(self, block1, block2):
|
324
|
-
"""
|
325
|
-
This function determines whether block1 and block2 are originally from the same block
|
326
|
-
|
327
|
-
Parameters
|
328
|
-
----------
|
329
|
-
block1 : dict
|
330
|
-
block1s
|
331
|
-
block2 : dict
|
332
|
-
block2
|
333
|
-
|
334
|
-
Returns
|
335
|
-
-------
|
336
|
-
is_same : bool
|
337
|
-
True if block1 and block2 are from the same block, else False
|
338
|
-
"""
|
339
|
-
return self.__is_same_block_font(block1, block2)
|
340
|
-
|
341
|
-
def _is_para_continued(self, para1, para2):
|
342
|
-
"""
|
343
|
-
This function determines whether para1 and para2 are originally from the same paragraph
|
344
|
-
|
345
|
-
Parameters
|
346
|
-
----------
|
347
|
-
para1 : dict
|
348
|
-
para1
|
349
|
-
para2 : dict
|
350
|
-
para2
|
351
|
-
|
352
|
-
Returns
|
353
|
-
-------
|
354
|
-
is_same : bool
|
355
|
-
True if para1 and para2 are from the same paragraph, else False
|
356
|
-
"""
|
357
|
-
is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
|
358
|
-
is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
|
359
|
-
|
360
|
-
return is_para_font_consistent and is_para_puncs_consistent
|
361
|
-
|
362
|
-
def _are_boundaries_of_block_consistent(self, block1, block2):
|
363
|
-
"""
|
364
|
-
This function checks if the boundaries of block1 and block2 are consistent
|
365
|
-
|
366
|
-
Parameters
|
367
|
-
----------
|
368
|
-
block1 : dict
|
369
|
-
block1
|
370
|
-
|
371
|
-
block2 : dict
|
372
|
-
block2
|
373
|
-
|
374
|
-
Returns
|
375
|
-
-------
|
376
|
-
is_consistent : bool
|
377
|
-
True if the boundaries of block1 and block2 are consistent, else False
|
378
|
-
"""
|
379
|
-
|
380
|
-
last_line_of_block1 = block1["lines"][-1]
|
381
|
-
first_line_of_block2 = block2["lines"][0]
|
382
|
-
|
383
|
-
spans_of_last_line_of_block1 = last_line_of_block1["spans"]
|
384
|
-
spans_of_first_line_of_block2 = first_line_of_block2["spans"]
|
385
|
-
|
386
|
-
font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
|
387
|
-
font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
|
388
|
-
font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
|
389
|
-
font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
|
390
|
-
|
391
|
-
font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
|
392
|
-
font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
|
393
|
-
font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
|
394
|
-
font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
|
395
|
-
|
396
|
-
return (
|
397
|
-
self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
|
398
|
-
and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
|
399
|
-
# and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
|
400
|
-
and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
|
401
|
-
)
|
402
|
-
|
403
|
-
def _get_last_paragraph(self, block):
|
404
|
-
"""
|
405
|
-
Retrieves the last paragraph from a block.
|
406
|
-
|
407
|
-
Parameters
|
408
|
-
----------
|
409
|
-
block : dict
|
410
|
-
The block from which to retrieve the paragraph.
|
411
|
-
|
412
|
-
Returns
|
413
|
-
-------
|
414
|
-
dict
|
415
|
-
The last paragraph of the block.
|
416
|
-
"""
|
417
|
-
if block["paras"]:
|
418
|
-
last_para_key = list(block["paras"].keys())[-1]
|
419
|
-
return block["paras"][last_para_key]
|
420
|
-
else:
|
421
|
-
return None
|
422
|
-
|
423
|
-
def _get_first_paragraph(self, block):
|
424
|
-
"""
|
425
|
-
Retrieves the first paragraph from a block.
|
426
|
-
|
427
|
-
Parameters
|
428
|
-
----------
|
429
|
-
block : dict
|
430
|
-
The block from which to retrieve the paragraph.
|
431
|
-
|
432
|
-
Returns
|
433
|
-
-------
|
434
|
-
dict
|
435
|
-
The first paragraph of the block.
|
436
|
-
"""
|
437
|
-
if block["paras"]:
|
438
|
-
first_para_key = list(block["paras"].keys())[0]
|
439
|
-
return block["paras"][first_para_key]
|
440
|
-
else:
|
441
|
-
return None
|
442
|
-
|
443
|
-
def should_merge_next_para(self, curr_para, next_para):
|
444
|
-
if self._is_para_continued(curr_para, next_para):
|
445
|
-
return True
|
446
|
-
else:
|
447
|
-
return False
|
448
|
-
|
449
|
-
def batch_tag_paras(self, pdf_dict):
|
450
|
-
the_last_page_id = len(pdf_dict) - 1
|
451
|
-
|
452
|
-
for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
|
453
|
-
if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
|
454
|
-
para_blocks_of_curr_page = curr_page_content["para_blocks"]
|
455
|
-
next_page_idx = curr_page_idx + 1
|
456
|
-
next_page_id = f"page_{next_page_idx}"
|
457
|
-
next_page_content = pdf_dict.get(next_page_id, {})
|
458
|
-
|
459
|
-
for i, current_block in enumerate(para_blocks_of_curr_page):
|
460
|
-
for para_id, curr_para in current_block["paras"].items():
|
461
|
-
curr_para["curr_para_location"] = [
|
462
|
-
curr_page_idx,
|
463
|
-
current_block["block_id"],
|
464
|
-
int(para_id.split("_")[-1]),
|
465
|
-
]
|
466
|
-
curr_para["next_para_location"] = None # 默认设置为None
|
467
|
-
curr_para["merge_next_para"] = False # 默认设置为False
|
468
|
-
|
469
|
-
next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
|
470
|
-
|
471
|
-
if next_block:
|
472
|
-
curr_block_last_para_key = list(current_block["paras"].keys())[-1]
|
473
|
-
curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
|
474
|
-
|
475
|
-
next_block_first_para_key = list(next_block["paras"].keys())[0]
|
476
|
-
next_blk_first_para = next_block["paras"][next_block_first_para_key]
|
477
|
-
|
478
|
-
if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
|
479
|
-
curr_blk_last_para["next_para_location"] = [
|
480
|
-
curr_page_idx,
|
481
|
-
next_block["block_id"],
|
482
|
-
int(next_block_first_para_key.split("_")[-1]),
|
483
|
-
]
|
484
|
-
curr_blk_last_para["merge_next_para"] = True
|
485
|
-
else:
|
486
|
-
# Handle the case where the next block is in a different page
|
487
|
-
curr_block_last_para_key = list(current_block["paras"].keys())[-1]
|
488
|
-
curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
|
489
|
-
|
490
|
-
while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
|
491
|
-
next_page_idx += 1
|
492
|
-
next_page_id = f"page_{next_page_idx}"
|
493
|
-
next_page_content = pdf_dict.get(next_page_id, {})
|
494
|
-
|
495
|
-
if next_page_content.get("para_blocks", []):
|
496
|
-
next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
|
497
|
-
next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
|
498
|
-
|
499
|
-
if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
|
500
|
-
curr_blk_last_para["next_para_location"] = [
|
501
|
-
next_page_idx,
|
502
|
-
next_page_content["para_blocks"][0]["block_id"],
|
503
|
-
int(next_blk_first_para_key.split("_")[-1]),
|
504
|
-
]
|
505
|
-
curr_blk_last_para["merge_next_para"] = True
|
506
|
-
|
507
|
-
return pdf_dict
|
508
|
-
|
509
|
-
def find_block_by_id(self, para_blocks, block_id):
|
510
|
-
for block in para_blocks:
|
511
|
-
if block.get("block_id") == block_id:
|
512
|
-
return block
|
513
|
-
return None
|
514
|
-
|
515
|
-
def batch_merge_paras(self, pdf_dict):
|
516
|
-
for page_id, page_content in pdf_dict.items():
|
517
|
-
if page_id.startswith("page_") and page_content.get("para_blocks", []):
|
518
|
-
para_blocks_of_page = page_content["para_blocks"]
|
519
|
-
|
520
|
-
for i in range(len(para_blocks_of_page)):
|
521
|
-
current_block = para_blocks_of_page[i]
|
522
|
-
paras = current_block["paras"]
|
523
|
-
|
524
|
-
for para_id, curr_para in list(paras.items()):
|
525
|
-
# 跳过标题段落
|
526
|
-
if curr_para.get("is_para_title"):
|
527
|
-
continue
|
528
|
-
|
529
|
-
while curr_para.get("merge_next_para"):
|
530
|
-
next_para_location = curr_para.get("next_para_location")
|
531
|
-
if not next_para_location:
|
532
|
-
break
|
533
|
-
|
534
|
-
next_page_idx, next_block_id, next_para_id = next_para_location
|
535
|
-
next_page_id = f"page_{next_page_idx}"
|
536
|
-
next_page_content = pdf_dict.get(next_page_id)
|
537
|
-
if not next_page_content:
|
538
|
-
break
|
539
|
-
|
540
|
-
next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
|
541
|
-
if not next_block:
|
542
|
-
break
|
543
|
-
|
544
|
-
next_para = next_block["paras"].get(f"para_{next_para_id}")
|
545
|
-
if not next_para or next_para.get("is_para_title"):
|
546
|
-
break
|
547
|
-
|
548
|
-
# 合并段落文本
|
549
|
-
curr_para_text = curr_para.get("para_text", "")
|
550
|
-
next_para_text = next_para.get("para_text", "")
|
551
|
-
curr_para["para_text"] = curr_para_text + " " + next_para_text
|
552
|
-
|
553
|
-
# 更新 next_para_location
|
554
|
-
curr_para["next_para_location"] = next_para.get("next_para_location")
|
555
|
-
|
556
|
-
# 将下一个段落文本置为空,表示已被合并
|
557
|
-
next_para["para_text"] = ""
|
558
|
-
|
559
|
-
# 更新 merge_next_para 标记
|
560
|
-
curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
|
561
|
-
|
562
|
-
return pdf_dict
|