magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,1014 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import re
|
3
|
-
import numpy as np
|
4
|
-
|
5
|
-
from magic_pdf.libs.nlp_utils import NLPModels
|
6
|
-
|
7
|
-
from magic_pdf.para.commons import *
|
8
|
-
|
9
|
-
if sys.version_info[0] >= 3:
|
10
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
11
|
-
|
12
|
-
|
13
|
-
class TitleProcessor:
|
14
|
-
def __init__(self, *doc_statistics) -> None:
|
15
|
-
if len(doc_statistics) > 0:
|
16
|
-
self.doc_statistics = doc_statistics[0]
|
17
|
-
|
18
|
-
self.nlp_model = NLPModels()
|
19
|
-
self.MAX_TITLE_LEVEL = 3
|
20
|
-
self.numbered_title_pattern = r"""
|
21
|
-
^ # 行首
|
22
|
-
( # 开始捕获组
|
23
|
-
[\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1)
|
24
|
-
|\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2)
|
25
|
-
|[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A)
|
26
|
-
|[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A)
|
27
|
-
|[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I)
|
28
|
-
|[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I)
|
29
|
-
|\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1
|
30
|
-
|[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、
|
31
|
-
|[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一)
|
32
|
-
|[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1
|
33
|
-
|[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a)
|
34
|
-
|[a-z]\)\s # 小写字母后跟右括号和空格,例如:a)
|
35
|
-
|[A-Z]-\s # 大写字母后跟短横线和空格,例如:A-
|
36
|
-
|\w+:\s # 英文序号词后跟冒号和空格,例如:First:
|
37
|
-
|第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
|
38
|
-
|[IVXLCDM]+\. # 罗马数字后跟点,例如:I.
|
39
|
-
|\d+\.\s # 单个数字后跟点和空格,例如:1.
|
40
|
-
) # 结束捕获组
|
41
|
-
.+ # 标题的其余部分
|
42
|
-
"""
|
43
|
-
|
44
|
-
def _is_potential_title(
|
45
|
-
self,
|
46
|
-
curr_line,
|
47
|
-
prev_line,
|
48
|
-
prev_line_is_title,
|
49
|
-
next_line,
|
50
|
-
avg_char_width,
|
51
|
-
avg_char_height,
|
52
|
-
median_font_size,
|
53
|
-
):
|
54
|
-
"""
|
55
|
-
This function checks if the line is a potential title.
|
56
|
-
|
57
|
-
Parameters
|
58
|
-
----------
|
59
|
-
curr_line : dict
|
60
|
-
current line
|
61
|
-
prev_line : dict
|
62
|
-
previous line
|
63
|
-
next_line : dict
|
64
|
-
next line
|
65
|
-
avg_char_width : float
|
66
|
-
average of char widths
|
67
|
-
avg_char_height : float
|
68
|
-
average of line heights
|
69
|
-
|
70
|
-
Returns
|
71
|
-
-------
|
72
|
-
bool
|
73
|
-
True if the line is a potential title, False otherwise.
|
74
|
-
"""
|
75
|
-
|
76
|
-
def __is_line_centered(line_bbox, page_bbox, avg_char_width):
|
77
|
-
"""
|
78
|
-
This function checks if the line is centered on the page
|
79
|
-
|
80
|
-
Parameters
|
81
|
-
----------
|
82
|
-
line_bbox : list
|
83
|
-
bbox of the line
|
84
|
-
page_bbox : list
|
85
|
-
bbox of the page
|
86
|
-
avg_char_width : float
|
87
|
-
average of char widths
|
88
|
-
|
89
|
-
Returns
|
90
|
-
-------
|
91
|
-
bool
|
92
|
-
True if the line is centered on the page, False otherwise.
|
93
|
-
"""
|
94
|
-
horizontal_ratio = 0.5
|
95
|
-
horizontal_thres = horizontal_ratio * avg_char_width
|
96
|
-
|
97
|
-
x0, _, x1, _ = line_bbox
|
98
|
-
_, _, page_x1, _ = page_bbox
|
99
|
-
|
100
|
-
return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
|
101
|
-
|
102
|
-
def __is_bold_font_line(line):
|
103
|
-
"""
|
104
|
-
Check if a line contains any bold font style.
|
105
|
-
"""
|
106
|
-
|
107
|
-
def _is_bold_span(span):
|
108
|
-
# if span text is empty or only contains space, return False
|
109
|
-
if not span["text"].strip():
|
110
|
-
return False
|
111
|
-
|
112
|
-
return bool(span["flags"] & 2**4) # Check if the font is bold
|
113
|
-
|
114
|
-
for span in line["spans"]:
|
115
|
-
if not _is_bold_span(span):
|
116
|
-
return False
|
117
|
-
|
118
|
-
return True
|
119
|
-
|
120
|
-
def __is_italic_font_line(line):
|
121
|
-
"""
|
122
|
-
Check if a line contains any italic font style.
|
123
|
-
"""
|
124
|
-
|
125
|
-
def __is_italic_span(span):
|
126
|
-
return bool(span["flags"] & 2**1) # Check if the font is italic
|
127
|
-
|
128
|
-
for span in line["spans"]:
|
129
|
-
if not __is_italic_span(span):
|
130
|
-
return False
|
131
|
-
|
132
|
-
return True
|
133
|
-
|
134
|
-
def __is_punctuation_heavy(line_text):
|
135
|
-
"""
|
136
|
-
Check if the line contains a high ratio of punctuation marks, which may indicate
|
137
|
-
that the line is not a title.
|
138
|
-
|
139
|
-
Parameters:
|
140
|
-
line_text (str): Text of the line.
|
141
|
-
|
142
|
-
Returns:
|
143
|
-
bool: True if the line is heavy with punctuation, False otherwise.
|
144
|
-
"""
|
145
|
-
# Pattern for common title format like "X.Y. Title"
|
146
|
-
pattern = r"\b\d+\.\d+\..*\b"
|
147
|
-
|
148
|
-
# If the line matches the title format, return False
|
149
|
-
if re.match(pattern, line_text.strip()):
|
150
|
-
return False
|
151
|
-
|
152
|
-
# Find all punctuation marks in the line
|
153
|
-
punctuation_marks = re.findall(r"[^\w\s]", line_text)
|
154
|
-
number_of_punctuation_marks = len(punctuation_marks)
|
155
|
-
|
156
|
-
text_length = len(line_text)
|
157
|
-
|
158
|
-
if text_length == 0:
|
159
|
-
return False
|
160
|
-
|
161
|
-
punctuation_ratio = number_of_punctuation_marks / text_length
|
162
|
-
if punctuation_ratio >= 0.1:
|
163
|
-
return True
|
164
|
-
|
165
|
-
return False
|
166
|
-
|
167
|
-
def __has_mixed_font_styles(spans, strict_mode=False):
|
168
|
-
"""
|
169
|
-
This function checks if the line has mixed font styles, the strict mode will compare the font types
|
170
|
-
|
171
|
-
Parameters
|
172
|
-
----------
|
173
|
-
spans : list
|
174
|
-
spans of the line
|
175
|
-
strict_mode : bool
|
176
|
-
True for strict mode, the font types will be fully compared
|
177
|
-
False for non-strict mode, the font types will be compared by the most longest common prefix
|
178
|
-
|
179
|
-
Returns
|
180
|
-
-------
|
181
|
-
bool
|
182
|
-
True if the line has mixed font styles, False otherwise.
|
183
|
-
"""
|
184
|
-
if strict_mode:
|
185
|
-
font_styles = set()
|
186
|
-
for span in spans:
|
187
|
-
font_style = span["font"].lower()
|
188
|
-
font_styles.add(font_style)
|
189
|
-
|
190
|
-
return len(font_styles) > 1
|
191
|
-
|
192
|
-
else: # non-strict mode
|
193
|
-
font_styles = []
|
194
|
-
for span in spans:
|
195
|
-
font_style = span["font"].lower()
|
196
|
-
font_styles.append(font_style)
|
197
|
-
|
198
|
-
if len(font_styles) > 1:
|
199
|
-
longest_common_prefix = os.path.commonprefix(font_styles)
|
200
|
-
if len(longest_common_prefix) > 0:
|
201
|
-
return False
|
202
|
-
else:
|
203
|
-
return True
|
204
|
-
else:
|
205
|
-
return False
|
206
|
-
|
207
|
-
def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
|
208
|
-
"""
|
209
|
-
This function checks if the current line has a different font type from the previous and next lines
|
210
|
-
|
211
|
-
Parameters
|
212
|
-
----------
|
213
|
-
curr_line_font_type : str
|
214
|
-
font type of the current line
|
215
|
-
prev_line_font_type : str
|
216
|
-
font type of the previous line
|
217
|
-
next_line_font_type : str
|
218
|
-
font type of the next line
|
219
|
-
|
220
|
-
Returns
|
221
|
-
-------
|
222
|
-
bool
|
223
|
-
True if the current line has a different font type from the previous and next lines, False otherwise.
|
224
|
-
"""
|
225
|
-
return all(
|
226
|
-
curr_line_font_type != other_font_type.lower()
|
227
|
-
for other_font_type in [prev_line_font_type, next_line_font_type]
|
228
|
-
if other_font_type is not None
|
229
|
-
)
|
230
|
-
|
231
|
-
def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
|
232
|
-
"""
|
233
|
-
This function checks if the current line has a larger font size than the previous and next lines
|
234
|
-
|
235
|
-
Parameters
|
236
|
-
----------
|
237
|
-
curr_line_font_size : float
|
238
|
-
font size of the current line
|
239
|
-
prev_line_font_size : float
|
240
|
-
font size of the previous line
|
241
|
-
next_line_font_size : float
|
242
|
-
font size of the next line
|
243
|
-
|
244
|
-
Returns
|
245
|
-
-------
|
246
|
-
bool
|
247
|
-
True if the current line has a larger font size than the previous and next lines, False otherwise.
|
248
|
-
"""
|
249
|
-
return all(
|
250
|
-
curr_line_font_size > other_font_size * 1.2
|
251
|
-
for other_font_size in [prev_line_font_size, next_line_font_size]
|
252
|
-
if other_font_size is not None
|
253
|
-
)
|
254
|
-
|
255
|
-
def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
|
256
|
-
"""
|
257
|
-
This function checks if the current line is similar to the previous line
|
258
|
-
|
259
|
-
Parameters
|
260
|
-
----------
|
261
|
-
curr_line : dict
|
262
|
-
current line
|
263
|
-
prev_line : dict
|
264
|
-
previous line
|
265
|
-
|
266
|
-
Returns
|
267
|
-
-------
|
268
|
-
bool
|
269
|
-
True if the current line is similar to the previous line, False otherwise.
|
270
|
-
"""
|
271
|
-
|
272
|
-
if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
|
273
|
-
return True
|
274
|
-
else:
|
275
|
-
return False
|
276
|
-
|
277
|
-
def __is_same_font_type_of_docAvg(curr_line_font_type):
|
278
|
-
"""
|
279
|
-
This function checks if the current line has the same font type as the document average font type
|
280
|
-
|
281
|
-
Parameters
|
282
|
-
----------
|
283
|
-
curr_line_font_type : str
|
284
|
-
font type of the current line
|
285
|
-
|
286
|
-
Returns
|
287
|
-
-------
|
288
|
-
bool
|
289
|
-
True if the current line has the same font type as the document average font type, False otherwise.
|
290
|
-
"""
|
291
|
-
doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
|
292
|
-
doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
|
293
|
-
|
294
|
-
return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
|
295
|
-
|
296
|
-
def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
|
297
|
-
"""
|
298
|
-
This function checks if the current line has a large enough font size
|
299
|
-
|
300
|
-
Parameters
|
301
|
-
----------
|
302
|
-
curr_line_font_size : float
|
303
|
-
font size of the current line
|
304
|
-
ratio : float
|
305
|
-
ratio of the current line font size to the document average font size
|
306
|
-
|
307
|
-
Returns
|
308
|
-
-------
|
309
|
-
bool
|
310
|
-
True if the current line has a large enough font size, False otherwise.
|
311
|
-
"""
|
312
|
-
doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
|
313
|
-
doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
|
314
|
-
doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
|
315
|
-
|
316
|
-
return curr_line_font_size >= doc_avg_font_size * ratio
|
317
|
-
|
318
|
-
def __is_sufficient_spacing_above_and_below(
|
319
|
-
curr_line_bbox,
|
320
|
-
prev_line_bbox,
|
321
|
-
next_line_bbox,
|
322
|
-
avg_char_height,
|
323
|
-
median_font_size,
|
324
|
-
):
|
325
|
-
"""
|
326
|
-
This function checks if the current line has sufficient spacing above and below
|
327
|
-
|
328
|
-
Parameters
|
329
|
-
----------
|
330
|
-
curr_line_bbox : list
|
331
|
-
bbox of the current line
|
332
|
-
prev_line_bbox : list
|
333
|
-
bbox of the previous line
|
334
|
-
next_line_bbox : list
|
335
|
-
bbox of the next line
|
336
|
-
avg_char_width : float
|
337
|
-
average of char widths
|
338
|
-
avg_char_height : float
|
339
|
-
average of line heights
|
340
|
-
|
341
|
-
Returns
|
342
|
-
-------
|
343
|
-
bool
|
344
|
-
True if the current line has sufficient spacing above and below, False otherwise.
|
345
|
-
"""
|
346
|
-
vertical_ratio = 1.25
|
347
|
-
vertical_thres = vertical_ratio * median_font_size
|
348
|
-
|
349
|
-
_, y0, _, y1 = curr_line_bbox
|
350
|
-
|
351
|
-
sufficient_spacing_above = False
|
352
|
-
if prev_line_bbox:
|
353
|
-
vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
|
354
|
-
sufficient_spacing_above = vertical_spacing_above > vertical_thres
|
355
|
-
else:
|
356
|
-
sufficient_spacing_above = True
|
357
|
-
|
358
|
-
sufficient_spacing_below = False
|
359
|
-
if next_line_bbox:
|
360
|
-
vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
|
361
|
-
sufficient_spacing_below = vertical_spacing_below > vertical_thres
|
362
|
-
else:
|
363
|
-
sufficient_spacing_below = True
|
364
|
-
|
365
|
-
return (sufficient_spacing_above, sufficient_spacing_below)
|
366
|
-
|
367
|
-
def __is_word_list_line_by_rules(curr_line_text):
|
368
|
-
"""
|
369
|
-
This function checks if the current line is a word list
|
370
|
-
|
371
|
-
Parameters
|
372
|
-
----------
|
373
|
-
curr_line_text : str
|
374
|
-
text of the current line
|
375
|
-
|
376
|
-
Returns
|
377
|
-
-------
|
378
|
-
bool
|
379
|
-
True if the current line is a name list, False otherwise.
|
380
|
-
"""
|
381
|
-
# name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[,,;;\s]|$)"
|
382
|
-
name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[,,;;\s]|$)"
|
383
|
-
|
384
|
-
compiled_pattern = re.compile(name_list_pattern)
|
385
|
-
|
386
|
-
if compiled_pattern.search(curr_line_text):
|
387
|
-
return True
|
388
|
-
else:
|
389
|
-
return False
|
390
|
-
|
391
|
-
# """
|
392
|
-
def __get_text_catgr_by_nlp(curr_line_text):
|
393
|
-
"""
|
394
|
-
This function checks if the current line is a name list using nlp model, such as spacy
|
395
|
-
|
396
|
-
Parameters
|
397
|
-
----------
|
398
|
-
curr_line_text : str
|
399
|
-
text of the current line
|
400
|
-
|
401
|
-
Returns
|
402
|
-
-------
|
403
|
-
bool
|
404
|
-
True if the current line is a name list, False otherwise.
|
405
|
-
"""
|
406
|
-
|
407
|
-
result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
|
408
|
-
|
409
|
-
return result
|
410
|
-
|
411
|
-
# """
|
412
|
-
|
413
|
-
def __is_numbered_title(curr_line_text):
|
414
|
-
"""
|
415
|
-
This function checks if the current line is a numbered list
|
416
|
-
|
417
|
-
Parameters
|
418
|
-
----------
|
419
|
-
curr_line_text : str
|
420
|
-
text of the current line
|
421
|
-
|
422
|
-
Returns
|
423
|
-
-------
|
424
|
-
bool
|
425
|
-
True if the current line is a numbered list, False otherwise.
|
426
|
-
"""
|
427
|
-
|
428
|
-
compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
|
429
|
-
|
430
|
-
if compiled_pattern.search(curr_line_text):
|
431
|
-
return True
|
432
|
-
else:
|
433
|
-
return False
|
434
|
-
|
435
|
-
def __is_end_with_ending_puncs(line_text):
|
436
|
-
"""
|
437
|
-
This function checks if the current line ends with a ending punctuation mark
|
438
|
-
|
439
|
-
Parameters
|
440
|
-
----------
|
441
|
-
line_text : str
|
442
|
-
text of the current line
|
443
|
-
|
444
|
-
Returns
|
445
|
-
-------
|
446
|
-
bool
|
447
|
-
True if the current line ends with a punctuation mark, False otherwise.
|
448
|
-
"""
|
449
|
-
end_puncs = [".", "?", "!", "。", "?", "!", "…"]
|
450
|
-
|
451
|
-
line_text = line_text.rstrip()
|
452
|
-
if line_text[-1] in end_puncs:
|
453
|
-
return True
|
454
|
-
|
455
|
-
return False
|
456
|
-
|
457
|
-
def __contains_only_no_meaning_symbols(line_text):
|
458
|
-
"""
|
459
|
-
This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
|
460
|
-
Situation contains:
|
461
|
-
1. Only have punctuation marks
|
462
|
-
2. Only have other non-meaning symbols
|
463
|
-
|
464
|
-
Parameters
|
465
|
-
----------
|
466
|
-
line_text : str
|
467
|
-
text of the current line
|
468
|
-
|
469
|
-
Returns
|
470
|
-
-------
|
471
|
-
bool
|
472
|
-
True if the current line contains only symbols that have no meaning, False otherwise.
|
473
|
-
"""
|
474
|
-
|
475
|
-
punctuation_marks = re.findall(r"[^\w\s]", line_text) # find all punctuation marks
|
476
|
-
number_of_punctuation_marks = len(punctuation_marks)
|
477
|
-
|
478
|
-
text_length = len(line_text)
|
479
|
-
|
480
|
-
if text_length == 0:
|
481
|
-
return False
|
482
|
-
|
483
|
-
punctuation_ratio = number_of_punctuation_marks / text_length
|
484
|
-
if punctuation_ratio >= 0.9:
|
485
|
-
return True
|
486
|
-
|
487
|
-
return False
|
488
|
-
|
489
|
-
def __is_equation(line_text):
|
490
|
-
"""
|
491
|
-
This function checks if the current line is an equation.
|
492
|
-
|
493
|
-
Parameters
|
494
|
-
----------
|
495
|
-
line_text : str
|
496
|
-
|
497
|
-
Returns
|
498
|
-
-------
|
499
|
-
bool
|
500
|
-
True if the current line is an equation, False otherwise.
|
501
|
-
"""
|
502
|
-
equation_reg = r"\$.*?\\overline.*?\$" # to match interline equations
|
503
|
-
|
504
|
-
if re.search(equation_reg, line_text):
|
505
|
-
return True
|
506
|
-
else:
|
507
|
-
return False
|
508
|
-
|
509
|
-
def __is_title_by_len(text, max_length=200):
|
510
|
-
"""
|
511
|
-
This function checks if the current line is a title by length.
|
512
|
-
|
513
|
-
Parameters
|
514
|
-
----------
|
515
|
-
text : str
|
516
|
-
text of the current line
|
517
|
-
|
518
|
-
max_length : int
|
519
|
-
max length of the title
|
520
|
-
|
521
|
-
Returns
|
522
|
-
-------
|
523
|
-
bool
|
524
|
-
True if the current line is a title, False otherwise.
|
525
|
-
|
526
|
-
"""
|
527
|
-
text = text.strip()
|
528
|
-
return len(text) <= max_length
|
529
|
-
|
530
|
-
def __compute_line_font_type_and_size(curr_line):
|
531
|
-
"""
|
532
|
-
This function computes the font type and font size of the line.
|
533
|
-
|
534
|
-
Parameters
|
535
|
-
----------
|
536
|
-
line : dict
|
537
|
-
line
|
538
|
-
|
539
|
-
Returns
|
540
|
-
-------
|
541
|
-
font_type : str
|
542
|
-
font type of the line
|
543
|
-
font_size : float
|
544
|
-
font size of the line
|
545
|
-
"""
|
546
|
-
spans = curr_line["spans"]
|
547
|
-
max_accumulated_length = 0
|
548
|
-
max_span_font_size = curr_line["spans"][0]["size"] # default value, float type
|
549
|
-
max_span_font_type = curr_line["spans"][0]["font"].lower() # default value, string type
|
550
|
-
for span in spans:
|
551
|
-
if span["text"].isspace():
|
552
|
-
continue
|
553
|
-
span_length = span["bbox"][2] - span["bbox"][0]
|
554
|
-
if span_length > max_accumulated_length:
|
555
|
-
max_accumulated_length = span_length
|
556
|
-
max_span_font_size = span["size"]
|
557
|
-
max_span_font_type = span["font"].lower()
|
558
|
-
|
559
|
-
return max_span_font_type, max_span_font_size
|
560
|
-
|
561
|
-
"""
|
562
|
-
Title detecting main Process.
|
563
|
-
"""
|
564
|
-
|
565
|
-
"""
|
566
|
-
Basic features about the current line.
|
567
|
-
"""
|
568
|
-
curr_line_bbox = curr_line["bbox"]
|
569
|
-
curr_line_text = curr_line["text"]
|
570
|
-
curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
|
571
|
-
|
572
|
-
if len(curr_line_text.strip()) == 0: # skip empty lines
|
573
|
-
return False
|
574
|
-
|
575
|
-
prev_line_bbox = prev_line["bbox"] if prev_line else None
|
576
|
-
if prev_line:
|
577
|
-
prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
|
578
|
-
else:
|
579
|
-
prev_line_font_type, prev_line_font_size = None, None
|
580
|
-
|
581
|
-
next_line_bbox = next_line["bbox"] if next_line else None
|
582
|
-
if next_line:
|
583
|
-
next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
|
584
|
-
else:
|
585
|
-
next_line_font_type, next_line_font_size = None, None
|
586
|
-
|
587
|
-
"""
|
588
|
-
Aggregated features about the current line.
|
589
|
-
"""
|
590
|
-
is_italc_font = __is_italic_font_line(curr_line)
|
591
|
-
is_bold_font = __is_bold_font_line(curr_line)
|
592
|
-
|
593
|
-
is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
|
594
|
-
is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
|
595
|
-
is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
|
596
|
-
|
597
|
-
is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
|
598
|
-
|
599
|
-
is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
|
600
|
-
|
601
|
-
is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
|
602
|
-
is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
|
603
|
-
|
604
|
-
is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
|
605
|
-
|
606
|
-
is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
|
607
|
-
is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
|
608
|
-
|
609
|
-
is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
|
610
|
-
curr_line_font_size, prev_line_font_size, next_line_font_size
|
611
|
-
)
|
612
|
-
|
613
|
-
is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
|
614
|
-
curr_line_font_type, prev_line_font_type, next_line_font_type
|
615
|
-
)
|
616
|
-
|
617
|
-
has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
|
618
|
-
curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
|
619
|
-
)
|
620
|
-
|
621
|
-
is_similar_to_pre_line = __is_similar_to_pre_line(
|
622
|
-
curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
|
623
|
-
)
|
624
|
-
|
625
|
-
"""
|
626
|
-
Further aggregated features about the current line.
|
627
|
-
|
628
|
-
Attention:
|
629
|
-
Features that start with __ are for internal use.
|
630
|
-
"""
|
631
|
-
|
632
|
-
__is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
|
633
|
-
curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
|
634
|
-
)
|
635
|
-
__is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
|
636
|
-
is_a_left_inline_title = (
|
637
|
-
is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
|
638
|
-
)
|
639
|
-
|
640
|
-
is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
|
641
|
-
is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
|
642
|
-
|
643
|
-
is_title_by_check_pre_and_next_line = (
|
644
|
-
(prev_line is not None or next_line is not None)
|
645
|
-
and has_sufficient_spaces_above
|
646
|
-
and has_sufficient_spaces_below
|
647
|
-
and is_potential_title_font
|
648
|
-
)
|
649
|
-
|
650
|
-
is_numbered_title = __is_numbered_title(curr_line_text) and (
|
651
|
-
(has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
|
652
|
-
)
|
653
|
-
|
654
|
-
is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
|
655
|
-
|
656
|
-
is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
|
657
|
-
|
658
|
-
is_equation = __is_equation(curr_line_text)
|
659
|
-
|
660
|
-
is_title_by_len = __is_title_by_len(curr_line_text)
|
661
|
-
|
662
|
-
"""
|
663
|
-
Decide if the line is a title.
|
664
|
-
"""
|
665
|
-
# is_title = False
|
666
|
-
# if prev_line_is_title:
|
667
|
-
|
668
|
-
is_title = (
|
669
|
-
is_not_end_with_ending_puncs # not end with ending punctuation marks
|
670
|
-
and is_not_only_no_meaning_symbols # not only have no meaning symbols
|
671
|
-
and is_title_by_len # is a title by length, default max length is 200
|
672
|
-
and not is_equation # an interline equation should never be a title
|
673
|
-
and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
|
674
|
-
and (
|
675
|
-
(is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
|
676
|
-
or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
|
677
|
-
or (
|
678
|
-
is_much_larger_font_than_doc_avg
|
679
|
-
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
|
680
|
-
)
|
681
|
-
or (
|
682
|
-
is_font_size_little_less_than_doc_avg
|
683
|
-
and is_bold_font
|
684
|
-
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
|
685
|
-
)
|
686
|
-
) # not the same font type as the document average font type, which includes the most common font type and the second most common font type
|
687
|
-
and (
|
688
|
-
(
|
689
|
-
not is_person_or_org_list_line_by_nlp
|
690
|
-
and (
|
691
|
-
is_much_larger_font_than_doc_avg
|
692
|
-
or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
|
693
|
-
)
|
694
|
-
)
|
695
|
-
or (
|
696
|
-
not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
|
697
|
-
and not is_a_left_inline_title
|
698
|
-
and not is_punctuation_heavy
|
699
|
-
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
|
700
|
-
)
|
701
|
-
or (
|
702
|
-
is_person_or_org_list_line_by_nlp
|
703
|
-
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
|
704
|
-
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
|
705
|
-
)
|
706
|
-
or (is_numbered_title and not is_a_left_inline_title)
|
707
|
-
)
|
708
|
-
)
|
709
|
-
# ) or (is_similar_to_pre_line and prev_line_is_title)
|
710
|
-
|
711
|
-
is_name_or_org_list_to_be_removed = (
|
712
|
-
(is_person_or_org_list_line_by_nlp)
|
713
|
-
and is_punctuation_heavy
|
714
|
-
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
|
715
|
-
) and not is_title
|
716
|
-
|
717
|
-
if is_name_or_org_list_to_be_removed:
|
718
|
-
is_author_or_org_list = True
|
719
|
-
# print curr_line_text to check
|
720
|
-
# print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
|
721
|
-
else:
|
722
|
-
is_author_or_org_list = False
|
723
|
-
"""
|
724
|
-
# print reason why the line is a title
|
725
|
-
if is_title:
|
726
|
-
print_green("This line is a title.")
|
727
|
-
print_green("↓" * 10)
|
728
|
-
print()
|
729
|
-
print("curr_line_text: ", curr_line_text)
|
730
|
-
print()
|
731
|
-
|
732
|
-
# print reason why the line is not a title
|
733
|
-
line_text = curr_line_text.strip()
|
734
|
-
test_text = "Career/Personal Life"
|
735
|
-
text_content_condition = line_text == test_text
|
736
|
-
|
737
|
-
if not is_title and text_content_condition: # Print specific line
|
738
|
-
# if not is_title: # Print each line
|
739
|
-
print_red("This line is not a title.")
|
740
|
-
print_red("↓" * 10)
|
741
|
-
|
742
|
-
print()
|
743
|
-
print("curr_line_text: ", curr_line_text)
|
744
|
-
print()
|
745
|
-
|
746
|
-
if is_not_end_with_ending_puncs:
|
747
|
-
print_green(f"is_not_end_with_ending_puncs")
|
748
|
-
else:
|
749
|
-
print_red(f"is_end_with_ending_puncs")
|
750
|
-
|
751
|
-
if is_not_only_no_meaning_symbols:
|
752
|
-
print_green(f"is_not_only_no_meaning_symbols")
|
753
|
-
else:
|
754
|
-
print_red(f"is_only_no_meaning_symbols")
|
755
|
-
|
756
|
-
if is_title_by_len:
|
757
|
-
print_green(f"is_title_by_len: {is_title_by_len}")
|
758
|
-
else:
|
759
|
-
print_red(f"is_not_title_by_len: {is_title_by_len}")
|
760
|
-
|
761
|
-
if is_equation:
|
762
|
-
print_red(f"is_equation")
|
763
|
-
else:
|
764
|
-
print_green(f"is_not_equation")
|
765
|
-
|
766
|
-
if is_potential_title_font:
|
767
|
-
print_green(f"is_potential_title_font")
|
768
|
-
else:
|
769
|
-
print_red(f"is_not_potential_title_font")
|
770
|
-
|
771
|
-
if is_punctuation_heavy:
|
772
|
-
print_red("is_punctuation_heavy")
|
773
|
-
else:
|
774
|
-
print_green("is_not_punctuation_heavy")
|
775
|
-
|
776
|
-
if is_bold_font:
|
777
|
-
print_green(f"is_bold_font")
|
778
|
-
else:
|
779
|
-
print_red(f"is_not_bold_font")
|
780
|
-
|
781
|
-
if is_font_size_not_less_than_doc_avg:
|
782
|
-
print_green(f"is_larger_font_than_doc_avg")
|
783
|
-
else:
|
784
|
-
print_red(f"is_not_larger_font_than_doc_avg")
|
785
|
-
|
786
|
-
if is_much_larger_font_than_doc_avg:
|
787
|
-
print_green(f"is_much_larger_font_than_doc_avg")
|
788
|
-
else:
|
789
|
-
print_red(f"is_not_much_larger_font_than_doc_avg")
|
790
|
-
|
791
|
-
if is_not_same_font_type_of_docAvg:
|
792
|
-
print_green(f"is_not_same_font_type_of_docAvg")
|
793
|
-
else:
|
794
|
-
print_red(f"is_same_font_type_of_docAvg")
|
795
|
-
|
796
|
-
if is_word_list_line_by_rules:
|
797
|
-
print_red("is_word_list_line_by_rules")
|
798
|
-
else:
|
799
|
-
print_green("is_not_name_list_by_rules")
|
800
|
-
|
801
|
-
if is_person_or_org_list_line_by_nlp:
|
802
|
-
print_red("is_person_or_org_list_line_by_nlp")
|
803
|
-
else:
|
804
|
-
print_green("is_not_person_or_org_list_line_by_nlp")
|
805
|
-
|
806
|
-
if not is_numbered_title:
|
807
|
-
print_red("is_not_numbered_title")
|
808
|
-
else:
|
809
|
-
print_green("is_numbered_title")
|
810
|
-
|
811
|
-
if is_a_left_inline_title:
|
812
|
-
print_red("is_a_left_inline_title")
|
813
|
-
else:
|
814
|
-
print_green("is_not_a_left_inline_title")
|
815
|
-
|
816
|
-
if not is_title_by_check_prev_line:
|
817
|
-
print_red("is_not_title_by_check_prev_line")
|
818
|
-
else:
|
819
|
-
print_green("is_title_by_check_prev_line")
|
820
|
-
|
821
|
-
if not is_title_by_check_next_line:
|
822
|
-
print_red("is_not_title_by_check_next_line")
|
823
|
-
else:
|
824
|
-
print_green("is_title_by_check_next_line")
|
825
|
-
|
826
|
-
if not is_title_by_check_pre_and_next_line:
|
827
|
-
print_red("is_not_title_by_check_pre_and_next_line")
|
828
|
-
else:
|
829
|
-
print_green("is_title_by_check_pre_and_next_line")
|
830
|
-
|
831
|
-
# print_green("Common features:")
|
832
|
-
# print_green("↓" * 10)
|
833
|
-
|
834
|
-
# print(f" curr_line_font_type: {curr_line_font_type}")
|
835
|
-
# print(f" curr_line_font_size: {curr_line_font_size}")
|
836
|
-
# print()
|
837
|
-
|
838
|
-
"""
|
839
|
-
|
840
|
-
return is_title, is_author_or_org_list
|
841
|
-
|
842
|
-
def _detect_block_title(self, input_block):
|
843
|
-
"""
|
844
|
-
Use the functions 'is_potential_title' to detect titles of each paragraph block.
|
845
|
-
If a line is a title, then the value of key 'is_title' of the line will be set to True.
|
846
|
-
"""
|
847
|
-
|
848
|
-
raw_lines = input_block["lines"]
|
849
|
-
|
850
|
-
prev_line_is_title_flag = False
|
851
|
-
|
852
|
-
for i, curr_line in enumerate(raw_lines):
|
853
|
-
prev_line = raw_lines[i - 1] if i > 0 else None
|
854
|
-
next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
|
855
|
-
|
856
|
-
blk_avg_char_width = input_block["avg_char_width"]
|
857
|
-
blk_avg_char_height = input_block["avg_char_height"]
|
858
|
-
blk_media_font_size = input_block["median_font_size"]
|
859
|
-
|
860
|
-
is_title, is_author_or_org_list = self._is_potential_title(
|
861
|
-
curr_line,
|
862
|
-
prev_line,
|
863
|
-
prev_line_is_title_flag,
|
864
|
-
next_line,
|
865
|
-
blk_avg_char_width,
|
866
|
-
blk_avg_char_height,
|
867
|
-
blk_media_font_size,
|
868
|
-
)
|
869
|
-
|
870
|
-
if is_title:
|
871
|
-
curr_line["is_title"] = is_title
|
872
|
-
prev_line_is_title_flag = True
|
873
|
-
else:
|
874
|
-
curr_line["is_title"] = False
|
875
|
-
prev_line_is_title_flag = False
|
876
|
-
|
877
|
-
if is_author_or_org_list:
|
878
|
-
curr_line["is_author_or_org_list"] = is_author_or_org_list
|
879
|
-
else:
|
880
|
-
curr_line["is_author_or_org_list"] = False
|
881
|
-
|
882
|
-
return input_block
|
883
|
-
|
884
|
-
def batch_process_blocks_detect_titles(self, pdf_dic):
|
885
|
-
"""
|
886
|
-
This function batch process the blocks to detect titles.
|
887
|
-
|
888
|
-
Parameters
|
889
|
-
----------
|
890
|
-
pdf_dict : dict
|
891
|
-
result dictionary
|
892
|
-
|
893
|
-
Returns
|
894
|
-
-------
|
895
|
-
pdf_dict : dict
|
896
|
-
result dictionary
|
897
|
-
"""
|
898
|
-
num_titles = 0
|
899
|
-
|
900
|
-
for page_id, blocks in pdf_dic.items():
|
901
|
-
if page_id.startswith("page_"):
|
902
|
-
para_blocks = []
|
903
|
-
if "para_blocks" in blocks.keys():
|
904
|
-
para_blocks = blocks["para_blocks"]
|
905
|
-
|
906
|
-
all_single_line_blocks = []
|
907
|
-
for block in para_blocks:
|
908
|
-
if len(block["lines"]) == 1:
|
909
|
-
all_single_line_blocks.append(block)
|
910
|
-
|
911
|
-
new_para_blocks = []
|
912
|
-
if not len(all_single_line_blocks) == len(para_blocks): # Not all blocks are single line blocks.
|
913
|
-
for para_block in para_blocks:
|
914
|
-
new_block = self._detect_block_title(para_block)
|
915
|
-
new_para_blocks.append(new_block)
|
916
|
-
num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
|
917
|
-
else: # All blocks are single line blocks.
|
918
|
-
for para_block in para_blocks:
|
919
|
-
new_para_blocks.append(para_block)
|
920
|
-
num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
|
921
|
-
para_blocks = new_para_blocks
|
922
|
-
|
923
|
-
blocks["para_blocks"] = para_blocks
|
924
|
-
|
925
|
-
for para_block in para_blocks:
|
926
|
-
all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
|
927
|
-
para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
|
928
|
-
if (
|
929
|
-
all_titles and para_text_len < 200
|
930
|
-
): # total length of the paragraph is less than 200, more than this should not be a title
|
931
|
-
para_block["is_block_title"] = 1
|
932
|
-
else:
|
933
|
-
para_block["is_block_title"] = 0
|
934
|
-
|
935
|
-
all_name_or_org_list_to_be_removed = all(
|
936
|
-
safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
|
937
|
-
)
|
938
|
-
if all_name_or_org_list_to_be_removed and page_id == "page_0":
|
939
|
-
para_block["is_block_an_author_or_org_list"] = 1
|
940
|
-
else:
|
941
|
-
para_block["is_block_an_author_or_org_list"] = 0
|
942
|
-
|
943
|
-
pdf_dic["statistics"]["num_titles"] = num_titles
|
944
|
-
|
945
|
-
return pdf_dic
|
946
|
-
|
947
|
-
def __determine_size_based_level(self, title_blocks):
|
948
|
-
"""
|
949
|
-
This function determines the title level based on the font size of the title.
|
950
|
-
|
951
|
-
Parameters
|
952
|
-
----------
|
953
|
-
title_blocks : list
|
954
|
-
|
955
|
-
Returns
|
956
|
-
-------
|
957
|
-
title_blocks : list
|
958
|
-
"""
|
959
|
-
|
960
|
-
font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
|
961
|
-
|
962
|
-
# Use the mean and std of font sizes to remove extreme values
|
963
|
-
mean_font_size = np.mean(font_sizes)
|
964
|
-
std_font_size = np.std(font_sizes)
|
965
|
-
min_extreme_font_size = mean_font_size - std_font_size # type: ignore
|
966
|
-
max_extreme_font_size = mean_font_size + std_font_size # type: ignore
|
967
|
-
|
968
|
-
# Compute the threshold for title level
|
969
|
-
middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
|
970
|
-
if middle_font_sizes.size > 0:
|
971
|
-
middle_mean_font_size = np.mean(middle_font_sizes)
|
972
|
-
level_threshold = middle_mean_font_size
|
973
|
-
else:
|
974
|
-
level_threshold = mean_font_size
|
975
|
-
|
976
|
-
for tb in title_blocks:
|
977
|
-
title_block = tb["block"]
|
978
|
-
title_font_size = safe_get(title_block, "block_font_size", 0)
|
979
|
-
|
980
|
-
current_level = 1 # Initialize title level, the biggest level is 1
|
981
|
-
|
982
|
-
# print(f"Before adjustment by font size, {current_level}")
|
983
|
-
if title_font_size >= max_extreme_font_size:
|
984
|
-
current_level = 1
|
985
|
-
elif title_font_size <= min_extreme_font_size:
|
986
|
-
current_level = 3
|
987
|
-
elif float(title_font_size) >= float(level_threshold):
|
988
|
-
current_level = 2
|
989
|
-
else:
|
990
|
-
current_level = 3
|
991
|
-
# print(f"After adjustment by font size, {current_level}")
|
992
|
-
|
993
|
-
title_block["block_title_level"] = current_level
|
994
|
-
|
995
|
-
return title_blocks
|
996
|
-
|
997
|
-
def batch_process_blocks_recog_title_level(self, pdf_dic):
|
998
|
-
title_blocks = []
|
999
|
-
|
1000
|
-
# Collect all titles
|
1001
|
-
for page_id, blocks in pdf_dic.items():
|
1002
|
-
if page_id.startswith("page_"):
|
1003
|
-
para_blocks = blocks.get("para_blocks", [])
|
1004
|
-
for block in para_blocks:
|
1005
|
-
if block.get("is_block_title"):
|
1006
|
-
title_obj = {"page_id": page_id, "block": block}
|
1007
|
-
title_blocks.append(title_obj)
|
1008
|
-
|
1009
|
-
# Determine title level
|
1010
|
-
if title_blocks:
|
1011
|
-
# Determine title level based on font size
|
1012
|
-
title_blocks = self.__determine_size_based_level(title_blocks)
|
1013
|
-
|
1014
|
-
return pdf_dic
|