magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,480 +0,0 @@
|
|
1
|
-
from magic_pdf.para.commons import *
|
2
|
-
|
3
|
-
|
4
|
-
if sys.version_info[0] >= 3:
|
5
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
class BlockTerminationProcessor:
|
10
|
-
def __init__(self) -> None:
|
11
|
-
pass
|
12
|
-
|
13
|
-
def _is_consistent_lines(
|
14
|
-
self,
|
15
|
-
curr_line,
|
16
|
-
prev_line,
|
17
|
-
next_line,
|
18
|
-
consistent_direction, # 0 for prev, 1 for next, 2 for both
|
19
|
-
):
|
20
|
-
"""
|
21
|
-
This function checks if the line is consistent with its neighbors
|
22
|
-
|
23
|
-
Parameters
|
24
|
-
----------
|
25
|
-
curr_line : dict
|
26
|
-
current line
|
27
|
-
prev_line : dict
|
28
|
-
previous line
|
29
|
-
next_line : dict
|
30
|
-
next line
|
31
|
-
consistent_direction : int
|
32
|
-
0 for prev, 1 for next, 2 for both
|
33
|
-
|
34
|
-
Returns
|
35
|
-
-------
|
36
|
-
bool
|
37
|
-
True if the line is consistent with its neighbors, False otherwise.
|
38
|
-
"""
|
39
|
-
|
40
|
-
curr_line_font_size = curr_line["spans"][0]["size"]
|
41
|
-
curr_line_font_type = curr_line["spans"][0]["font"].lower()
|
42
|
-
|
43
|
-
if consistent_direction == 0:
|
44
|
-
if prev_line:
|
45
|
-
prev_line_font_size = prev_line["spans"][0]["size"]
|
46
|
-
prev_line_font_type = prev_line["spans"][0]["font"].lower()
|
47
|
-
return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
|
48
|
-
else:
|
49
|
-
return False
|
50
|
-
|
51
|
-
elif consistent_direction == 1:
|
52
|
-
if next_line:
|
53
|
-
next_line_font_size = next_line["spans"][0]["size"]
|
54
|
-
next_line_font_type = next_line["spans"][0]["font"].lower()
|
55
|
-
return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
|
56
|
-
else:
|
57
|
-
return False
|
58
|
-
|
59
|
-
elif consistent_direction == 2:
|
60
|
-
if prev_line and next_line:
|
61
|
-
prev_line_font_size = prev_line["spans"][0]["size"]
|
62
|
-
prev_line_font_type = prev_line["spans"][0]["font"].lower()
|
63
|
-
next_line_font_size = next_line["spans"][0]["size"]
|
64
|
-
next_line_font_type = next_line["spans"][0]["font"].lower()
|
65
|
-
return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
|
66
|
-
curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
|
67
|
-
)
|
68
|
-
else:
|
69
|
-
return False
|
70
|
-
|
71
|
-
else:
|
72
|
-
return False
|
73
|
-
|
74
|
-
def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
|
75
|
-
"""
|
76
|
-
This function checks if the line is a regular line
|
77
|
-
|
78
|
-
Parameters
|
79
|
-
----------
|
80
|
-
curr_line_bbox : list
|
81
|
-
bbox of the current line
|
82
|
-
prev_line_bbox : list
|
83
|
-
bbox of the previous line
|
84
|
-
next_line_bbox : list
|
85
|
-
bbox of the next line
|
86
|
-
avg_char_width : float
|
87
|
-
average of char widths
|
88
|
-
X0 : float
|
89
|
-
median of x0 values, which represents the left average boundary of the page
|
90
|
-
X1 : float
|
91
|
-
median of x1 values, which represents the right average boundary of the page
|
92
|
-
avg_line_height : float
|
93
|
-
average of line heights
|
94
|
-
|
95
|
-
Returns
|
96
|
-
-------
|
97
|
-
bool
|
98
|
-
True if the line is a regular line, False otherwise.
|
99
|
-
"""
|
100
|
-
horizontal_ratio = 0.5
|
101
|
-
vertical_ratio = 0.5
|
102
|
-
horizontal_thres = horizontal_ratio * avg_char_width
|
103
|
-
vertical_thres = vertical_ratio * avg_line_height
|
104
|
-
|
105
|
-
x0, y0, x1, y1 = curr_line_bbox
|
106
|
-
|
107
|
-
x0_near_X0 = abs(x0 - X0) < horizontal_thres
|
108
|
-
x1_near_X1 = abs(x1 - X1) < horizontal_thres
|
109
|
-
|
110
|
-
prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
|
111
|
-
|
112
|
-
sufficient_spacing_above = False
|
113
|
-
if prev_line_bbox:
|
114
|
-
vertical_spacing_above = y1 - prev_line_bbox[3]
|
115
|
-
sufficient_spacing_above = vertical_spacing_above > vertical_thres
|
116
|
-
|
117
|
-
sufficient_spacing_below = False
|
118
|
-
if next_line_bbox:
|
119
|
-
vertical_spacing_below = next_line_bbox[1] - y0
|
120
|
-
sufficient_spacing_below = vertical_spacing_below > vertical_thres
|
121
|
-
|
122
|
-
return (
|
123
|
-
(sufficient_spacing_above or sufficient_spacing_below)
|
124
|
-
or (not x0_near_X0 and not x1_near_X1)
|
125
|
-
or prev_line_is_end_of_para
|
126
|
-
)
|
127
|
-
|
128
|
-
def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
|
129
|
-
"""
|
130
|
-
This function checks if the line is a possible start of a paragraph
|
131
|
-
|
132
|
-
Parameters
|
133
|
-
----------
|
134
|
-
curr_line : dict
|
135
|
-
current line
|
136
|
-
prev_line : dict
|
137
|
-
previous line
|
138
|
-
next_line : dict
|
139
|
-
next line
|
140
|
-
X0 : float
|
141
|
-
median of x0 values, which represents the left average boundary of the page
|
142
|
-
X1 : float
|
143
|
-
median of x1 values, which represents the right average boundary of the page
|
144
|
-
avg_char_width : float
|
145
|
-
average of char widths
|
146
|
-
avg_line_height : float
|
147
|
-
average of line heights
|
148
|
-
|
149
|
-
Returns
|
150
|
-
-------
|
151
|
-
bool
|
152
|
-
True if the line is a possible start of a paragraph, False otherwise.
|
153
|
-
"""
|
154
|
-
start_confidence = 0.5 # Initial confidence of the line being a start of a paragraph
|
155
|
-
decision_path = [] # Record the decision path
|
156
|
-
|
157
|
-
curr_line_bbox = curr_line["bbox"]
|
158
|
-
prev_line_bbox = prev_line["bbox"] if prev_line else None
|
159
|
-
next_line_bbox = next_line["bbox"] if next_line else None
|
160
|
-
|
161
|
-
indent_ratio = 1
|
162
|
-
|
163
|
-
vertical_ratio = 1.5
|
164
|
-
vertical_thres = vertical_ratio * avg_font_size
|
165
|
-
|
166
|
-
left_horizontal_ratio = 0.5
|
167
|
-
left_horizontal_thres = left_horizontal_ratio * avg_char_width
|
168
|
-
|
169
|
-
right_horizontal_ratio = 2.5
|
170
|
-
right_horizontal_thres = right_horizontal_ratio * avg_char_width
|
171
|
-
|
172
|
-
x0, y0, x1, y1 = curr_line_bbox
|
173
|
-
|
174
|
-
indent_condition = x0 > X0 + indent_ratio * avg_char_width
|
175
|
-
if indent_condition:
|
176
|
-
start_confidence += 0.2
|
177
|
-
decision_path.append("indent_condition_met")
|
178
|
-
|
179
|
-
x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
|
180
|
-
if x0_near_X0:
|
181
|
-
start_confidence += 0.1
|
182
|
-
decision_path.append("x0_near_X0")
|
183
|
-
|
184
|
-
x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
|
185
|
-
if x1_near_X1:
|
186
|
-
start_confidence += 0.1
|
187
|
-
decision_path.append("x1_near_X1")
|
188
|
-
|
189
|
-
if prev_line is None:
|
190
|
-
prev_line_is_end_of_para = True
|
191
|
-
start_confidence += 0.2
|
192
|
-
decision_path.append("no_prev_line")
|
193
|
-
else:
|
194
|
-
prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
|
195
|
-
if prev_line_is_end_of_para:
|
196
|
-
start_confidence += 0.1
|
197
|
-
decision_path.append("prev_line_is_end_of_para")
|
198
|
-
|
199
|
-
sufficient_spacing_above = False
|
200
|
-
if prev_line_bbox:
|
201
|
-
vertical_spacing_above = y1 - prev_line_bbox[3]
|
202
|
-
sufficient_spacing_above = vertical_spacing_above > vertical_thres
|
203
|
-
if sufficient_spacing_above:
|
204
|
-
start_confidence += 0.2
|
205
|
-
decision_path.append("sufficient_spacing_above")
|
206
|
-
|
207
|
-
sufficient_spacing_below = False
|
208
|
-
if next_line_bbox:
|
209
|
-
vertical_spacing_below = next_line_bbox[1] - y0
|
210
|
-
sufficient_spacing_below = vertical_spacing_below > vertical_thres
|
211
|
-
if sufficient_spacing_below:
|
212
|
-
start_confidence += 0.2
|
213
|
-
decision_path.append("sufficient_spacing_below")
|
214
|
-
|
215
|
-
is_regular_line = self._is_regular_line(
|
216
|
-
curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
|
217
|
-
)
|
218
|
-
if is_regular_line:
|
219
|
-
start_confidence += 0.1
|
220
|
-
decision_path.append("is_regular_line")
|
221
|
-
|
222
|
-
is_start_of_para = (
|
223
|
-
(sufficient_spacing_above or sufficient_spacing_below)
|
224
|
-
or (indent_condition)
|
225
|
-
or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
|
226
|
-
or prev_line_is_end_of_para
|
227
|
-
)
|
228
|
-
return (is_start_of_para, start_confidence, decision_path)
|
229
|
-
|
230
|
-
def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
|
231
|
-
"""
|
232
|
-
This function checks if the line is a possible end of a paragraph
|
233
|
-
|
234
|
-
Parameters
|
235
|
-
----------
|
236
|
-
curr_line : dict
|
237
|
-
current line
|
238
|
-
next_line : dict
|
239
|
-
next line
|
240
|
-
X0 : float
|
241
|
-
median of x0 values, which represents the left average boundary of the page
|
242
|
-
X1 : float
|
243
|
-
median of x1 values, which represents the right average boundary of the page
|
244
|
-
avg_char_width : float
|
245
|
-
average of char widths
|
246
|
-
|
247
|
-
Returns
|
248
|
-
-------
|
249
|
-
bool
|
250
|
-
True if the line is a possible end of a paragraph, False otherwise.
|
251
|
-
"""
|
252
|
-
|
253
|
-
end_confidence = 0.5 # Initial confidence of the line being a end of a paragraph
|
254
|
-
decision_path = [] # Record the decision path
|
255
|
-
|
256
|
-
curr_line_bbox = curr_line["bbox"]
|
257
|
-
next_line_bbox = next_line["bbox"] if next_line else None
|
258
|
-
|
259
|
-
left_horizontal_ratio = 0.5
|
260
|
-
right_horizontal_ratio = 0.5
|
261
|
-
|
262
|
-
x0, _, x1, y1 = curr_line_bbox
|
263
|
-
next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
|
264
|
-
|
265
|
-
x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
|
266
|
-
if x0_near_X0:
|
267
|
-
end_confidence += 0.1
|
268
|
-
decision_path.append("x0_near_X0")
|
269
|
-
|
270
|
-
x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
|
271
|
-
if x1_smaller_than_X1:
|
272
|
-
end_confidence += 0.1
|
273
|
-
decision_path.append("x1_smaller_than_X1")
|
274
|
-
|
275
|
-
next_line_is_start_of_para = (
|
276
|
-
next_line_bbox
|
277
|
-
and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
|
278
|
-
and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
|
279
|
-
)
|
280
|
-
if next_line_is_start_of_para:
|
281
|
-
end_confidence += 0.2
|
282
|
-
decision_path.append("next_line_is_start_of_para")
|
283
|
-
|
284
|
-
is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
|
285
|
-
curr_line_bbox, None, next_line_bbox, avg_char_width
|
286
|
-
)
|
287
|
-
if is_line_left_aligned_from_neighbors_bool:
|
288
|
-
end_confidence += 0.1
|
289
|
-
decision_path.append("line_is_left_aligned_from_neighbors")
|
290
|
-
|
291
|
-
is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
|
292
|
-
curr_line_bbox, None, next_line_bbox, avg_char_width
|
293
|
-
)
|
294
|
-
if not is_line_right_aligned_from_neighbors_bool:
|
295
|
-
end_confidence += 0.1
|
296
|
-
decision_path.append("line_is_not_right_aligned_from_neighbors")
|
297
|
-
|
298
|
-
is_end_of_para = end_with_punctuation(curr_line["text"]) and (
|
299
|
-
(x0_near_X0 and x1_smaller_than_X1)
|
300
|
-
or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
|
301
|
-
)
|
302
|
-
|
303
|
-
return (is_end_of_para, end_confidence, decision_path)
|
304
|
-
|
305
|
-
def _cut_paras_per_block(
|
306
|
-
self,
|
307
|
-
block,
|
308
|
-
):
|
309
|
-
"""
|
310
|
-
Processes a raw block from PyMuPDF and returns the processed block.
|
311
|
-
|
312
|
-
Parameters
|
313
|
-
----------
|
314
|
-
raw_block : dict
|
315
|
-
A raw block from pymupdf.
|
316
|
-
|
317
|
-
Returns
|
318
|
-
-------
|
319
|
-
processed_block : dict
|
320
|
-
|
321
|
-
"""
|
322
|
-
|
323
|
-
def _construct_para(lines, is_block_title, para_title_level):
|
324
|
-
"""
|
325
|
-
Construct a paragraph from given lines.
|
326
|
-
"""
|
327
|
-
|
328
|
-
font_sizes = [span["size"] for line in lines for span in line["spans"]]
|
329
|
-
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
|
330
|
-
|
331
|
-
font_colors = [span["color"] for line in lines for span in line["spans"]]
|
332
|
-
most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
|
333
|
-
|
334
|
-
# font_types = [span["font"] for line in lines for span in line["spans"]]
|
335
|
-
# most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
|
336
|
-
|
337
|
-
font_type_lengths = {}
|
338
|
-
for line in lines:
|
339
|
-
for span in line["spans"]:
|
340
|
-
font_type = span["font"]
|
341
|
-
bbox_width = span["bbox"][2] - span["bbox"][0]
|
342
|
-
if font_type in font_type_lengths:
|
343
|
-
font_type_lengths[font_type] += bbox_width
|
344
|
-
else:
|
345
|
-
font_type_lengths[font_type] = bbox_width
|
346
|
-
|
347
|
-
# get the font type with the longest bbox width
|
348
|
-
most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None # type: ignore
|
349
|
-
|
350
|
-
para_bbox = calculate_para_bbox(lines)
|
351
|
-
para_text = " ".join(line["text"] for line in lines)
|
352
|
-
|
353
|
-
return {
|
354
|
-
"para_bbox": para_bbox,
|
355
|
-
"para_text": para_text,
|
356
|
-
"para_font_type": most_common_font_type,
|
357
|
-
"para_font_size": avg_font_size,
|
358
|
-
"para_font_color": most_common_font_color,
|
359
|
-
"is_para_title": is_block_title,
|
360
|
-
"para_title_level": para_title_level,
|
361
|
-
}
|
362
|
-
|
363
|
-
block_bbox = block["bbox"]
|
364
|
-
block_text = block["text"]
|
365
|
-
block_lines = block["lines"]
|
366
|
-
|
367
|
-
X0 = safe_get(block, "X0", 0)
|
368
|
-
X1 = safe_get(block, "X1", 0)
|
369
|
-
avg_char_width = safe_get(block, "avg_char_width", 0)
|
370
|
-
avg_char_height = safe_get(block, "avg_char_height", 0)
|
371
|
-
avg_font_size = safe_get(block, "avg_font_size", 0)
|
372
|
-
|
373
|
-
is_block_title = safe_get(block, "is_block_title", False)
|
374
|
-
para_title_level = safe_get(block, "block_title_level", 0)
|
375
|
-
|
376
|
-
# Segment into paragraphs
|
377
|
-
para_ranges = []
|
378
|
-
in_paragraph = False
|
379
|
-
start_idx_of_para = None
|
380
|
-
|
381
|
-
# Create the processed paragraphs
|
382
|
-
processed_paras = {}
|
383
|
-
para_bboxes = []
|
384
|
-
end_idx_of_para = 0
|
385
|
-
|
386
|
-
for line_index, line in enumerate(block_lines):
|
387
|
-
curr_line = line
|
388
|
-
prev_line = block_lines[line_index - 1] if line_index > 0 else None
|
389
|
-
next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
|
390
|
-
|
391
|
-
"""
|
392
|
-
Start processing paragraphs.
|
393
|
-
"""
|
394
|
-
|
395
|
-
# Check if the line is the start of a paragraph
|
396
|
-
is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
|
397
|
-
curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
|
398
|
-
)
|
399
|
-
if not in_paragraph and is_start_of_para:
|
400
|
-
in_paragraph = True
|
401
|
-
start_idx_of_para = line_index
|
402
|
-
|
403
|
-
# print_green(">>> Start of a paragraph")
|
404
|
-
# print(" curr_line_text: ", curr_line["text"])
|
405
|
-
# print(" start_confidence: ", start_confidence)
|
406
|
-
# print(" decision_path: ", decision_path)
|
407
|
-
|
408
|
-
# Check if the line is the end of a paragraph
|
409
|
-
is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
|
410
|
-
curr_line, next_line, X0, X1, avg_char_width
|
411
|
-
)
|
412
|
-
if in_paragraph and (is_end_of_para or not next_line):
|
413
|
-
para_ranges.append((start_idx_of_para, line_index))
|
414
|
-
start_idx_of_para = None
|
415
|
-
in_paragraph = False
|
416
|
-
|
417
|
-
# print_red(">>> End of a paragraph")
|
418
|
-
# print(" curr_line_text: ", curr_line["text"])
|
419
|
-
# print(" end_confidence: ", end_confidence)
|
420
|
-
# print(" decision_path: ", decision_path)
|
421
|
-
|
422
|
-
# Add the last paragraph if it is not added
|
423
|
-
if in_paragraph and start_idx_of_para is not None:
|
424
|
-
para_ranges.append((start_idx_of_para, len(block_lines) - 1))
|
425
|
-
|
426
|
-
# Process the matched paragraphs
|
427
|
-
for para_index, (start_idx, end_idx) in enumerate(para_ranges):
|
428
|
-
matched_lines = block_lines[start_idx : end_idx + 1]
|
429
|
-
para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
|
430
|
-
para_key = f"para_{len(processed_paras)}"
|
431
|
-
processed_paras[para_key] = para_properties
|
432
|
-
para_bboxes.append(para_properties["para_bbox"])
|
433
|
-
end_idx_of_para = end_idx + 1
|
434
|
-
|
435
|
-
# Deal with the remaining lines
|
436
|
-
if end_idx_of_para < len(block_lines):
|
437
|
-
unmatched_lines = block_lines[end_idx_of_para:]
|
438
|
-
unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
|
439
|
-
unmatched_key = f"para_{len(processed_paras)}"
|
440
|
-
processed_paras[unmatched_key] = unmatched_properties
|
441
|
-
para_bboxes.append(unmatched_properties["para_bbox"])
|
442
|
-
|
443
|
-
block["paras"] = processed_paras
|
444
|
-
|
445
|
-
return block
|
446
|
-
|
447
|
-
def batch_process_blocks(self, pdf_dict):
|
448
|
-
"""
|
449
|
-
Parses the blocks of all pages.
|
450
|
-
|
451
|
-
Parameters
|
452
|
-
----------
|
453
|
-
pdf_dict : dict
|
454
|
-
PDF dictionary.
|
455
|
-
filter_blocks : list
|
456
|
-
List of bounding boxes to filter.
|
457
|
-
|
458
|
-
Returns
|
459
|
-
-------
|
460
|
-
result_dict : dict
|
461
|
-
Result dictionary.
|
462
|
-
|
463
|
-
"""
|
464
|
-
|
465
|
-
num_paras = 0
|
466
|
-
|
467
|
-
for page_id, page in pdf_dict.items():
|
468
|
-
if page_id.startswith("page_"):
|
469
|
-
para_blocks = []
|
470
|
-
if "para_blocks" in page.keys():
|
471
|
-
input_blocks = page["para_blocks"]
|
472
|
-
for input_block in input_blocks:
|
473
|
-
new_block = self._cut_paras_per_block(input_block)
|
474
|
-
para_blocks.append(new_block)
|
475
|
-
num_paras += len(new_block["paras"])
|
476
|
-
|
477
|
-
page["para_blocks"] = para_blocks
|
478
|
-
|
479
|
-
pdf_dict["statistics"]["num_paras"] = num_paras
|
480
|
-
return pdf_dict
|
magic_pdf/para/commons.py
DELETED
@@ -1,222 +0,0 @@
|
|
1
|
-
import sys
|
2
|
-
|
3
|
-
from magic_pdf.libs.commons import fitz
|
4
|
-
from termcolor import cprint
|
5
|
-
|
6
|
-
|
7
|
-
if sys.version_info[0] >= 3:
|
8
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
9
|
-
|
10
|
-
|
11
|
-
def open_pdf(pdf_path):
|
12
|
-
try:
|
13
|
-
pdf_document = fitz.open(pdf_path) # type: ignore
|
14
|
-
return pdf_document
|
15
|
-
except Exception as e:
|
16
|
-
print(f"无法打开PDF文件:{pdf_path}。原因是:{e}")
|
17
|
-
raise e
|
18
|
-
|
19
|
-
|
20
|
-
def print_green_on_red(text):
|
21
|
-
cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
|
22
|
-
|
23
|
-
|
24
|
-
def print_green(text):
|
25
|
-
print()
|
26
|
-
cprint(text, "green", attrs=["bold"], end="\n\n")
|
27
|
-
|
28
|
-
|
29
|
-
def print_red(text):
|
30
|
-
print()
|
31
|
-
cprint(text, "red", attrs=["bold"], end="\n\n")
|
32
|
-
|
33
|
-
|
34
|
-
def print_yellow(text):
|
35
|
-
print()
|
36
|
-
cprint(text, "yellow", attrs=["bold"], end="\n\n")
|
37
|
-
|
38
|
-
|
39
|
-
def safe_get(dict_obj, key, default):
|
40
|
-
val = dict_obj.get(key)
|
41
|
-
if val is None:
|
42
|
-
return default
|
43
|
-
else:
|
44
|
-
return val
|
45
|
-
|
46
|
-
|
47
|
-
def is_bbox_overlap(bbox1, bbox2):
|
48
|
-
"""
|
49
|
-
This function checks if bbox1 and bbox2 overlap or not
|
50
|
-
|
51
|
-
Parameters
|
52
|
-
----------
|
53
|
-
bbox1 : list
|
54
|
-
bbox1
|
55
|
-
bbox2 : list
|
56
|
-
bbox2
|
57
|
-
|
58
|
-
Returns
|
59
|
-
-------
|
60
|
-
bool
|
61
|
-
True if bbox1 and bbox2 overlap, else False
|
62
|
-
"""
|
63
|
-
x0_1, y0_1, x1_1, y1_1 = bbox1
|
64
|
-
x0_2, y0_2, x1_2, y1_2 = bbox2
|
65
|
-
|
66
|
-
if x0_1 > x1_2 or x0_2 > x1_1:
|
67
|
-
return False
|
68
|
-
if y0_1 > y1_2 or y0_2 > y1_1:
|
69
|
-
return False
|
70
|
-
|
71
|
-
return True
|
72
|
-
|
73
|
-
|
74
|
-
def is_in_bbox(bbox1, bbox2):
|
75
|
-
"""
|
76
|
-
This function checks if bbox1 is in bbox2
|
77
|
-
|
78
|
-
Parameters
|
79
|
-
----------
|
80
|
-
bbox1 : list
|
81
|
-
bbox1
|
82
|
-
bbox2 : list
|
83
|
-
bbox2
|
84
|
-
|
85
|
-
Returns
|
86
|
-
-------
|
87
|
-
bool
|
88
|
-
True if bbox1 is in bbox2, else False
|
89
|
-
"""
|
90
|
-
x0_1, y0_1, x1_1, y1_1 = bbox1
|
91
|
-
x0_2, y0_2, x1_2, y1_2 = bbox2
|
92
|
-
|
93
|
-
if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
|
94
|
-
return True
|
95
|
-
else:
|
96
|
-
return False
|
97
|
-
|
98
|
-
|
99
|
-
def calculate_para_bbox(lines):
|
100
|
-
"""
|
101
|
-
This function calculates the minimum bbox of the paragraph
|
102
|
-
|
103
|
-
Parameters
|
104
|
-
----------
|
105
|
-
lines : list
|
106
|
-
lines
|
107
|
-
|
108
|
-
Returns
|
109
|
-
-------
|
110
|
-
para_bbox : list
|
111
|
-
bbox of the paragraph
|
112
|
-
"""
|
113
|
-
x0 = min(line["bbox"][0] for line in lines)
|
114
|
-
y0 = min(line["bbox"][1] for line in lines)
|
115
|
-
x1 = max(line["bbox"][2] for line in lines)
|
116
|
-
y1 = max(line["bbox"][3] for line in lines)
|
117
|
-
return [x0, y0, x1, y1]
|
118
|
-
|
119
|
-
|
120
|
-
def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
|
121
|
-
"""
|
122
|
-
This function checks if the line is right aligned from its neighbors
|
123
|
-
|
124
|
-
Parameters
|
125
|
-
----------
|
126
|
-
curr_line_bbox : list
|
127
|
-
bbox of the current line
|
128
|
-
prev_line_bbox : list
|
129
|
-
bbox of the previous line
|
130
|
-
next_line_bbox : list
|
131
|
-
bbox of the next line
|
132
|
-
avg_char_width : float
|
133
|
-
average of char widths
|
134
|
-
direction : int
|
135
|
-
0 for prev, 1 for next, 2 for both
|
136
|
-
|
137
|
-
Returns
|
138
|
-
-------
|
139
|
-
bool
|
140
|
-
True if the line is right aligned from its neighbors, False otherwise.
|
141
|
-
"""
|
142
|
-
horizontal_ratio = 0.5
|
143
|
-
horizontal_thres = horizontal_ratio * avg_char_width
|
144
|
-
|
145
|
-
_, _, x1, _ = curr_line_bbox
|
146
|
-
_, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
|
147
|
-
_, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
|
148
|
-
|
149
|
-
if direction == 0:
|
150
|
-
return abs(x1 - prev_x1) < horizontal_thres
|
151
|
-
elif direction == 1:
|
152
|
-
return abs(x1 - next_x1) < horizontal_thres
|
153
|
-
elif direction == 2:
|
154
|
-
return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
|
155
|
-
else:
|
156
|
-
return False
|
157
|
-
|
158
|
-
|
159
|
-
def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
|
160
|
-
"""
|
161
|
-
This function checks if the line is left aligned from its neighbors
|
162
|
-
|
163
|
-
Parameters
|
164
|
-
----------
|
165
|
-
curr_line_bbox : list
|
166
|
-
bbox of the current line
|
167
|
-
prev_line_bbox : list
|
168
|
-
bbox of the previous line
|
169
|
-
next_line_bbox : list
|
170
|
-
bbox of the next line
|
171
|
-
avg_char_width : float
|
172
|
-
average of char widths
|
173
|
-
direction : int
|
174
|
-
0 for prev, 1 for next, 2 for both
|
175
|
-
|
176
|
-
Returns
|
177
|
-
-------
|
178
|
-
bool
|
179
|
-
True if the line is left aligned from its neighbors, False otherwise.
|
180
|
-
"""
|
181
|
-
horizontal_ratio = 0.5
|
182
|
-
horizontal_thres = horizontal_ratio * avg_char_width
|
183
|
-
|
184
|
-
x0, _, _, _ = curr_line_bbox
|
185
|
-
prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
|
186
|
-
next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
|
187
|
-
|
188
|
-
if direction == 0:
|
189
|
-
return abs(x0 - prev_x0) < horizontal_thres
|
190
|
-
elif direction == 1:
|
191
|
-
return abs(x0 - next_x0) < horizontal_thres
|
192
|
-
elif direction == 2:
|
193
|
-
return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
|
194
|
-
else:
|
195
|
-
return False
|
196
|
-
|
197
|
-
|
198
|
-
def end_with_punctuation(line_text):
|
199
|
-
"""
|
200
|
-
This function checks if the line ends with punctuation marks
|
201
|
-
"""
|
202
|
-
|
203
|
-
english_end_puncs = [".", "?", "!"]
|
204
|
-
chinese_end_puncs = ["。", "?", "!"]
|
205
|
-
end_puncs = english_end_puncs + chinese_end_puncs
|
206
|
-
|
207
|
-
last_non_space_char = None
|
208
|
-
for ch in line_text[::-1]:
|
209
|
-
if not ch.isspace():
|
210
|
-
last_non_space_char = ch
|
211
|
-
break
|
212
|
-
|
213
|
-
if last_non_space_char is None:
|
214
|
-
return False
|
215
|
-
|
216
|
-
return last_non_space_char in end_puncs
|
217
|
-
|
218
|
-
|
219
|
-
def is_nested_list(lst):
|
220
|
-
if isinstance(lst, list):
|
221
|
-
return any(isinstance(sub, list) for sub in lst)
|
222
|
-
return False
|