magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
magic_pdf/para/denoise.py
DELETED
@@ -1,246 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
|
3
|
-
from collections import defaultdict
|
4
|
-
from magic_pdf.para.commons import *
|
5
|
-
|
6
|
-
if sys.version_info[0] >= 3:
|
7
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
8
|
-
|
9
|
-
|
10
|
-
class HeaderFooterProcessor:
|
11
|
-
def __init__(self) -> None:
|
12
|
-
pass
|
13
|
-
|
14
|
-
def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
|
15
|
-
"""
|
16
|
-
This function gets the most common bboxes from the bboxes
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
bboxes : list
|
21
|
-
bboxes
|
22
|
-
page_height : float
|
23
|
-
height of the page
|
24
|
-
position : str, optional
|
25
|
-
"top" or "bottom", by default "top"
|
26
|
-
threshold : float, optional
|
27
|
-
threshold, by default 0.25
|
28
|
-
num_bboxes : int, optional
|
29
|
-
number of bboxes to return, by default 3
|
30
|
-
min_frequency : int, optional
|
31
|
-
minimum frequency of the bbox, by default 2
|
32
|
-
|
33
|
-
Returns
|
34
|
-
-------
|
35
|
-
common_bboxes : list
|
36
|
-
common bboxes
|
37
|
-
"""
|
38
|
-
# Filter bbox by position
|
39
|
-
if position == "top":
|
40
|
-
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
|
41
|
-
else:
|
42
|
-
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
|
43
|
-
|
44
|
-
# Find the most common bbox
|
45
|
-
bbox_count = defaultdict(int)
|
46
|
-
for bbox in filtered_bboxes:
|
47
|
-
bbox_count[tuple(bbox)] += 1
|
48
|
-
|
49
|
-
# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
|
50
|
-
common_bboxes = [
|
51
|
-
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
|
52
|
-
][:num_bboxes]
|
53
|
-
return common_bboxes
|
54
|
-
|
55
|
-
def detect_footer_header(self, result_dict, similarity_threshold=0.5):
|
56
|
-
"""
|
57
|
-
This function detects the header and footer of the document.
|
58
|
-
|
59
|
-
Parameters
|
60
|
-
----------
|
61
|
-
result_dict : dict
|
62
|
-
result dictionary
|
63
|
-
|
64
|
-
Returns
|
65
|
-
-------
|
66
|
-
result_dict : dict
|
67
|
-
result dictionary
|
68
|
-
"""
|
69
|
-
|
70
|
-
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
|
71
|
-
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
|
72
|
-
|
73
|
-
def is_single_line_block(block):
|
74
|
-
# Determine based on the width and height of the block
|
75
|
-
block_width = block["X1"] - block["X0"]
|
76
|
-
block_height = block["bbox"][3] - block["bbox"][1]
|
77
|
-
|
78
|
-
# If the height of the block is close to the average character height and the width is large, it is considered a single line
|
79
|
-
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
|
80
|
-
|
81
|
-
# Traverse all blocks in the document
|
82
|
-
single_preproc_blocks = 0
|
83
|
-
total_blocks = 0
|
84
|
-
single_preproc_blocks = 0
|
85
|
-
|
86
|
-
for page_id, blocks in result_dict.items():
|
87
|
-
if page_id.startswith("page_"):
|
88
|
-
for block_key, block in blocks.items():
|
89
|
-
if block_key.startswith("block_"):
|
90
|
-
total_blocks += 1
|
91
|
-
if is_single_line_block(block):
|
92
|
-
single_preproc_blocks += 1
|
93
|
-
|
94
|
-
# If there are no blocks, skip the header and footer detection
|
95
|
-
if total_blocks == 0:
|
96
|
-
print("No blocks found. Skipping header/footer detection.")
|
97
|
-
return result_dict
|
98
|
-
|
99
|
-
# If most of the blocks are single-line, skip the header and footer detection
|
100
|
-
if single_preproc_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
|
101
|
-
return result_dict
|
102
|
-
|
103
|
-
# Collect the bounding boxes of all blocks
|
104
|
-
all_bboxes = []
|
105
|
-
all_texts = []
|
106
|
-
|
107
|
-
for page_id, blocks in result_dict.items():
|
108
|
-
if page_id.startswith("page_"):
|
109
|
-
for block_key, block in blocks.items():
|
110
|
-
if block_key.startswith("block_"):
|
111
|
-
all_bboxes.append(block["bbox"])
|
112
|
-
|
113
|
-
# Get the height of the page
|
114
|
-
page_height = max(bbox[3] for bbox in all_bboxes)
|
115
|
-
|
116
|
-
# Get the most common bbox lists for headers and footers
|
117
|
-
common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
|
118
|
-
common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
|
119
|
-
|
120
|
-
# Detect and mark headers and footers
|
121
|
-
for page_id, blocks in result_dict.items():
|
122
|
-
if page_id.startswith("page_"):
|
123
|
-
for block_key, block in blocks.items():
|
124
|
-
if block_key.startswith("block_"):
|
125
|
-
bbox = block["bbox"]
|
126
|
-
text = block["text"]
|
127
|
-
|
128
|
-
is_header = compare_bbox_with_list(bbox, common_header_bboxes)
|
129
|
-
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
|
130
|
-
|
131
|
-
block["is_header"] = int(is_header)
|
132
|
-
block["is_footer"] = int(is_footer)
|
133
|
-
|
134
|
-
return result_dict
|
135
|
-
|
136
|
-
|
137
|
-
class NonHorizontalTextProcessor:
|
138
|
-
def __init__(self) -> None:
|
139
|
-
pass
|
140
|
-
|
141
|
-
def detect_non_horizontal_texts(self, result_dict):
|
142
|
-
"""
|
143
|
-
This function detects watermarks and vertical margin notes in the document.
|
144
|
-
|
145
|
-
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
146
|
-
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
|
147
|
-
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
|
148
|
-
|
149
|
-
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
150
|
-
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
|
151
|
-
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
|
152
|
-
|
153
|
-
|
154
|
-
Parameters
|
155
|
-
----------
|
156
|
-
result_dict : dict
|
157
|
-
The result dictionary.
|
158
|
-
|
159
|
-
Returns
|
160
|
-
-------
|
161
|
-
result_dict : dict
|
162
|
-
The updated result dictionary.
|
163
|
-
"""
|
164
|
-
# Dictionary to store information about potential watermarks
|
165
|
-
potential_watermarks = {}
|
166
|
-
potential_margin_notes = {}
|
167
|
-
|
168
|
-
for page_id, page_content in result_dict.items():
|
169
|
-
if page_id.startswith("page_"):
|
170
|
-
for block_id, block_data in page_content.items():
|
171
|
-
if block_id.startswith("block_"):
|
172
|
-
if "dir" in block_data:
|
173
|
-
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
|
174
|
-
|
175
|
-
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
|
176
|
-
angle = abs(math.degrees(angle))
|
177
|
-
|
178
|
-
if angle > 5 and angle < 85: # Check if direction is watermarks
|
179
|
-
if coordinates_text in potential_watermarks:
|
180
|
-
potential_watermarks[coordinates_text] += 1
|
181
|
-
else:
|
182
|
-
potential_watermarks[coordinates_text] = 1
|
183
|
-
|
184
|
-
if angle > 85 and angle < 105: # Check if direction is vertical
|
185
|
-
if coordinates_text in potential_margin_notes:
|
186
|
-
potential_margin_notes[coordinates_text] += 1 # Increment count
|
187
|
-
else:
|
188
|
-
potential_margin_notes[coordinates_text] = 1 # Initialize count
|
189
|
-
|
190
|
-
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
191
|
-
watermark_threshold = len(result_dict) // 2
|
192
|
-
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
|
193
|
-
|
194
|
-
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
195
|
-
margin_note_threshold = len(result_dict) // 2
|
196
|
-
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
|
197
|
-
|
198
|
-
# Add watermark information to the result dictionary
|
199
|
-
for page_id, blocks in result_dict.items():
|
200
|
-
if page_id.startswith("page_"):
|
201
|
-
for block_id, block_data in blocks.items():
|
202
|
-
coordinates_text = (block_data["bbox"], block_data["text"])
|
203
|
-
if coordinates_text in watermarks:
|
204
|
-
block_data["is_watermark"] = 1
|
205
|
-
else:
|
206
|
-
block_data["is_watermark"] = 0
|
207
|
-
|
208
|
-
if coordinates_text in margin_notes:
|
209
|
-
block_data["is_vertical_margin_note"] = 1
|
210
|
-
else:
|
211
|
-
block_data["is_vertical_margin_note"] = 0
|
212
|
-
|
213
|
-
return result_dict
|
214
|
-
|
215
|
-
|
216
|
-
class NoiseRemover:
|
217
|
-
def __init__(self) -> None:
|
218
|
-
pass
|
219
|
-
|
220
|
-
def skip_data_noises(self, result_dict):
|
221
|
-
"""
|
222
|
-
This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
|
223
|
-
"""
|
224
|
-
filtered_result_dict = {}
|
225
|
-
for page_id, blocks in result_dict.items():
|
226
|
-
if page_id.startswith("page_"):
|
227
|
-
filtered_blocks = {}
|
228
|
-
for block_id, block in blocks.items():
|
229
|
-
if block_id.startswith("block_"):
|
230
|
-
if any(
|
231
|
-
block.get(key, 0)
|
232
|
-
for key in [
|
233
|
-
"is_overlap",
|
234
|
-
"is_header",
|
235
|
-
"is_footer",
|
236
|
-
"is_watermark",
|
237
|
-
"is_vertical_margin_note",
|
238
|
-
"is_block_title",
|
239
|
-
]
|
240
|
-
):
|
241
|
-
continue
|
242
|
-
filtered_blocks[block_id] = block
|
243
|
-
if filtered_blocks:
|
244
|
-
filtered_result_dict[page_id] = filtered_blocks
|
245
|
-
|
246
|
-
return filtered_result_dict
|
magic_pdf/para/draw.py
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
from magic_pdf.libs.commons import fitz
|
2
|
-
|
3
|
-
from magic_pdf.para.commons import *
|
4
|
-
|
5
|
-
|
6
|
-
if sys.version_info[0] >= 3:
|
7
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
8
|
-
|
9
|
-
|
10
|
-
class DrawAnnos:
|
11
|
-
"""
|
12
|
-
This class draws annotations on the pdf file
|
13
|
-
|
14
|
-
----------------------------------------
|
15
|
-
Color Code
|
16
|
-
----------------------------------------
|
17
|
-
Red: (1, 0, 0)
|
18
|
-
Green: (0, 1, 0)
|
19
|
-
Blue: (0, 0, 1)
|
20
|
-
Yellow: (1, 1, 0) - mix of red and green
|
21
|
-
Cyan: (0, 1, 1) - mix of green and blue
|
22
|
-
Magenta: (1, 0, 1) - mix of red and blue
|
23
|
-
White: (1, 1, 1) - red, green and blue full intensity
|
24
|
-
Black: (0, 0, 0) - no color component whatsoever
|
25
|
-
Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
|
26
|
-
Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
|
27
|
-
"""
|
28
|
-
|
29
|
-
def __init__(self) -> None:
|
30
|
-
pass
|
31
|
-
|
32
|
-
def __is_nested_list(self, lst):
|
33
|
-
"""
|
34
|
-
This function returns True if the given list is a nested list of any degree.
|
35
|
-
"""
|
36
|
-
if isinstance(lst, list):
|
37
|
-
return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
|
38
|
-
return False
|
39
|
-
|
40
|
-
def __valid_rect(self, bbox):
|
41
|
-
# Ensure that the rectangle is not empty or invalid
|
42
|
-
if isinstance(bbox[0], list):
|
43
|
-
return False # It's a nested list, hence it can't be valid rect
|
44
|
-
else:
|
45
|
-
return bbox[0] < bbox[2] and bbox[1] < bbox[3]
|
46
|
-
|
47
|
-
def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
|
48
|
-
"""
|
49
|
-
This function draws the nested boxes
|
50
|
-
|
51
|
-
Parameters
|
52
|
-
----------
|
53
|
-
page : fitz.Page
|
54
|
-
page
|
55
|
-
nested_bbox : list
|
56
|
-
nested bbox
|
57
|
-
color : tuple
|
58
|
-
color, by default (0, 1, 1) # draw with cyan color for combined paragraph
|
59
|
-
"""
|
60
|
-
if self.__is_nested_list(nested_bbox): # If it's a nested list
|
61
|
-
for bbox in nested_bbox:
|
62
|
-
self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
|
63
|
-
elif self.__valid_rect(nested_bbox): # If valid rectangle
|
64
|
-
para_rect = fitz.Rect(nested_bbox)
|
65
|
-
para_anno = page.add_rect_annot(para_rect)
|
66
|
-
para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
|
67
|
-
para_anno.set_border(width=1)
|
68
|
-
para_anno.update()
|
69
|
-
|
70
|
-
def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
|
71
|
-
pdf_doc = open_pdf(input_pdf_path)
|
72
|
-
|
73
|
-
if pdf_dic is None:
|
74
|
-
pdf_dic = {}
|
75
|
-
|
76
|
-
if output_pdf_path is None:
|
77
|
-
output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
|
78
|
-
|
79
|
-
for page_id, page in enumerate(pdf_doc): # type: ignore
|
80
|
-
page_key = f"page_{page_id}"
|
81
|
-
for ele_key, ele_data in pdf_dic[page_key].items():
|
82
|
-
if ele_key == "para_blocks":
|
83
|
-
para_blocks = ele_data
|
84
|
-
for para_block in para_blocks:
|
85
|
-
if "paras" in para_block.keys():
|
86
|
-
paras = para_block["paras"]
|
87
|
-
for para_key, para_content in paras.items():
|
88
|
-
para_bbox = para_content["para_bbox"]
|
89
|
-
# print(f"para_bbox: {para_bbox}")
|
90
|
-
# print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
|
91
|
-
if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
|
92
|
-
color = (0, 1, 1)
|
93
|
-
self.__draw_nested_boxes(
|
94
|
-
page, para_bbox, color
|
95
|
-
) # draw with cyan color for combined paragraph
|
96
|
-
else:
|
97
|
-
if self.__valid_rect(para_bbox):
|
98
|
-
para_rect = fitz.Rect(para_bbox)
|
99
|
-
para_anno = page.add_rect_annot(para_rect)
|
100
|
-
para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
|
101
|
-
para_anno.set_border(width=0.5)
|
102
|
-
para_anno.update()
|
103
|
-
|
104
|
-
is_para_title = para_content["is_para_title"]
|
105
|
-
if is_para_title:
|
106
|
-
if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
|
107
|
-
color = (0, 0, 1)
|
108
|
-
self.__draw_nested_boxes(
|
109
|
-
page, para_content["para_bbox"], color
|
110
|
-
) # draw with cyan color for combined title
|
111
|
-
else:
|
112
|
-
if self.__valid_rect(para_content["para_bbox"]):
|
113
|
-
para_rect = fitz.Rect(para_content["para_bbox"])
|
114
|
-
if self.__valid_rect(para_content["para_bbox"]):
|
115
|
-
para_anno = page.add_rect_annot(para_rect)
|
116
|
-
para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
|
117
|
-
para_anno.set_border(width=0.5)
|
118
|
-
para_anno.update()
|
119
|
-
|
120
|
-
pdf_doc.save(output_pdf_path)
|
121
|
-
pdf_doc.close()
|
magic_pdf/para/exceptions.py
DELETED
@@ -1,198 +0,0 @@
|
|
1
|
-
class DenseSingleLineBlockException(Exception):
|
2
|
-
"""
|
3
|
-
This class defines the exception type for dense single line-block.
|
4
|
-
"""
|
5
|
-
|
6
|
-
def __init__(self, message="DenseSingleLineBlockException"):
|
7
|
-
self.message = message
|
8
|
-
super().__init__(self.message)
|
9
|
-
|
10
|
-
def __str__(self):
|
11
|
-
return f"{self.message}"
|
12
|
-
|
13
|
-
def __repr__(self):
|
14
|
-
return f"{self.message}"
|
15
|
-
|
16
|
-
|
17
|
-
class TitleDetectionException(Exception):
|
18
|
-
"""
|
19
|
-
This class defines the exception type for title detection.
|
20
|
-
"""
|
21
|
-
|
22
|
-
def __init__(self, message="TitleDetectionException"):
|
23
|
-
self.message = message
|
24
|
-
super().__init__(self.message)
|
25
|
-
|
26
|
-
def __str__(self):
|
27
|
-
return f"{self.message}"
|
28
|
-
|
29
|
-
def __repr__(self):
|
30
|
-
return f"{self.message}"
|
31
|
-
|
32
|
-
|
33
|
-
class TitleLevelException(Exception):
|
34
|
-
"""
|
35
|
-
This class defines the exception type for title level.
|
36
|
-
"""
|
37
|
-
|
38
|
-
def __init__(self, message="TitleLevelException"):
|
39
|
-
self.message = message
|
40
|
-
super().__init__(self.message)
|
41
|
-
|
42
|
-
def __str__(self):
|
43
|
-
return f"{self.message}"
|
44
|
-
|
45
|
-
def __repr__(self):
|
46
|
-
return f"{self.message}"
|
47
|
-
|
48
|
-
|
49
|
-
class ParaSplitException(Exception):
|
50
|
-
"""
|
51
|
-
This class defines the exception type for paragraph splitting.
|
52
|
-
"""
|
53
|
-
|
54
|
-
def __init__(self, message="ParaSplitException"):
|
55
|
-
self.message = message
|
56
|
-
super().__init__(self.message)
|
57
|
-
|
58
|
-
def __str__(self):
|
59
|
-
return f"{self.message}"
|
60
|
-
|
61
|
-
def __repr__(self):
|
62
|
-
return f"{self.message}"
|
63
|
-
|
64
|
-
|
65
|
-
class ParaMergeException(Exception):
|
66
|
-
"""
|
67
|
-
This class defines the exception type for paragraph merging.
|
68
|
-
"""
|
69
|
-
|
70
|
-
def __init__(self, message="ParaMergeException"):
|
71
|
-
self.message = message
|
72
|
-
super().__init__(self.message)
|
73
|
-
|
74
|
-
def __str__(self):
|
75
|
-
return f"{self.message}"
|
76
|
-
|
77
|
-
def __repr__(self):
|
78
|
-
return f"{self.message}"
|
79
|
-
|
80
|
-
|
81
|
-
class DiscardByException:
|
82
|
-
"""
|
83
|
-
This class discards pdf files by exception
|
84
|
-
"""
|
85
|
-
|
86
|
-
def __init__(self) -> None:
|
87
|
-
pass
|
88
|
-
|
89
|
-
def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
|
90
|
-
"""
|
91
|
-
This function discards pdf files by single line block exception
|
92
|
-
|
93
|
-
Parameters
|
94
|
-
----------
|
95
|
-
pdf_dic : dict
|
96
|
-
pdf dictionary
|
97
|
-
exception : str
|
98
|
-
exception message
|
99
|
-
|
100
|
-
Returns
|
101
|
-
-------
|
102
|
-
error_message : str
|
103
|
-
"""
|
104
|
-
exception_page_nums = 0
|
105
|
-
page_num = 0
|
106
|
-
for page_id, page in pdf_dic.items():
|
107
|
-
if page_id.startswith("page_"):
|
108
|
-
page_num += 1
|
109
|
-
if "preproc_blocks" in page.keys():
|
110
|
-
preproc_blocks = page["preproc_blocks"]
|
111
|
-
|
112
|
-
all_single_line_blocks = []
|
113
|
-
for block in preproc_blocks:
|
114
|
-
if len(block["lines"]) == 1:
|
115
|
-
all_single_line_blocks.append(block)
|
116
|
-
|
117
|
-
if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
|
118
|
-
exception_page_nums += 1
|
119
|
-
|
120
|
-
if page_num == 0:
|
121
|
-
return None
|
122
|
-
|
123
|
-
if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded
|
124
|
-
return exception.message
|
125
|
-
|
126
|
-
return None
|
127
|
-
|
128
|
-
def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
|
129
|
-
"""
|
130
|
-
This function discards pdf files by title detection exception
|
131
|
-
|
132
|
-
Parameters
|
133
|
-
----------
|
134
|
-
pdf_dic : dict
|
135
|
-
pdf dictionary
|
136
|
-
exception : str
|
137
|
-
exception message
|
138
|
-
|
139
|
-
Returns
|
140
|
-
-------
|
141
|
-
error_message : str
|
142
|
-
"""
|
143
|
-
# return exception.message
|
144
|
-
return None
|
145
|
-
|
146
|
-
def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
|
147
|
-
"""
|
148
|
-
This function discards pdf files by title level exception
|
149
|
-
|
150
|
-
Parameters
|
151
|
-
----------
|
152
|
-
pdf_dic : dict
|
153
|
-
pdf dictionary
|
154
|
-
exception : str
|
155
|
-
exception message
|
156
|
-
|
157
|
-
Returns
|
158
|
-
-------
|
159
|
-
error_message : str
|
160
|
-
"""
|
161
|
-
# return exception.message
|
162
|
-
return None
|
163
|
-
|
164
|
-
def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
|
165
|
-
"""
|
166
|
-
This function discards pdf files by split para exception
|
167
|
-
|
168
|
-
Parameters
|
169
|
-
----------
|
170
|
-
pdf_dic : dict
|
171
|
-
pdf dictionary
|
172
|
-
exception : str
|
173
|
-
exception message
|
174
|
-
|
175
|
-
Returns
|
176
|
-
-------
|
177
|
-
error_message : str
|
178
|
-
"""
|
179
|
-
# return exception.message
|
180
|
-
return None
|
181
|
-
|
182
|
-
def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
|
183
|
-
"""
|
184
|
-
This function discards pdf files by merge para exception
|
185
|
-
|
186
|
-
Parameters
|
187
|
-
----------
|
188
|
-
pdf_dic : dict
|
189
|
-
pdf dictionary
|
190
|
-
exception : str
|
191
|
-
exception message
|
192
|
-
|
193
|
-
Returns
|
194
|
-
-------
|
195
|
-
error_message : str
|
196
|
-
"""
|
197
|
-
# return exception.message
|
198
|
-
return None
|
@@ -1,40 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
from magic_pdf.para.commons import *
|
3
|
-
|
4
|
-
|
5
|
-
if sys.version_info[0] >= 3:
|
6
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
7
|
-
|
8
|
-
|
9
|
-
class LayoutFilterProcessor:
|
10
|
-
def __init__(self) -> None:
|
11
|
-
pass
|
12
|
-
|
13
|
-
def batch_process_blocks(self, pdf_dict):
|
14
|
-
for page_id, blocks in pdf_dict.items():
|
15
|
-
if page_id.startswith("page_"):
|
16
|
-
if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
|
17
|
-
layout_bbox_objs = blocks["layout_bboxes"]
|
18
|
-
if layout_bbox_objs is None:
|
19
|
-
continue
|
20
|
-
layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
|
21
|
-
|
22
|
-
# Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
|
23
|
-
layout_bboxes = [
|
24
|
-
[math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
|
25
|
-
]
|
26
|
-
|
27
|
-
para_blocks = blocks["para_blocks"]
|
28
|
-
if para_blocks is None:
|
29
|
-
continue
|
30
|
-
|
31
|
-
for lb_bbox in layout_bboxes:
|
32
|
-
for i, para_block in enumerate(para_blocks):
|
33
|
-
para_bbox = para_block["bbox"]
|
34
|
-
para_blocks[i]["in_layout"] = 0
|
35
|
-
if is_in_bbox(para_bbox, lb_bbox):
|
36
|
-
para_blocks[i]["in_layout"] = 1
|
37
|
-
|
38
|
-
blocks["para_blocks"] = para_blocks
|
39
|
-
|
40
|
-
return pdf_dict
|