magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
magic_pdf/para/denoise.py DELETED
@@ -1,246 +0,0 @@
1
- import math
2
-
3
- from collections import defaultdict
4
- from magic_pdf.para.commons import *
5
-
6
- if sys.version_info[0] >= 3:
7
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
8
-
9
-
10
- class HeaderFooterProcessor:
11
- def __init__(self) -> None:
12
- pass
13
-
14
- def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
15
- """
16
- This function gets the most common bboxes from the bboxes
17
-
18
- Parameters
19
- ----------
20
- bboxes : list
21
- bboxes
22
- page_height : float
23
- height of the page
24
- position : str, optional
25
- "top" or "bottom", by default "top"
26
- threshold : float, optional
27
- threshold, by default 0.25
28
- num_bboxes : int, optional
29
- number of bboxes to return, by default 3
30
- min_frequency : int, optional
31
- minimum frequency of the bbox, by default 2
32
-
33
- Returns
34
- -------
35
- common_bboxes : list
36
- common bboxes
37
- """
38
- # Filter bbox by position
39
- if position == "top":
40
- filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
41
- else:
42
- filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
43
-
44
- # Find the most common bbox
45
- bbox_count = defaultdict(int)
46
- for bbox in filtered_bboxes:
47
- bbox_count[tuple(bbox)] += 1
48
-
49
- # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
50
- common_bboxes = [
51
- bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
52
- ][:num_bboxes]
53
- return common_bboxes
54
-
55
- def detect_footer_header(self, result_dict, similarity_threshold=0.5):
56
- """
57
- This function detects the header and footer of the document.
58
-
59
- Parameters
60
- ----------
61
- result_dict : dict
62
- result dictionary
63
-
64
- Returns
65
- -------
66
- result_dict : dict
67
- result dictionary
68
- """
69
-
70
- def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
71
- return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
72
-
73
- def is_single_line_block(block):
74
- # Determine based on the width and height of the block
75
- block_width = block["X1"] - block["X0"]
76
- block_height = block["bbox"][3] - block["bbox"][1]
77
-
78
- # If the height of the block is close to the average character height and the width is large, it is considered a single line
79
- return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
80
-
81
- # Traverse all blocks in the document
82
- single_preproc_blocks = 0
83
- total_blocks = 0
84
- single_preproc_blocks = 0
85
-
86
- for page_id, blocks in result_dict.items():
87
- if page_id.startswith("page_"):
88
- for block_key, block in blocks.items():
89
- if block_key.startswith("block_"):
90
- total_blocks += 1
91
- if is_single_line_block(block):
92
- single_preproc_blocks += 1
93
-
94
- # If there are no blocks, skip the header and footer detection
95
- if total_blocks == 0:
96
- print("No blocks found. Skipping header/footer detection.")
97
- return result_dict
98
-
99
- # If most of the blocks are single-line, skip the header and footer detection
100
- if single_preproc_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
101
- return result_dict
102
-
103
- # Collect the bounding boxes of all blocks
104
- all_bboxes = []
105
- all_texts = []
106
-
107
- for page_id, blocks in result_dict.items():
108
- if page_id.startswith("page_"):
109
- for block_key, block in blocks.items():
110
- if block_key.startswith("block_"):
111
- all_bboxes.append(block["bbox"])
112
-
113
- # Get the height of the page
114
- page_height = max(bbox[3] for bbox in all_bboxes)
115
-
116
- # Get the most common bbox lists for headers and footers
117
- common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
118
- common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
119
-
120
- # Detect and mark headers and footers
121
- for page_id, blocks in result_dict.items():
122
- if page_id.startswith("page_"):
123
- for block_key, block in blocks.items():
124
- if block_key.startswith("block_"):
125
- bbox = block["bbox"]
126
- text = block["text"]
127
-
128
- is_header = compare_bbox_with_list(bbox, common_header_bboxes)
129
- is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
130
-
131
- block["is_header"] = int(is_header)
132
- block["is_footer"] = int(is_footer)
133
-
134
- return result_dict
135
-
136
-
137
- class NonHorizontalTextProcessor:
138
- def __init__(self) -> None:
139
- pass
140
-
141
- def detect_non_horizontal_texts(self, result_dict):
142
- """
143
- This function detects watermarks and vertical margin notes in the document.
144
-
145
- Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
146
- If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
147
- If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
148
-
149
- Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
150
- If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
151
- If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
152
-
153
-
154
- Parameters
155
- ----------
156
- result_dict : dict
157
- The result dictionary.
158
-
159
- Returns
160
- -------
161
- result_dict : dict
162
- The updated result dictionary.
163
- """
164
- # Dictionary to store information about potential watermarks
165
- potential_watermarks = {}
166
- potential_margin_notes = {}
167
-
168
- for page_id, page_content in result_dict.items():
169
- if page_id.startswith("page_"):
170
- for block_id, block_data in page_content.items():
171
- if block_id.startswith("block_"):
172
- if "dir" in block_data:
173
- coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
174
-
175
- angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
176
- angle = abs(math.degrees(angle))
177
-
178
- if angle > 5 and angle < 85: # Check if direction is watermarks
179
- if coordinates_text in potential_watermarks:
180
- potential_watermarks[coordinates_text] += 1
181
- else:
182
- potential_watermarks[coordinates_text] = 1
183
-
184
- if angle > 85 and angle < 105: # Check if direction is vertical
185
- if coordinates_text in potential_margin_notes:
186
- potential_margin_notes[coordinates_text] += 1 # Increment count
187
- else:
188
- potential_margin_notes[coordinates_text] = 1 # Initialize count
189
-
190
- # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
191
- watermark_threshold = len(result_dict) // 2
192
- watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
193
-
194
- # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
195
- margin_note_threshold = len(result_dict) // 2
196
- margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
197
-
198
- # Add watermark information to the result dictionary
199
- for page_id, blocks in result_dict.items():
200
- if page_id.startswith("page_"):
201
- for block_id, block_data in blocks.items():
202
- coordinates_text = (block_data["bbox"], block_data["text"])
203
- if coordinates_text in watermarks:
204
- block_data["is_watermark"] = 1
205
- else:
206
- block_data["is_watermark"] = 0
207
-
208
- if coordinates_text in margin_notes:
209
- block_data["is_vertical_margin_note"] = 1
210
- else:
211
- block_data["is_vertical_margin_note"] = 0
212
-
213
- return result_dict
214
-
215
-
216
- class NoiseRemover:
217
- def __init__(self) -> None:
218
- pass
219
-
220
- def skip_data_noises(self, result_dict):
221
- """
222
- This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
223
- """
224
- filtered_result_dict = {}
225
- for page_id, blocks in result_dict.items():
226
- if page_id.startswith("page_"):
227
- filtered_blocks = {}
228
- for block_id, block in blocks.items():
229
- if block_id.startswith("block_"):
230
- if any(
231
- block.get(key, 0)
232
- for key in [
233
- "is_overlap",
234
- "is_header",
235
- "is_footer",
236
- "is_watermark",
237
- "is_vertical_margin_note",
238
- "is_block_title",
239
- ]
240
- ):
241
- continue
242
- filtered_blocks[block_id] = block
243
- if filtered_blocks:
244
- filtered_result_dict[page_id] = filtered_blocks
245
-
246
- return filtered_result_dict
magic_pdf/para/draw.py DELETED
@@ -1,121 +0,0 @@
1
- from magic_pdf.libs.commons import fitz
2
-
3
- from magic_pdf.para.commons import *
4
-
5
-
6
- if sys.version_info[0] >= 3:
7
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
8
-
9
-
10
- class DrawAnnos:
11
- """
12
- This class draws annotations on the pdf file
13
-
14
- ----------------------------------------
15
- Color Code
16
- ----------------------------------------
17
- Red: (1, 0, 0)
18
- Green: (0, 1, 0)
19
- Blue: (0, 0, 1)
20
- Yellow: (1, 1, 0) - mix of red and green
21
- Cyan: (0, 1, 1) - mix of green and blue
22
- Magenta: (1, 0, 1) - mix of red and blue
23
- White: (1, 1, 1) - red, green and blue full intensity
24
- Black: (0, 0, 0) - no color component whatsoever
25
- Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
26
- Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
27
- """
28
-
29
- def __init__(self) -> None:
30
- pass
31
-
32
- def __is_nested_list(self, lst):
33
- """
34
- This function returns True if the given list is a nested list of any degree.
35
- """
36
- if isinstance(lst, list):
37
- return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
38
- return False
39
-
40
- def __valid_rect(self, bbox):
41
- # Ensure that the rectangle is not empty or invalid
42
- if isinstance(bbox[0], list):
43
- return False # It's a nested list, hence it can't be valid rect
44
- else:
45
- return bbox[0] < bbox[2] and bbox[1] < bbox[3]
46
-
47
- def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
48
- """
49
- This function draws the nested boxes
50
-
51
- Parameters
52
- ----------
53
- page : fitz.Page
54
- page
55
- nested_bbox : list
56
- nested bbox
57
- color : tuple
58
- color, by default (0, 1, 1) # draw with cyan color for combined paragraph
59
- """
60
- if self.__is_nested_list(nested_bbox): # If it's a nested list
61
- for bbox in nested_bbox:
62
- self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
63
- elif self.__valid_rect(nested_bbox): # If valid rectangle
64
- para_rect = fitz.Rect(nested_bbox)
65
- para_anno = page.add_rect_annot(para_rect)
66
- para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
67
- para_anno.set_border(width=1)
68
- para_anno.update()
69
-
70
- def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
71
- pdf_doc = open_pdf(input_pdf_path)
72
-
73
- if pdf_dic is None:
74
- pdf_dic = {}
75
-
76
- if output_pdf_path is None:
77
- output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
78
-
79
- for page_id, page in enumerate(pdf_doc): # type: ignore
80
- page_key = f"page_{page_id}"
81
- for ele_key, ele_data in pdf_dic[page_key].items():
82
- if ele_key == "para_blocks":
83
- para_blocks = ele_data
84
- for para_block in para_blocks:
85
- if "paras" in para_block.keys():
86
- paras = para_block["paras"]
87
- for para_key, para_content in paras.items():
88
- para_bbox = para_content["para_bbox"]
89
- # print(f"para_bbox: {para_bbox}")
90
- # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
91
- if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
92
- color = (0, 1, 1)
93
- self.__draw_nested_boxes(
94
- page, para_bbox, color
95
- ) # draw with cyan color for combined paragraph
96
- else:
97
- if self.__valid_rect(para_bbox):
98
- para_rect = fitz.Rect(para_bbox)
99
- para_anno = page.add_rect_annot(para_rect)
100
- para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
101
- para_anno.set_border(width=0.5)
102
- para_anno.update()
103
-
104
- is_para_title = para_content["is_para_title"]
105
- if is_para_title:
106
- if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
107
- color = (0, 0, 1)
108
- self.__draw_nested_boxes(
109
- page, para_content["para_bbox"], color
110
- ) # draw with cyan color for combined title
111
- else:
112
- if self.__valid_rect(para_content["para_bbox"]):
113
- para_rect = fitz.Rect(para_content["para_bbox"])
114
- if self.__valid_rect(para_content["para_bbox"]):
115
- para_anno = page.add_rect_annot(para_rect)
116
- para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
117
- para_anno.set_border(width=0.5)
118
- para_anno.update()
119
-
120
- pdf_doc.save(output_pdf_path)
121
- pdf_doc.close()
@@ -1,198 +0,0 @@
1
- class DenseSingleLineBlockException(Exception):
2
- """
3
- This class defines the exception type for dense single line-block.
4
- """
5
-
6
- def __init__(self, message="DenseSingleLineBlockException"):
7
- self.message = message
8
- super().__init__(self.message)
9
-
10
- def __str__(self):
11
- return f"{self.message}"
12
-
13
- def __repr__(self):
14
- return f"{self.message}"
15
-
16
-
17
- class TitleDetectionException(Exception):
18
- """
19
- This class defines the exception type for title detection.
20
- """
21
-
22
- def __init__(self, message="TitleDetectionException"):
23
- self.message = message
24
- super().__init__(self.message)
25
-
26
- def __str__(self):
27
- return f"{self.message}"
28
-
29
- def __repr__(self):
30
- return f"{self.message}"
31
-
32
-
33
- class TitleLevelException(Exception):
34
- """
35
- This class defines the exception type for title level.
36
- """
37
-
38
- def __init__(self, message="TitleLevelException"):
39
- self.message = message
40
- super().__init__(self.message)
41
-
42
- def __str__(self):
43
- return f"{self.message}"
44
-
45
- def __repr__(self):
46
- return f"{self.message}"
47
-
48
-
49
- class ParaSplitException(Exception):
50
- """
51
- This class defines the exception type for paragraph splitting.
52
- """
53
-
54
- def __init__(self, message="ParaSplitException"):
55
- self.message = message
56
- super().__init__(self.message)
57
-
58
- def __str__(self):
59
- return f"{self.message}"
60
-
61
- def __repr__(self):
62
- return f"{self.message}"
63
-
64
-
65
- class ParaMergeException(Exception):
66
- """
67
- This class defines the exception type for paragraph merging.
68
- """
69
-
70
- def __init__(self, message="ParaMergeException"):
71
- self.message = message
72
- super().__init__(self.message)
73
-
74
- def __str__(self):
75
- return f"{self.message}"
76
-
77
- def __repr__(self):
78
- return f"{self.message}"
79
-
80
-
81
- class DiscardByException:
82
- """
83
- This class discards pdf files by exception
84
- """
85
-
86
- def __init__(self) -> None:
87
- pass
88
-
89
- def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
90
- """
91
- This function discards pdf files by single line block exception
92
-
93
- Parameters
94
- ----------
95
- pdf_dic : dict
96
- pdf dictionary
97
- exception : str
98
- exception message
99
-
100
- Returns
101
- -------
102
- error_message : str
103
- """
104
- exception_page_nums = 0
105
- page_num = 0
106
- for page_id, page in pdf_dic.items():
107
- if page_id.startswith("page_"):
108
- page_num += 1
109
- if "preproc_blocks" in page.keys():
110
- preproc_blocks = page["preproc_blocks"]
111
-
112
- all_single_line_blocks = []
113
- for block in preproc_blocks:
114
- if len(block["lines"]) == 1:
115
- all_single_line_blocks.append(block)
116
-
117
- if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
118
- exception_page_nums += 1
119
-
120
- if page_num == 0:
121
- return None
122
-
123
- if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded
124
- return exception.message
125
-
126
- return None
127
-
128
- def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
129
- """
130
- This function discards pdf files by title detection exception
131
-
132
- Parameters
133
- ----------
134
- pdf_dic : dict
135
- pdf dictionary
136
- exception : str
137
- exception message
138
-
139
- Returns
140
- -------
141
- error_message : str
142
- """
143
- # return exception.message
144
- return None
145
-
146
- def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
147
- """
148
- This function discards pdf files by title level exception
149
-
150
- Parameters
151
- ----------
152
- pdf_dic : dict
153
- pdf dictionary
154
- exception : str
155
- exception message
156
-
157
- Returns
158
- -------
159
- error_message : str
160
- """
161
- # return exception.message
162
- return None
163
-
164
- def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
165
- """
166
- This function discards pdf files by split para exception
167
-
168
- Parameters
169
- ----------
170
- pdf_dic : dict
171
- pdf dictionary
172
- exception : str
173
- exception message
174
-
175
- Returns
176
- -------
177
- error_message : str
178
- """
179
- # return exception.message
180
- return None
181
-
182
- def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
183
- """
184
- This function discards pdf files by merge para exception
185
-
186
- Parameters
187
- ----------
188
- pdf_dic : dict
189
- pdf dictionary
190
- exception : str
191
- exception message
192
-
193
- Returns
194
- -------
195
- error_message : str
196
- """
197
- # return exception.message
198
- return None
@@ -1,40 +0,0 @@
1
- import math
2
- from magic_pdf.para.commons import *
3
-
4
-
5
- if sys.version_info[0] >= 3:
6
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
7
-
8
-
9
- class LayoutFilterProcessor:
10
- def __init__(self) -> None:
11
- pass
12
-
13
- def batch_process_blocks(self, pdf_dict):
14
- for page_id, blocks in pdf_dict.items():
15
- if page_id.startswith("page_"):
16
- if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
17
- layout_bbox_objs = blocks["layout_bboxes"]
18
- if layout_bbox_objs is None:
19
- continue
20
- layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
21
-
22
- # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
23
- layout_bboxes = [
24
- [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
25
- ]
26
-
27
- para_blocks = blocks["para_blocks"]
28
- if para_blocks is None:
29
- continue
30
-
31
- for lb_bbox in layout_bboxes:
32
- for i, para_block in enumerate(para_blocks):
33
- para_bbox = para_block["bbox"]
34
- para_blocks[i]["in_layout"] = 0
35
- if is_in_bbox(para_bbox, lb_bbox):
36
- para_blocks[i]["in_layout"] = 1
37
-
38
- blocks["para_blocks"] = para_blocks
39
-
40
- return pdf_dict