magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,207 +0,0 @@
1
- class RawBlockProcessor:
2
- def __init__(self) -> None:
3
- self.y_tolerance = 2
4
- self.pdf_dic = {}
5
-
6
- def __span_flags_decomposer(self, span_flags):
7
- """
8
- Make font flags human readable.
9
-
10
- Parameters
11
- ----------
12
- self : object
13
- The instance of the class.
14
-
15
- span_flags : int
16
- span flags
17
-
18
- Returns
19
- -------
20
- l : dict
21
- decomposed flags
22
- """
23
-
24
- l = {
25
- "is_superscript": False,
26
- "is_italic": False,
27
- "is_serifed": False,
28
- "is_sans_serifed": False,
29
- "is_monospaced": False,
30
- "is_proportional": False,
31
- "is_bold": False,
32
- }
33
-
34
- if span_flags & 2**0:
35
- l["is_superscript"] = True # 表示上标
36
-
37
- if span_flags & 2**1:
38
- l["is_italic"] = True # 表示斜体
39
-
40
- if span_flags & 2**2:
41
- l["is_serifed"] = True # 表示衬线字体
42
- else:
43
- l["is_sans_serifed"] = True # 表示非衬线字体
44
-
45
- if span_flags & 2**3:
46
- l["is_monospaced"] = True # 表示等宽字体
47
- else:
48
- l["is_proportional"] = True # 表示比例字体
49
-
50
- if span_flags & 2**4:
51
- l["is_bold"] = True # 表示粗体
52
-
53
- return l
54
-
55
- def __make_new_lines(self, raw_lines):
56
- """
57
- This function makes new lines.
58
-
59
- Parameters
60
- ----------
61
- self : object
62
- The instance of the class.
63
-
64
- raw_lines : list
65
- raw lines
66
-
67
- Returns
68
- -------
69
- new_lines : list
70
- new lines
71
- """
72
- new_lines = []
73
- new_line = None
74
-
75
- for raw_line in raw_lines:
76
- raw_line_bbox = raw_line["bbox"]
77
- raw_line_spans = raw_line["spans"]
78
- raw_line_text = "".join([span["text"] for span in raw_line_spans])
79
- raw_line_dir = raw_line.get("dir", None)
80
-
81
- decomposed_line_spans = []
82
- for span in raw_line_spans:
83
- raw_flags = span["flags"]
84
- decomposed_flags = self.__span_flags_decomposer(raw_flags)
85
- span["decomposed_flags"] = decomposed_flags
86
- decomposed_line_spans.append(span)
87
-
88
- if new_line is None:
89
- new_line = {
90
- "bbox": raw_line_bbox,
91
- "text": raw_line_text,
92
- "dir": raw_line_dir if raw_line_dir else (0, 0),
93
- "spans": decomposed_line_spans,
94
- }
95
- else:
96
- if (
97
- abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
98
- and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
99
- ):
100
- new_line["bbox"] = (
101
- min(new_line["bbox"][0], raw_line_bbox[0]), # left
102
- new_line["bbox"][1], # top
103
- max(new_line["bbox"][2], raw_line_bbox[2]), # right
104
- raw_line_bbox[3], # bottom
105
- )
106
- new_line["text"] += " " + raw_line_text
107
- new_line["spans"].extend(raw_line_spans)
108
- new_line["dir"] = (
109
- new_line["dir"][0] + raw_line_dir[0],
110
- new_line["dir"][1] + raw_line_dir[1],
111
- )
112
- else:
113
- new_lines.append(new_line)
114
- new_line = {
115
- "bbox": raw_line_bbox,
116
- "text": raw_line_text,
117
- "dir": raw_line_dir if raw_line_dir else (0, 0),
118
- "spans": raw_line_spans,
119
- }
120
- if new_line:
121
- new_lines.append(new_line)
122
-
123
- return new_lines
124
-
125
- def __make_new_block(self, raw_block):
126
- """
127
- This function makes a new block.
128
-
129
- Parameters
130
- ----------
131
- self : object
132
- The instance of the class.
133
- ----------
134
- raw_block : dict
135
- a raw block
136
-
137
- Returns
138
- -------
139
- new_block : dict
140
-
141
- Schema of new_block:
142
- {
143
- "block_id": "block_1",
144
- "bbox": [0, 0, 100, 100],
145
- "text": "This is a block.",
146
- "lines": [
147
- {
148
- "bbox": [0, 0, 100, 100],
149
- "text": "This is a line.",
150
- "spans": [
151
- {
152
- "text": "This is a span.",
153
- "font": "Times New Roman",
154
- "size": 12,
155
- "color": "#000000",
156
- }
157
- ],
158
- }
159
- ],
160
- }
161
- """
162
- new_block = {}
163
-
164
- block_id = raw_block["number"]
165
- block_bbox = raw_block["bbox"]
166
- block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
167
- raw_lines = raw_block["lines"]
168
- block_lines = self.__make_new_lines(raw_lines)
169
-
170
- new_block["block_id"] = block_id
171
- new_block["bbox"] = block_bbox
172
- new_block["text"] = block_text
173
- new_block["lines"] = block_lines
174
-
175
- return new_block
176
-
177
- def batch_process_blocks(self, pdf_dic):
178
- """
179
- This function processes the blocks in batch.
180
-
181
- Parameters
182
- ----------
183
- self : object
184
- The instance of the class.
185
- ----------
186
- blocks : list
187
- Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
188
-
189
- Returns
190
- -------
191
- result_dict : dict
192
- result dictionary
193
- """
194
-
195
- for page_id, blocks in pdf_dic.items():
196
- if page_id.startswith("page_"):
197
- para_blocks = []
198
- if "preproc_blocks" in blocks.keys():
199
- input_blocks = blocks["preproc_blocks"]
200
- for raw_block in input_blocks:
201
- new_block = self.__make_new_block(raw_block)
202
- para_blocks.append(new_block)
203
-
204
- blocks["para_blocks"] = para_blocks
205
-
206
- return pdf_dic
207
-
magic_pdf/para/stats.py DELETED
@@ -1,268 +0,0 @@
1
- from collections import Counter
2
- import numpy as np
3
-
4
- from magic_pdf.para.commons import *
5
-
6
-
7
- if sys.version_info[0] >= 3:
8
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
9
-
10
-
11
- class BlockStatisticsCalculator:
12
- def __init__(self) -> None:
13
- pass
14
-
15
- def __calc_stats_of_new_lines(self, new_lines):
16
- """
17
- This function calculates the paragraph metrics
18
-
19
- Parameters
20
- ----------
21
- combined_lines : list
22
- combined lines
23
-
24
- Returns
25
- -------
26
- X0 : float
27
- Median of x0 values, which represents the left average boundary of the block
28
- X1 : float
29
- Median of x1 values, which represents the right average boundary of the block
30
- avg_char_width : float
31
- Average of char widths, which represents the average char width of the block
32
- avg_char_height : float
33
- Average of line heights, which represents the average line height of the block
34
-
35
- """
36
- x0_values = []
37
- x1_values = []
38
- char_widths = []
39
- char_heights = []
40
-
41
- block_font_types = []
42
- block_font_sizes = []
43
- block_directions = []
44
-
45
- if len(new_lines) > 0:
46
- for i, line in enumerate(new_lines):
47
- line_bbox = line["bbox"]
48
- line_text = line["text"]
49
- line_spans = line["spans"]
50
-
51
- num_chars = len([ch for ch in line_text if not ch.isspace()])
52
-
53
- x0_values.append(line_bbox[0])
54
- x1_values.append(line_bbox[2])
55
-
56
- if num_chars > 0:
57
- char_width = (line_bbox[2] - line_bbox[0]) / num_chars
58
- char_widths.append(char_width)
59
-
60
- for span in line_spans:
61
- block_font_types.append(span["font"])
62
- block_font_sizes.append(span["size"])
63
-
64
- if "dir" in line:
65
- block_directions.append(line["dir"])
66
-
67
- # line_font_types = [span["font"] for span in line_spans]
68
- char_heights = [span["size"] for span in line_spans]
69
-
70
- X0 = np.median(x0_values) if x0_values else 0
71
- X1 = np.median(x1_values) if x1_values else 0
72
- avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
73
- avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
74
-
75
- # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
76
-
77
- max_span_length = 0
78
- max_span_font_type = None
79
- for line in new_lines:
80
- line_spans = line["spans"]
81
- for span in line_spans:
82
- span_length = span["bbox"][2] - span["bbox"][0]
83
- if span_length > max_span_length:
84
- max_span_length = span_length
85
- max_span_font_type = span["font"]
86
-
87
- max_freq_font_type = max_span_font_type
88
-
89
- avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
90
-
91
- avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
92
- avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
93
-
94
- median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
95
-
96
- return (
97
- X0,
98
- X1,
99
- avg_char_width,
100
- avg_char_height,
101
- max_freq_font_type,
102
- avg_font_size,
103
- (avg_dir_horizontal, avg_dir_vertical),
104
- median_font_size,
105
- )
106
-
107
- def __make_new_block(self, input_block):
108
- new_block = {}
109
-
110
- raw_lines = input_block["lines"]
111
- stats = self.__calc_stats_of_new_lines(raw_lines)
112
-
113
- block_id = input_block["block_id"]
114
- block_bbox = input_block["bbox"]
115
- block_text = input_block["text"]
116
- block_lines = raw_lines
117
- block_avg_left_boundary = stats[0]
118
- block_avg_right_boundary = stats[1]
119
- block_avg_char_width = stats[2]
120
- block_avg_char_height = stats[3]
121
- block_font_type = stats[4]
122
- block_font_size = stats[5]
123
- block_direction = stats[6]
124
- block_median_font_size = stats[7]
125
-
126
- new_block["block_id"] = block_id
127
- new_block["bbox"] = block_bbox
128
- new_block["text"] = block_text
129
- new_block["dir"] = block_direction
130
- new_block["X0"] = block_avg_left_boundary
131
- new_block["X1"] = block_avg_right_boundary
132
- new_block["avg_char_width"] = block_avg_char_width
133
- new_block["avg_char_height"] = block_avg_char_height
134
- new_block["block_font_type"] = block_font_type
135
- new_block["block_font_size"] = block_font_size
136
- new_block["lines"] = block_lines
137
- new_block["median_font_size"] = block_median_font_size
138
-
139
- return new_block
140
-
141
- def batch_process_blocks(self, pdf_dic):
142
- """
143
- This function processes the blocks in batch.
144
-
145
- Parameters
146
- ----------
147
- self : object
148
- The instance of the class.
149
- ----------
150
- blocks : list
151
- Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
152
-
153
- Returns
154
- -------
155
- result_dict : dict
156
- result dictionary
157
- """
158
-
159
- for page_id, blocks in pdf_dic.items():
160
- if page_id.startswith("page_"):
161
- para_blocks = []
162
- if "para_blocks" in blocks.keys():
163
- input_blocks = blocks["para_blocks"]
164
- for input_block in input_blocks:
165
- new_block = self.__make_new_block(input_block)
166
- para_blocks.append(new_block)
167
-
168
- blocks["para_blocks"] = para_blocks
169
-
170
- return pdf_dic
171
-
172
-
173
- class DocStatisticsCalculator:
174
- def __init__(self) -> None:
175
- pass
176
-
177
- def calc_stats_of_doc(self, pdf_dict):
178
- """
179
- This function computes the statistics of the document
180
-
181
- Parameters
182
- ----------
183
- result_dict : dict
184
- result dictionary
185
-
186
- Returns
187
- -------
188
- statistics : dict
189
- statistics of the document
190
- """
191
-
192
- total_text_length = 0
193
- total_num_blocks = 0
194
-
195
- for page_id, blocks in pdf_dict.items():
196
- if page_id.startswith("page_"):
197
- if "para_blocks" in blocks.keys():
198
- para_blocks = blocks["para_blocks"]
199
- for para_block in para_blocks:
200
- total_text_length += len(para_block["text"])
201
- total_num_blocks += 1
202
-
203
- avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
204
-
205
- font_list = []
206
-
207
- for page_id, blocks in pdf_dict.items():
208
- if page_id.startswith("page_"):
209
- if "para_blocks" in blocks.keys():
210
- input_blocks = blocks["para_blocks"]
211
- for input_block in input_blocks:
212
- block_text_length = len(input_block.get("text", ""))
213
- if block_text_length < avg_text_length * 0.5:
214
- continue
215
- block_font_type = safe_get(input_block, "block_font_type", "")
216
- block_font_size = safe_get(input_block, "block_font_size", 0)
217
- font_list.append((block_font_type, block_font_size))
218
-
219
- font_counter = Counter(font_list)
220
- most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
221
- second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
222
-
223
- statistics = {
224
- "num_pages": 0,
225
- "num_blocks": 0,
226
- "num_paras": 0,
227
- "num_titles": 0,
228
- "num_header_blocks": 0,
229
- "num_footer_blocks": 0,
230
- "num_watermark_blocks": 0,
231
- "num_vertical_margin_note_blocks": 0,
232
- "most_common_font_type": most_common_font[0][0],
233
- "most_common_font_size": most_common_font[0][1],
234
- "number_of_most_common_font": most_common_font[1],
235
- "second_most_common_font_type": second_most_common_font[0][0],
236
- "second_most_common_font_size": second_most_common_font[0][1],
237
- "number_of_second_most_common_font": second_most_common_font[1],
238
- "avg_text_length": avg_text_length,
239
- }
240
-
241
- for page_id, blocks in pdf_dict.items():
242
- if page_id.startswith("page_"):
243
- blocks = pdf_dict[page_id]["para_blocks"]
244
- statistics["num_pages"] += 1
245
- for block_id, block_data in enumerate(blocks):
246
- statistics["num_blocks"] += 1
247
-
248
- if "paras" in block_data.keys():
249
- statistics["num_paras"] += len(block_data["paras"])
250
-
251
- for line in block_data["lines"]:
252
- if line.get("is_title", 0):
253
- statistics["num_titles"] += 1
254
-
255
- if block_data.get("is_header", 0):
256
- statistics["num_header_blocks"] += 1
257
- if block_data.get("is_footer", 0):
258
- statistics["num_footer_blocks"] += 1
259
- if block_data.get("is_watermark", 0):
260
- statistics["num_watermark_blocks"] += 1
261
- if block_data.get("is_vertical_margin_note", 0):
262
- statistics["num_vertical_margin_note_blocks"] += 1
263
-
264
- pdf_dict["statistics"] = statistics
265
-
266
- return pdf_dict
267
-
268
-