magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,207 +0,0 @@
1
- class RawBlockProcessor:
2
- def __init__(self) -> None:
3
- self.y_tolerance = 2
4
- self.pdf_dic = {}
5
-
6
- def __span_flags_decomposer(self, span_flags):
7
- """
8
- Make font flags human readable.
9
-
10
- Parameters
11
- ----------
12
- self : object
13
- The instance of the class.
14
-
15
- span_flags : int
16
- span flags
17
-
18
- Returns
19
- -------
20
- l : dict
21
- decomposed flags
22
- """
23
-
24
- l = {
25
- "is_superscript": False,
26
- "is_italic": False,
27
- "is_serifed": False,
28
- "is_sans_serifed": False,
29
- "is_monospaced": False,
30
- "is_proportional": False,
31
- "is_bold": False,
32
- }
33
-
34
- if span_flags & 2**0:
35
- l["is_superscript"] = True # 表示上标
36
-
37
- if span_flags & 2**1:
38
- l["is_italic"] = True # 表示斜体
39
-
40
- if span_flags & 2**2:
41
- l["is_serifed"] = True # 表示衬线字体
42
- else:
43
- l["is_sans_serifed"] = True # 表示非衬线字体
44
-
45
- if span_flags & 2**3:
46
- l["is_monospaced"] = True # 表示等宽字体
47
- else:
48
- l["is_proportional"] = True # 表示比例字体
49
-
50
- if span_flags & 2**4:
51
- l["is_bold"] = True # 表示粗体
52
-
53
- return l
54
-
55
- def __make_new_lines(self, raw_lines):
56
- """
57
- This function makes new lines.
58
-
59
- Parameters
60
- ----------
61
- self : object
62
- The instance of the class.
63
-
64
- raw_lines : list
65
- raw lines
66
-
67
- Returns
68
- -------
69
- new_lines : list
70
- new lines
71
- """
72
- new_lines = []
73
- new_line = None
74
-
75
- for raw_line in raw_lines:
76
- raw_line_bbox = raw_line["bbox"]
77
- raw_line_spans = raw_line["spans"]
78
- raw_line_text = "".join([span["text"] for span in raw_line_spans])
79
- raw_line_dir = raw_line.get("dir", None)
80
-
81
- decomposed_line_spans = []
82
- for span in raw_line_spans:
83
- raw_flags = span["flags"]
84
- decomposed_flags = self.__span_flags_decomposer(raw_flags)
85
- span["decomposed_flags"] = decomposed_flags
86
- decomposed_line_spans.append(span)
87
-
88
- if new_line is None:
89
- new_line = {
90
- "bbox": raw_line_bbox,
91
- "text": raw_line_text,
92
- "dir": raw_line_dir if raw_line_dir else (0, 0),
93
- "spans": decomposed_line_spans,
94
- }
95
- else:
96
- if (
97
- abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
98
- and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
99
- ):
100
- new_line["bbox"] = (
101
- min(new_line["bbox"][0], raw_line_bbox[0]), # left
102
- new_line["bbox"][1], # top
103
- max(new_line["bbox"][2], raw_line_bbox[2]), # right
104
- raw_line_bbox[3], # bottom
105
- )
106
- new_line["text"] += " " + raw_line_text
107
- new_line["spans"].extend(raw_line_spans)
108
- new_line["dir"] = (
109
- new_line["dir"][0] + raw_line_dir[0],
110
- new_line["dir"][1] + raw_line_dir[1],
111
- )
112
- else:
113
- new_lines.append(new_line)
114
- new_line = {
115
- "bbox": raw_line_bbox,
116
- "text": raw_line_text,
117
- "dir": raw_line_dir if raw_line_dir else (0, 0),
118
- "spans": raw_line_spans,
119
- }
120
- if new_line:
121
- new_lines.append(new_line)
122
-
123
- return new_lines
124
-
125
- def __make_new_block(self, raw_block):
126
- """
127
- This function makes a new block.
128
-
129
- Parameters
130
- ----------
131
- self : object
132
- The instance of the class.
133
- ----------
134
- raw_block : dict
135
- a raw block
136
-
137
- Returns
138
- -------
139
- new_block : dict
140
-
141
- Schema of new_block:
142
- {
143
- "block_id": "block_1",
144
- "bbox": [0, 0, 100, 100],
145
- "text": "This is a block.",
146
- "lines": [
147
- {
148
- "bbox": [0, 0, 100, 100],
149
- "text": "This is a line.",
150
- "spans": [
151
- {
152
- "text": "This is a span.",
153
- "font": "Times New Roman",
154
- "size": 12,
155
- "color": "#000000",
156
- }
157
- ],
158
- }
159
- ],
160
- }
161
- """
162
- new_block = {}
163
-
164
- block_id = raw_block["number"]
165
- block_bbox = raw_block["bbox"]
166
- block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
167
- raw_lines = raw_block["lines"]
168
- block_lines = self.__make_new_lines(raw_lines)
169
-
170
- new_block["block_id"] = block_id
171
- new_block["bbox"] = block_bbox
172
- new_block["text"] = block_text
173
- new_block["lines"] = block_lines
174
-
175
- return new_block
176
-
177
- def batch_process_blocks(self, pdf_dic):
178
- """
179
- This function processes the blocks in batch.
180
-
181
- Parameters
182
- ----------
183
- self : object
184
- The instance of the class.
185
- ----------
186
- blocks : list
187
- Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
188
-
189
- Returns
190
- -------
191
- result_dict : dict
192
- result dictionary
193
- """
194
-
195
- for page_id, blocks in pdf_dic.items():
196
- if page_id.startswith("page_"):
197
- para_blocks = []
198
- if "preproc_blocks" in blocks.keys():
199
- input_blocks = blocks["preproc_blocks"]
200
- for raw_block in input_blocks:
201
- new_block = self.__make_new_block(raw_block)
202
- para_blocks.append(new_block)
203
-
204
- blocks["para_blocks"] = para_blocks
205
-
206
- return pdf_dic
207
-
magic_pdf/para/stats.py DELETED
@@ -1,268 +0,0 @@
1
- from collections import Counter
2
- import numpy as np
3
-
4
- from magic_pdf.para.commons import *
5
-
6
-
7
- if sys.version_info[0] >= 3:
8
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
9
-
10
-
11
- class BlockStatisticsCalculator:
12
- def __init__(self) -> None:
13
- pass
14
-
15
- def __calc_stats_of_new_lines(self, new_lines):
16
- """
17
- This function calculates the paragraph metrics
18
-
19
- Parameters
20
- ----------
21
- combined_lines : list
22
- combined lines
23
-
24
- Returns
25
- -------
26
- X0 : float
27
- Median of x0 values, which represents the left average boundary of the block
28
- X1 : float
29
- Median of x1 values, which represents the right average boundary of the block
30
- avg_char_width : float
31
- Average of char widths, which represents the average char width of the block
32
- avg_char_height : float
33
- Average of line heights, which represents the average line height of the block
34
-
35
- """
36
- x0_values = []
37
- x1_values = []
38
- char_widths = []
39
- char_heights = []
40
-
41
- block_font_types = []
42
- block_font_sizes = []
43
- block_directions = []
44
-
45
- if len(new_lines) > 0:
46
- for i, line in enumerate(new_lines):
47
- line_bbox = line["bbox"]
48
- line_text = line["text"]
49
- line_spans = line["spans"]
50
-
51
- num_chars = len([ch for ch in line_text if not ch.isspace()])
52
-
53
- x0_values.append(line_bbox[0])
54
- x1_values.append(line_bbox[2])
55
-
56
- if num_chars > 0:
57
- char_width = (line_bbox[2] - line_bbox[0]) / num_chars
58
- char_widths.append(char_width)
59
-
60
- for span in line_spans:
61
- block_font_types.append(span["font"])
62
- block_font_sizes.append(span["size"])
63
-
64
- if "dir" in line:
65
- block_directions.append(line["dir"])
66
-
67
- # line_font_types = [span["font"] for span in line_spans]
68
- char_heights = [span["size"] for span in line_spans]
69
-
70
- X0 = np.median(x0_values) if x0_values else 0
71
- X1 = np.median(x1_values) if x1_values else 0
72
- avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
73
- avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
74
-
75
- # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
76
-
77
- max_span_length = 0
78
- max_span_font_type = None
79
- for line in new_lines:
80
- line_spans = line["spans"]
81
- for span in line_spans:
82
- span_length = span["bbox"][2] - span["bbox"][0]
83
- if span_length > max_span_length:
84
- max_span_length = span_length
85
- max_span_font_type = span["font"]
86
-
87
- max_freq_font_type = max_span_font_type
88
-
89
- avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
90
-
91
- avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
92
- avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
93
-
94
- median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
95
-
96
- return (
97
- X0,
98
- X1,
99
- avg_char_width,
100
- avg_char_height,
101
- max_freq_font_type,
102
- avg_font_size,
103
- (avg_dir_horizontal, avg_dir_vertical),
104
- median_font_size,
105
- )
106
-
107
- def __make_new_block(self, input_block):
108
- new_block = {}
109
-
110
- raw_lines = input_block["lines"]
111
- stats = self.__calc_stats_of_new_lines(raw_lines)
112
-
113
- block_id = input_block["block_id"]
114
- block_bbox = input_block["bbox"]
115
- block_text = input_block["text"]
116
- block_lines = raw_lines
117
- block_avg_left_boundary = stats[0]
118
- block_avg_right_boundary = stats[1]
119
- block_avg_char_width = stats[2]
120
- block_avg_char_height = stats[3]
121
- block_font_type = stats[4]
122
- block_font_size = stats[5]
123
- block_direction = stats[6]
124
- block_median_font_size = stats[7]
125
-
126
- new_block["block_id"] = block_id
127
- new_block["bbox"] = block_bbox
128
- new_block["text"] = block_text
129
- new_block["dir"] = block_direction
130
- new_block["X0"] = block_avg_left_boundary
131
- new_block["X1"] = block_avg_right_boundary
132
- new_block["avg_char_width"] = block_avg_char_width
133
- new_block["avg_char_height"] = block_avg_char_height
134
- new_block["block_font_type"] = block_font_type
135
- new_block["block_font_size"] = block_font_size
136
- new_block["lines"] = block_lines
137
- new_block["median_font_size"] = block_median_font_size
138
-
139
- return new_block
140
-
141
- def batch_process_blocks(self, pdf_dic):
142
- """
143
- This function processes the blocks in batch.
144
-
145
- Parameters
146
- ----------
147
- self : object
148
- The instance of the class.
149
- ----------
150
- blocks : list
151
- Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
152
-
153
- Returns
154
- -------
155
- result_dict : dict
156
- result dictionary
157
- """
158
-
159
- for page_id, blocks in pdf_dic.items():
160
- if page_id.startswith("page_"):
161
- para_blocks = []
162
- if "para_blocks" in blocks.keys():
163
- input_blocks = blocks["para_blocks"]
164
- for input_block in input_blocks:
165
- new_block = self.__make_new_block(input_block)
166
- para_blocks.append(new_block)
167
-
168
- blocks["para_blocks"] = para_blocks
169
-
170
- return pdf_dic
171
-
172
-
173
- class DocStatisticsCalculator:
174
- def __init__(self) -> None:
175
- pass
176
-
177
- def calc_stats_of_doc(self, pdf_dict):
178
- """
179
- This function computes the statistics of the document
180
-
181
- Parameters
182
- ----------
183
- result_dict : dict
184
- result dictionary
185
-
186
- Returns
187
- -------
188
- statistics : dict
189
- statistics of the document
190
- """
191
-
192
- total_text_length = 0
193
- total_num_blocks = 0
194
-
195
- for page_id, blocks in pdf_dict.items():
196
- if page_id.startswith("page_"):
197
- if "para_blocks" in blocks.keys():
198
- para_blocks = blocks["para_blocks"]
199
- for para_block in para_blocks:
200
- total_text_length += len(para_block["text"])
201
- total_num_blocks += 1
202
-
203
- avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
204
-
205
- font_list = []
206
-
207
- for page_id, blocks in pdf_dict.items():
208
- if page_id.startswith("page_"):
209
- if "para_blocks" in blocks.keys():
210
- input_blocks = blocks["para_blocks"]
211
- for input_block in input_blocks:
212
- block_text_length = len(input_block.get("text", ""))
213
- if block_text_length < avg_text_length * 0.5:
214
- continue
215
- block_font_type = safe_get(input_block, "block_font_type", "")
216
- block_font_size = safe_get(input_block, "block_font_size", 0)
217
- font_list.append((block_font_type, block_font_size))
218
-
219
- font_counter = Counter(font_list)
220
- most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
221
- second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
222
-
223
- statistics = {
224
- "num_pages": 0,
225
- "num_blocks": 0,
226
- "num_paras": 0,
227
- "num_titles": 0,
228
- "num_header_blocks": 0,
229
- "num_footer_blocks": 0,
230
- "num_watermark_blocks": 0,
231
- "num_vertical_margin_note_blocks": 0,
232
- "most_common_font_type": most_common_font[0][0],
233
- "most_common_font_size": most_common_font[0][1],
234
- "number_of_most_common_font": most_common_font[1],
235
- "second_most_common_font_type": second_most_common_font[0][0],
236
- "second_most_common_font_size": second_most_common_font[0][1],
237
- "number_of_second_most_common_font": second_most_common_font[1],
238
- "avg_text_length": avg_text_length,
239
- }
240
-
241
- for page_id, blocks in pdf_dict.items():
242
- if page_id.startswith("page_"):
243
- blocks = pdf_dict[page_id]["para_blocks"]
244
- statistics["num_pages"] += 1
245
- for block_id, block_data in enumerate(blocks):
246
- statistics["num_blocks"] += 1
247
-
248
- if "paras" in block_data.keys():
249
- statistics["num_paras"] += len(block_data["paras"])
250
-
251
- for line in block_data["lines"]:
252
- if line.get("is_title", 0):
253
- statistics["num_titles"] += 1
254
-
255
- if block_data.get("is_header", 0):
256
- statistics["num_header_blocks"] += 1
257
- if block_data.get("is_footer", 0):
258
- statistics["num_footer_blocks"] += 1
259
- if block_data.get("is_watermark", 0):
260
- statistics["num_watermark_blocks"] += 1
261
- if block_data.get("is_vertical_margin_note", 0):
262
- statistics["num_vertical_margin_note_blocks"] += 1
263
-
264
- pdf_dict["statistics"] = statistics
265
-
266
- return pdf_dict
267
-
268
-