magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
magic_pdf/para/raw_processor.py
DELETED
@@ -1,207 +0,0 @@
|
|
1
|
-
class RawBlockProcessor:
|
2
|
-
def __init__(self) -> None:
|
3
|
-
self.y_tolerance = 2
|
4
|
-
self.pdf_dic = {}
|
5
|
-
|
6
|
-
def __span_flags_decomposer(self, span_flags):
|
7
|
-
"""
|
8
|
-
Make font flags human readable.
|
9
|
-
|
10
|
-
Parameters
|
11
|
-
----------
|
12
|
-
self : object
|
13
|
-
The instance of the class.
|
14
|
-
|
15
|
-
span_flags : int
|
16
|
-
span flags
|
17
|
-
|
18
|
-
Returns
|
19
|
-
-------
|
20
|
-
l : dict
|
21
|
-
decomposed flags
|
22
|
-
"""
|
23
|
-
|
24
|
-
l = {
|
25
|
-
"is_superscript": False,
|
26
|
-
"is_italic": False,
|
27
|
-
"is_serifed": False,
|
28
|
-
"is_sans_serifed": False,
|
29
|
-
"is_monospaced": False,
|
30
|
-
"is_proportional": False,
|
31
|
-
"is_bold": False,
|
32
|
-
}
|
33
|
-
|
34
|
-
if span_flags & 2**0:
|
35
|
-
l["is_superscript"] = True # 表示上标
|
36
|
-
|
37
|
-
if span_flags & 2**1:
|
38
|
-
l["is_italic"] = True # 表示斜体
|
39
|
-
|
40
|
-
if span_flags & 2**2:
|
41
|
-
l["is_serifed"] = True # 表示衬线字体
|
42
|
-
else:
|
43
|
-
l["is_sans_serifed"] = True # 表示非衬线字体
|
44
|
-
|
45
|
-
if span_flags & 2**3:
|
46
|
-
l["is_monospaced"] = True # 表示等宽字体
|
47
|
-
else:
|
48
|
-
l["is_proportional"] = True # 表示比例字体
|
49
|
-
|
50
|
-
if span_flags & 2**4:
|
51
|
-
l["is_bold"] = True # 表示粗体
|
52
|
-
|
53
|
-
return l
|
54
|
-
|
55
|
-
def __make_new_lines(self, raw_lines):
|
56
|
-
"""
|
57
|
-
This function makes new lines.
|
58
|
-
|
59
|
-
Parameters
|
60
|
-
----------
|
61
|
-
self : object
|
62
|
-
The instance of the class.
|
63
|
-
|
64
|
-
raw_lines : list
|
65
|
-
raw lines
|
66
|
-
|
67
|
-
Returns
|
68
|
-
-------
|
69
|
-
new_lines : list
|
70
|
-
new lines
|
71
|
-
"""
|
72
|
-
new_lines = []
|
73
|
-
new_line = None
|
74
|
-
|
75
|
-
for raw_line in raw_lines:
|
76
|
-
raw_line_bbox = raw_line["bbox"]
|
77
|
-
raw_line_spans = raw_line["spans"]
|
78
|
-
raw_line_text = "".join([span["text"] for span in raw_line_spans])
|
79
|
-
raw_line_dir = raw_line.get("dir", None)
|
80
|
-
|
81
|
-
decomposed_line_spans = []
|
82
|
-
for span in raw_line_spans:
|
83
|
-
raw_flags = span["flags"]
|
84
|
-
decomposed_flags = self.__span_flags_decomposer(raw_flags)
|
85
|
-
span["decomposed_flags"] = decomposed_flags
|
86
|
-
decomposed_line_spans.append(span)
|
87
|
-
|
88
|
-
if new_line is None:
|
89
|
-
new_line = {
|
90
|
-
"bbox": raw_line_bbox,
|
91
|
-
"text": raw_line_text,
|
92
|
-
"dir": raw_line_dir if raw_line_dir else (0, 0),
|
93
|
-
"spans": decomposed_line_spans,
|
94
|
-
}
|
95
|
-
else:
|
96
|
-
if (
|
97
|
-
abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
|
98
|
-
and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
|
99
|
-
):
|
100
|
-
new_line["bbox"] = (
|
101
|
-
min(new_line["bbox"][0], raw_line_bbox[0]), # left
|
102
|
-
new_line["bbox"][1], # top
|
103
|
-
max(new_line["bbox"][2], raw_line_bbox[2]), # right
|
104
|
-
raw_line_bbox[3], # bottom
|
105
|
-
)
|
106
|
-
new_line["text"] += " " + raw_line_text
|
107
|
-
new_line["spans"].extend(raw_line_spans)
|
108
|
-
new_line["dir"] = (
|
109
|
-
new_line["dir"][0] + raw_line_dir[0],
|
110
|
-
new_line["dir"][1] + raw_line_dir[1],
|
111
|
-
)
|
112
|
-
else:
|
113
|
-
new_lines.append(new_line)
|
114
|
-
new_line = {
|
115
|
-
"bbox": raw_line_bbox,
|
116
|
-
"text": raw_line_text,
|
117
|
-
"dir": raw_line_dir if raw_line_dir else (0, 0),
|
118
|
-
"spans": raw_line_spans,
|
119
|
-
}
|
120
|
-
if new_line:
|
121
|
-
new_lines.append(new_line)
|
122
|
-
|
123
|
-
return new_lines
|
124
|
-
|
125
|
-
def __make_new_block(self, raw_block):
|
126
|
-
"""
|
127
|
-
This function makes a new block.
|
128
|
-
|
129
|
-
Parameters
|
130
|
-
----------
|
131
|
-
self : object
|
132
|
-
The instance of the class.
|
133
|
-
----------
|
134
|
-
raw_block : dict
|
135
|
-
a raw block
|
136
|
-
|
137
|
-
Returns
|
138
|
-
-------
|
139
|
-
new_block : dict
|
140
|
-
|
141
|
-
Schema of new_block:
|
142
|
-
{
|
143
|
-
"block_id": "block_1",
|
144
|
-
"bbox": [0, 0, 100, 100],
|
145
|
-
"text": "This is a block.",
|
146
|
-
"lines": [
|
147
|
-
{
|
148
|
-
"bbox": [0, 0, 100, 100],
|
149
|
-
"text": "This is a line.",
|
150
|
-
"spans": [
|
151
|
-
{
|
152
|
-
"text": "This is a span.",
|
153
|
-
"font": "Times New Roman",
|
154
|
-
"size": 12,
|
155
|
-
"color": "#000000",
|
156
|
-
}
|
157
|
-
],
|
158
|
-
}
|
159
|
-
],
|
160
|
-
}
|
161
|
-
"""
|
162
|
-
new_block = {}
|
163
|
-
|
164
|
-
block_id = raw_block["number"]
|
165
|
-
block_bbox = raw_block["bbox"]
|
166
|
-
block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
|
167
|
-
raw_lines = raw_block["lines"]
|
168
|
-
block_lines = self.__make_new_lines(raw_lines)
|
169
|
-
|
170
|
-
new_block["block_id"] = block_id
|
171
|
-
new_block["bbox"] = block_bbox
|
172
|
-
new_block["text"] = block_text
|
173
|
-
new_block["lines"] = block_lines
|
174
|
-
|
175
|
-
return new_block
|
176
|
-
|
177
|
-
def batch_process_blocks(self, pdf_dic):
|
178
|
-
"""
|
179
|
-
This function processes the blocks in batch.
|
180
|
-
|
181
|
-
Parameters
|
182
|
-
----------
|
183
|
-
self : object
|
184
|
-
The instance of the class.
|
185
|
-
----------
|
186
|
-
blocks : list
|
187
|
-
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
|
188
|
-
|
189
|
-
Returns
|
190
|
-
-------
|
191
|
-
result_dict : dict
|
192
|
-
result dictionary
|
193
|
-
"""
|
194
|
-
|
195
|
-
for page_id, blocks in pdf_dic.items():
|
196
|
-
if page_id.startswith("page_"):
|
197
|
-
para_blocks = []
|
198
|
-
if "preproc_blocks" in blocks.keys():
|
199
|
-
input_blocks = blocks["preproc_blocks"]
|
200
|
-
for raw_block in input_blocks:
|
201
|
-
new_block = self.__make_new_block(raw_block)
|
202
|
-
para_blocks.append(new_block)
|
203
|
-
|
204
|
-
blocks["para_blocks"] = para_blocks
|
205
|
-
|
206
|
-
return pdf_dic
|
207
|
-
|
magic_pdf/para/stats.py
DELETED
@@ -1,268 +0,0 @@
|
|
1
|
-
from collections import Counter
|
2
|
-
import numpy as np
|
3
|
-
|
4
|
-
from magic_pdf.para.commons import *
|
5
|
-
|
6
|
-
|
7
|
-
if sys.version_info[0] >= 3:
|
8
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
9
|
-
|
10
|
-
|
11
|
-
class BlockStatisticsCalculator:
|
12
|
-
def __init__(self) -> None:
|
13
|
-
pass
|
14
|
-
|
15
|
-
def __calc_stats_of_new_lines(self, new_lines):
|
16
|
-
"""
|
17
|
-
This function calculates the paragraph metrics
|
18
|
-
|
19
|
-
Parameters
|
20
|
-
----------
|
21
|
-
combined_lines : list
|
22
|
-
combined lines
|
23
|
-
|
24
|
-
Returns
|
25
|
-
-------
|
26
|
-
X0 : float
|
27
|
-
Median of x0 values, which represents the left average boundary of the block
|
28
|
-
X1 : float
|
29
|
-
Median of x1 values, which represents the right average boundary of the block
|
30
|
-
avg_char_width : float
|
31
|
-
Average of char widths, which represents the average char width of the block
|
32
|
-
avg_char_height : float
|
33
|
-
Average of line heights, which represents the average line height of the block
|
34
|
-
|
35
|
-
"""
|
36
|
-
x0_values = []
|
37
|
-
x1_values = []
|
38
|
-
char_widths = []
|
39
|
-
char_heights = []
|
40
|
-
|
41
|
-
block_font_types = []
|
42
|
-
block_font_sizes = []
|
43
|
-
block_directions = []
|
44
|
-
|
45
|
-
if len(new_lines) > 0:
|
46
|
-
for i, line in enumerate(new_lines):
|
47
|
-
line_bbox = line["bbox"]
|
48
|
-
line_text = line["text"]
|
49
|
-
line_spans = line["spans"]
|
50
|
-
|
51
|
-
num_chars = len([ch for ch in line_text if not ch.isspace()])
|
52
|
-
|
53
|
-
x0_values.append(line_bbox[0])
|
54
|
-
x1_values.append(line_bbox[2])
|
55
|
-
|
56
|
-
if num_chars > 0:
|
57
|
-
char_width = (line_bbox[2] - line_bbox[0]) / num_chars
|
58
|
-
char_widths.append(char_width)
|
59
|
-
|
60
|
-
for span in line_spans:
|
61
|
-
block_font_types.append(span["font"])
|
62
|
-
block_font_sizes.append(span["size"])
|
63
|
-
|
64
|
-
if "dir" in line:
|
65
|
-
block_directions.append(line["dir"])
|
66
|
-
|
67
|
-
# line_font_types = [span["font"] for span in line_spans]
|
68
|
-
char_heights = [span["size"] for span in line_spans]
|
69
|
-
|
70
|
-
X0 = np.median(x0_values) if x0_values else 0
|
71
|
-
X1 = np.median(x1_values) if x1_values else 0
|
72
|
-
avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
|
73
|
-
avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
|
74
|
-
|
75
|
-
# max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
|
76
|
-
|
77
|
-
max_span_length = 0
|
78
|
-
max_span_font_type = None
|
79
|
-
for line in new_lines:
|
80
|
-
line_spans = line["spans"]
|
81
|
-
for span in line_spans:
|
82
|
-
span_length = span["bbox"][2] - span["bbox"][0]
|
83
|
-
if span_length > max_span_length:
|
84
|
-
max_span_length = span_length
|
85
|
-
max_span_font_type = span["font"]
|
86
|
-
|
87
|
-
max_freq_font_type = max_span_font_type
|
88
|
-
|
89
|
-
avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
|
90
|
-
|
91
|
-
avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
|
92
|
-
avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
|
93
|
-
|
94
|
-
median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
|
95
|
-
|
96
|
-
return (
|
97
|
-
X0,
|
98
|
-
X1,
|
99
|
-
avg_char_width,
|
100
|
-
avg_char_height,
|
101
|
-
max_freq_font_type,
|
102
|
-
avg_font_size,
|
103
|
-
(avg_dir_horizontal, avg_dir_vertical),
|
104
|
-
median_font_size,
|
105
|
-
)
|
106
|
-
|
107
|
-
def __make_new_block(self, input_block):
|
108
|
-
new_block = {}
|
109
|
-
|
110
|
-
raw_lines = input_block["lines"]
|
111
|
-
stats = self.__calc_stats_of_new_lines(raw_lines)
|
112
|
-
|
113
|
-
block_id = input_block["block_id"]
|
114
|
-
block_bbox = input_block["bbox"]
|
115
|
-
block_text = input_block["text"]
|
116
|
-
block_lines = raw_lines
|
117
|
-
block_avg_left_boundary = stats[0]
|
118
|
-
block_avg_right_boundary = stats[1]
|
119
|
-
block_avg_char_width = stats[2]
|
120
|
-
block_avg_char_height = stats[3]
|
121
|
-
block_font_type = stats[4]
|
122
|
-
block_font_size = stats[5]
|
123
|
-
block_direction = stats[6]
|
124
|
-
block_median_font_size = stats[7]
|
125
|
-
|
126
|
-
new_block["block_id"] = block_id
|
127
|
-
new_block["bbox"] = block_bbox
|
128
|
-
new_block["text"] = block_text
|
129
|
-
new_block["dir"] = block_direction
|
130
|
-
new_block["X0"] = block_avg_left_boundary
|
131
|
-
new_block["X1"] = block_avg_right_boundary
|
132
|
-
new_block["avg_char_width"] = block_avg_char_width
|
133
|
-
new_block["avg_char_height"] = block_avg_char_height
|
134
|
-
new_block["block_font_type"] = block_font_type
|
135
|
-
new_block["block_font_size"] = block_font_size
|
136
|
-
new_block["lines"] = block_lines
|
137
|
-
new_block["median_font_size"] = block_median_font_size
|
138
|
-
|
139
|
-
return new_block
|
140
|
-
|
141
|
-
def batch_process_blocks(self, pdf_dic):
|
142
|
-
"""
|
143
|
-
This function processes the blocks in batch.
|
144
|
-
|
145
|
-
Parameters
|
146
|
-
----------
|
147
|
-
self : object
|
148
|
-
The instance of the class.
|
149
|
-
----------
|
150
|
-
blocks : list
|
151
|
-
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
|
152
|
-
|
153
|
-
Returns
|
154
|
-
-------
|
155
|
-
result_dict : dict
|
156
|
-
result dictionary
|
157
|
-
"""
|
158
|
-
|
159
|
-
for page_id, blocks in pdf_dic.items():
|
160
|
-
if page_id.startswith("page_"):
|
161
|
-
para_blocks = []
|
162
|
-
if "para_blocks" in blocks.keys():
|
163
|
-
input_blocks = blocks["para_blocks"]
|
164
|
-
for input_block in input_blocks:
|
165
|
-
new_block = self.__make_new_block(input_block)
|
166
|
-
para_blocks.append(new_block)
|
167
|
-
|
168
|
-
blocks["para_blocks"] = para_blocks
|
169
|
-
|
170
|
-
return pdf_dic
|
171
|
-
|
172
|
-
|
173
|
-
class DocStatisticsCalculator:
|
174
|
-
def __init__(self) -> None:
|
175
|
-
pass
|
176
|
-
|
177
|
-
def calc_stats_of_doc(self, pdf_dict):
|
178
|
-
"""
|
179
|
-
This function computes the statistics of the document
|
180
|
-
|
181
|
-
Parameters
|
182
|
-
----------
|
183
|
-
result_dict : dict
|
184
|
-
result dictionary
|
185
|
-
|
186
|
-
Returns
|
187
|
-
-------
|
188
|
-
statistics : dict
|
189
|
-
statistics of the document
|
190
|
-
"""
|
191
|
-
|
192
|
-
total_text_length = 0
|
193
|
-
total_num_blocks = 0
|
194
|
-
|
195
|
-
for page_id, blocks in pdf_dict.items():
|
196
|
-
if page_id.startswith("page_"):
|
197
|
-
if "para_blocks" in blocks.keys():
|
198
|
-
para_blocks = blocks["para_blocks"]
|
199
|
-
for para_block in para_blocks:
|
200
|
-
total_text_length += len(para_block["text"])
|
201
|
-
total_num_blocks += 1
|
202
|
-
|
203
|
-
avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
|
204
|
-
|
205
|
-
font_list = []
|
206
|
-
|
207
|
-
for page_id, blocks in pdf_dict.items():
|
208
|
-
if page_id.startswith("page_"):
|
209
|
-
if "para_blocks" in blocks.keys():
|
210
|
-
input_blocks = blocks["para_blocks"]
|
211
|
-
for input_block in input_blocks:
|
212
|
-
block_text_length = len(input_block.get("text", ""))
|
213
|
-
if block_text_length < avg_text_length * 0.5:
|
214
|
-
continue
|
215
|
-
block_font_type = safe_get(input_block, "block_font_type", "")
|
216
|
-
block_font_size = safe_get(input_block, "block_font_size", 0)
|
217
|
-
font_list.append((block_font_type, block_font_size))
|
218
|
-
|
219
|
-
font_counter = Counter(font_list)
|
220
|
-
most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
|
221
|
-
second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
|
222
|
-
|
223
|
-
statistics = {
|
224
|
-
"num_pages": 0,
|
225
|
-
"num_blocks": 0,
|
226
|
-
"num_paras": 0,
|
227
|
-
"num_titles": 0,
|
228
|
-
"num_header_blocks": 0,
|
229
|
-
"num_footer_blocks": 0,
|
230
|
-
"num_watermark_blocks": 0,
|
231
|
-
"num_vertical_margin_note_blocks": 0,
|
232
|
-
"most_common_font_type": most_common_font[0][0],
|
233
|
-
"most_common_font_size": most_common_font[0][1],
|
234
|
-
"number_of_most_common_font": most_common_font[1],
|
235
|
-
"second_most_common_font_type": second_most_common_font[0][0],
|
236
|
-
"second_most_common_font_size": second_most_common_font[0][1],
|
237
|
-
"number_of_second_most_common_font": second_most_common_font[1],
|
238
|
-
"avg_text_length": avg_text_length,
|
239
|
-
}
|
240
|
-
|
241
|
-
for page_id, blocks in pdf_dict.items():
|
242
|
-
if page_id.startswith("page_"):
|
243
|
-
blocks = pdf_dict[page_id]["para_blocks"]
|
244
|
-
statistics["num_pages"] += 1
|
245
|
-
for block_id, block_data in enumerate(blocks):
|
246
|
-
statistics["num_blocks"] += 1
|
247
|
-
|
248
|
-
if "paras" in block_data.keys():
|
249
|
-
statistics["num_paras"] += len(block_data["paras"])
|
250
|
-
|
251
|
-
for line in block_data["lines"]:
|
252
|
-
if line.get("is_title", 0):
|
253
|
-
statistics["num_titles"] += 1
|
254
|
-
|
255
|
-
if block_data.get("is_header", 0):
|
256
|
-
statistics["num_header_blocks"] += 1
|
257
|
-
if block_data.get("is_footer", 0):
|
258
|
-
statistics["num_footer_blocks"] += 1
|
259
|
-
if block_data.get("is_watermark", 0):
|
260
|
-
statistics["num_watermark_blocks"] += 1
|
261
|
-
if block_data.get("is_vertical_margin_note", 0):
|
262
|
-
statistics["num_vertical_margin_note_blocks"] += 1
|
263
|
-
|
264
|
-
pdf_dict["statistics"] = statistics
|
265
|
-
|
266
|
-
return pdf_dict
|
267
|
-
|
268
|
-
|