magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,101 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
找到能分割布局的水平的横线、色块
|
3
|
-
"""
|
4
|
-
|
5
|
-
import os
|
6
|
-
from magic_pdf.libs.commons import fitz
|
7
|
-
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
|
8
|
-
|
9
|
-
|
10
|
-
def __rect_filter_by_width(rect, page_w, page_h):
|
11
|
-
mid_x = page_w/2
|
12
|
-
if rect[0]< mid_x < rect[2]:
|
13
|
-
return True
|
14
|
-
return False
|
15
|
-
|
16
|
-
|
17
|
-
def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
|
18
|
-
"""
|
19
|
-
不能出现在table和image的位置
|
20
|
-
"""
|
21
|
-
for box in image_bboxes:
|
22
|
-
if _is_in_or_part_overlap(rect, box):
|
23
|
-
return False
|
24
|
-
|
25
|
-
for box in table_bboxes:
|
26
|
-
if _is_in_or_part_overlap(rect, box):
|
27
|
-
return False
|
28
|
-
|
29
|
-
return True
|
30
|
-
|
31
|
-
|
32
|
-
def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
|
33
|
-
save_path = "./tmp/debug.pdf"
|
34
|
-
if os.path.exists(save_path):
|
35
|
-
# 删除已经存在的文件
|
36
|
-
os.remove(save_path)
|
37
|
-
# 创建一个新的空白 PDF 文件
|
38
|
-
doc = fitz.open('')
|
39
|
-
|
40
|
-
width = page.rect.width
|
41
|
-
height = page.rect.height
|
42
|
-
new_page = doc.new_page(width=width, height=height)
|
43
|
-
|
44
|
-
shape = new_page.new_shape()
|
45
|
-
for bbox in bboxes1:
|
46
|
-
# 原始box画上去
|
47
|
-
rect = fitz.Rect(*bbox[0:4])
|
48
|
-
shape = new_page.new_shape()
|
49
|
-
shape.draw_rect(rect)
|
50
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
|
51
|
-
shape.finish()
|
52
|
-
shape.commit()
|
53
|
-
|
54
|
-
for bbox in bboxes2:
|
55
|
-
# 原始box画上去
|
56
|
-
rect = fitz.Rect(*bbox[0:4])
|
57
|
-
shape = new_page.new_shape()
|
58
|
-
shape.draw_rect(rect)
|
59
|
-
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
|
60
|
-
shape.finish()
|
61
|
-
shape.commit()
|
62
|
-
|
63
|
-
for bbox in bboxes3:
|
64
|
-
# 原始box画上去
|
65
|
-
rect = fitz.Rect(*bbox[0:4])
|
66
|
-
shape = new_page.new_shape()
|
67
|
-
shape.draw_rect(rect)
|
68
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=None)
|
69
|
-
shape.finish()
|
70
|
-
shape.commit()
|
71
|
-
|
72
|
-
parent_dir = os.path.dirname(save_path)
|
73
|
-
if not os.path.exists(parent_dir):
|
74
|
-
os.makedirs(parent_dir)
|
75
|
-
|
76
|
-
doc.save(save_path)
|
77
|
-
doc.close()
|
78
|
-
|
79
|
-
def get_spilter_of_page(page, image_bboxes, table_bboxes):
|
80
|
-
"""
|
81
|
-
获取到色块和横线
|
82
|
-
"""
|
83
|
-
cdrawings = page.get_cdrawings()
|
84
|
-
|
85
|
-
spilter_bbox = []
|
86
|
-
for block in cdrawings:
|
87
|
-
if 'fill' in block:
|
88
|
-
fill = block['fill']
|
89
|
-
if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
|
90
|
-
rect = block['rect']
|
91
|
-
if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
|
92
|
-
spilter_bbox.append(list(rect))
|
93
|
-
|
94
|
-
"""过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
|
95
|
-
for box in spilter_bbox:
|
96
|
-
if box[3]-box[1] <= 0:
|
97
|
-
box[3] = box[1] + 1
|
98
|
-
|
99
|
-
#__debug_show_page(page, spilter_bbox, [], [])
|
100
|
-
|
101
|
-
return spilter_bbox
|
magic_pdf/layout/mcol_sort.py
DELETED
@@ -1,336 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
This is an advanced PyMuPDF utility for detecting multi-column pages.
|
3
|
-
It can be used in a shell script, or its main function can be imported and
|
4
|
-
invoked as descript below.
|
5
|
-
|
6
|
-
Features
|
7
|
-
---------
|
8
|
-
- Identify text belonging to (a variable number of) columns on the page.
|
9
|
-
- Text with different background color is handled separately, allowing for
|
10
|
-
easier treatment of side remarks, comment boxes, etc.
|
11
|
-
- Uses text block detection capability to identify text blocks and
|
12
|
-
uses the block bboxes as primary structuring principle.
|
13
|
-
- Supports ignoring footers via a footer margin parameter.
|
14
|
-
- Returns re-created text boundary boxes (integer coordinates), sorted ascending
|
15
|
-
by the top, then by the left coordinates.
|
16
|
-
|
17
|
-
Restrictions
|
18
|
-
-------------
|
19
|
-
- Only supporting horizontal, left-to-right text
|
20
|
-
- Returns a list of text boundary boxes - not the text itself. The caller is
|
21
|
-
expected to extract text from within the returned boxes.
|
22
|
-
- Text written above images is ignored altogether (option).
|
23
|
-
- This utility works as expected in most cases. The following situation cannot
|
24
|
-
be handled correctly:
|
25
|
-
* overlapping (non-disjoint) text blocks
|
26
|
-
* image captions are not recognized and are handled like normal text
|
27
|
-
|
28
|
-
Usage
|
29
|
-
------
|
30
|
-
- As a CLI shell command use
|
31
|
-
|
32
|
-
python multi_column.py input.pdf footer_margin
|
33
|
-
|
34
|
-
Where footer margin is the height of the bottom stripe to ignore on each page.
|
35
|
-
This code is intended to be modified according to your need.
|
36
|
-
|
37
|
-
- Use in a Python script as follows:
|
38
|
-
|
39
|
-
----------------------------------------------------------------------------------
|
40
|
-
from multi_column import column_boxes
|
41
|
-
|
42
|
-
# for each page execute
|
43
|
-
bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
|
44
|
-
|
45
|
-
# bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
|
46
|
-
# then x0 coordinates. Their text content can be extracted by all PyMuPDF
|
47
|
-
# get_text() variants, like for instance the following:
|
48
|
-
for rect in bboxes:
|
49
|
-
print(page.get_text(clip=rect, sort=True))
|
50
|
-
----------------------------------------------------------------------------------
|
51
|
-
"""
|
52
|
-
import sys
|
53
|
-
from magic_pdf.libs.commons import fitz
|
54
|
-
|
55
|
-
|
56
|
-
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
|
57
|
-
"""Determine bboxes which wrap a column."""
|
58
|
-
paths = page.get_drawings()
|
59
|
-
bboxes = []
|
60
|
-
|
61
|
-
# path rectangles
|
62
|
-
path_rects = []
|
63
|
-
|
64
|
-
# image bboxes
|
65
|
-
img_bboxes = []
|
66
|
-
|
67
|
-
# bboxes of non-horizontal text
|
68
|
-
# avoid when expanding horizontal text boxes
|
69
|
-
vert_bboxes = []
|
70
|
-
|
71
|
-
# compute relevant page area
|
72
|
-
clip = +page.rect
|
73
|
-
clip.y1 -= footer_margin # Remove footer area
|
74
|
-
clip.y0 += header_margin # Remove header area
|
75
|
-
|
76
|
-
def can_extend(temp, bb, bboxlist):
|
77
|
-
"""Determines whether rectangle 'temp' can be extended by 'bb'
|
78
|
-
without intersecting any of the rectangles contained in 'bboxlist'.
|
79
|
-
|
80
|
-
Items of bboxlist may be None if they have been removed.
|
81
|
-
|
82
|
-
Returns:
|
83
|
-
True if 'temp' has no intersections with items of 'bboxlist'.
|
84
|
-
"""
|
85
|
-
for b in bboxlist:
|
86
|
-
if not intersects_bboxes(temp, vert_bboxes) and (
|
87
|
-
b == None or b == bb or (temp & b).is_empty
|
88
|
-
):
|
89
|
-
continue
|
90
|
-
return False
|
91
|
-
|
92
|
-
return True
|
93
|
-
|
94
|
-
def in_bbox(bb, bboxes):
|
95
|
-
"""Return 1-based number if a bbox contains bb, else return 0."""
|
96
|
-
for i, bbox in enumerate(bboxes):
|
97
|
-
if bb in bbox:
|
98
|
-
return i + 1
|
99
|
-
return 0
|
100
|
-
|
101
|
-
def intersects_bboxes(bb, bboxes):
|
102
|
-
"""Return True if a bbox intersects bb, else return False."""
|
103
|
-
for bbox in bboxes:
|
104
|
-
if not (bb & bbox).is_empty:
|
105
|
-
return True
|
106
|
-
return False
|
107
|
-
|
108
|
-
def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
|
109
|
-
"""Extend a bbox to the right page border.
|
110
|
-
|
111
|
-
Whenever there is no text to the right of a bbox, enlarge it up
|
112
|
-
to the right page border.
|
113
|
-
|
114
|
-
Args:
|
115
|
-
bboxes: (list[IRect]) bboxes to check
|
116
|
-
width: (int) page width
|
117
|
-
path_bboxes: (list[IRect]) bboxes with a background color
|
118
|
-
vert_bboxes: (list[IRect]) bboxes with vertical text
|
119
|
-
img_bboxes: (list[IRect]) bboxes of images
|
120
|
-
Returns:
|
121
|
-
Potentially modified bboxes.
|
122
|
-
"""
|
123
|
-
for i, bb in enumerate(bboxes):
|
124
|
-
# do not extend text with background color
|
125
|
-
if in_bbox(bb, path_bboxes):
|
126
|
-
continue
|
127
|
-
|
128
|
-
# do not extend text in images
|
129
|
-
if in_bbox(bb, img_bboxes):
|
130
|
-
continue
|
131
|
-
|
132
|
-
# temp extends bb to the right page border
|
133
|
-
temp = +bb
|
134
|
-
temp.x1 = width
|
135
|
-
|
136
|
-
# do not cut through colored background or images
|
137
|
-
if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
|
138
|
-
continue
|
139
|
-
|
140
|
-
# also, do not intersect other text bboxes
|
141
|
-
check = can_extend(temp, bb, bboxes)
|
142
|
-
if check:
|
143
|
-
bboxes[i] = temp # replace with enlarged bbox
|
144
|
-
|
145
|
-
return [b for b in bboxes if b != None]
|
146
|
-
|
147
|
-
def clean_nblocks(nblocks):
|
148
|
-
"""Do some elementary cleaning."""
|
149
|
-
|
150
|
-
# 1. remove any duplicate blocks.
|
151
|
-
blen = len(nblocks)
|
152
|
-
if blen < 2:
|
153
|
-
return nblocks
|
154
|
-
start = blen - 1
|
155
|
-
for i in range(start, -1, -1):
|
156
|
-
bb1 = nblocks[i]
|
157
|
-
bb0 = nblocks[i - 1]
|
158
|
-
if bb0 == bb1:
|
159
|
-
del nblocks[i]
|
160
|
-
|
161
|
-
# 2. repair sequence in special cases:
|
162
|
-
# consecutive bboxes with almost same bottom value are sorted ascending
|
163
|
-
# by x-coordinate.
|
164
|
-
y1 = nblocks[0].y1 # first bottom coordinate
|
165
|
-
i0 = 0 # its index
|
166
|
-
i1 = -1 # index of last bbox with same bottom
|
167
|
-
|
168
|
-
# Iterate over bboxes, identifying segments with approx. same bottom value.
|
169
|
-
# Replace every segment by its sorted version.
|
170
|
-
for i in range(1, len(nblocks)):
|
171
|
-
b1 = nblocks[i]
|
172
|
-
if abs(b1.y1 - y1) > 10: # different bottom
|
173
|
-
if i1 > i0: # segment length > 1? Sort it!
|
174
|
-
nblocks[i0 : i1 + 1] = sorted(
|
175
|
-
nblocks[i0 : i1 + 1], key=lambda b: b.x0
|
176
|
-
)
|
177
|
-
y1 = b1.y1 # store new bottom value
|
178
|
-
i0 = i # store its start index
|
179
|
-
i1 = i # store current index
|
180
|
-
if i1 > i0: # segment waiting to be sorted
|
181
|
-
nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
|
182
|
-
return nblocks
|
183
|
-
|
184
|
-
# extract vector graphics
|
185
|
-
for p in paths:
|
186
|
-
path_rects.append(p["rect"].irect)
|
187
|
-
path_bboxes = path_rects
|
188
|
-
|
189
|
-
# sort path bboxes by ascending top, then left coordinates
|
190
|
-
path_bboxes.sort(key=lambda b: (b.y0, b.x0))
|
191
|
-
|
192
|
-
# bboxes of images on page, no need to sort them
|
193
|
-
for item in page.get_images():
|
194
|
-
img_bboxes.extend(page.get_image_rects(item[0]))
|
195
|
-
|
196
|
-
# blocks of text on page
|
197
|
-
blocks = page.get_text(
|
198
|
-
"dict",
|
199
|
-
flags=fitz.TEXTFLAGS_TEXT,
|
200
|
-
clip=clip,
|
201
|
-
)["blocks"]
|
202
|
-
|
203
|
-
# Make block rectangles, ignoring non-horizontal text
|
204
|
-
for b in blocks:
|
205
|
-
bbox = fitz.IRect(b["bbox"]) # bbox of the block
|
206
|
-
|
207
|
-
# ignore text written upon images
|
208
|
-
if no_image_text and in_bbox(bbox, img_bboxes):
|
209
|
-
continue
|
210
|
-
|
211
|
-
# confirm first line to be horizontal
|
212
|
-
line0 = b["lines"][0] # get first line
|
213
|
-
if line0["dir"] != (1, 0): # only accept horizontal text
|
214
|
-
vert_bboxes.append(bbox)
|
215
|
-
continue
|
216
|
-
|
217
|
-
srect = fitz.EMPTY_IRECT()
|
218
|
-
for line in b["lines"]:
|
219
|
-
lbbox = fitz.IRect(line["bbox"])
|
220
|
-
text = "".join([s["text"].strip() for s in line["spans"]])
|
221
|
-
if len(text) > 1:
|
222
|
-
srect |= lbbox
|
223
|
-
bbox = +srect
|
224
|
-
|
225
|
-
if not bbox.is_empty:
|
226
|
-
bboxes.append(bbox)
|
227
|
-
|
228
|
-
# Sort text bboxes by ascending background, top, then left coordinates
|
229
|
-
bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
|
230
|
-
|
231
|
-
# Extend bboxes to the right where possible
|
232
|
-
bboxes = extend_right(
|
233
|
-
bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
|
234
|
-
)
|
235
|
-
|
236
|
-
# immediately return of no text found
|
237
|
-
if bboxes == []:
|
238
|
-
return []
|
239
|
-
|
240
|
-
# --------------------------------------------------------------------
|
241
|
-
# Join bboxes to establish some column structure
|
242
|
-
# --------------------------------------------------------------------
|
243
|
-
# the final block bboxes on page
|
244
|
-
nblocks = [bboxes[0]] # pre-fill with first bbox
|
245
|
-
bboxes = bboxes[1:] # remaining old bboxes
|
246
|
-
|
247
|
-
for i, bb in enumerate(bboxes): # iterate old bboxes
|
248
|
-
check = False # indicates unwanted joins
|
249
|
-
|
250
|
-
# check if bb can extend one of the new blocks
|
251
|
-
for j in range(len(nblocks)):
|
252
|
-
nbb = nblocks[j] # a new block
|
253
|
-
|
254
|
-
# never join across columns
|
255
|
-
if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
|
256
|
-
continue
|
257
|
-
|
258
|
-
# never join across different background colors
|
259
|
-
if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
|
260
|
-
continue
|
261
|
-
|
262
|
-
temp = bb | nbb # temporary extension of new block
|
263
|
-
check = can_extend(temp, nbb, nblocks)
|
264
|
-
if check == True:
|
265
|
-
break
|
266
|
-
|
267
|
-
if not check: # bb cannot be used to extend any of the new bboxes
|
268
|
-
nblocks.append(bb) # so add it to the list
|
269
|
-
j = len(nblocks) - 1 # index of it
|
270
|
-
temp = nblocks[j] # new bbox added
|
271
|
-
|
272
|
-
# check if some remaining bbox is contained in temp
|
273
|
-
check = can_extend(temp, bb, bboxes)
|
274
|
-
if check == False:
|
275
|
-
nblocks.append(bb)
|
276
|
-
else:
|
277
|
-
nblocks[j] = temp
|
278
|
-
bboxes[i] = None
|
279
|
-
|
280
|
-
# do some elementary cleaning
|
281
|
-
nblocks = clean_nblocks(nblocks)
|
282
|
-
|
283
|
-
# return identified text bboxes
|
284
|
-
return nblocks
|
285
|
-
|
286
|
-
|
287
|
-
if __name__ == "__main__":
|
288
|
-
"""Only for debugging purposes, currently.
|
289
|
-
|
290
|
-
Draw red borders around the returned text bboxes and insert
|
291
|
-
the bbox number.
|
292
|
-
Then save the file under the name "input-blocks.pdf".
|
293
|
-
"""
|
294
|
-
|
295
|
-
# get the file name
|
296
|
-
filename = sys.argv[1]
|
297
|
-
|
298
|
-
# check if footer margin is given
|
299
|
-
if len(sys.argv) > 2:
|
300
|
-
footer_margin = int(sys.argv[2])
|
301
|
-
else: # use default vaue
|
302
|
-
footer_margin = 50
|
303
|
-
|
304
|
-
# check if header margin is given
|
305
|
-
if len(sys.argv) > 3:
|
306
|
-
header_margin = int(sys.argv[3])
|
307
|
-
else: # use default vaue
|
308
|
-
header_margin = 50
|
309
|
-
|
310
|
-
# open document
|
311
|
-
doc = fitz.open(filename)
|
312
|
-
|
313
|
-
# iterate over the pages
|
314
|
-
for page in doc:
|
315
|
-
# remove any geometry issues
|
316
|
-
page.wrap_contents()
|
317
|
-
|
318
|
-
# get the text bboxes
|
319
|
-
bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
|
320
|
-
|
321
|
-
# prepare a canvas to draw rectangles and text
|
322
|
-
shape = page.new_shape()
|
323
|
-
|
324
|
-
# iterate over the bboxes
|
325
|
-
for i, rect in enumerate(bboxes):
|
326
|
-
shape.draw_rect(rect) # draw a border
|
327
|
-
|
328
|
-
# write sequence number
|
329
|
-
shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
|
330
|
-
|
331
|
-
# finish drawing / text with color red
|
332
|
-
shape.finish(color=fitz.pdfcolor["red"])
|
333
|
-
shape.commit() # store to the page
|
334
|
-
|
335
|
-
# save document with text bboxes
|
336
|
-
doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
|
@@ -1,239 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import csv
|
3
|
-
import json
|
4
|
-
import pandas as pd
|
5
|
-
from pandas import DataFrame as df
|
6
|
-
from matplotlib import pyplot as plt
|
7
|
-
from termcolor import cprint
|
8
|
-
|
9
|
-
"""
|
10
|
-
Execute this script in the following way:
|
11
|
-
|
12
|
-
1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
|
13
|
-
|
14
|
-
code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
|
15
|
-
|
16
|
-
2. Under the directory code-clean, execute the following command:
|
17
|
-
|
18
|
-
$ python -m libs.calc_span_stats
|
19
|
-
|
20
|
-
"""
|
21
|
-
|
22
|
-
|
23
|
-
def print_green_on_red(text):
|
24
|
-
cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
|
25
|
-
|
26
|
-
|
27
|
-
def print_green(text):
|
28
|
-
print()
|
29
|
-
cprint(text, "green", attrs=["bold"], end="\n\n")
|
30
|
-
|
31
|
-
|
32
|
-
def print_red(text):
|
33
|
-
print()
|
34
|
-
cprint(text, "red", attrs=["bold"], end="\n\n")
|
35
|
-
|
36
|
-
|
37
|
-
def safe_get(dict_obj, key, default):
|
38
|
-
val = dict_obj.get(key)
|
39
|
-
if val is None:
|
40
|
-
return default
|
41
|
-
else:
|
42
|
-
return val
|
43
|
-
|
44
|
-
|
45
|
-
class SpanStatsCalc:
|
46
|
-
"""Calculate statistics of span."""
|
47
|
-
|
48
|
-
def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
|
49
|
-
"""Draw multiple figures in one figure."""
|
50
|
-
# make a canvas
|
51
|
-
fig = plt.figure(fig_num, figsize=(20, 20))
|
52
|
-
|
53
|
-
pass
|
54
|
-
|
55
|
-
def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
|
56
|
-
"""Calculate statistics per pdf_dict."""
|
57
|
-
span_stats = pd.DataFrame()
|
58
|
-
|
59
|
-
span_stats = []
|
60
|
-
span_id = 0
|
61
|
-
for page_id, blocks in pdf_dict.items():
|
62
|
-
if page_id.startswith("page_"):
|
63
|
-
if "para_blocks" in blocks.keys():
|
64
|
-
for para_block in blocks["para_blocks"]:
|
65
|
-
for line in para_block["lines"]:
|
66
|
-
for span in line["spans"]:
|
67
|
-
span_text = safe_get(span, "text", "")
|
68
|
-
span_font_name = safe_get(span, "font", "")
|
69
|
-
span_font_size = safe_get(span, "size", 0)
|
70
|
-
span_font_color = safe_get(span, "color", "")
|
71
|
-
span_font_flags = safe_get(span, "flags", 0)
|
72
|
-
|
73
|
-
span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
|
74
|
-
span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
|
75
|
-
span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
|
76
|
-
span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
|
77
|
-
span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
|
78
|
-
span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
|
79
|
-
span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
|
80
|
-
span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
|
81
|
-
|
82
|
-
span_stats.append(
|
83
|
-
{
|
84
|
-
"span_id": span_id, # id of span
|
85
|
-
"page_id": page_id, # page number of pdf
|
86
|
-
"span_text": span_text, # text of span
|
87
|
-
"span_font_name": span_font_name, # font name of span
|
88
|
-
"span_font_size": span_font_size, # font size of span
|
89
|
-
"span_font_color": span_font_color, # font color of span
|
90
|
-
"span_font_flags": span_font_flags, # font flags of span
|
91
|
-
"span_is_superscript": int(
|
92
|
-
span_is_super_script
|
93
|
-
), # indicate whether the span is super script or not
|
94
|
-
"span_is_italic": int(span_is_italic), # indicate whether the span is italic or not
|
95
|
-
"span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not
|
96
|
-
"span_is_sans_serifed": int(
|
97
|
-
span_is_sans_serifed
|
98
|
-
), # indicate whether the span is sans serifed or not
|
99
|
-
"span_is_monospaced": int(
|
100
|
-
span_is_monospaced
|
101
|
-
), # indicate whether the span is monospaced or not
|
102
|
-
"span_is_proportional": int(
|
103
|
-
span_is_proportional
|
104
|
-
), # indicate whether the span is proportional or not
|
105
|
-
"span_is_bold": int(span_is_bold), # indicate whether the span is bold or not
|
106
|
-
}
|
107
|
-
)
|
108
|
-
|
109
|
-
span_id += 1
|
110
|
-
|
111
|
-
span_stats = pd.DataFrame(span_stats)
|
112
|
-
# print(span_stats)
|
113
|
-
|
114
|
-
return span_stats
|
115
|
-
|
116
|
-
|
117
|
-
def __find_pdf_dic_files(
|
118
|
-
jf_name="pdf_dic.json",
|
119
|
-
base_code_name="code-clean",
|
120
|
-
tgt_base_dir_name="tmp",
|
121
|
-
unittest_dir_name="unittest",
|
122
|
-
md_dir_name="md",
|
123
|
-
book_names=[
|
124
|
-
"scihub",
|
125
|
-
], # other possible values: "zlib", "arxiv" and so on
|
126
|
-
):
|
127
|
-
pdf_dict_files = []
|
128
|
-
|
129
|
-
curr_dir = os.path.dirname(__file__)
|
130
|
-
|
131
|
-
for i in range(len(curr_dir)):
|
132
|
-
if curr_dir[i : i + len(base_code_name)] == base_code_name:
|
133
|
-
base_code_dir_name = curr_dir[: i + len(base_code_name)]
|
134
|
-
for book_name in book_names:
|
135
|
-
search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
|
136
|
-
if os.path.exists(base_code_dir_name):
|
137
|
-
search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
|
138
|
-
for root, dirs, files in os.walk(search_dir_name):
|
139
|
-
for file in files:
|
140
|
-
if file == jf_name:
|
141
|
-
pdf_dict_files.append(os.path.join(root, file))
|
142
|
-
break
|
143
|
-
|
144
|
-
return pdf_dict_files
|
145
|
-
|
146
|
-
|
147
|
-
def combine_span_texts(group_df, span_stats):
|
148
|
-
combined_span_texts = []
|
149
|
-
for _, row in group_df.iterrows():
|
150
|
-
curr_span_id = row.name
|
151
|
-
curr_span_text = row["span_text"]
|
152
|
-
|
153
|
-
pre_span_id = curr_span_id - 1
|
154
|
-
pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
|
155
|
-
|
156
|
-
next_span_id = curr_span_id + 1
|
157
|
-
next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
|
158
|
-
|
159
|
-
# pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
|
160
|
-
pointer_sign = "→ → → "
|
161
|
-
combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
|
162
|
-
combined_span_texts.append(combined_text)
|
163
|
-
|
164
|
-
return "\n\n".join(combined_span_texts)
|
165
|
-
|
166
|
-
|
167
|
-
# pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本
|
168
|
-
pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行
|
169
|
-
|
170
|
-
|
171
|
-
def main():
|
172
|
-
pdf_dict_files = __find_pdf_dic_files()
|
173
|
-
# print(pdf_dict_files)
|
174
|
-
|
175
|
-
span_stats_calc = SpanStatsCalc()
|
176
|
-
|
177
|
-
for pdf_dict_file in pdf_dict_files:
|
178
|
-
print("-" * 100)
|
179
|
-
print_green_on_red(f"Processing {pdf_dict_file}")
|
180
|
-
|
181
|
-
with open(pdf_dict_file, "r", encoding="utf-8") as f:
|
182
|
-
pdf_dict = json.load(f)
|
183
|
-
|
184
|
-
raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
|
185
|
-
save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
|
186
|
-
raw_df.to_csv(save_path, index=False)
|
187
|
-
|
188
|
-
filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
|
189
|
-
if filtered_df.empty:
|
190
|
-
print("No superscript span found!")
|
191
|
-
continue
|
192
|
-
|
193
|
-
filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
|
194
|
-
|
195
|
-
combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore
|
196
|
-
|
197
|
-
final_df = filtered_grouped_df.size().reset_index(name="count")
|
198
|
-
final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
|
199
|
-
|
200
|
-
print(final_df)
|
201
|
-
|
202
|
-
final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
|
203
|
-
|
204
|
-
save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
|
205
|
-
# 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围
|
206
|
-
final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
|
207
|
-
|
208
|
-
# 创建一个 2x2 的图表布局
|
209
|
-
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
|
210
|
-
|
211
|
-
# 按照 span_font_name 分类作图
|
212
|
-
final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
|
213
|
-
|
214
|
-
# 按照 span_font_size 分类作图
|
215
|
-
final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
|
216
|
-
|
217
|
-
# 按照 span_font_color 分类作图
|
218
|
-
final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
|
219
|
-
|
220
|
-
# 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
|
221
|
-
grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
|
222
|
-
grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
|
223
|
-
|
224
|
-
# 调整布局
|
225
|
-
plt.tight_layout()
|
226
|
-
|
227
|
-
# 显示图表
|
228
|
-
# plt.show()
|
229
|
-
|
230
|
-
# 保存图表到 PNG 文件
|
231
|
-
save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
|
232
|
-
plt.savefig(save_path)
|
233
|
-
|
234
|
-
# 清除画布
|
235
|
-
plt.clf()
|
236
|
-
|
237
|
-
|
238
|
-
if __name__ == "__main__":
|
239
|
-
main()
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from collections import Counter
|
2
|
-
|
3
|
-
from magic_pdf.libs.language import detect_lang
|
4
|
-
|
5
|
-
def get_language_from_model(model_list: list):
|
6
|
-
language_lst = []
|
7
|
-
for ocr_page_info in model_list:
|
8
|
-
page_text = ""
|
9
|
-
layout_dets = ocr_page_info["layout_dets"]
|
10
|
-
for layout_det in layout_dets:
|
11
|
-
category_id = layout_det["category_id"]
|
12
|
-
allow_category_id_list = [15]
|
13
|
-
if category_id in allow_category_id_list:
|
14
|
-
page_text += layout_det["text"]
|
15
|
-
page_language = detect_lang(page_text)
|
16
|
-
language_lst.append(page_language)
|
17
|
-
# 统计text_language_list中每种语言的个数
|
18
|
-
count_dict = Counter(language_lst)
|
19
|
-
# 输出text_language_list中出现的次数最多的语言
|
20
|
-
language = max(count_dict, key=count_dict.get)
|
21
|
-
return language
|