magic-pdf 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +5 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/dataset.py +175 -4
- magic_pdf/data/utils.py +2 -2
- magic_pdf/dict2md/ocr_mkcontent.py +2 -2
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/draw_bbox.py +11 -10
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +124 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
- magic_pdf/model/operators.py +190 -0
- magic_pdf/model/pdf_extract_kit.py +20 -1
- magic_pdf/model/sub_modules/model_init.py +13 -3
- magic_pdf/model/sub_modules/model_utils.py +11 -5
- magic_pdf/para/para_split_v3.py +2 -2
- magic_pdf/pdf_parse_by_ocr.py +4 -5
- magic_pdf/pdf_parse_by_txt.py +4 -5
- magic_pdf/pdf_parse_union_core_v2.py +10 -11
- magic_pdf/pipe/AbsPipe.py +3 -2
- magic_pdf/pipe/OCRPipe.py +54 -15
- magic_pdf/pipe/TXTPipe.py +5 -4
- magic_pdf/pipe/UNIPipe.py +82 -30
- magic_pdf/pipe/operators.py +138 -0
- magic_pdf/pre_proc/cut_image.py +2 -2
- magic_pdf/tools/common.py +108 -59
- magic_pdf/user_api.py +47 -24
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +34 -32
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0
magic_pdf/config/constants.py
CHANGED
@@ -48,4 +48,16 @@ class DataWriter(ABC):
|
|
48
48
|
path (str): the target file where to write
|
49
49
|
data (str): the data want to write
|
50
50
|
"""
|
51
|
-
|
51
|
+
|
52
|
+
def safe_encode(data: str, method: str):
|
53
|
+
try:
|
54
|
+
bit_data = data.encode(encoding=method, errors='replace')
|
55
|
+
return bit_data, True
|
56
|
+
except: # noqa
|
57
|
+
return None, False
|
58
|
+
|
59
|
+
for method in ['utf-8', 'ascii']:
|
60
|
+
bit_data, flag = safe_encode(data, method)
|
61
|
+
if flag:
|
62
|
+
self.write(path, bit_data)
|
63
|
+
break
|
magic_pdf/data/dataset.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
+
import os
|
1
2
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import Iterator
|
3
|
+
from typing import Callable, Iterator
|
3
4
|
|
4
5
|
import fitz
|
5
6
|
|
6
7
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
7
8
|
from magic_pdf.data.schemas import PageInfo
|
8
9
|
from magic_pdf.data.utils import fitz_doc_to_image
|
10
|
+
from magic_pdf.filter import classify
|
9
11
|
|
10
12
|
|
11
13
|
class PageableData(ABC):
|
@@ -28,6 +30,32 @@ class PageableData(ABC):
|
|
28
30
|
"""
|
29
31
|
pass
|
30
32
|
|
33
|
+
@abstractmethod
|
34
|
+
def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
|
35
|
+
"""draw rectangle.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
39
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
40
|
+
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
41
|
+
fill_opacity (float): opacity of the fill, range from [0, 1]
|
42
|
+
width (float): the width of board
|
43
|
+
overlay (bool): fill the color in foreground or background. True means fill in background.
|
44
|
+
"""
|
45
|
+
pass
|
46
|
+
|
47
|
+
@abstractmethod
|
48
|
+
def insert_text(self, coord, content, fontsize, color):
|
49
|
+
"""insert text.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
53
|
+
content (str): the text content
|
54
|
+
fontsize (int): font size of the text
|
55
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
56
|
+
"""
|
57
|
+
pass
|
58
|
+
|
31
59
|
|
32
60
|
class Dataset(ABC):
|
33
61
|
@abstractmethod
|
@@ -66,6 +94,43 @@ class Dataset(ABC):
|
|
66
94
|
"""
|
67
95
|
pass
|
68
96
|
|
97
|
+
@abstractmethod
|
98
|
+
def dump_to_file(self, file_path: str):
|
99
|
+
"""Dump the file
|
100
|
+
|
101
|
+
Args:
|
102
|
+
file_path (str): the file path
|
103
|
+
"""
|
104
|
+
pass
|
105
|
+
|
106
|
+
@abstractmethod
|
107
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
108
|
+
"""Apply callable method which.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
proc (Callable): invoke proc as follows:
|
112
|
+
proc(self, *args, **kwargs)
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
Any: return the result generated by proc
|
116
|
+
"""
|
117
|
+
pass
|
118
|
+
|
119
|
+
@abstractmethod
|
120
|
+
def classify(self) -> SupportedPdfParseMethod:
|
121
|
+
"""classify the dataset
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
SupportedPdfParseMethod: _description_
|
125
|
+
"""
|
126
|
+
pass
|
127
|
+
|
128
|
+
@abstractmethod
|
129
|
+
def clone(self):
|
130
|
+
"""clone this dataset
|
131
|
+
"""
|
132
|
+
pass
|
133
|
+
|
69
134
|
|
70
135
|
class PymuDocDataset(Dataset):
|
71
136
|
def __init__(self, bits: bytes):
|
@@ -74,7 +139,8 @@ class PymuDocDataset(Dataset):
|
|
74
139
|
Args:
|
75
140
|
bits (bytes): the bytes of the pdf
|
76
141
|
"""
|
77
|
-
self.
|
142
|
+
self._raw_fitz = fitz.open('pdf', bits)
|
143
|
+
self._records = [Doc(v) for v in self._raw_fitz]
|
78
144
|
self._data_bits = bits
|
79
145
|
self._raw_data = bits
|
80
146
|
|
@@ -109,6 +175,43 @@ class PymuDocDataset(Dataset):
|
|
109
175
|
"""
|
110
176
|
return self._records[page_id]
|
111
177
|
|
178
|
+
def dump_to_file(self, file_path: str):
|
179
|
+
"""Dump the file
|
180
|
+
|
181
|
+
Args:
|
182
|
+
file_path (str): the file path
|
183
|
+
"""
|
184
|
+
|
185
|
+
dir_name = os.path.dirname(file_path)
|
186
|
+
if dir_name not in ('', '.', '..'):
|
187
|
+
os.makedirs(dir_name, exist_ok=True)
|
188
|
+
self._raw_fitz.save(file_path)
|
189
|
+
|
190
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
191
|
+
"""Apply callable method which.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
proc (Callable): invoke proc as follows:
|
195
|
+
proc(dataset, *args, **kwargs)
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
Any: return the result generated by proc
|
199
|
+
"""
|
200
|
+
return proc(self, *args, **kwargs)
|
201
|
+
|
202
|
+
def classify(self) -> SupportedPdfParseMethod:
|
203
|
+
"""classify the dataset
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
SupportedPdfParseMethod: _description_
|
207
|
+
"""
|
208
|
+
return classify(self._data_bits)
|
209
|
+
|
210
|
+
def clone(self):
|
211
|
+
"""clone this dataset
|
212
|
+
"""
|
213
|
+
return PymuDocDataset(self._raw_data)
|
214
|
+
|
112
215
|
|
113
216
|
class ImageDataset(Dataset):
|
114
217
|
def __init__(self, bits: bytes):
|
@@ -118,7 +221,8 @@ class ImageDataset(Dataset):
|
|
118
221
|
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
|
119
222
|
"""
|
120
223
|
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
121
|
-
self.
|
224
|
+
self._raw_fitz = fitz.open('pdf', pdf_bytes)
|
225
|
+
self._records = [Doc(v) for v in self._raw_fitz]
|
122
226
|
self._raw_data = bits
|
123
227
|
self._data_bits = pdf_bytes
|
124
228
|
|
@@ -153,14 +257,50 @@ class ImageDataset(Dataset):
|
|
153
257
|
"""
|
154
258
|
return self._records[page_id]
|
155
259
|
|
260
|
+
def dump_to_file(self, file_path: str):
|
261
|
+
"""Dump the file
|
262
|
+
|
263
|
+
Args:
|
264
|
+
file_path (str): the file path
|
265
|
+
"""
|
266
|
+
dir_name = os.path.dirname(file_path)
|
267
|
+
if dir_name not in ('', '.', '..'):
|
268
|
+
os.makedirs(dir_name, exist_ok=True)
|
269
|
+
self._raw_fitz.save(file_path)
|
270
|
+
|
271
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
272
|
+
"""Apply callable method which.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
proc (Callable): invoke proc as follows:
|
276
|
+
proc(dataset, *args, **kwargs)
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
Any: return the result generated by proc
|
280
|
+
"""
|
281
|
+
return proc(self, *args, **kwargs)
|
282
|
+
|
283
|
+
def classify(self) -> SupportedPdfParseMethod:
|
284
|
+
"""classify the dataset
|
285
|
+
|
286
|
+
Returns:
|
287
|
+
SupportedPdfParseMethod: _description_
|
288
|
+
"""
|
289
|
+
return SupportedPdfParseMethod.OCR
|
290
|
+
|
291
|
+
def clone(self):
|
292
|
+
"""clone this dataset
|
293
|
+
"""
|
294
|
+
return ImageDataset(self._raw_data)
|
156
295
|
|
157
296
|
class Doc(PageableData):
|
158
297
|
"""Initialized with pymudoc object."""
|
298
|
+
|
159
299
|
def __init__(self, doc: fitz.Page):
|
160
300
|
self._doc = doc
|
161
301
|
|
162
302
|
def get_image(self):
|
163
|
-
"""Return the
|
303
|
+
"""Return the image info.
|
164
304
|
|
165
305
|
Returns:
|
166
306
|
dict: {
|
@@ -192,3 +332,34 @@ class Doc(PageableData):
|
|
192
332
|
def __getattr__(self, name):
|
193
333
|
if hasattr(self._doc, name):
|
194
334
|
return getattr(self._doc, name)
|
335
|
+
|
336
|
+
def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
|
337
|
+
"""draw rectangle.
|
338
|
+
|
339
|
+
Args:
|
340
|
+
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
341
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
342
|
+
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
343
|
+
fill_opacity (float): opacity of the fill, range from [0, 1]
|
344
|
+
width (float): the width of board
|
345
|
+
overlay (bool): fill the color in foreground or background. True means fill in background.
|
346
|
+
"""
|
347
|
+
self._doc.draw_rect(
|
348
|
+
rect_coords,
|
349
|
+
color=color,
|
350
|
+
fill=fill,
|
351
|
+
fill_opacity=fill_opacity,
|
352
|
+
width=width,
|
353
|
+
overlay=overlay,
|
354
|
+
)
|
355
|
+
|
356
|
+
def insert_text(self, coord, content, fontsize, color):
|
357
|
+
"""insert text.
|
358
|
+
|
359
|
+
Args:
|
360
|
+
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
361
|
+
content (str): the text content
|
362
|
+
fontsize (int): font size of the text
|
363
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
364
|
+
"""
|
365
|
+
self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
|
magic_pdf/data/utils.py
CHANGED
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
20
20
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
21
21
|
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
22
22
|
|
23
|
-
# If the width or height exceeds
|
24
|
-
if pm.width >
|
23
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
24
|
+
if pm.width > 4500 or pm.height > 4500:
|
25
25
|
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
26
26
|
|
27
27
|
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
@@ -165,8 +165,8 @@ def merge_para_with_text(para_block):
|
|
165
165
|
if content:
|
166
166
|
langs = ['zh', 'ja', 'ko']
|
167
167
|
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
168
|
-
if block_lang in langs: #
|
169
|
-
if j == len(line['spans']) - 1:
|
168
|
+
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
169
|
+
if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
|
170
170
|
para_text += content
|
171
171
|
else:
|
172
172
|
para_text += f'{content} '
|
magic_pdf/filter/__init__.py
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
from magic_pdf.config.drop_reason import DropReason
|
3
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
4
|
+
from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
|
5
|
+
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
6
|
+
|
7
|
+
|
8
|
+
def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
|
9
|
+
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
10
|
+
pdf_meta = pdf_meta_scan(pdf_bytes)
|
11
|
+
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
12
|
+
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
13
|
+
else:
|
14
|
+
is_encrypted = pdf_meta['is_encrypted']
|
15
|
+
is_needs_password = pdf_meta['is_needs_password']
|
16
|
+
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
17
|
+
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
18
|
+
else:
|
19
|
+
is_text_pdf, results = do_classify(
|
20
|
+
pdf_meta['total_page'],
|
21
|
+
pdf_meta['page_width_pts'],
|
22
|
+
pdf_meta['page_height_pts'],
|
23
|
+
pdf_meta['image_info_per_page'],
|
24
|
+
pdf_meta['text_len_per_page'],
|
25
|
+
pdf_meta['imgs_per_page'],
|
26
|
+
pdf_meta['text_layout_per_page'],
|
27
|
+
pdf_meta['invalid_chars'],
|
28
|
+
)
|
29
|
+
if is_text_pdf:
|
30
|
+
return SupportedPdfParseMethod.TXT
|
31
|
+
else:
|
32
|
+
return SupportedPdfParseMethod.OCR
|
@@ -8,7 +8,7 @@ from loguru import logger
|
|
8
8
|
from magic_pdf.config.drop_reason import DropReason
|
9
9
|
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
10
10
|
from magic_pdf.libs.language import detect_lang
|
11
|
-
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
|
11
|
+
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
|
12
12
|
|
13
13
|
scan_max_page = 50
|
14
14
|
junk_limit_min = 10
|
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
|
|
323
323
|
|
324
324
|
def check_invalid_chars(pdf_bytes):
|
325
325
|
"""乱码检测."""
|
326
|
-
return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
326
|
+
# return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
327
|
+
return detect_invalid_chars(pdf_bytes)
|
327
328
|
|
328
329
|
|
329
330
|
def pdf_meta_scan(pdf_bytes: bytes):
|
magic_pdf/libs/draw_bbox.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
import fitz
|
2
2
|
from magic_pdf.config.constants import CROSS_PAGE
|
3
|
-
from magic_pdf.config.ocr_content_type import BlockType, CategoryId,
|
4
|
-
|
3
|
+
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
|
4
|
+
ContentType)
|
5
|
+
from magic_pdf.data.dataset import Dataset
|
5
6
|
from magic_pdf.model.magic_model import MagicModel
|
6
7
|
|
7
8
|
|
@@ -194,7 +195,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
194
195
|
)
|
195
196
|
|
196
197
|
# Save the PDF
|
197
|
-
pdf_docs.save(f'{out_path}/{filename}
|
198
|
+
pdf_docs.save(f'{out_path}/{filename}')
|
198
199
|
|
199
200
|
|
200
201
|
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
@@ -282,18 +283,17 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
282
283
|
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
|
283
284
|
|
284
285
|
# Save the PDF
|
285
|
-
pdf_docs.save(f'{out_path}/{filename}
|
286
|
+
pdf_docs.save(f'{out_path}/{filename}')
|
286
287
|
|
287
288
|
|
288
|
-
def draw_model_bbox(model_list:
|
289
|
+
def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
|
289
290
|
dropped_bbox_list = []
|
290
291
|
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
|
291
292
|
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
|
292
293
|
titles_list = []
|
293
294
|
texts_list = []
|
294
295
|
interequations_list = []
|
295
|
-
|
296
|
-
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
|
296
|
+
magic_model = MagicModel(model_list, dataset)
|
297
297
|
for i in range(len(model_list)):
|
298
298
|
page_dropped_list = []
|
299
299
|
tables_body, tables_caption, tables_footnote = [], [], []
|
@@ -337,7 +337,8 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
|
337
337
|
dropped_bbox_list.append(page_dropped_list)
|
338
338
|
imgs_footnote_list.append(imgs_footnote)
|
339
339
|
|
340
|
-
for i
|
340
|
+
for i in range(len(dataset)):
|
341
|
+
page = dataset.get_page(i)
|
341
342
|
draw_bbox_with_number(
|
342
343
|
i, dropped_bbox_list, page, [158, 158, 158], True
|
343
344
|
) # color !
|
@@ -352,7 +353,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
|
352
353
|
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
|
353
354
|
|
354
355
|
# Save the PDF
|
355
|
-
|
356
|
+
dataset.dump_to_file(f'{out_path}/{filename}')
|
356
357
|
|
357
358
|
|
358
359
|
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
@@ -390,7 +391,7 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
390
391
|
for i, page in enumerate(pdf_docs):
|
391
392
|
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
392
393
|
|
393
|
-
pdf_docs.save(f'{out_path}/{filename}
|
394
|
+
pdf_docs.save(f'{out_path}/{filename}')
|
394
395
|
|
395
396
|
|
396
397
|
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
magic_pdf/libs/pdf_check.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
import fitz
|
2
2
|
import numpy as np
|
3
3
|
from loguru import logger
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
import re
|
5
|
+
from io import BytesIO
|
6
|
+
from pdfminer.high_level import extract_text
|
7
7
|
|
8
8
|
|
9
9
|
def calculate_sample_count(total_page: int):
|
@@ -33,33 +33,33 @@ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
|
|
33
33
|
return sample_docs
|
34
34
|
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
#
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
36
|
+
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
37
|
+
""""
|
38
|
+
检测PDF中是否包含非法字符
|
39
|
+
"""
|
40
|
+
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
41
|
+
sample_docs = extract_pages(src_pdf_bytes)
|
42
|
+
sample_pdf_bytes = sample_docs.tobytes()
|
43
|
+
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
44
|
+
text = extract_text(sample_pdf_file_like_object)
|
45
|
+
text = text.replace("\n", "")
|
46
|
+
# logger.info(text)
|
47
|
+
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
48
|
+
cid_pattern = re.compile(r'\(cid:\d+\)')
|
49
|
+
matches = cid_pattern.findall(text)
|
50
|
+
cid_count = len(matches)
|
51
|
+
cid_len = sum(len(match) for match in matches)
|
52
|
+
text_len = len(text)
|
53
|
+
if text_len == 0:
|
54
|
+
cid_chars_radio = 0
|
55
|
+
else:
|
56
|
+
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
57
|
+
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
58
|
+
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
59
|
+
if cid_chars_radio > 0.05:
|
60
|
+
return False # 乱码文档
|
61
|
+
else:
|
62
|
+
return True # 正常文档
|
63
63
|
|
64
64
|
|
65
65
|
def count_replacement_characters(text: str) -> int:
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.6"
|
magic_pdf/model/__init__.py
CHANGED
@@ -1,2 +1,126 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
|
5
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
+
from magic_pdf.data.dataset import Dataset
|
7
|
+
from magic_pdf.pipe.operators import PipeResult
|
8
|
+
|
9
|
+
|
1
10
|
__use_inside_model__ = True
|
2
11
|
__model_mode__ = "full"
|
12
|
+
|
13
|
+
|
14
|
+
class InferenceResultBase(ABC):
|
15
|
+
|
16
|
+
@abstractmethod
|
17
|
+
def __init__(self, inference_results: list, dataset: Dataset):
|
18
|
+
"""Initialized method.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
inference_results (list): the inference result generated by model
|
22
|
+
dataset (Dataset): the dataset related with model inference result
|
23
|
+
"""
|
24
|
+
self._infer_res = inference_results
|
25
|
+
self._dataset = dataset
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def draw_model(self, file_path: str) -> None:
|
29
|
+
"""Draw model inference result.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
file_path (str): the output file path
|
33
|
+
"""
|
34
|
+
pass
|
35
|
+
|
36
|
+
@abstractmethod
|
37
|
+
def dump_model(self, writer: DataWriter, file_path: str):
|
38
|
+
"""Dump model inference result to file.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
writer (DataWriter): writer handle
|
42
|
+
file_path (str): the location of target file
|
43
|
+
"""
|
44
|
+
pass
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def get_infer_res(self):
|
48
|
+
"""Get the inference result.
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
list: the inference result generated by model
|
52
|
+
"""
|
53
|
+
pass
|
54
|
+
|
55
|
+
@abstractmethod
|
56
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
57
|
+
"""Apply callable method which.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
proc (Callable): invoke proc as follows:
|
61
|
+
proc(inference_result, *args, **kwargs)
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
Any: return the result generated by proc
|
65
|
+
"""
|
66
|
+
pass
|
67
|
+
|
68
|
+
@abstractmethod
|
69
|
+
def pipe_auto_mode(
|
70
|
+
self,
|
71
|
+
imageWriter: DataWriter,
|
72
|
+
start_page_id=0,
|
73
|
+
end_page_id=None,
|
74
|
+
debug_mode=False,
|
75
|
+
lang=None,
|
76
|
+
) -> PipeResult:
|
77
|
+
"""Post-proc the model inference result.
|
78
|
+
step1: classify the dataset type
|
79
|
+
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
|
80
|
+
|
81
|
+
Args:
|
82
|
+
imageWriter (DataWriter): the image writer handle
|
83
|
+
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
84
|
+
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
85
|
+
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
86
|
+
lang (str, optional): Defaults to None.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
PipeResult: the result
|
90
|
+
"""
|
91
|
+
pass
|
92
|
+
|
93
|
+
@abstractmethod
|
94
|
+
def pipe_txt_mode(
|
95
|
+
self,
|
96
|
+
imageWriter: DataWriter,
|
97
|
+
start_page_id=0,
|
98
|
+
end_page_id=None,
|
99
|
+
debug_mode=False,
|
100
|
+
lang=None,
|
101
|
+
) -> PipeResult:
|
102
|
+
"""Post-proc the model inference result, Extract the text using the
|
103
|
+
third library, such as `pymupdf`
|
104
|
+
|
105
|
+
Args:
|
106
|
+
imageWriter (DataWriter): the image writer handle
|
107
|
+
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
108
|
+
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
109
|
+
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
110
|
+
lang (str, optional): Defaults to None.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
PipeResult: the result
|
114
|
+
"""
|
115
|
+
pass
|
116
|
+
|
117
|
+
@abstractmethod
|
118
|
+
def pipe_ocr_mode(
|
119
|
+
self,
|
120
|
+
imageWriter: DataWriter,
|
121
|
+
start_page_id=0,
|
122
|
+
end_page_id=None,
|
123
|
+
debug_mode=False,
|
124
|
+
lang=None,
|
125
|
+
) -> PipeResult:
|
126
|
+
pass
|