magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
magic_pdf/config/constants.py
CHANGED
magic_pdf/config/exceptions.py
CHANGED
@@ -48,4 +48,16 @@ class DataWriter(ABC):
|
|
48
48
|
path (str): the target file where to write
|
49
49
|
data (str): the data want to write
|
50
50
|
"""
|
51
|
-
|
51
|
+
|
52
|
+
def safe_encode(data: str, method: str):
|
53
|
+
try:
|
54
|
+
bit_data = data.encode(encoding=method, errors='replace')
|
55
|
+
return bit_data, True
|
56
|
+
except: # noqa
|
57
|
+
return None, False
|
58
|
+
|
59
|
+
for method in ['utf-8', 'ascii']:
|
60
|
+
bit_data, flag = safe_encode(data, method)
|
61
|
+
if flag:
|
62
|
+
self.write(path, bit_data)
|
63
|
+
break
|
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
|
|
55
55
|
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
|
56
56
|
fn_path = os.path.join(self._parent_dir, path)
|
57
57
|
|
58
|
-
if not os.path.exists(os.path.dirname(fn_path)):
|
58
|
+
if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
|
59
59
|
os.makedirs(os.path.dirname(fn_path), exist_ok=True)
|
60
60
|
|
61
61
|
with open(fn_path, 'wb') as f:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
|
3
3
|
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
|
4
4
|
from magic_pdf.data.io.s3 import S3Reader, S3Writer
|
@@ -22,10 +22,10 @@ class MultiS3Mixin:
|
|
22
22
|
"""
|
23
23
|
if len(default_prefix) == 0:
|
24
24
|
raise InvalidConfig('default_prefix must be provided')
|
25
|
-
|
26
|
-
arr = default_prefix.strip(
|
25
|
+
|
26
|
+
arr = default_prefix.strip('/').split('/')
|
27
27
|
self.default_bucket = arr[0]
|
28
|
-
self.default_prefix =
|
28
|
+
self.default_prefix = '/'.join(arr[1:])
|
29
29
|
|
30
30
|
found_default_bucket_config = False
|
31
31
|
for conf in s3_configs:
|
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
|
|
103
103
|
s3_reader = self.__get_s3_client(bucket_name)
|
104
104
|
else:
|
105
105
|
s3_reader = self.__get_s3_client(self.default_bucket)
|
106
|
-
|
106
|
+
if self.default_prefix:
|
107
|
+
path = self.default_prefix + '/' + path
|
107
108
|
return s3_reader.read_at(path, offset, limit)
|
108
109
|
|
109
110
|
|
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
|
|
139
140
|
s3_writer = self.__get_s3_client(bucket_name)
|
140
141
|
else:
|
141
142
|
s3_writer = self.__get_s3_client(self.default_bucket)
|
142
|
-
|
143
|
+
if self.default_prefix:
|
144
|
+
path = self.default_prefix + '/' + path
|
143
145
|
return s3_writer.write(path, data)
|
magic_pdf/data/dataset.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
+
import os
|
1
2
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import Iterator
|
3
|
+
from typing import Callable, Iterator
|
3
4
|
|
4
5
|
import fitz
|
6
|
+
from loguru import logger
|
5
7
|
|
6
8
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
7
9
|
from magic_pdf.data.schemas import PageInfo
|
8
10
|
from magic_pdf.data.utils import fitz_doc_to_image
|
11
|
+
from magic_pdf.filter import classify
|
9
12
|
|
10
13
|
|
11
14
|
class PageableData(ABC):
|
@@ -28,6 +31,32 @@ class PageableData(ABC):
|
|
28
31
|
"""
|
29
32
|
pass
|
30
33
|
|
34
|
+
@abstractmethod
|
35
|
+
def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
|
36
|
+
"""draw rectangle.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
40
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
41
|
+
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
42
|
+
fill_opacity (float): opacity of the fill, range from [0, 1]
|
43
|
+
width (float): the width of board
|
44
|
+
overlay (bool): fill the color in foreground or background. True means fill in background.
|
45
|
+
"""
|
46
|
+
pass
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def insert_text(self, coord, content, fontsize, color):
|
50
|
+
"""insert text.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
54
|
+
content (str): the text content
|
55
|
+
fontsize (int): font size of the text
|
56
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
57
|
+
"""
|
58
|
+
pass
|
59
|
+
|
31
60
|
|
32
61
|
class Dataset(ABC):
|
33
62
|
@abstractmethod
|
@@ -66,18 +95,65 @@ class Dataset(ABC):
|
|
66
95
|
"""
|
67
96
|
pass
|
68
97
|
|
98
|
+
@abstractmethod
|
99
|
+
def dump_to_file(self, file_path: str):
|
100
|
+
"""Dump the file
|
101
|
+
|
102
|
+
Args:
|
103
|
+
file_path (str): the file path
|
104
|
+
"""
|
105
|
+
pass
|
106
|
+
|
107
|
+
@abstractmethod
|
108
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
109
|
+
"""Apply callable method which.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
proc (Callable): invoke proc as follows:
|
113
|
+
proc(self, *args, **kwargs)
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
Any: return the result generated by proc
|
117
|
+
"""
|
118
|
+
pass
|
119
|
+
|
120
|
+
@abstractmethod
|
121
|
+
def classify(self) -> SupportedPdfParseMethod:
|
122
|
+
"""classify the dataset
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
SupportedPdfParseMethod: _description_
|
126
|
+
"""
|
127
|
+
pass
|
128
|
+
|
129
|
+
@abstractmethod
|
130
|
+
def clone(self):
|
131
|
+
"""clone this dataset
|
132
|
+
"""
|
133
|
+
pass
|
134
|
+
|
69
135
|
|
70
136
|
class PymuDocDataset(Dataset):
|
71
|
-
def __init__(self, bits: bytes):
|
137
|
+
def __init__(self, bits: bytes, lang=None):
|
72
138
|
"""Initialize the dataset, which wraps the pymudoc documents.
|
73
139
|
|
74
140
|
Args:
|
75
141
|
bits (bytes): the bytes of the pdf
|
76
142
|
"""
|
77
|
-
self.
|
143
|
+
self._raw_fitz = fitz.open('pdf', bits)
|
144
|
+
self._records = [Doc(v) for v in self._raw_fitz]
|
78
145
|
self._data_bits = bits
|
79
146
|
self._raw_data = bits
|
80
147
|
|
148
|
+
if lang == '':
|
149
|
+
self._lang = None
|
150
|
+
elif lang == 'auto':
|
151
|
+
from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
|
152
|
+
self._lang = auto_detect_lang(bits)
|
153
|
+
logger.info(f"lang: {lang}, detect_lang: {self._lang}")
|
154
|
+
else:
|
155
|
+
self._lang = lang
|
156
|
+
logger.info(f"lang: {lang}")
|
81
157
|
def __len__(self) -> int:
|
82
158
|
"""The page number of the pdf."""
|
83
159
|
return len(self._records)
|
@@ -109,6 +185,45 @@ class PymuDocDataset(Dataset):
|
|
109
185
|
"""
|
110
186
|
return self._records[page_id]
|
111
187
|
|
188
|
+
def dump_to_file(self, file_path: str):
|
189
|
+
"""Dump the file
|
190
|
+
|
191
|
+
Args:
|
192
|
+
file_path (str): the file path
|
193
|
+
"""
|
194
|
+
|
195
|
+
dir_name = os.path.dirname(file_path)
|
196
|
+
if dir_name not in ('', '.', '..'):
|
197
|
+
os.makedirs(dir_name, exist_ok=True)
|
198
|
+
self._raw_fitz.save(file_path)
|
199
|
+
|
200
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
201
|
+
"""Apply callable method which.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
proc (Callable): invoke proc as follows:
|
205
|
+
proc(dataset, *args, **kwargs)
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
Any: return the result generated by proc
|
209
|
+
"""
|
210
|
+
if 'lang' in kwargs and self._lang is not None:
|
211
|
+
kwargs['lang'] = self._lang
|
212
|
+
return proc(self, *args, **kwargs)
|
213
|
+
|
214
|
+
def classify(self) -> SupportedPdfParseMethod:
|
215
|
+
"""classify the dataset
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
SupportedPdfParseMethod: _description_
|
219
|
+
"""
|
220
|
+
return classify(self._data_bits)
|
221
|
+
|
222
|
+
def clone(self):
|
223
|
+
"""clone this dataset
|
224
|
+
"""
|
225
|
+
return PymuDocDataset(self._raw_data)
|
226
|
+
|
112
227
|
|
113
228
|
class ImageDataset(Dataset):
|
114
229
|
def __init__(self, bits: bytes):
|
@@ -118,7 +233,8 @@ class ImageDataset(Dataset):
|
|
118
233
|
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
|
119
234
|
"""
|
120
235
|
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
121
|
-
self.
|
236
|
+
self._raw_fitz = fitz.open('pdf', pdf_bytes)
|
237
|
+
self._records = [Doc(v) for v in self._raw_fitz]
|
122
238
|
self._raw_data = bits
|
123
239
|
self._data_bits = pdf_bytes
|
124
240
|
|
@@ -153,14 +269,50 @@ class ImageDataset(Dataset):
|
|
153
269
|
"""
|
154
270
|
return self._records[page_id]
|
155
271
|
|
272
|
+
def dump_to_file(self, file_path: str):
|
273
|
+
"""Dump the file
|
274
|
+
|
275
|
+
Args:
|
276
|
+
file_path (str): the file path
|
277
|
+
"""
|
278
|
+
dir_name = os.path.dirname(file_path)
|
279
|
+
if dir_name not in ('', '.', '..'):
|
280
|
+
os.makedirs(dir_name, exist_ok=True)
|
281
|
+
self._raw_fitz.save(file_path)
|
282
|
+
|
283
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
284
|
+
"""Apply callable method which.
|
285
|
+
|
286
|
+
Args:
|
287
|
+
proc (Callable): invoke proc as follows:
|
288
|
+
proc(dataset, *args, **kwargs)
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
Any: return the result generated by proc
|
292
|
+
"""
|
293
|
+
return proc(self, *args, **kwargs)
|
294
|
+
|
295
|
+
def classify(self) -> SupportedPdfParseMethod:
|
296
|
+
"""classify the dataset
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
SupportedPdfParseMethod: _description_
|
300
|
+
"""
|
301
|
+
return SupportedPdfParseMethod.OCR
|
302
|
+
|
303
|
+
def clone(self):
|
304
|
+
"""clone this dataset
|
305
|
+
"""
|
306
|
+
return ImageDataset(self._raw_data)
|
156
307
|
|
157
308
|
class Doc(PageableData):
|
158
309
|
"""Initialized with pymudoc object."""
|
310
|
+
|
159
311
|
def __init__(self, doc: fitz.Page):
|
160
312
|
self._doc = doc
|
161
313
|
|
162
314
|
def get_image(self):
|
163
|
-
"""Return the
|
315
|
+
"""Return the image info.
|
164
316
|
|
165
317
|
Returns:
|
166
318
|
dict: {
|
@@ -192,3 +344,34 @@ class Doc(PageableData):
|
|
192
344
|
def __getattr__(self, name):
|
193
345
|
if hasattr(self._doc, name):
|
194
346
|
return getattr(self._doc, name)
|
347
|
+
|
348
|
+
def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
|
349
|
+
"""draw rectangle.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
353
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
354
|
+
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
355
|
+
fill_opacity (float): opacity of the fill, range from [0, 1]
|
356
|
+
width (float): the width of board
|
357
|
+
overlay (bool): fill the color in foreground or background. True means fill in background.
|
358
|
+
"""
|
359
|
+
self._doc.draw_rect(
|
360
|
+
rect_coords,
|
361
|
+
color=color,
|
362
|
+
fill=fill,
|
363
|
+
fill_opacity=fill_opacity,
|
364
|
+
width=width,
|
365
|
+
overlay=overlay,
|
366
|
+
)
|
367
|
+
|
368
|
+
def insert_text(self, coord, content, fontsize, color):
|
369
|
+
"""insert text.
|
370
|
+
|
371
|
+
Args:
|
372
|
+
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
373
|
+
content (str): the text content
|
374
|
+
fontsize (int): font size of the text
|
375
|
+
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
376
|
+
"""
|
377
|
+
self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
|
magic_pdf/data/read_api.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
import tempfile
|
4
|
+
import shutil
|
3
5
|
from pathlib import Path
|
4
6
|
|
5
7
|
from magic_pdf.config.exceptions import EmptyData, InvalidParams
|
6
8
|
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
7
9
|
MultiBucketS3DataReader)
|
8
10
|
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
|
9
|
-
|
11
|
+
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
|
10
12
|
|
11
13
|
def read_jsonl(
|
12
14
|
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
|
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
|
|
58
60
|
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
|
59
61
|
"""
|
60
62
|
if os.path.isdir(path):
|
61
|
-
reader = FileBasedDataReader(
|
62
|
-
|
63
|
-
|
64
|
-
for
|
65
|
-
|
63
|
+
reader = FileBasedDataReader()
|
64
|
+
ret = []
|
65
|
+
for root, _, files in os.walk(path):
|
66
|
+
for file in files:
|
67
|
+
suffix = file.split('.')
|
68
|
+
if suffix[-1] == 'pdf':
|
69
|
+
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
|
70
|
+
return ret
|
66
71
|
else:
|
67
72
|
reader = FileBasedDataReader()
|
68
73
|
bits = reader.read(path)
|
69
74
|
return [PymuDocDataset(bits)]
|
70
75
|
|
76
|
+
def read_local_office(path: str) -> list[PymuDocDataset]:
|
77
|
+
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
|
71
78
|
|
72
|
-
|
79
|
+
Args:
|
80
|
+
path (str): ms-office file or directory that contains ms-office files
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
|
84
|
+
|
85
|
+
Raises:
|
86
|
+
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
|
87
|
+
FileNotFoundError: File not Found
|
88
|
+
Exception: Unknown Exception raised
|
89
|
+
"""
|
90
|
+
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
|
91
|
+
fns = []
|
92
|
+
ret = []
|
93
|
+
if os.path.isdir(path):
|
94
|
+
for root, _, files in os.walk(path):
|
95
|
+
for file in files:
|
96
|
+
suffix = Path(file).suffix
|
97
|
+
if suffix in suffixes:
|
98
|
+
fns.append((os.path.join(root, file)))
|
99
|
+
else:
|
100
|
+
fns.append(path)
|
101
|
+
|
102
|
+
reader = FileBasedDataReader()
|
103
|
+
temp_dir = tempfile.mkdtemp()
|
104
|
+
for fn in fns:
|
105
|
+
try:
|
106
|
+
convert_file_to_pdf(fn, temp_dir)
|
107
|
+
except ConvertToPdfError as e:
|
108
|
+
raise e
|
109
|
+
except FileNotFoundError as e:
|
110
|
+
raise e
|
111
|
+
except Exception as e:
|
112
|
+
raise e
|
113
|
+
fn_path = Path(fn)
|
114
|
+
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
|
115
|
+
ret.append(PymuDocDataset(reader.read(pdf_fn)))
|
116
|
+
shutil.rmtree(temp_dir)
|
117
|
+
return ret
|
118
|
+
|
119
|
+
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
|
73
120
|
"""Read images from path or directory.
|
74
121
|
|
75
122
|
Args:
|
76
123
|
path (str): image file path or directory that contains image files
|
77
|
-
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
|
124
|
+
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
|
78
125
|
|
79
126
|
Returns:
|
80
127
|
list[ImageDataset]: each image file will converted to a ImageDataset
|
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
|
|
82
129
|
if os.path.isdir(path):
|
83
130
|
imgs_bits = []
|
84
131
|
s_suffixes = set(suffixes)
|
85
|
-
reader = FileBasedDataReader(
|
132
|
+
reader = FileBasedDataReader()
|
86
133
|
for root, _, files in os.walk(path):
|
87
134
|
for file in files:
|
88
|
-
suffix = file.
|
89
|
-
if suffix
|
90
|
-
imgs_bits.append(reader.read(file))
|
135
|
+
suffix = Path(file).suffix
|
136
|
+
if suffix in s_suffixes:
|
137
|
+
imgs_bits.append(reader.read(os.path.join(root, file)))
|
91
138
|
return [ImageDataset(bits) for bits in imgs_bits]
|
92
139
|
else:
|
93
140
|
reader = FileBasedDataReader()
|
magic_pdf/data/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
|
2
2
|
import fitz
|
3
3
|
import numpy as np
|
4
|
+
from loguru import logger
|
4
5
|
|
5
6
|
from magic_pdf.utils.annotations import ImportPIL
|
6
7
|
|
@@ -30,3 +31,37 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
30
31
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
31
32
|
|
32
33
|
return img_dict
|
34
|
+
|
35
|
+
@ImportPIL
|
36
|
+
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
37
|
+
from PIL import Image
|
38
|
+
images = []
|
39
|
+
with fitz.open('pdf', pdf_bytes) as doc:
|
40
|
+
pdf_page_num = doc.page_count
|
41
|
+
end_page_id = (
|
42
|
+
end_page_id
|
43
|
+
if end_page_id is not None and end_page_id >= 0
|
44
|
+
else pdf_page_num - 1
|
45
|
+
)
|
46
|
+
if end_page_id > pdf_page_num - 1:
|
47
|
+
logger.warning('end_page_id is out of range, use images length')
|
48
|
+
end_page_id = pdf_page_num - 1
|
49
|
+
|
50
|
+
for index in range(0, doc.page_count):
|
51
|
+
if start_page_id <= index <= end_page_id:
|
52
|
+
page = doc[index]
|
53
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
54
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
|
+
|
56
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
57
|
+
if pm.width > 4500 or pm.height > 4500:
|
58
|
+
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
59
|
+
|
60
|
+
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
61
|
+
img = np.array(img)
|
62
|
+
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
63
|
+
else:
|
64
|
+
img_dict = {'img': [], 'width': 0, 'height': 0}
|
65
|
+
|
66
|
+
images.append(img_dict)
|
67
|
+
return images
|
@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
|
7
7
|
from magic_pdf.libs.commons import join_path
|
8
8
|
from magic_pdf.libs.language import detect_lang
|
9
9
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
10
|
-
from magic_pdf.
|
10
|
+
from magic_pdf.post_proc.para_split_v3 import ListLineTag
|
11
11
|
|
12
12
|
|
13
13
|
def __is_hyphen_at_line_end(line):
|
@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
61
61
|
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
|
62
62
|
para_text = merge_para_with_text(para_block)
|
63
63
|
elif para_type == BlockType.Title:
|
64
|
-
|
64
|
+
title_level = get_title_level(para_block)
|
65
|
+
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
|
65
66
|
elif para_type == BlockType.InterlineEquation:
|
66
67
|
para_text = merge_para_with_text(para_block)
|
67
68
|
elif para_type == BlockType.Image:
|
@@ -125,16 +126,6 @@ def detect_language(text):
|
|
125
126
|
return 'empty'
|
126
127
|
|
127
128
|
|
128
|
-
# 连写字符拆分
|
129
|
-
def __replace_ligatures(text: str):
|
130
|
-
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
|
131
|
-
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
|
132
|
-
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
|
133
|
-
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
|
134
|
-
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
|
135
|
-
return text
|
136
|
-
|
137
|
-
|
138
129
|
def merge_para_with_text(para_block):
|
139
130
|
block_text = ''
|
140
131
|
for line in para_block['lines']:
|
@@ -165,8 +156,8 @@ def merge_para_with_text(para_block):
|
|
165
156
|
if content:
|
166
157
|
langs = ['zh', 'ja', 'ko']
|
167
158
|
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
168
|
-
if block_lang in langs: #
|
169
|
-
if j == len(line['spans']) - 1:
|
159
|
+
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
160
|
+
if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
|
170
161
|
para_text += content
|
171
162
|
else:
|
172
163
|
para_text += f'{content} '
|
@@ -196,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
196
187
|
'text': merge_para_with_text(para_block),
|
197
188
|
}
|
198
189
|
elif para_type == BlockType.Title:
|
190
|
+
title_level = get_title_level(para_block)
|
199
191
|
para_content = {
|
200
192
|
'type': 'text',
|
201
193
|
'text': merge_para_with_text(para_block),
|
202
|
-
'text_level':
|
194
|
+
'text_level': title_level,
|
203
195
|
}
|
204
196
|
elif para_type == BlockType.InterlineEquation:
|
205
197
|
para_content = {
|
@@ -299,3 +291,12 @@ def union_make(pdf_info_dict: list,
|
|
299
291
|
return '\n\n'.join(output_content)
|
300
292
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
301
293
|
return output_content
|
294
|
+
|
295
|
+
|
296
|
+
def get_title_level(block):
|
297
|
+
title_level = block.get('level', 1)
|
298
|
+
if title_level > 4:
|
299
|
+
title_level = 4
|
300
|
+
elif title_level < 1:
|
301
|
+
title_level = 1
|
302
|
+
return title_level
|
magic_pdf/filter/__init__.py
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
from magic_pdf.config.drop_reason import DropReason
|
3
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
4
|
+
from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
|
5
|
+
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
6
|
+
|
7
|
+
|
8
|
+
def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
|
9
|
+
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
10
|
+
pdf_meta = pdf_meta_scan(pdf_bytes)
|
11
|
+
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
12
|
+
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
13
|
+
else:
|
14
|
+
is_encrypted = pdf_meta['is_encrypted']
|
15
|
+
is_needs_password = pdf_meta['is_needs_password']
|
16
|
+
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
17
|
+
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
18
|
+
else:
|
19
|
+
is_text_pdf, results = do_classify(
|
20
|
+
pdf_meta['total_page'],
|
21
|
+
pdf_meta['page_width_pts'],
|
22
|
+
pdf_meta['page_height_pts'],
|
23
|
+
pdf_meta['image_info_per_page'],
|
24
|
+
pdf_meta['text_len_per_page'],
|
25
|
+
pdf_meta['imgs_per_page'],
|
26
|
+
pdf_meta['text_layout_per_page'],
|
27
|
+
pdf_meta['invalid_chars'],
|
28
|
+
)
|
29
|
+
if is_text_pdf:
|
30
|
+
return SupportedPdfParseMethod.TXT
|
31
|
+
else:
|
32
|
+
return SupportedPdfParseMethod.OCR
|
@@ -8,7 +8,7 @@ from loguru import logger
|
|
8
8
|
from magic_pdf.config.drop_reason import DropReason
|
9
9
|
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
10
10
|
from magic_pdf.libs.language import detect_lang
|
11
|
-
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
|
11
|
+
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
|
12
12
|
|
13
13
|
scan_max_page = 50
|
14
14
|
junk_limit_min = 10
|
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
|
|
323
323
|
|
324
324
|
def check_invalid_chars(pdf_bytes):
|
325
325
|
"""乱码检测."""
|
326
|
-
return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
326
|
+
# return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
327
|
+
return detect_invalid_chars(pdf_bytes)
|
327
328
|
|
328
329
|
|
329
330
|
def pdf_meta_scan(pdf_bytes: bytes):
|
magic_pdf/libs/clean_memory.py
CHANGED
@@ -3,8 +3,15 @@ import torch
|
|
3
3
|
import gc
|
4
4
|
|
5
5
|
|
6
|
-
def clean_memory():
|
7
|
-
if
|
8
|
-
torch.cuda.
|
9
|
-
|
6
|
+
def clean_memory(device='cuda'):
|
7
|
+
if device == 'cuda':
|
8
|
+
if torch.cuda.is_available():
|
9
|
+
torch.cuda.empty_cache()
|
10
|
+
torch.cuda.ipc_collect()
|
11
|
+
elif str(device).startswith("npu"):
|
12
|
+
import torch_npu
|
13
|
+
if torch_npu.npu.is_available():
|
14
|
+
torch_npu.npu.empty_cache()
|
15
|
+
elif str(device).startswith("mps"):
|
16
|
+
torch.mps.empty_cache()
|
10
17
|
gc.collect()
|
magic_pdf/libs/config_reader.py
CHANGED
@@ -116,6 +116,15 @@ def get_formula_config():
|
|
116
116
|
else:
|
117
117
|
return formula_config
|
118
118
|
|
119
|
+
def get_llm_aided_config():
|
120
|
+
config = read_config()
|
121
|
+
llm_aided_config = config.get('llm-aided-config')
|
122
|
+
if llm_aided_config is None:
|
123
|
+
logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
|
124
|
+
return None
|
125
|
+
else:
|
126
|
+
return llm_aided_config
|
127
|
+
|
119
128
|
|
120
129
|
if __name__ == '__main__':
|
121
130
|
ak, sk, endpoint = get_s3_config('llm-raw')
|