magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +54 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +418 -51
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +19 -9
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- magic_pdf-0.9.0.dist-info/METADATA +507 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- magic_pdf-0.8.0.dist-info/METADATA +0 -459
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
magic_pdf/data/io/s3.py
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
import boto3
|
2
|
+
from botocore.config import Config
|
3
|
+
|
4
|
+
from magic_pdf.data.io.base import IOReader, IOWriter
|
5
|
+
|
6
|
+
|
7
|
+
class S3Reader(IOReader):
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
bucket: str,
|
11
|
+
ak: str,
|
12
|
+
sk: str,
|
13
|
+
endpoint_url: str,
|
14
|
+
addressing_style: str = 'auto',
|
15
|
+
):
|
16
|
+
"""s3 reader client.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
bucket (str): bucket name
|
20
|
+
ak (str): access key
|
21
|
+
sk (str): secret key
|
22
|
+
endpoint_url (str): endpoint url of s3
|
23
|
+
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
|
24
|
+
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
25
|
+
"""
|
26
|
+
self._bucket = bucket
|
27
|
+
self._ak = ak
|
28
|
+
self._sk = sk
|
29
|
+
self._s3_client = boto3.client(
|
30
|
+
service_name='s3',
|
31
|
+
aws_access_key_id=ak,
|
32
|
+
aws_secret_access_key=sk,
|
33
|
+
endpoint_url=endpoint_url,
|
34
|
+
config=Config(
|
35
|
+
s3={'addressing_style': addressing_style},
|
36
|
+
retries={'max_attempts': 5, 'mode': 'standard'},
|
37
|
+
),
|
38
|
+
)
|
39
|
+
|
40
|
+
def read(self, key: str) -> bytes:
|
41
|
+
"""Read the file.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
path (str): file path to read
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
bytes: the content of the file
|
48
|
+
"""
|
49
|
+
return self.read_at(key)
|
50
|
+
|
51
|
+
def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
|
52
|
+
"""Read at offset and limit.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
56
|
+
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
57
|
+
limit (int, optional): the length of bytes want to read. Defaults to -1.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
bytes: the content of file
|
61
|
+
"""
|
62
|
+
if limit > -1:
|
63
|
+
range_header = f'bytes={offset}-{offset+limit-1}'
|
64
|
+
res = self._s3_client.get_object(
|
65
|
+
Bucket=self._bucket, Key=key, Range=range_header
|
66
|
+
)
|
67
|
+
else:
|
68
|
+
res = self._s3_client.get_object(
|
69
|
+
Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
|
70
|
+
)
|
71
|
+
return res['Body'].read()
|
72
|
+
|
73
|
+
|
74
|
+
class S3Writer(IOWriter):
|
75
|
+
def __init__(
|
76
|
+
self,
|
77
|
+
bucket: str,
|
78
|
+
ak: str,
|
79
|
+
sk: str,
|
80
|
+
endpoint_url: str,
|
81
|
+
addressing_style: str = 'auto',
|
82
|
+
):
|
83
|
+
"""s3 reader client.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
bucket (str): bucket name
|
87
|
+
ak (str): access key
|
88
|
+
sk (str): secret key
|
89
|
+
endpoint_url (str): endpoint url of s3
|
90
|
+
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
|
91
|
+
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
92
|
+
"""
|
93
|
+
self._bucket = bucket
|
94
|
+
self._ak = ak
|
95
|
+
self._sk = sk
|
96
|
+
self._s3_client = boto3.client(
|
97
|
+
service_name='s3',
|
98
|
+
aws_access_key_id=ak,
|
99
|
+
aws_secret_access_key=sk,
|
100
|
+
endpoint_url=endpoint_url,
|
101
|
+
config=Config(
|
102
|
+
s3={'addressing_style': addressing_style},
|
103
|
+
retries={'max_attempts': 5, 'mode': 'standard'},
|
104
|
+
),
|
105
|
+
)
|
106
|
+
|
107
|
+
def write(self, key: str, data: bytes):
|
108
|
+
"""Write file with data.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
112
|
+
data (bytes): the data want to write
|
113
|
+
"""
|
114
|
+
self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from magic_pdf.config.exceptions import EmptyData, InvalidParams
|
6
|
+
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
7
|
+
MultiBucketS3DataReader)
|
8
|
+
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
|
9
|
+
|
10
|
+
|
11
|
+
def read_jsonl(
|
12
|
+
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
|
13
|
+
) -> list[PymuDocDataset]:
|
14
|
+
"""Read the jsonl file and return the list of PymuDocDataset.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
s3_path_or_local (str): local file or s3 path
|
18
|
+
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
|
19
|
+
|
20
|
+
Raises:
|
21
|
+
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
|
22
|
+
EmptyData: if no pdf file location is provided in some line of jsonl file.
|
23
|
+
InvalidParams: if the file location is s3 path but s3_client is not provided
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
|
27
|
+
"""
|
28
|
+
bits_arr = []
|
29
|
+
if s3_path_or_local.startswith('s3://'):
|
30
|
+
if s3_client is None:
|
31
|
+
raise InvalidParams('s3_client is required when s3_path is provided')
|
32
|
+
jsonl_bits = s3_client.read(s3_path_or_local)
|
33
|
+
else:
|
34
|
+
jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
|
35
|
+
jsonl_d = [
|
36
|
+
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
|
37
|
+
]
|
38
|
+
for d in jsonl_d[:5]:
|
39
|
+
pdf_path = d.get('file_location', '') or d.get('path', '')
|
40
|
+
if len(pdf_path) == 0:
|
41
|
+
raise EmptyData('pdf file location is empty')
|
42
|
+
if pdf_path.startswith('s3://'):
|
43
|
+
if s3_client is None:
|
44
|
+
raise InvalidParams('s3_client is required when s3_path is provided')
|
45
|
+
bits_arr.append(s3_client.read(pdf_path))
|
46
|
+
else:
|
47
|
+
bits_arr.append(FileBasedDataReader('').read(pdf_path))
|
48
|
+
return [PymuDocDataset(bits) for bits in bits_arr]
|
49
|
+
|
50
|
+
|
51
|
+
def read_local_pdfs(path: str) -> list[PymuDocDataset]:
|
52
|
+
"""Read pdf from path or directory.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
path (str): pdf file path or directory that contains pdf files
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
|
59
|
+
"""
|
60
|
+
if os.path.isdir(path):
|
61
|
+
reader = FileBasedDataReader(path)
|
62
|
+
return [
|
63
|
+
PymuDocDataset(reader.read(doc_path.name))
|
64
|
+
for doc_path in Path(path).glob('*.pdf')
|
65
|
+
]
|
66
|
+
else:
|
67
|
+
reader = FileBasedDataReader()
|
68
|
+
bits = reader.read(path)
|
69
|
+
return [PymuDocDataset(bits)]
|
70
|
+
|
71
|
+
|
72
|
+
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
|
73
|
+
"""Read images from path or directory.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
path (str): image file path or directory that contains image files
|
77
|
+
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
list[ImageDataset]: each image file will converted to a ImageDataset
|
81
|
+
"""
|
82
|
+
if os.path.isdir(path):
|
83
|
+
imgs_bits = []
|
84
|
+
s_suffixes = set(suffixes)
|
85
|
+
reader = FileBasedDataReader(path)
|
86
|
+
for root, _, files in os.walk(path):
|
87
|
+
for file in files:
|
88
|
+
suffix = file.split('.')
|
89
|
+
if suffix[-1] in s_suffixes:
|
90
|
+
imgs_bits.append(reader.read(file))
|
91
|
+
return [ImageDataset(bits) for bits in imgs_bits]
|
92
|
+
else:
|
93
|
+
reader = FileBasedDataReader()
|
94
|
+
bits = reader.read(path)
|
95
|
+
return [ImageDataset(bits)]
|
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
|
5
|
+
class S3Config(BaseModel):
|
6
|
+
bucket_name: str = Field(description='s3 bucket name', min_length=1)
|
7
|
+
access_key: str = Field(description='s3 access key', min_length=1)
|
8
|
+
secret_key: str = Field(description='s3 secret key', min_length=1)
|
9
|
+
endpoint_url: str = Field(description='s3 endpoint url', min_length=1)
|
10
|
+
addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1)
|
11
|
+
|
12
|
+
|
13
|
+
class PageInfo(BaseModel):
|
14
|
+
w: float = Field(description='the width of page')
|
15
|
+
h: float = Field(description='the height of page')
|
magic_pdf/data/utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
import fitz
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
from magic_pdf.utils.annotations import ImportPIL
|
6
|
+
|
7
|
+
|
8
|
+
@ImportPIL
|
9
|
+
def fitz_doc_to_image(doc, dpi=200) -> dict:
|
10
|
+
"""Convert fitz.Document to image, Then convert the image to numpy array.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
doc (_type_): pymudoc page
|
14
|
+
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
dict: {'img': numpy array, 'width': width, 'height': height }
|
18
|
+
"""
|
19
|
+
from PIL import Image
|
20
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
21
|
+
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
22
|
+
|
23
|
+
# If the width or height exceeds 9000 after scaling, do not scale further.
|
24
|
+
if pm.width > 9000 or pm.height > 9000:
|
25
|
+
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
26
|
+
|
27
|
+
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
28
|
+
img = np.array(img)
|
29
|
+
|
30
|
+
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
31
|
+
|
32
|
+
return img_dict
|