magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +54 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +418 -51
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +19 -9
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- magic_pdf-0.9.0.dist-info/METADATA +507 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- magic_pdf-0.8.0.dist-info/METADATA +0 -459
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
class FileNotExisted(Exception):
|
3
|
+
|
4
|
+
def __init__(self, path):
|
5
|
+
self.path = path
|
6
|
+
|
7
|
+
def __str__(self):
|
8
|
+
return f'File {self.path} does not exist.'
|
9
|
+
|
10
|
+
|
11
|
+
class InvalidConfig(Exception):
|
12
|
+
def __init__(self, msg):
|
13
|
+
self.msg = msg
|
14
|
+
|
15
|
+
def __str__(self):
|
16
|
+
return f'Invalid config: {self.msg}'
|
17
|
+
|
18
|
+
|
19
|
+
class InvalidParams(Exception):
|
20
|
+
def __init__(self, msg):
|
21
|
+
self.msg = msg
|
22
|
+
|
23
|
+
def __str__(self):
|
24
|
+
return f'Invalid params: {self.msg}'
|
25
|
+
|
26
|
+
|
27
|
+
class EmptyData(Exception):
|
28
|
+
def __init__(self, msg):
|
29
|
+
self.msg = msg
|
30
|
+
|
31
|
+
def __str__(self):
|
32
|
+
return f'Empty data: {self.msg}'
|
File without changes
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from magic_pdf.data.data_reader_writer.filebase import \
|
2
|
+
FileBasedDataReader # noqa: F401
|
3
|
+
from magic_pdf.data.data_reader_writer.filebase import \
|
4
|
+
FileBasedDataWriter # noqa: F401
|
5
|
+
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
|
6
|
+
MultiBucketS3DataReader # noqa: F401
|
7
|
+
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
|
8
|
+
MultiBucketS3DataWriter # noqa: F401
|
9
|
+
from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401
|
10
|
+
from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401
|
11
|
+
from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401
|
12
|
+
from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
|
4
|
+
|
5
|
+
class DataReader(ABC):
|
6
|
+
|
7
|
+
def read(self, path: str) -> bytes:
|
8
|
+
"""Read the file.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
path (str): file path to read
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
bytes: the content of the file
|
15
|
+
"""
|
16
|
+
return self.read_at(path)
|
17
|
+
|
18
|
+
@abstractmethod
|
19
|
+
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
20
|
+
"""Read the file at offset and limit.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
path (str): the file path
|
24
|
+
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
25
|
+
limit (int, optional): the length of bytes want to read. Defaults to -1.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
bytes: the content of the file
|
29
|
+
"""
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
class DataWriter(ABC):
|
34
|
+
@abstractmethod
|
35
|
+
def write(self, path: str, data: bytes) -> None:
|
36
|
+
"""Write the data to the file.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
path (str): the target file where to write
|
40
|
+
data (bytes): the data want to write
|
41
|
+
"""
|
42
|
+
pass
|
43
|
+
|
44
|
+
def write_string(self, path: str, data: str) -> None:
|
45
|
+
"""Write the data to file, the data will be encoded to bytes.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
path (str): the target file where to write
|
49
|
+
data (str): the data want to write
|
50
|
+
"""
|
51
|
+
self.write(path, data.encode())
|
@@ -0,0 +1,59 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
|
4
|
+
|
5
|
+
|
6
|
+
class FileBasedDataReader(DataReader):
|
7
|
+
def __init__(self, parent_dir: str = ''):
|
8
|
+
"""Initialized with parent_dir.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
|
12
|
+
"""
|
13
|
+
self._parent_dir = parent_dir
|
14
|
+
|
15
|
+
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
16
|
+
"""Read at offset and limit.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
20
|
+
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
21
|
+
limit (int, optional): the length of bytes want to read. Defaults to -1.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
bytes: the content of file
|
25
|
+
"""
|
26
|
+
fn_path = path
|
27
|
+
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
|
28
|
+
fn_path = os.path.join(self._parent_dir, path)
|
29
|
+
|
30
|
+
with open(fn_path, 'rb') as f:
|
31
|
+
f.seek(offset)
|
32
|
+
if limit == -1:
|
33
|
+
return f.read()
|
34
|
+
else:
|
35
|
+
return f.read(limit)
|
36
|
+
|
37
|
+
|
38
|
+
class FileBasedDataWriter(DataWriter):
|
39
|
+
def __init__(self, parent_dir: str = '') -> None:
|
40
|
+
"""Initialized with parent_dir.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
|
44
|
+
"""
|
45
|
+
self._parent_dir = parent_dir
|
46
|
+
|
47
|
+
def write(self, path: str, data: bytes) -> None:
|
48
|
+
"""Write file with data.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
52
|
+
data (bytes): the data want to write
|
53
|
+
"""
|
54
|
+
fn_path = path
|
55
|
+
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
|
56
|
+
fn_path = os.path.join(self._parent_dir, path)
|
57
|
+
|
58
|
+
with open(fn_path, 'wb') as f:
|
59
|
+
f.write(data)
|
@@ -0,0 +1,137 @@
|
|
1
|
+
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
|
2
|
+
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
|
3
|
+
from magic_pdf.data.io.s3 import S3Reader, S3Writer
|
4
|
+
from magic_pdf.data.schemas import S3Config
|
5
|
+
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
|
6
|
+
remove_non_official_s3_args)
|
7
|
+
|
8
|
+
|
9
|
+
class MultiS3Mixin:
|
10
|
+
def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
|
11
|
+
"""Initialized with multiple s3 configs.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
default_bucket (str): the default bucket name of the relative path
|
15
|
+
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
|
16
|
+
|
17
|
+
Raises:
|
18
|
+
InvalidConfig: default bucket config not in s3_configs
|
19
|
+
InvalidConfig: bucket name not unique in s3_configs
|
20
|
+
InvalidConfig: default bucket must be provided
|
21
|
+
"""
|
22
|
+
if len(default_bucket) == 0:
|
23
|
+
raise InvalidConfig('default_bucket must be provided')
|
24
|
+
|
25
|
+
found_default_bucket_config = False
|
26
|
+
for conf in s3_configs:
|
27
|
+
if conf.bucket_name == default_bucket:
|
28
|
+
found_default_bucket_config = True
|
29
|
+
break
|
30
|
+
|
31
|
+
if not found_default_bucket_config:
|
32
|
+
raise InvalidConfig(
|
33
|
+
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
|
34
|
+
)
|
35
|
+
|
36
|
+
uniq_bucket = set([conf.bucket_name for conf in s3_configs])
|
37
|
+
if len(uniq_bucket) != len(s3_configs):
|
38
|
+
raise InvalidConfig(
|
39
|
+
f'the bucket_name in s3_configs: {s3_configs} must be unique'
|
40
|
+
)
|
41
|
+
|
42
|
+
self.default_bucket = default_bucket
|
43
|
+
self.s3_configs = s3_configs
|
44
|
+
self._s3_clients_h: dict = {}
|
45
|
+
|
46
|
+
|
47
|
+
class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
|
48
|
+
def read(self, path: str) -> bytes:
|
49
|
+
"""Read the path from s3, select diffect bucket client for each request
|
50
|
+
based on the path, also support range read.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
|
54
|
+
for example: s3://bucket_name/path?0,100
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
bytes: the content of s3 file
|
58
|
+
"""
|
59
|
+
may_range_params = parse_s3_range_params(path)
|
60
|
+
if may_range_params is None or 2 != len(may_range_params):
|
61
|
+
byte_start, byte_len = 0, -1
|
62
|
+
else:
|
63
|
+
byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1])
|
64
|
+
path = remove_non_official_s3_args(path)
|
65
|
+
return self.read_at(path, byte_start, byte_len)
|
66
|
+
|
67
|
+
def __get_s3_client(self, bucket_name: str):
|
68
|
+
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
|
69
|
+
raise InvalidParams(
|
70
|
+
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
|
71
|
+
)
|
72
|
+
if bucket_name not in self._s3_clients_h:
|
73
|
+
conf = next(
|
74
|
+
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
|
75
|
+
)
|
76
|
+
self._s3_clients_h[bucket_name] = S3Reader(
|
77
|
+
bucket_name,
|
78
|
+
conf.access_key,
|
79
|
+
conf.secret_key,
|
80
|
+
conf.endpoint_url,
|
81
|
+
conf.addressing_style,
|
82
|
+
)
|
83
|
+
return self._s3_clients_h[bucket_name]
|
84
|
+
|
85
|
+
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
86
|
+
"""Read the file with offset and limit, select diffect bucket client
|
87
|
+
for each request based on the path.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
path (str): the file path
|
91
|
+
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
92
|
+
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
bytes: the file content
|
96
|
+
"""
|
97
|
+
if path.startswith('s3://'):
|
98
|
+
bucket_name, path = parse_s3path(path)
|
99
|
+
s3_reader = self.__get_s3_client(bucket_name)
|
100
|
+
else:
|
101
|
+
s3_reader = self.__get_s3_client(self.default_bucket)
|
102
|
+
return s3_reader.read_at(path, offset, limit)
|
103
|
+
|
104
|
+
|
105
|
+
class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
|
106
|
+
def __get_s3_client(self, bucket_name: str):
|
107
|
+
if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
|
108
|
+
raise InvalidParams(
|
109
|
+
f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
|
110
|
+
)
|
111
|
+
if bucket_name not in self._s3_clients_h:
|
112
|
+
conf = next(
|
113
|
+
filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
|
114
|
+
)
|
115
|
+
self._s3_clients_h[bucket_name] = S3Writer(
|
116
|
+
bucket_name,
|
117
|
+
conf.access_key,
|
118
|
+
conf.secret_key,
|
119
|
+
conf.endpoint_url,
|
120
|
+
conf.addressing_style,
|
121
|
+
)
|
122
|
+
return self._s3_clients_h[bucket_name]
|
123
|
+
|
124
|
+
def write(self, path: str, data: bytes) -> None:
|
125
|
+
"""Write file with data, also select diffect bucket client for each
|
126
|
+
request based on the path.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
130
|
+
data (bytes): the data want to write
|
131
|
+
"""
|
132
|
+
if path.startswith('s3://'):
|
133
|
+
bucket_name, path = parse_s3path(path)
|
134
|
+
s3_writer = self.__get_s3_client(bucket_name)
|
135
|
+
else:
|
136
|
+
s3_writer = self.__get_s3_client(self.default_bucket)
|
137
|
+
return s3_writer.write(path, data)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import (
|
2
|
+
MultiBucketS3DataReader, MultiBucketS3DataWriter)
|
3
|
+
from magic_pdf.data.schemas import S3Config
|
4
|
+
|
5
|
+
|
6
|
+
class S3DataReader(MultiBucketS3DataReader):
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
bucket: str,
|
10
|
+
ak: str,
|
11
|
+
sk: str,
|
12
|
+
endpoint_url: str,
|
13
|
+
addressing_style: str = 'auto',
|
14
|
+
):
|
15
|
+
"""s3 reader client.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
bucket (str): bucket name
|
19
|
+
ak (str): access key
|
20
|
+
sk (str): secret key
|
21
|
+
endpoint_url (str): endpoint url of s3
|
22
|
+
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
|
23
|
+
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
24
|
+
"""
|
25
|
+
super().__init__(
|
26
|
+
bucket,
|
27
|
+
[
|
28
|
+
S3Config(
|
29
|
+
bucket_name=bucket,
|
30
|
+
access_key=ak,
|
31
|
+
secret_key=sk,
|
32
|
+
endpoint_url=endpoint_url,
|
33
|
+
addressing_style=addressing_style,
|
34
|
+
)
|
35
|
+
],
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
class S3DataWriter(MultiBucketS3DataWriter):
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
bucket: str,
|
43
|
+
ak: str,
|
44
|
+
sk: str,
|
45
|
+
endpoint_url: str,
|
46
|
+
addressing_style: str = 'auto',
|
47
|
+
):
|
48
|
+
"""s3 writer client.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
bucket (str): bucket name
|
52
|
+
ak (str): access key
|
53
|
+
sk (str): secret key
|
54
|
+
endpoint_url (str): endpoint url of s3
|
55
|
+
addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
|
56
|
+
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
57
|
+
"""
|
58
|
+
super().__init__(
|
59
|
+
bucket,
|
60
|
+
[
|
61
|
+
S3Config(
|
62
|
+
bucket_name=bucket,
|
63
|
+
access_key=ak,
|
64
|
+
secret_key=sk,
|
65
|
+
endpoint_url=endpoint_url,
|
66
|
+
addressing_style=addressing_style,
|
67
|
+
)
|
68
|
+
],
|
69
|
+
)
|
@@ -0,0 +1,194 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Iterator
|
3
|
+
|
4
|
+
import fitz
|
5
|
+
|
6
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
7
|
+
from magic_pdf.data.schemas import PageInfo
|
8
|
+
from magic_pdf.data.utils import fitz_doc_to_image
|
9
|
+
|
10
|
+
|
11
|
+
class PageableData(ABC):
|
12
|
+
@abstractmethod
|
13
|
+
def get_image(self) -> dict:
|
14
|
+
"""Transform data to image."""
|
15
|
+
pass
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def get_doc(self) -> fitz.Page:
|
19
|
+
"""Get the pymudoc page."""
|
20
|
+
pass
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def get_page_info(self) -> PageInfo:
|
24
|
+
"""Get the page info of the page.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
PageInfo: the page info of this page
|
28
|
+
"""
|
29
|
+
pass
|
30
|
+
|
31
|
+
|
32
|
+
class Dataset(ABC):
|
33
|
+
@abstractmethod
|
34
|
+
def __len__(self) -> int:
|
35
|
+
"""The length of the dataset."""
|
36
|
+
pass
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def __iter__(self) -> Iterator[PageableData]:
|
40
|
+
"""Yield the page data."""
|
41
|
+
pass
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def supported_methods(self) -> list[SupportedPdfParseMethod]:
|
45
|
+
"""The methods that this dataset support.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
|
49
|
+
"""
|
50
|
+
pass
|
51
|
+
|
52
|
+
@abstractmethod
|
53
|
+
def data_bits(self) -> bytes:
|
54
|
+
"""The bits used to create this dataset."""
|
55
|
+
pass
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
def get_page(self, page_id: int) -> PageableData:
|
59
|
+
"""Get the page indexed by page_id.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
page_id (int): the index of the page
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
PageableData: the page doc object
|
66
|
+
"""
|
67
|
+
pass
|
68
|
+
|
69
|
+
|
70
|
+
class PymuDocDataset(Dataset):
|
71
|
+
def __init__(self, bits: bytes):
|
72
|
+
"""Initialize the dataset, which wraps the pymudoc documents.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
bits (bytes): the bytes of the pdf
|
76
|
+
"""
|
77
|
+
self._records = [Doc(v) for v in fitz.open('pdf', bits)]
|
78
|
+
self._data_bits = bits
|
79
|
+
self._raw_data = bits
|
80
|
+
|
81
|
+
def __len__(self) -> int:
|
82
|
+
"""The page number of the pdf."""
|
83
|
+
return len(self._records)
|
84
|
+
|
85
|
+
def __iter__(self) -> Iterator[PageableData]:
|
86
|
+
"""Yield the page doc object."""
|
87
|
+
return iter(self._records)
|
88
|
+
|
89
|
+
def supported_methods(self) -> list[SupportedPdfParseMethod]:
|
90
|
+
"""The method supported by this dataset.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
list[SupportedPdfParseMethod]: the supported methods
|
94
|
+
"""
|
95
|
+
return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
|
96
|
+
|
97
|
+
def data_bits(self) -> bytes:
|
98
|
+
"""The pdf bits used to create this dataset."""
|
99
|
+
return self._data_bits
|
100
|
+
|
101
|
+
def get_page(self, page_id: int) -> PageableData:
|
102
|
+
"""The page doc object.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
page_id (int): the page doc index
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
PageableData: the page doc object
|
109
|
+
"""
|
110
|
+
return self._records[page_id]
|
111
|
+
|
112
|
+
|
113
|
+
class ImageDataset(Dataset):
|
114
|
+
def __init__(self, bits: bytes):
|
115
|
+
"""Initialize the dataset, which wraps the pymudoc documents.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
|
119
|
+
"""
|
120
|
+
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
121
|
+
self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
|
122
|
+
self._raw_data = bits
|
123
|
+
self._data_bits = pdf_bytes
|
124
|
+
|
125
|
+
def __len__(self) -> int:
|
126
|
+
"""The length of the dataset."""
|
127
|
+
return len(self._records)
|
128
|
+
|
129
|
+
def __iter__(self) -> Iterator[PageableData]:
|
130
|
+
"""Yield the page object."""
|
131
|
+
return iter(self._records)
|
132
|
+
|
133
|
+
def supported_methods(self):
|
134
|
+
"""The method supported by this dataset.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
list[SupportedPdfParseMethod]: the supported methods
|
138
|
+
"""
|
139
|
+
return [SupportedPdfParseMethod.OCR]
|
140
|
+
|
141
|
+
def data_bits(self) -> bytes:
|
142
|
+
"""The pdf bits used to create this dataset."""
|
143
|
+
return self._data_bits
|
144
|
+
|
145
|
+
def get_page(self, page_id: int) -> PageableData:
|
146
|
+
"""The page doc object.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
page_id (int): the page doc index
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
PageableData: the page doc object
|
153
|
+
"""
|
154
|
+
return self._records[page_id]
|
155
|
+
|
156
|
+
|
157
|
+
class Doc(PageableData):
|
158
|
+
"""Initialized with pymudoc object."""
|
159
|
+
def __init__(self, doc: fitz.Page):
|
160
|
+
self._doc = doc
|
161
|
+
|
162
|
+
def get_image(self):
|
163
|
+
"""Return the imge info.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
dict: {
|
167
|
+
img: np.ndarray,
|
168
|
+
width: int,
|
169
|
+
height: int
|
170
|
+
}
|
171
|
+
"""
|
172
|
+
return fitz_doc_to_image(self._doc)
|
173
|
+
|
174
|
+
def get_doc(self) -> fitz.Page:
|
175
|
+
"""Get the pymudoc object.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
fitz.Page: the pymudoc object
|
179
|
+
"""
|
180
|
+
return self._doc
|
181
|
+
|
182
|
+
def get_page_info(self) -> PageInfo:
|
183
|
+
"""Get the page info of the page.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
PageInfo: the page info of this page
|
187
|
+
"""
|
188
|
+
page_w = self._doc.rect.width
|
189
|
+
page_h = self._doc.rect.height
|
190
|
+
return PageInfo(w=page_w, h=page_h)
|
191
|
+
|
192
|
+
def __getattr__(self, name):
|
193
|
+
if hasattr(self._doc, name):
|
194
|
+
return getattr(self._doc, name)
|
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
|
4
|
+
class IOReader(ABC):
|
5
|
+
@abstractmethod
|
6
|
+
def read(self, path: str) -> bytes:
|
7
|
+
"""Read the file.
|
8
|
+
|
9
|
+
Args:
|
10
|
+
path (str): file path to read
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
bytes: the content of the file
|
14
|
+
"""
|
15
|
+
pass
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
19
|
+
"""Read at offset and limit.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
23
|
+
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
24
|
+
limit (int, optional): the length of bytes want to read. Defaults to -1.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
bytes: the content of file
|
28
|
+
"""
|
29
|
+
pass
|
30
|
+
|
31
|
+
|
32
|
+
class IOWriter:
|
33
|
+
|
34
|
+
@abstractmethod
|
35
|
+
def write(self, path: str, data: bytes) -> None:
|
36
|
+
"""Write file with data.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
40
|
+
data (bytes): the data want to write
|
41
|
+
"""
|
42
|
+
pass
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
import io
|
3
|
+
|
4
|
+
import requests
|
5
|
+
|
6
|
+
from magic_pdf.data.io.base import IOReader, IOWriter
|
7
|
+
|
8
|
+
|
9
|
+
class HttpReader(IOReader):
|
10
|
+
|
11
|
+
def read(self, url: str) -> bytes:
|
12
|
+
"""Read the file.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
path (str): file path to read
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
bytes: the content of the file
|
19
|
+
"""
|
20
|
+
return requests.get(url).content
|
21
|
+
|
22
|
+
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
23
|
+
"""Not Implemented."""
|
24
|
+
raise NotImplementedError
|
25
|
+
|
26
|
+
|
27
|
+
class HttpWriter(IOWriter):
|
28
|
+
def write(self, url: str, data: bytes) -> None:
|
29
|
+
"""Write file with data.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
33
|
+
data (bytes): the data want to write
|
34
|
+
"""
|
35
|
+
files = {'file': io.BytesIO(data)}
|
36
|
+
response = requests.post(url, files=files)
|
37
|
+
assert 300 > response.status_code and response.status_code > 199
|