magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
  9. magic_pdf/data/data_reader_writer/s3.py +73 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +6 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +19 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +106 -244
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +35 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +331 -15
  29. magic_pdf/model/pdf_extract_kit.py +170 -83
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
  31. magic_pdf/model/ppTableModel.py +8 -6
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +322 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +18 -8
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
  53. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
  54. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
  55. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
  56. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
  57. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
1
+ import boto3
2
+ from botocore.config import Config
3
+
4
+ from magic_pdf.data.io.base import IOReader, IOWriter
5
+
6
+
7
+ class S3Reader(IOReader):
8
+ def __init__(
9
+ self,
10
+ bucket: str,
11
+ ak: str,
12
+ sk: str,
13
+ endpoint_url: str,
14
+ addressing_style: str = 'auto',
15
+ ):
16
+ """s3 reader client.
17
+
18
+ Args:
19
+ bucket (str): bucket name
20
+ ak (str): access key
21
+ sk (str): secret key
22
+ endpoint_url (str): endpoint url of s3
23
+ addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
24
+ refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
25
+ """
26
+ self._bucket = bucket
27
+ self._ak = ak
28
+ self._sk = sk
29
+ self._s3_client = boto3.client(
30
+ service_name='s3',
31
+ aws_access_key_id=ak,
32
+ aws_secret_access_key=sk,
33
+ endpoint_url=endpoint_url,
34
+ config=Config(
35
+ s3={'addressing_style': addressing_style},
36
+ retries={'max_attempts': 5, 'mode': 'standard'},
37
+ ),
38
+ )
39
+
40
+ def read(self, key: str) -> bytes:
41
+ """Read the file.
42
+
43
+ Args:
44
+ path (str): file path to read
45
+
46
+ Returns:
47
+ bytes: the content of the file
48
+ """
49
+ return self.read_at(key)
50
+
51
+ def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
52
+ """Read at offset and limit.
53
+
54
+ Args:
55
+ path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
56
+ offset (int, optional): the number of bytes skipped. Defaults to 0.
57
+ limit (int, optional): the length of bytes want to read. Defaults to -1.
58
+
59
+ Returns:
60
+ bytes: the content of file
61
+ """
62
+ if limit > -1:
63
+ range_header = f'bytes={offset}-{offset+limit-1}'
64
+ res = self._s3_client.get_object(
65
+ Bucket=self._bucket, Key=key, Range=range_header
66
+ )
67
+ else:
68
+ res = self._s3_client.get_object(
69
+ Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
70
+ )
71
+ return res['Body'].read()
72
+
73
+
74
+ class S3Writer(IOWriter):
75
+ def __init__(
76
+ self,
77
+ bucket: str,
78
+ ak: str,
79
+ sk: str,
80
+ endpoint_url: str,
81
+ addressing_style: str = 'auto',
82
+ ):
83
+ """s3 reader client.
84
+
85
+ Args:
86
+ bucket (str): bucket name
87
+ ak (str): access key
88
+ sk (str): secret key
89
+ endpoint_url (str): endpoint url of s3
90
+ addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
91
+ refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
92
+ """
93
+ self._bucket = bucket
94
+ self._ak = ak
95
+ self._sk = sk
96
+ self._s3_client = boto3.client(
97
+ service_name='s3',
98
+ aws_access_key_id=ak,
99
+ aws_secret_access_key=sk,
100
+ endpoint_url=endpoint_url,
101
+ config=Config(
102
+ s3={'addressing_style': addressing_style},
103
+ retries={'max_attempts': 5, 'mode': 'standard'},
104
+ ),
105
+ )
106
+
107
+ def write(self, key: str, data: bytes):
108
+ """Write file with data.
109
+
110
+ Args:
111
+ path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
112
+ data (bytes): the data want to write
113
+ """
114
+ self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)
@@ -0,0 +1,95 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from magic_pdf.config.exceptions import EmptyData, InvalidParams
6
+ from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
+ MultiBucketS3DataReader)
8
+ from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
9
+
10
+
11
+ def read_jsonl(
12
+ s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
13
+ ) -> list[PymuDocDataset]:
14
+ """Read the jsonl file and return the list of PymuDocDataset.
15
+
16
+ Args:
17
+ s3_path_or_local (str): local file or s3 path
18
+ s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
19
+
20
+ Raises:
21
+ InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
22
+ EmptyData: if no pdf file location is provided in some line of jsonl file.
23
+ InvalidParams: if the file location is s3 path but s3_client is not provided
24
+
25
+ Returns:
26
+ list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
27
+ """
28
+ bits_arr = []
29
+ if s3_path_or_local.startswith('s3://'):
30
+ if s3_client is None:
31
+ raise InvalidParams('s3_client is required when s3_path is provided')
32
+ jsonl_bits = s3_client.read(s3_path_or_local)
33
+ else:
34
+ jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
35
+ jsonl_d = [
36
+ json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
37
+ ]
38
+ for d in jsonl_d[:5]:
39
+ pdf_path = d.get('file_location', '') or d.get('path', '')
40
+ if len(pdf_path) == 0:
41
+ raise EmptyData('pdf file location is empty')
42
+ if pdf_path.startswith('s3://'):
43
+ if s3_client is None:
44
+ raise InvalidParams('s3_client is required when s3_path is provided')
45
+ bits_arr.append(s3_client.read(pdf_path))
46
+ else:
47
+ bits_arr.append(FileBasedDataReader('').read(pdf_path))
48
+ return [PymuDocDataset(bits) for bits in bits_arr]
49
+
50
+
51
+ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
52
+ """Read pdf from path or directory.
53
+
54
+ Args:
55
+ path (str): pdf file path or directory that contains pdf files
56
+
57
+ Returns:
58
+ list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
59
+ """
60
+ if os.path.isdir(path):
61
+ reader = FileBasedDataReader(path)
62
+ return [
63
+ PymuDocDataset(reader.read(doc_path.name))
64
+ for doc_path in Path(path).glob('*.pdf')
65
+ ]
66
+ else:
67
+ reader = FileBasedDataReader()
68
+ bits = reader.read(path)
69
+ return [PymuDocDataset(bits)]
70
+
71
+
72
+ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
73
+ """Read images from path or directory.
74
+
75
+ Args:
76
+ path (str): image file path or directory that contains image files
77
+ suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
78
+
79
+ Returns:
80
+ list[ImageDataset]: each image file will converted to a ImageDataset
81
+ """
82
+ if os.path.isdir(path):
83
+ imgs_bits = []
84
+ s_suffixes = set(suffixes)
85
+ reader = FileBasedDataReader(path)
86
+ for root, _, files in os.walk(path):
87
+ for file in files:
88
+ suffix = file.split('.')
89
+ if suffix[-1] in s_suffixes:
90
+ imgs_bits.append(reader.read(file))
91
+ return [ImageDataset(bits) for bits in imgs_bits]
92
+ else:
93
+ reader = FileBasedDataReader()
94
+ bits = reader.read(path)
95
+ return [ImageDataset(bits)]
@@ -0,0 +1,19 @@
1
+
2
+ from pydantic import BaseModel, Field
3
+
4
+
5
+ class S3Config(BaseModel):
6
+ """S3 config
7
+ """
8
+ bucket_name: str = Field(description='s3 bucket name', min_length=1)
9
+ access_key: str = Field(description='s3 access key', min_length=1)
10
+ secret_key: str = Field(description='s3 secret key', min_length=1)
11
+ endpoint_url: str = Field(description='s3 endpoint url', min_length=1)
12
+ addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1)
13
+
14
+
15
+ class PageInfo(BaseModel):
16
+ """The width and height of page
17
+ """
18
+ w: float = Field(description='the width of page')
19
+ h: float = Field(description='the height of page')
@@ -0,0 +1,32 @@
1
+
2
+ import fitz
3
+ import numpy as np
4
+
5
+ from magic_pdf.utils.annotations import ImportPIL
6
+
7
+
8
+ @ImportPIL
9
+ def fitz_doc_to_image(doc, dpi=200) -> dict:
10
+ """Convert fitz.Document to image, Then convert the image to numpy array.
11
+
12
+ Args:
13
+ doc (_type_): pymudoc page
14
+ dpi (int, optional): reset the dpi of dpi. Defaults to 200.
15
+
16
+ Returns:
17
+ dict: {'img': numpy array, 'width': width, 'height': height }
18
+ """
19
+ from PIL import Image
20
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
21
+ pm = doc.get_pixmap(matrix=mat, alpha=False)
22
+
23
+ # If the width or height exceeds 9000 after scaling, do not scale further.
24
+ if pm.width > 9000 or pm.height > 9000:
25
+ pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
26
+
27
+ img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
28
+ img = np.array(img)
29
+
30
+ img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
31
+
32
+ return img_dict