magic-pdf 0.10.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +2 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  4. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  5. magic_pdf/data/dataset.py +13 -1
  6. magic_pdf/data/read_api.py +59 -12
  7. magic_pdf/data/utils.py +35 -0
  8. magic_pdf/dict2md/ocr_mkcontent.py +14 -13
  9. magic_pdf/libs/clean_memory.py +11 -4
  10. magic_pdf/libs/config_reader.py +9 -0
  11. magic_pdf/libs/draw_bbox.py +8 -12
  12. magic_pdf/libs/language.py +3 -0
  13. magic_pdf/libs/version.py +1 -1
  14. magic_pdf/model/__init__.py +1 -125
  15. magic_pdf/model/batch_analyze.py +275 -0
  16. magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
  17. magic_pdf/model/magic_model.py +4 -435
  18. magic_pdf/model/model_list.py +1 -0
  19. magic_pdf/model/pdf_extract_kit.py +33 -22
  20. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  21. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  22. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  23. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  24. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  25. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  26. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  27. magic_pdf/model/sub_modules/model_init.py +30 -4
  28. magic_pdf/model/sub_modules/model_utils.py +8 -2
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  31. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  32. magic_pdf/operators/__init__.py +94 -0
  33. magic_pdf/{model/operators.py → operators/models.py} +2 -38
  34. magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
  35. magic_pdf/pdf_parse_union_core_v2.py +68 -17
  36. magic_pdf/post_proc/__init__.py +1 -0
  37. magic_pdf/post_proc/llm_aided.py +133 -0
  38. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  39. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  40. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  41. magic_pdf/tools/cli.py +36 -11
  42. magic_pdf/tools/common.py +28 -18
  43. magic_pdf/utils/office_to_pdf.py +29 -0
  44. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +73 -23
  45. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +50 -53
  46. magic_pdf/para/__init__.py +0 -0
  47. magic_pdf/pdf_parse_by_ocr.py +0 -22
  48. magic_pdf/pdf_parse_by_txt.py +0 -23
  49. magic_pdf/pipe/AbsPipe.py +0 -99
  50. magic_pdf/pipe/OCRPipe.py +0 -80
  51. magic_pdf/pipe/TXTPipe.py +0 -42
  52. magic_pdf/pipe/UNIPipe.py +0 -150
  53. magic_pdf/pipe/__init__.py +0 -0
  54. magic_pdf/rw/AbsReaderWriter.py +0 -17
  55. magic_pdf/rw/DiskReaderWriter.py +0 -74
  56. magic_pdf/rw/S3ReaderWriter.py +0 -142
  57. magic_pdf/rw/__init__.py +0 -0
  58. magic_pdf/user_api.py +0 -144
  59. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  60. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  61. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  62. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,74 +0,0 @@
1
- import os
2
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
3
- from loguru import logger
4
-
5
-
6
- class DiskReaderWriter(AbsReaderWriter):
7
- def __init__(self, parent_path, encoding="utf-8"):
8
- self.path = parent_path
9
- self.encoding = encoding
10
-
11
- def read(self, path, mode=AbsReaderWriter.MODE_TXT):
12
- if os.path.isabs(path):
13
- abspath = path
14
- else:
15
- abspath = os.path.join(self.path, path)
16
- if not os.path.exists(abspath):
17
- logger.error(f"file {abspath} not exists")
18
- raise Exception(f"file {abspath} no exists")
19
- if mode == AbsReaderWriter.MODE_TXT:
20
- with open(abspath, "r", encoding=self.encoding) as f:
21
- return f.read()
22
- elif mode == AbsReaderWriter.MODE_BIN:
23
- with open(abspath, "rb") as f:
24
- return f.read()
25
- else:
26
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
27
-
28
- def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
29
- if os.path.isabs(path):
30
- abspath = path
31
- else:
32
- abspath = os.path.join(self.path, path)
33
- directory_path = os.path.dirname(abspath)
34
- if not os.path.exists(directory_path):
35
- os.makedirs(directory_path)
36
- if mode == AbsReaderWriter.MODE_TXT:
37
- with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
38
- f.write(content)
39
-
40
- elif mode == AbsReaderWriter.MODE_BIN:
41
- with open(abspath, "wb") as f:
42
- f.write(content)
43
- else:
44
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
45
-
46
- def read_offset(self, path: str, offset=0, limit=None):
47
- abspath = path
48
- if not os.path.isabs(path):
49
- abspath = os.path.join(self.path, path)
50
- with open(abspath, "rb") as f:
51
- f.seek(offset)
52
- return f.read(limit)
53
-
54
-
55
- if __name__ == "__main__":
56
- if 0:
57
- file_path = "io/test/example.txt"
58
- drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
59
-
60
- # 写入内容到文件
61
- drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
62
-
63
- # 从文件读取内容
64
- content = drw.read(path=file_path)
65
- if content:
66
- logger.info(f"从 {file_path} 读取的内容: {content}")
67
- if 1:
68
- drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
69
- content_bin = drw.read_offset("1.txt")
70
- assert content_bin == b"ABCD!"
71
-
72
- content_bin = drw.read_offset("1.txt", offset=1, limit=2)
73
- assert content_bin == b"BC"
74
-
@@ -1,142 +0,0 @@
1
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
2
- from magic_pdf.libs.commons import parse_bucket_key, join_path
3
- import boto3
4
- from loguru import logger
5
- from botocore.config import Config
6
-
7
-
8
- class S3ReaderWriter(AbsReaderWriter):
9
- def __init__(
10
- self,
11
- ak: str,
12
- sk: str,
13
- endpoint_url: str,
14
- addressing_style: str = "auto",
15
- parent_path: str = "",
16
- ):
17
- self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
18
- self.path = parent_path
19
-
20
- def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
21
- s3_client = boto3.client(
22
- service_name="s3",
23
- aws_access_key_id=ak,
24
- aws_secret_access_key=sk,
25
- endpoint_url=endpoint_url,
26
- config=Config(
27
- s3={"addressing_style": addressing_style},
28
- retries={"max_attempts": 5, "mode": "standard"},
29
- ),
30
- )
31
- return s3_client
32
-
33
- def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
34
- if s3_relative_path.startswith("s3://"):
35
- s3_path = s3_relative_path
36
- else:
37
- s3_path = join_path(self.path, s3_relative_path)
38
- bucket_name, key = parse_bucket_key(s3_path)
39
- res = self.client.get_object(Bucket=bucket_name, Key=key)
40
- body = res["Body"].read()
41
- if mode == AbsReaderWriter.MODE_TXT:
42
- data = body.decode(encoding) # Decode bytes to text
43
- elif mode == AbsReaderWriter.MODE_BIN:
44
- data = body
45
- else:
46
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
47
- return data
48
-
49
- def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
50
- if s3_relative_path.startswith("s3://"):
51
- s3_path = s3_relative_path
52
- else:
53
- s3_path = join_path(self.path, s3_relative_path)
54
- if mode == AbsReaderWriter.MODE_TXT:
55
- body = content.encode(encoding) # Encode text data as bytes
56
- elif mode == AbsReaderWriter.MODE_BIN:
57
- body = content
58
- else:
59
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
60
- bucket_name, key = parse_bucket_key(s3_path)
61
- self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
62
- logger.info(f"内容已写入 {s3_path} ")
63
-
64
- def read_offset(self, path: str, offset=0, limit=None) -> bytes:
65
- if path.startswith("s3://"):
66
- s3_path = path
67
- else:
68
- s3_path = join_path(self.path, path)
69
- bucket_name, key = parse_bucket_key(s3_path)
70
-
71
- range_header = (
72
- f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
73
- )
74
- res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
75
- return res["Body"].read()
76
-
77
-
78
- if __name__ == "__main__":
79
- if 0:
80
- # Config the connection info
81
- ak = ""
82
- sk = ""
83
- endpoint_url = ""
84
- addressing_style = "auto"
85
- bucket_name = ""
86
- # Create an S3ReaderWriter object
87
- s3_reader_writer = S3ReaderWriter(
88
- ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
89
- )
90
-
91
- # Write text data to S3
92
- text_data = "This is some text data"
93
- s3_reader_writer.write(
94
- text_data,
95
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
96
- mode=AbsReaderWriter.MODE_TXT,
97
- )
98
-
99
- # Read text data from S3
100
- text_data_read = s3_reader_writer.read(
101
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
102
- )
103
- logger.info(f"Read text data from S3: {text_data_read}")
104
- # Write binary data to S3
105
- binary_data = b"This is some binary data"
106
- s3_reader_writer.write(
107
- text_data,
108
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
109
- mode=AbsReaderWriter.MODE_BIN,
110
- )
111
-
112
- # Read binary data from S3
113
- binary_data_read = s3_reader_writer.read(
114
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
115
- )
116
- logger.info(f"Read binary data from S3: {binary_data_read}")
117
-
118
- # Range Read text data from S3
119
- binary_data_read = s3_reader_writer.read_offset(
120
- path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
121
- )
122
- logger.info(f"Read binary data from S3: {binary_data_read}")
123
- if 1:
124
- import os
125
- import json
126
-
127
- ak = os.getenv("AK", "")
128
- sk = os.getenv("SK", "")
129
- endpoint_url = os.getenv("ENDPOINT", "")
130
- bucket = os.getenv("S3_BUCKET", "")
131
- prefix = os.getenv("S3_PREFIX", "")
132
- key_basename = os.getenv("S3_KEY_BASENAME", "")
133
- s3_reader_writer = S3ReaderWriter(
134
- ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
135
- )
136
- content_bin = s3_reader_writer.read_offset(key_basename)
137
- assert content_bin[:10] == b'{"track_id'
138
- assert content_bin[-10:] == b'r":null}}\n'
139
-
140
- content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
141
- jso = json.dumps(content_bin.decode("utf-8"))
142
- print(jso)
magic_pdf/rw/__init__.py DELETED
File without changes
magic_pdf/user_api.py DELETED
@@ -1,144 +0,0 @@
1
- """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
2
-
3
- 然后:
4
- 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
5
- 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
6
-
7
- 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
8
- """
9
-
10
- from loguru import logger
11
-
12
- from magic_pdf.data.data_reader_writer import DataWriter
13
- from magic_pdf.data.dataset import Dataset
14
- from magic_pdf.libs.version import __version__
15
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
16
- from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
17
- from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
18
- from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
19
-
20
-
21
- def parse_txt_pdf(
22
- dataset: Dataset,
23
- model_list: list,
24
- imageWriter: DataWriter,
25
- is_debug=False,
26
- start_page_id=0,
27
- end_page_id=None,
28
- lang=None,
29
- *args,
30
- **kwargs
31
- ):
32
- """解析文本类pdf."""
33
- pdf_info_dict = parse_pdf_by_txt(
34
- dataset,
35
- model_list,
36
- imageWriter,
37
- start_page_id=start_page_id,
38
- end_page_id=end_page_id,
39
- debug_mode=is_debug,
40
- lang=lang,
41
- )
42
-
43
- pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
44
-
45
- pdf_info_dict['_version_name'] = __version__
46
-
47
- if lang is not None:
48
- pdf_info_dict['_lang'] = lang
49
-
50
- return pdf_info_dict
51
-
52
-
53
- def parse_ocr_pdf(
54
- dataset: Dataset,
55
- model_list: list,
56
- imageWriter: DataWriter,
57
- is_debug=False,
58
- start_page_id=0,
59
- end_page_id=None,
60
- lang=None,
61
- *args,
62
- **kwargs
63
- ):
64
- """解析ocr类pdf."""
65
- pdf_info_dict = parse_pdf_by_ocr(
66
- dataset,
67
- model_list,
68
- imageWriter,
69
- start_page_id=start_page_id,
70
- end_page_id=end_page_id,
71
- debug_mode=is_debug,
72
- lang=lang,
73
- )
74
-
75
- pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
76
-
77
- pdf_info_dict['_version_name'] = __version__
78
-
79
- if lang is not None:
80
- pdf_info_dict['_lang'] = lang
81
-
82
- return pdf_info_dict
83
-
84
-
85
- def parse_union_pdf(
86
- dataset: Dataset,
87
- model_list: list,
88
- imageWriter: DataWriter,
89
- is_debug=False,
90
- start_page_id=0,
91
- end_page_id=None,
92
- lang=None,
93
- *args,
94
- **kwargs
95
- ):
96
- """ocr和文本混合的pdf,全部解析出来."""
97
-
98
- def parse_pdf(method):
99
- try:
100
- return method(
101
- dataset,
102
- model_list,
103
- imageWriter,
104
- start_page_id=start_page_id,
105
- end_page_id=end_page_id,
106
- debug_mode=is_debug,
107
- lang=lang,
108
- )
109
- except Exception as e:
110
- logger.exception(e)
111
- return None
112
-
113
- pdf_info_dict = parse_pdf(parse_pdf_by_txt)
114
- if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
115
- logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
116
- if len(model_list) == 0:
117
- layout_model = kwargs.get('layout_model', None)
118
- formula_enable = kwargs.get('formula_enable', None)
119
- table_enable = kwargs.get('table_enable', None)
120
- infer_res = doc_analyze(
121
- dataset,
122
- ocr=True,
123
- start_page_id=start_page_id,
124
- end_page_id=end_page_id,
125
- lang=lang,
126
- layout_model=layout_model,
127
- formula_enable=formula_enable,
128
- table_enable=table_enable,
129
- )
130
- model_list = infer_res.get_infer_res()
131
- pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
132
- if pdf_info_dict is None:
133
- raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
134
- else:
135
- pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
136
- else:
137
- pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
138
-
139
- pdf_info_dict['_version_name'] = __version__
140
-
141
- if lang is not None:
142
- pdf_info_dict['_lang'] = lang
143
-
144
- return pdf_info_dict
File without changes