magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,142 +0,0 @@
1
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
2
- from magic_pdf.libs.commons import parse_bucket_key, join_path
3
- import boto3
4
- from loguru import logger
5
- from botocore.config import Config
6
-
7
-
8
- class S3ReaderWriter(AbsReaderWriter):
9
- def __init__(
10
- self,
11
- ak: str,
12
- sk: str,
13
- endpoint_url: str,
14
- addressing_style: str = "auto",
15
- parent_path: str = "",
16
- ):
17
- self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
18
- self.path = parent_path
19
-
20
- def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
21
- s3_client = boto3.client(
22
- service_name="s3",
23
- aws_access_key_id=ak,
24
- aws_secret_access_key=sk,
25
- endpoint_url=endpoint_url,
26
- config=Config(
27
- s3={"addressing_style": addressing_style},
28
- retries={"max_attempts": 5, "mode": "standard"},
29
- ),
30
- )
31
- return s3_client
32
-
33
- def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
34
- if s3_relative_path.startswith("s3://"):
35
- s3_path = s3_relative_path
36
- else:
37
- s3_path = join_path(self.path, s3_relative_path)
38
- bucket_name, key = parse_bucket_key(s3_path)
39
- res = self.client.get_object(Bucket=bucket_name, Key=key)
40
- body = res["Body"].read()
41
- if mode == AbsReaderWriter.MODE_TXT:
42
- data = body.decode(encoding) # Decode bytes to text
43
- elif mode == AbsReaderWriter.MODE_BIN:
44
- data = body
45
- else:
46
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
47
- return data
48
-
49
- def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
50
- if s3_relative_path.startswith("s3://"):
51
- s3_path = s3_relative_path
52
- else:
53
- s3_path = join_path(self.path, s3_relative_path)
54
- if mode == AbsReaderWriter.MODE_TXT:
55
- body = content.encode(encoding) # Encode text data as bytes
56
- elif mode == AbsReaderWriter.MODE_BIN:
57
- body = content
58
- else:
59
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
60
- bucket_name, key = parse_bucket_key(s3_path)
61
- self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
62
- logger.info(f"内容已写入 {s3_path} ")
63
-
64
- def read_offset(self, path: str, offset=0, limit=None) -> bytes:
65
- if path.startswith("s3://"):
66
- s3_path = path
67
- else:
68
- s3_path = join_path(self.path, path)
69
- bucket_name, key = parse_bucket_key(s3_path)
70
-
71
- range_header = (
72
- f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
73
- )
74
- res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
75
- return res["Body"].read()
76
-
77
-
78
- if __name__ == "__main__":
79
- if 0:
80
- # Config the connection info
81
- ak = ""
82
- sk = ""
83
- endpoint_url = ""
84
- addressing_style = "auto"
85
- bucket_name = ""
86
- # Create an S3ReaderWriter object
87
- s3_reader_writer = S3ReaderWriter(
88
- ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
89
- )
90
-
91
- # Write text data to S3
92
- text_data = "This is some text data"
93
- s3_reader_writer.write(
94
- text_data,
95
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
96
- mode=AbsReaderWriter.MODE_TXT,
97
- )
98
-
99
- # Read text data from S3
100
- text_data_read = s3_reader_writer.read(
101
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
102
- )
103
- logger.info(f"Read text data from S3: {text_data_read}")
104
- # Write binary data to S3
105
- binary_data = b"This is some binary data"
106
- s3_reader_writer.write(
107
- text_data,
108
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
109
- mode=AbsReaderWriter.MODE_BIN,
110
- )
111
-
112
- # Read binary data from S3
113
- binary_data_read = s3_reader_writer.read(
114
- s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
115
- )
116
- logger.info(f"Read binary data from S3: {binary_data_read}")
117
-
118
- # Range Read text data from S3
119
- binary_data_read = s3_reader_writer.read_offset(
120
- path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
121
- )
122
- logger.info(f"Read binary data from S3: {binary_data_read}")
123
- if 1:
124
- import os
125
- import json
126
-
127
- ak = os.getenv("AK", "")
128
- sk = os.getenv("SK", "")
129
- endpoint_url = os.getenv("ENDPOINT", "")
130
- bucket = os.getenv("S3_BUCKET", "")
131
- prefix = os.getenv("S3_PREFIX", "")
132
- key_basename = os.getenv("S3_KEY_BASENAME", "")
133
- s3_reader_writer = S3ReaderWriter(
134
- ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
135
- )
136
- content_bin = s3_reader_writer.read_offset(key_basename)
137
- assert content_bin[:10] == b'{"track_id'
138
- assert content_bin[-10:] == b'r":null}}\n'
139
-
140
- content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
141
- jso = json.dumps(content_bin.decode("utf-8"))
142
- print(jso)
magic_pdf/rw/__init__.py DELETED
File without changes
magic_pdf/user_api.py DELETED
@@ -1,121 +0,0 @@
1
- """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
2
-
3
- 然后:
4
- 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
5
- 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
6
-
7
- 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
8
- """
9
-
10
- from loguru import logger
11
-
12
- from magic_pdf.data.data_reader_writer import DataWriter
13
- from magic_pdf.libs.version import __version__
14
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
- from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
16
- from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
17
-
18
- PARSE_TYPE_TXT = 'txt'
19
- PARSE_TYPE_OCR = 'ocr'
20
-
21
-
22
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
23
- start_page_id=0, end_page_id=None, lang=None,
24
- *args, **kwargs):
25
- """解析文本类pdf."""
26
- pdf_info_dict = parse_pdf_by_txt(
27
- pdf_bytes,
28
- pdf_models,
29
- imageWriter,
30
- start_page_id=start_page_id,
31
- end_page_id=end_page_id,
32
- debug_mode=is_debug,
33
- lang=lang,
34
- )
35
-
36
- pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
37
-
38
- pdf_info_dict['_version_name'] = __version__
39
-
40
- if lang is not None:
41
- pdf_info_dict['_lang'] = lang
42
-
43
- return pdf_info_dict
44
-
45
-
46
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
47
- start_page_id=0, end_page_id=None, lang=None,
48
- *args, **kwargs):
49
- """解析ocr类pdf."""
50
- pdf_info_dict = parse_pdf_by_ocr(
51
- pdf_bytes,
52
- pdf_models,
53
- imageWriter,
54
- start_page_id=start_page_id,
55
- end_page_id=end_page_id,
56
- debug_mode=is_debug,
57
- lang=lang,
58
- )
59
-
60
- pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
61
-
62
- pdf_info_dict['_version_name'] = __version__
63
-
64
- if lang is not None:
65
- pdf_info_dict['_lang'] = lang
66
-
67
- return pdf_info_dict
68
-
69
-
70
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
71
- input_model_is_empty: bool = False,
72
- start_page_id=0, end_page_id=None, lang=None,
73
- *args, **kwargs):
74
- """ocr和文本混合的pdf,全部解析出来."""
75
-
76
- def parse_pdf(method):
77
- try:
78
- return method(
79
- pdf_bytes,
80
- pdf_models,
81
- imageWriter,
82
- start_page_id=start_page_id,
83
- end_page_id=end_page_id,
84
- debug_mode=is_debug,
85
- lang=lang,
86
- )
87
- except Exception as e:
88
- logger.exception(e)
89
- return None
90
-
91
- pdf_info_dict = parse_pdf(parse_pdf_by_txt)
92
- if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
93
- logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
94
- if input_model_is_empty:
95
- layout_model = kwargs.get('layout_model', None)
96
- formula_enable = kwargs.get('formula_enable', None)
97
- table_enable = kwargs.get('table_enable', None)
98
- pdf_models = doc_analyze(
99
- pdf_bytes,
100
- ocr=True,
101
- start_page_id=start_page_id,
102
- end_page_id=end_page_id,
103
- lang=lang,
104
- layout_model=layout_model,
105
- formula_enable=formula_enable,
106
- table_enable=table_enable,
107
- )
108
- pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
109
- if pdf_info_dict is None:
110
- raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
111
- else:
112
- pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
113
- else:
114
- pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
115
-
116
- pdf_info_dict['_version_name'] = __version__
117
-
118
- if lang is not None:
119
- pdf_info_dict['_lang'] = lang
120
-
121
- return pdf_info_dict
File without changes