pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,191 +1,191 @@
1
- import datetime
2
- import os
3
- import subprocess
4
-
5
- import oss2
6
- from dotenv import load_dotenv
7
- from loguru import logger
8
- from tqdm import tqdm
9
-
10
- load_dotenv()
11
-
12
-
13
- def removing(path):
14
- for root, dirs, files in os.walk(path):
15
- for dir in dirs:
16
- if dir == "__pycache__":
17
- pycache_path = os.path.join(root, dir)
18
- subprocess.run(["rm", "-rf", pycache_path], check=False)
19
-
20
-
21
- def format_size_adaptive(value):
22
- units = ["B", "KB", "MB", "GB", "TB", "PB"]
23
- size = 1024.0
24
- for i in range(len(units)):
25
- if (value / size) < 1:
26
- return "%.2f%s" % (value, units[i])
27
- value = value / size
28
-
29
-
30
- def format_datetime_into_isoformat(date_time: datetime.datetime) -> str:
31
- return (
32
- date_time.replace(tzinfo=datetime.timezone.utc)
33
- .isoformat()
34
- .replace("+00:00", "Z")
35
- )
36
-
37
-
38
- class OssClient:
39
- def __init__(
40
- self, oss_access_key_id, oss_access_key_secret, oss_endpoint, oss_bucket_name
41
- ):
42
- self.bucket_name = oss_bucket_name
43
- self.auth = oss2.Auth(
44
- os.getenv("OSS_ACCESS_KEY_ID", oss_access_key_id),
45
- os.getenv("OSS_ACCESS_KEY_SECRET", oss_access_key_secret),
46
- )
47
- self.endpoint = os.getenv("OSS_ENDPOINT", oss_endpoint)
48
- self.bucket = oss2.Bucket(
49
- self.auth, self.endpoint, os.getenv("OSS_BUCKET_NAME", oss_bucket_name)
50
- )
51
-
52
- # Upload a file
53
- # Usage: ossBucket.put_object_from_file("my-object-key", "path/to/local/file.txt")
54
- def put_object_from_file(self, object_name, file_path, progress_callback=None):
55
- self.bucket.put_object_from_file(
56
- object_name, file_path, progress_callback=progress_callback
57
- )
58
-
59
- # Download a file
60
- # Usage: ossBucket.get_object_to_file("my-object-key", "path/to/local/output-file.txt")
61
- def get_object_to_file(self, object_name, file_path, progress_callback=None):
62
- try:
63
- self.bucket.get_object_to_file(
64
- object_name, file_path, progress_callback=progress_callback
65
- )
66
- except oss2.exceptions.NoSuchKey:
67
- raise
68
-
69
- # Upload a folder
70
-
71
- # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
72
- def put_pdf_word_from_folder(
73
- self, object_folder_name, local_folder_path, progress_callback=None
74
- ):
75
- for root, dirs, files in os.walk(local_folder_path):
76
- for file in tqdm(files, desc=root):
77
- if file.endswith(".pdf") or file.endswith(".word"):
78
- file_path = os.path.join(root, file)
79
- object_name = os.path.join(
80
- object_folder_name, file_path[len(local_folder_path) + 1 :]
81
- )
82
- self.bucket.put_object_from_file(
83
- object_name, file_path, progress_callback=progress_callback
84
- )
85
- # logger.info("object name: {}, file path: {}".format(
86
- # object_name, file_path))
87
-
88
- # Upload a folder
89
- # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
90
- def put_object_from_folder(
91
- self, object_folder_name, local_folder_path, progress_callback=None
92
- ):
93
- for root, dirs, files in os.walk(local_folder_path):
94
- for file in tqdm(files, desc=root):
95
- file_path = os.path.join(root, file)
96
- object_name = os.path.join(
97
- object_folder_name, file_path[len(local_folder_path) + 1 :]
98
- )
99
- self.bucket.put_object_from_file(
100
- object_name, file_path, progress_callback=progress_callback
101
- )
102
- logger.info(
103
- "object name: {}, file path: {}".format(object_name, file_path)
104
- )
105
-
106
- # Download a folder
107
- # Usage: ossBucket.get_object_to_folder("my-object-folder", "path/to/local/output-folder")
108
- def get_object_to_folder(
109
- self, object_folder_name, local_folder_path, progress_callback=None
110
- ):
111
- os.makedirs(local_folder_path, exist_ok=True)
112
- for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
113
- file_path = os.path.join(
114
- local_folder_path, obj.key[len(object_folder_name) + 1 :]
115
- )
116
- self.bucket.get_object_to_file(
117
- obj.key, file_path, progress_callback=progress_callback
118
- )
119
-
120
- # Get all objects in the bucket
121
- # Usage: ossBucket.get_all_objects_in_bucket()
122
- def get_all_objects_in_bucket(self, prefix=None, delimiter=None):
123
- for obj in oss2.ObjectIterator(self.bucket, prefix=prefix, delimiter=delimiter):
124
- if obj.is_prefix(): # obj is folder
125
- logger.info("directory key: {}".format(obj.key))
126
- else: # obj is file
127
- logger.info(
128
- "file key: {}, object last modified: {}, object size: {}".format(
129
- obj.key,
130
- format_datetime_into_isoformat(
131
- datetime.datetime.fromtimestamp(obj.last_modified)
132
- ),
133
- format_size_adaptive(obj.size),
134
- )
135
- )
136
-
137
- def get_objects_in_folders(self, prefix: str):
138
- all_keys = []
139
- for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
140
- if obj.is_prefix(): # obj is folder
141
- pass
142
- else: # obj is file
143
- if obj.key.endswith("/"):
144
- continue
145
- all_keys.append(obj.key)
146
- return all_keys
147
-
148
- def delete_object(self, object_name="test"):
149
- if object_name is None or object_name == "":
150
- raise Exception(
151
- "Danger! object name is None or '' Will delete all objects in bucket!"
152
- )
153
- self.bucket.delete_object(key=object_name)
154
-
155
- # Delete a folder
156
- # Usage: ossBucket.delete_object_folder("my-object-folder")
157
- def delete_object_folder(self, object_folder_name="test"):
158
- if object_folder_name is None or object_folder_name == "":
159
- raise Exception(
160
- "Danger! object name is None or '' Will delete all objects in bucket!"
161
- )
162
- for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
163
- self.bucket.delete_object(obj.key)
164
- logger.info("delete object key: {}".format(obj.key))
165
-
166
- def get_oss_url(
167
- self, object_name, url_expires_time, aliyun_oss_url_prefix, csnt_url_prefix
168
- ):
169
- oss_prefix = "oss://" + os.getenv("OSS_BUCKET_NAME", self.bucket_name) + "/"
170
- if object_name.__contains__(oss_prefix):
171
- object_name = object_name.replace(oss_prefix, "")
172
- aliyun_url = self.bucket.sign_url(
173
- "GET",
174
- object_name,
175
- int(os.getenv("URL_EXPIRES_TIME", url_expires_time)),
176
- slash_safe=True,
177
- )
178
- csnt_url = aliyun_url.replace(
179
- os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix),
180
- os.getenv("CSNT_URL_PREFIX", csnt_url_prefix),
181
- )
182
- return csnt_url
183
-
184
- def get_default_oss_url(self, object_name: str, url_expires_time):
185
- aliyun_url = self.bucket.sign_url(
186
- "GET",
187
- object_name,
188
- int(os.getenv("url_expires_time", url_expires_time)),
189
- slash_safe=True,
190
- )
191
- return aliyun_url
1
+ import datetime
2
+ import os
3
+ import subprocess
4
+
5
+ import oss2
6
+ from dotenv import load_dotenv
7
+ from loguru import logger
8
+ from tqdm import tqdm
9
+
10
+ load_dotenv()
11
+
12
+
13
+ def removing(path):
14
+ for root, dirs, files in os.walk(path):
15
+ for dir in dirs:
16
+ if dir == "__pycache__":
17
+ pycache_path = os.path.join(root, dir)
18
+ subprocess.run(["rm", "-rf", pycache_path], check=False)
19
+
20
+
21
+ def format_size_adaptive(value):
22
+ units = ["B", "KB", "MB", "GB", "TB", "PB"]
23
+ size = 1024.0
24
+ for i in range(len(units)):
25
+ if (value / size) < 1:
26
+ return "%.2f%s" % (value, units[i])
27
+ value = value / size
28
+
29
+
30
+ def format_datetime_into_isoformat(date_time: datetime.datetime) -> str:
31
+ return (
32
+ date_time.replace(tzinfo=datetime.timezone.utc)
33
+ .isoformat()
34
+ .replace("+00:00", "Z")
35
+ )
36
+
37
+
38
+ class OssClient:
39
+ def __init__(
40
+ self, oss_access_key_id, oss_access_key_secret, oss_endpoint, oss_bucket_name
41
+ ):
42
+ self.bucket_name = oss_bucket_name
43
+ self.auth = oss2.Auth(
44
+ os.getenv("OSS_ACCESS_KEY_ID", oss_access_key_id),
45
+ os.getenv("OSS_ACCESS_KEY_SECRET", oss_access_key_secret),
46
+ )
47
+ self.endpoint = os.getenv("OSS_ENDPOINT", oss_endpoint)
48
+ self.bucket = oss2.Bucket(
49
+ self.auth, self.endpoint, os.getenv("OSS_BUCKET_NAME", oss_bucket_name)
50
+ )
51
+
52
+ # Upload a file
53
+ # Usage: ossBucket.put_object_from_file("my-object-key", "path/to/local/file.txt")
54
+ def put_object_from_file(self, object_name, file_path, progress_callback=None):
55
+ self.bucket.put_object_from_file(
56
+ object_name, file_path, progress_callback=progress_callback
57
+ )
58
+
59
+ # Download a file
60
+ # Usage: ossBucket.get_object_to_file("my-object-key", "path/to/local/output-file.txt")
61
+ def get_object_to_file(self, object_name, file_path, progress_callback=None):
62
+ try:
63
+ self.bucket.get_object_to_file(
64
+ object_name, file_path, progress_callback=progress_callback
65
+ )
66
+ except oss2.exceptions.NoSuchKey:
67
+ raise
68
+
69
+ # Upload a folder
70
+
71
+ # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
72
+ def put_pdf_word_from_folder(
73
+ self, object_folder_name, local_folder_path, progress_callback=None
74
+ ):
75
+ for root, dirs, files in os.walk(local_folder_path):
76
+ for file in tqdm(files, desc=root):
77
+ if file.endswith(".pdf") or file.endswith(".word"):
78
+ file_path = os.path.join(root, file)
79
+ object_name = os.path.join(
80
+ object_folder_name, file_path[len(local_folder_path) + 1 :]
81
+ )
82
+ self.bucket.put_object_from_file(
83
+ object_name, file_path, progress_callback=progress_callback
84
+ )
85
+ # logger.info("object name: {}, file path: {}".format(
86
+ # object_name, file_path))
87
+
88
+ # Upload a folder
89
+ # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
90
+ def put_object_from_folder(
91
+ self, object_folder_name, local_folder_path, progress_callback=None
92
+ ):
93
+ for root, dirs, files in os.walk(local_folder_path):
94
+ for file in tqdm(files, desc=root):
95
+ file_path = os.path.join(root, file)
96
+ object_name = os.path.join(
97
+ object_folder_name, file_path[len(local_folder_path) + 1 :]
98
+ )
99
+ self.bucket.put_object_from_file(
100
+ object_name, file_path, progress_callback=progress_callback
101
+ )
102
+ logger.info(
103
+ "object name: {}, file path: {}".format(object_name, file_path)
104
+ )
105
+
106
+ # Download a folder
107
+ # Usage: ossBucket.get_object_to_folder("my-object-folder", "path/to/local/output-folder")
108
+ def get_object_to_folder(
109
+ self, object_folder_name, local_folder_path, progress_callback=None
110
+ ):
111
+ os.makedirs(local_folder_path, exist_ok=True)
112
+ for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
113
+ file_path = os.path.join(
114
+ local_folder_path, obj.key[len(object_folder_name) + 1 :]
115
+ )
116
+ self.bucket.get_object_to_file(
117
+ obj.key, file_path, progress_callback=progress_callback
118
+ )
119
+
120
+ # Get all objects in the bucket
121
+ # Usage: ossBucket.get_all_objects_in_bucket()
122
+ def get_all_objects_in_bucket(self, prefix=None, delimiter=None):
123
+ for obj in oss2.ObjectIterator(self.bucket, prefix=prefix, delimiter=delimiter):
124
+ if obj.is_prefix(): # obj is folder
125
+ logger.info("directory key: {}".format(obj.key))
126
+ else: # obj is file
127
+ logger.info(
128
+ "file key: {}, object last modified: {}, object size: {}".format(
129
+ obj.key,
130
+ format_datetime_into_isoformat(
131
+ datetime.datetime.fromtimestamp(obj.last_modified)
132
+ ),
133
+ format_size_adaptive(obj.size),
134
+ )
135
+ )
136
+
137
+ def get_objects_in_folders(self, prefix: str):
138
+ all_keys = []
139
+ for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
140
+ if obj.is_prefix(): # obj is folder
141
+ pass
142
+ else: # obj is file
143
+ if obj.key.endswith("/"):
144
+ continue
145
+ all_keys.append(obj.key)
146
+ return all_keys
147
+
148
+ def delete_object(self, object_name="test"):
149
+ if object_name is None or object_name == "":
150
+ raise Exception(
151
+ "Danger! object name is None or '' Will delete all objects in bucket!"
152
+ )
153
+ self.bucket.delete_object(key=object_name)
154
+
155
+ # Delete a folder
156
+ # Usage: ossBucket.delete_object_folder("my-object-folder")
157
+ def delete_object_folder(self, object_folder_name="test"):
158
+ if object_folder_name is None or object_folder_name == "":
159
+ raise Exception(
160
+ "Danger! object name is None or '' Will delete all objects in bucket!"
161
+ )
162
+ for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
163
+ self.bucket.delete_object(obj.key)
164
+ logger.info("delete object key: {}".format(obj.key))
165
+
166
+ def get_oss_url(
167
+ self, object_name, url_expires_time, aliyun_oss_url_prefix, csnt_url_prefix
168
+ ):
169
+ oss_prefix = "oss://" + os.getenv("OSS_BUCKET_NAME", self.bucket_name) + "/"
170
+ if object_name.__contains__(oss_prefix):
171
+ object_name = object_name.replace(oss_prefix, "")
172
+ aliyun_url = self.bucket.sign_url(
173
+ "GET",
174
+ object_name,
175
+ int(os.getenv("URL_EXPIRES_TIME", url_expires_time)),
176
+ slash_safe=True,
177
+ )
178
+ csnt_url = aliyun_url.replace(
179
+ os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix),
180
+ os.getenv("CSNT_URL_PREFIX", csnt_url_prefix),
181
+ )
182
+ return csnt_url
183
+
184
+ def get_default_oss_url(self, object_name: str, url_expires_time):
185
+ aliyun_url = self.bucket.sign_url(
186
+ "GET",
187
+ object_name,
188
+ int(os.getenv("url_expires_time", url_expires_time)),
189
+ slash_safe=True,
190
+ )
191
+ return aliyun_url
@@ -1,4 +1,2 @@
1
- from .core import DataMax
2
- import logging
3
- logger = logging.getLogger()
4
- logger.addHandler(logging.NullHandler())
1
+ from .core import DataMax
2
+ from loguru import logger
datamax/parser/base.py CHANGED
@@ -1,77 +1,77 @@
1
- import os
2
- from datetime import datetime
3
- from pathlib import Path
4
- from typing import List, Dict
5
- from datamax.utils.tokenizer import DashScopeClient
6
-
7
-
8
- class LifeCycle:
9
- """
10
- Life cycle class
11
- """
12
-
13
- def __init__(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
14
- self.update_time = update_time # Update time
15
- self.life_type = life_type # Life cycle type
16
- self.life_metadata = life_metadata # Life cycle metadata
17
-
18
- def update(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
19
- self.update_time = update_time
20
- self.life_type = life_type
21
- self.life_metadata.update(life_metadata)
22
-
23
- def __str__(self):
24
- metadata_str = ', '.join(f'{k}: {v}' for k, v in self.life_metadata.items())
25
- return f'update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}'
26
-
27
- def to_dict(self):
28
- return {
29
- 'update_time': self.update_time,
30
- 'life_type': self.life_type,
31
- 'life_metadata': self.life_metadata
32
- }
33
-
34
-
35
- class MarkdownOutputVo:
36
- """
37
- Markdown output conversion
38
- """
39
-
40
- def __init__(self, title: str, content: str):
41
- self.title: str = title # File type
42
- self.content: str = content # Markdown content
43
- self.lifecycle: List[LifeCycle] = [] # Life cycle data
44
-
45
- def add_lifecycle(self, lifecycle: LifeCycle):
46
- self.lifecycle.append(lifecycle)
47
-
48
- def to_dict(self):
49
- data_dict = {
50
- 'title': self.title,
51
- 'content': self.content,
52
- 'lifecycle': [lc.to_dict() for lc in self.lifecycle]
53
- }
54
- return data_dict
55
-
56
-
57
- class BaseLife:
58
- tk_client = DashScopeClient()
59
-
60
- @staticmethod
61
- def generate_lifecycle(source_file, domain, life_type, usage_purpose) -> LifeCycle:
62
- update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
- life_type = [life_type]
64
- storage = os.stat(source_file)
65
- life_metadata = {
66
- # "token_count": token_count, # Token count of the text
67
- "storage_size": storage.st_size, # Storage size in bytes
68
- "source_file": source_file, # Source file
69
- "domain": domain, # Domain
70
- "usage_purpose": usage_purpose # Usage purpose
71
- }
72
- return LifeCycle(update_time, life_type, life_metadata)
73
-
74
- @staticmethod
75
- def get_file_extension(file_path):
76
- file_path = Path(file_path)
1
+ import os
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+ from datamax.utils.tokenizer import DashScopeClient
6
+
7
+
8
+ class LifeCycle:
9
+ """
10
+ Life cycle class
11
+ """
12
+
13
+ def __init__(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
14
+ self.update_time = update_time # Update time
15
+ self.life_type = life_type # Life cycle type
16
+ self.life_metadata = life_metadata # Life cycle metadata
17
+
18
+ def update(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
19
+ self.update_time = update_time
20
+ self.life_type = life_type
21
+ self.life_metadata.update(life_metadata)
22
+
23
+ def __str__(self):
24
+ metadata_str = ', '.join(f'{k}: {v}' for k, v in self.life_metadata.items())
25
+ return f'update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}'
26
+
27
+ def to_dict(self):
28
+ return {
29
+ 'update_time': self.update_time,
30
+ 'life_type': self.life_type,
31
+ 'life_metadata': self.life_metadata
32
+ }
33
+
34
+
35
+ class MarkdownOutputVo:
36
+ """
37
+ Markdown output conversion
38
+ """
39
+
40
+ def __init__(self, title: str, content: str):
41
+ self.title: str = title # File type
42
+ self.content: str = content # Markdown content
43
+ self.lifecycle: List[LifeCycle] = [] # Life cycle data
44
+
45
+ def add_lifecycle(self, lifecycle: LifeCycle):
46
+ self.lifecycle.append(lifecycle)
47
+
48
+ def to_dict(self):
49
+ data_dict = {
50
+ 'title': self.title,
51
+ 'content': self.content,
52
+ 'lifecycle': [lc.to_dict() for lc in self.lifecycle]
53
+ }
54
+ return data_dict
55
+
56
+
57
+ class BaseLife:
58
+ tk_client = DashScopeClient()
59
+
60
+ @staticmethod
61
+ def generate_lifecycle(source_file, domain, life_type, usage_purpose) -> LifeCycle:
62
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
+ life_type = [life_type]
64
+ storage = os.stat(source_file)
65
+ life_metadata = {
66
+ # "token_count": token_count, # Token count of the text
67
+ "storage_size": storage.st_size, # Storage size in bytes
68
+ "source_file": source_file, # Source file
69
+ "domain": domain, # Domain
70
+ "usage_purpose": usage_purpose # Usage purpose
71
+ }
72
+ return LifeCycle(update_time, life_type, life_metadata)
73
+
74
+ @staticmethod
75
+ def get_file_extension(file_path):
76
+ file_path = Path(file_path)
77
77
  return file_path.suffix[1:].lower()