pydatamax 0.1.14__tar.gz → 0.1.15.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/LICENSE +21 -21
  2. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/PKG-INFO +117 -5
  3. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/README.md +116 -4
  4. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/__init__.py +1 -1
  5. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/__init__.py +0 -0
  6. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/core.py +118 -118
  7. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/minio_handler.py +171 -171
  8. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/oss_handler.py +191 -191
  9. pydatamax-0.1.15.post2/datamax/parser/__init__.py +2 -0
  10. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/base.py +76 -76
  11. pydatamax-0.1.15.post2/datamax/parser/core.py +406 -0
  12. pydatamax-0.1.15.post2/datamax/parser/csv_parser.py +31 -0
  13. pydatamax-0.1.15.post2/datamax/parser/doc_parser.py +659 -0
  14. pydatamax-0.1.15.post2/datamax/parser/docx_parser.py +662 -0
  15. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/epub_parser.py +41 -41
  16. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/html_parser.py +37 -37
  17. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/image_parser.py +34 -34
  18. pydatamax-0.1.15.post2/datamax/parser/json_parser.py +32 -0
  19. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/md_parser.py +72 -72
  20. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/pdf_parser.py +101 -101
  21. pydatamax-0.1.15.post2/datamax/parser/ppt_parser.py +124 -0
  22. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/pptx_parser.py +45 -45
  23. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/txt_parser.py +45 -45
  24. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/xls_parser.py +26 -26
  25. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/parser/xlsx_parser.py +212 -215
  26. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/__init__.py +23 -2
  27. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/constants.py +58 -58
  28. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/data_cleaner.py +275 -237
  29. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/env_setup.py +79 -79
  30. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/gotocr_pdf.py +265 -265
  31. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/mineru_operator.py +62 -62
  32. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/paddleocr_pdf_operator.py +90 -90
  33. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/ppt_extract.py +140 -140
  34. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/qa_generator.py +369 -376
  35. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/utils/tokenizer.py +21 -21
  36. pydatamax-0.1.15.post2/datamax/utils/uno_handler.py +426 -0
  37. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/pydatamax.egg-info/PKG-INFO +117 -5
  38. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/pydatamax.egg-info/SOURCES.txt +5 -2
  39. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/pydatamax.egg-info/dependency_links.txt +0 -0
  40. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/pydatamax.egg-info/requires.txt +0 -0
  41. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/pydatamax.egg-info/top_level.txt +0 -1
  42. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/setup.cfg +0 -0
  43. {pydatamax-0.1.14 → pydatamax-0.1.15.post2}/setup.py +58 -58
  44. pydatamax-0.1.15.post2/tests/test_doc_parser.py +247 -0
  45. pydatamax-0.1.15.post2/tests/test_docx_format_analysis.py +340 -0
  46. pydatamax-0.1.15.post2/tests/test_docx_parser.py +310 -0
  47. pydatamax-0.1.15.post2/tests/test_wps_doc.py +138 -0
  48. pydatamax-0.1.14/datamax/parser/__init__.py +0 -4
  49. pydatamax-0.1.14/datamax/parser/core.py +0 -288
  50. pydatamax-0.1.14/datamax/parser/csv_parser.py +0 -10
  51. pydatamax-0.1.14/datamax/parser/doc_parser.py +0 -203
  52. pydatamax-0.1.14/datamax/parser/docx_parser.py +0 -224
  53. pydatamax-0.1.14/datamax/parser/json_parser.py +0 -10
  54. pydatamax-0.1.14/datamax/parser/ppt_parser.py +0 -74
  55. pydatamax-0.1.14/tests/__init__.py +0 -0
  56. pydatamax-0.1.14/tests/test_basic.py +0 -20
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Hi-Dolphin
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Hi-Dolphin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.14
3
+ Version: 0.1.15.post2
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -105,10 +105,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
105
 
106
106
  # AI annotation
107
107
  qa_data = dm.get_pre_label(
108
- api_key="your-api-key",
109
- base_url="https://api.openai.com/v1",
110
- model_name="gpt-3.5-turbo"
108
+ api_key="sk-xxx",
109
+ base_url="https://api.provider.com/v1",
110
+ model_name="model-name",
111
+ chunk_size=500, # 文本块大小
112
+ chunk_overlap=100, # 重叠长度
113
+ question_number=5, # 每块生成问题数
114
+ max_workers=5 # 并发数
111
115
  )
116
+ dm.save_label_data(res)
112
117
  ```
113
118
 
114
119
  ## 📖 Detailed Documentation
@@ -138,8 +143,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
138
143
  # Image OCR
139
144
  dm = DataMax(file_path="image.jpg", use_ocr=True)
140
145
  ```
146
+ ### Batch Processing
147
+ ```python
148
+ # Parse multiple files in batch
149
+ dm = DataMax(
150
+ file_path=["file1.pdf", "file2.docx"],
151
+ use_mineru=True
152
+ )
153
+ data = dm.get_data()
154
+ ```
155
+
156
+ ### Cache parsed results
157
+ ```python
158
+ # Cache parsed results to avoid repeated parsing
159
+ dm = DataMax(
160
+ file_path=["file1.pdf", "file2.docx"],
161
+ ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
162
+ )
163
+ data = dm.get_data()
164
+ ```
141
165
 
142
166
  ### Data Cleaning
167
+ ## Exception Handling
168
+
169
+ - remove_abnormal_chars Remove abnormal characters from text
170
+ - remove_html_tags Remove HTML tags
171
+ - convert_newlines Convert \r to \n and merge multiple \n into single \n
172
+ - single_space Convert multiple spaces (more than 2) to single space
173
+ - tabs_to_spaces Convert tabs to 4 spaces
174
+ - remove_invisible_chars Remove invisible ASCII characters
175
+ - simplify_chinese Convert traditional Chinese to simplified Chinese
176
+
177
+ ## Text Filtering
178
+
179
+ - filter_by_word_repetition Filter by word repetition rate
180
+ - filter_by_char_count Filter by character count
181
+ - filter_by_numeric_content Filter by numeric content ratio
182
+
183
+ ## Privacy Desensitization
184
+
185
+ - replace_ip
186
+ - replace_email
187
+ - replace_customer_number Clean hotline numbers like 4008-123-123
188
+ - replace_bank_id
189
+ - replace_phone_number
190
+ - replace_qq
191
+ - replace_id_card
192
+
193
+
143
194
 
144
195
  ```python
145
196
  # Three cleaning modes
@@ -148,6 +199,67 @@ dm.clean_data(method_list=[
148
199
  "private", # Privacy information masking
149
200
  "filter" # Text filtering and normalization
150
201
  ])
202
+
203
+ # Custom cleaning mode
204
+ from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
205
+ dm = DataMax(
206
+ file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
207
+ )
208
+ parsed_data = dm.get_data().get('content')
209
+ # 1. Text filtering
210
+ tf = TextFilter(parsed_data=parsed_data)
211
+ # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
212
+ tf_bool = tf.filter_by_word_repetition(threshold=0.6)
213
+ if tf_bool:
214
+ print("Text passed word repetition filtering")
215
+ else:
216
+ print("Text failed word repetition filtering")
217
+
218
+ # Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
219
+ tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
220
+ if tf_bool:
221
+ print("Text passed character count filtering")
222
+ else:
223
+ print("Text failed character count filtering")
224
+
225
+ # Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
226
+ tf_bool = tf.filter_by_numeric_content(threshold=0.6)
227
+ if tf_bool:
228
+ print("Text passed numeric ratio filtering")
229
+ else:
230
+ print("Text failed numeric ratio filtering")
231
+
232
+ # 2. Privacy desensitization
233
+ pd = PrivacyDesensitization(parsed_data=parsed_data)
234
+ res = pd.replace_ip(
235
+ token="MyIP"
236
+ )
237
+ print(res)
238
+
239
+ # 3. Abnormal character cleaning
240
+ ac = AbnormalCleaner(parsed_data=parsed_data)
241
+ res = ac.remove_abnormal_chars()
242
+ res = ac.remove_html_tags()
243
+ res = ac.convert_newlines()
244
+ res = ac.single_space()
245
+ res = ac.tabs_to_spaces()
246
+ res = ac.remove_invisible_chars()
247
+ res = ac.simplify_chinese()
248
+ print(res)
249
+ ```
250
+ # Text Segmentation
251
+ ```python
252
+ dm.split_data(
253
+ chunk_size=500, # Chunk size
254
+ chunk_overlap=100, # Overlap length
255
+ use_langchain=True # Use LangChain for text segmentation
256
+ )
257
+
258
+ # When use_langchain is False, use custom segmentation method
259
+ # Using 。!? as separators, consecutive separators will be merged
260
+ # chunk_size strictly limits the string length
261
+ for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
262
+ print(chunk)
151
263
  ```
152
264
 
153
265
  ### AI Annotation
@@ -225,4 +337,4 @@ This project is licensed under the [MIT License](LICENSE).
225
337
 
226
338
  ---
227
339
 
228
- ⭐ If this project helps you, please give us a star!
340
+ ⭐ If this project helps you, please give us a star!
@@ -44,10 +44,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
44
44
 
45
45
  # AI annotation
46
46
  qa_data = dm.get_pre_label(
47
- api_key="your-api-key",
48
- base_url="https://api.openai.com/v1",
49
- model_name="gpt-3.5-turbo"
47
+ api_key="sk-xxx",
48
+ base_url="https://api.provider.com/v1",
49
+ model_name="model-name",
50
+ chunk_size=500, # 文本块大小
51
+ chunk_overlap=100, # 重叠长度
52
+ question_number=5, # 每块生成问题数
53
+ max_workers=5 # 并发数
50
54
  )
55
+ dm.save_label_data(res)
51
56
  ```
52
57
 
53
58
  ## 📖 Detailed Documentation
@@ -77,8 +82,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
77
82
  # Image OCR
78
83
  dm = DataMax(file_path="image.jpg", use_ocr=True)
79
84
  ```
85
+ ### Batch Processing
86
+ ```python
87
+ # Parse multiple files in batch
88
+ dm = DataMax(
89
+ file_path=["file1.pdf", "file2.docx"],
90
+ use_mineru=True
91
+ )
92
+ data = dm.get_data()
93
+ ```
94
+
95
+ ### Cache parsed results
96
+ ```python
97
+ # Cache parsed results to avoid repeated parsing
98
+ dm = DataMax(
99
+ file_path=["file1.pdf", "file2.docx"],
100
+ ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
101
+ )
102
+ data = dm.get_data()
103
+ ```
80
104
 
81
105
  ### Data Cleaning
106
+ ## Exception Handling
107
+
108
+ - remove_abnormal_chars Remove abnormal characters from text
109
+ - remove_html_tags Remove HTML tags
110
+ - convert_newlines Convert \r to \n and merge multiple \n into single \n
111
+ - single_space Convert multiple spaces (more than 2) to single space
112
+ - tabs_to_spaces Convert tabs to 4 spaces
113
+ - remove_invisible_chars Remove invisible ASCII characters
114
+ - simplify_chinese Convert traditional Chinese to simplified Chinese
115
+
116
+ ## Text Filtering
117
+
118
+ - filter_by_word_repetition Filter by word repetition rate
119
+ - filter_by_char_count Filter by character count
120
+ - filter_by_numeric_content Filter by numeric content ratio
121
+
122
+ ## Privacy Desensitization
123
+
124
+ - replace_ip
125
+ - replace_email
126
+ - replace_customer_number Clean hotline numbers like 4008-123-123
127
+ - replace_bank_id
128
+ - replace_phone_number
129
+ - replace_qq
130
+ - replace_id_card
131
+
132
+
82
133
 
83
134
  ```python
84
135
  # Three cleaning modes
@@ -87,6 +138,67 @@ dm.clean_data(method_list=[
87
138
  "private", # Privacy information masking
88
139
  "filter" # Text filtering and normalization
89
140
  ])
141
+
142
+ # Custom cleaning mode
143
+ from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
144
+ dm = DataMax(
145
+ file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
146
+ )
147
+ parsed_data = dm.get_data().get('content')
148
+ # 1. Text filtering
149
+ tf = TextFilter(parsed_data=parsed_data)
150
+ # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
151
+ tf_bool = tf.filter_by_word_repetition(threshold=0.6)
152
+ if tf_bool:
153
+ print("Text passed word repetition filtering")
154
+ else:
155
+ print("Text failed word repetition filtering")
156
+
157
+ # Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
158
+ tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
159
+ if tf_bool:
160
+ print("Text passed character count filtering")
161
+ else:
162
+ print("Text failed character count filtering")
163
+
164
+ # Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
165
+ tf_bool = tf.filter_by_numeric_content(threshold=0.6)
166
+ if tf_bool:
167
+ print("Text passed numeric ratio filtering")
168
+ else:
169
+ print("Text failed numeric ratio filtering")
170
+
171
+ # 2. Privacy desensitization
172
+ pd = PrivacyDesensitization(parsed_data=parsed_data)
173
+ res = pd.replace_ip(
174
+ token="MyIP"
175
+ )
176
+ print(res)
177
+
178
+ # 3. Abnormal character cleaning
179
+ ac = AbnormalCleaner(parsed_data=parsed_data)
180
+ res = ac.remove_abnormal_chars()
181
+ res = ac.remove_html_tags()
182
+ res = ac.convert_newlines()
183
+ res = ac.single_space()
184
+ res = ac.tabs_to_spaces()
185
+ res = ac.remove_invisible_chars()
186
+ res = ac.simplify_chinese()
187
+ print(res)
188
+ ```
189
+ # Text Segmentation
190
+ ```python
191
+ dm.split_data(
192
+ chunk_size=500, # Chunk size
193
+ chunk_overlap=100, # Overlap length
194
+ use_langchain=True # Use LangChain for text segmentation
195
+ )
196
+
197
+ # When use_langchain is False, use custom segmentation method
198
+ # Using 。!? as separators, consecutive separators will be merged
199
+ # chunk_size strictly limits the string length
200
+ for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
201
+ print(chunk)
90
202
  ```
91
203
 
92
204
  ### AI Annotation
@@ -164,4 +276,4 @@ This project is licensed under the [MIT License](LICENSE).
164
276
 
165
277
  ---
166
278
 
167
- ⭐ If this project helps you, please give us a star!
279
+ ⭐ If this project helps you, please give us a star!
@@ -1 +1 @@
1
- from .parser import DataMax
1
+ from .parser import DataMax
@@ -1,119 +1,119 @@
1
- import os
2
- from typing import List
3
- from datamax.loader.minio_handler import MinIOClient
4
- from datamax.loader.oss_handler import OssClient
5
-
6
-
7
- class DataLoader:
8
- def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
9
- bucket_name: str = None, source: str = None):
10
- if source and source == 'Oss':
11
- self.oss = OssClient(
12
- oss_endpoint=endpoint,
13
- oss_access_key_secret=secret_key,
14
- oss_access_key_id=access_key,
15
- oss_bucket_name=bucket_name
16
- )
17
- elif source and source == 'MinIO':
18
- self.mi = MinIOClient(
19
- endpoint=endpoint,
20
- secret_key=secret_key,
21
- access_key=access_key,
22
- bucket_name=bucket_name
23
- )
24
- self.download_path = str('./download_file')
25
- self.source = source
26
- self.bucket_name = bucket_name
27
-
28
- @staticmethod
29
- def load_from_file(local_file_path) -> List[str]:
30
- if os.path.isfile(local_file_path):
31
- if os.path.exists(local_file_path):
32
- if os.access(local_file_path, os.R_OK):
33
- return [local_file_path]
34
- else:
35
- return []
36
- else:
37
- return []
38
- elif os.path.isdir(local_file_path):
39
- access_path = []
40
- for root, dirs, files in os.walk(local_file_path):
41
- for file in files:
42
- file_path = os.path.join(root, file)
43
- if os.path.exists(file_path):
44
- if os.access(file_path, os.R_OK):
45
- access_path.append(file_path)
46
- else:
47
- continue
48
- else:
49
- continue
50
- return access_path
51
- else:
52
- return []
53
-
54
- def load_from_oss_source(self, oss_path: str) -> List[str]:
55
- if not os.path.exists(self.download_path):
56
- os.makedirs(self.download_path)
57
-
58
- self.download(oss_path=oss_path)
59
-
60
- file_list = []
61
- for root, dirs, files in os.walk(self.download_path):
62
- for file in files:
63
- file_path = os.path.join(self.download_path, file)
64
- file_list.append(file_path)
65
-
66
- success_file_list = []
67
- for file_path in file_list:
68
- if self.load_from_file(file_path):
69
- success_file_list.append(file_path)
70
-
71
- return success_file_list
72
-
73
- def download(self, oss_path: str):
74
- if self.source == 'MinIO':
75
- file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
76
- for path in file_list:
77
- self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
78
- file_path=f'{self.download_path}/{path.split("/")[-1]}')
79
- elif self.source == "Oss":
80
- keys = self.oss.get_objects_in_folders(prefix=oss_path)
81
- for path in keys:
82
- self.oss.get_object_to_file(object_name=path,
83
- file_path=f'{self.download_path}/{path.split("/")[-1]}')
84
-
85
- def upload(self, local_file_path: str, save_prefix: str):
86
- if self.source == 'MinIO':
87
- if os.path.isdir(local_file_path):
88
- for root, dirs, files in os.walk(local_file_path):
89
- for file in files:
90
- file_path = os.path.join(root, file)
91
- self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
92
- file_path=file_path)
93
- elif os.path.isfile(local_file_path):
94
- self.mi.upload_file(bucket_name=self.bucket_name,
95
- object_name=save_prefix + os.path.basename(local_file_path),
96
- file_path=local_file_path)
97
- else:
98
- pass
99
-
100
- elif self.source == "Oss":
101
- if os.path.isdir(local_file_path):
102
- self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
103
- elif os.path.isfile(local_file_path):
104
- self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
105
- file_path=local_file_path)
106
- else:
107
- pass
108
-
109
- def share(self, oss_path: str,
110
- expires: int = None,
111
- aliyun_oss_url_prefix: str = None,
112
- csnt_url_prefix: str = None):
113
- if self.source == 'MinIO':
114
- return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
115
- elif self.source == "Oss":
116
- return self.oss.get_oss_url(object_name=oss_path,
117
- url_expires_time=expires,
118
- aliyun_oss_url_prefix=aliyun_oss_url_prefix,
1
+ import os
2
+ from typing import List
3
+ from datamax.loader.minio_handler import MinIOClient
4
+ from datamax.loader.oss_handler import OssClient
5
+
6
+
7
+ class DataLoader:
8
+ def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
9
+ bucket_name: str = None, source: str = None):
10
+ if source and source == 'Oss':
11
+ self.oss = OssClient(
12
+ oss_endpoint=endpoint,
13
+ oss_access_key_secret=secret_key,
14
+ oss_access_key_id=access_key,
15
+ oss_bucket_name=bucket_name
16
+ )
17
+ elif source and source == 'MinIO':
18
+ self.mi = MinIOClient(
19
+ endpoint=endpoint,
20
+ secret_key=secret_key,
21
+ access_key=access_key,
22
+ bucket_name=bucket_name
23
+ )
24
+ self.download_path = str('./download_file')
25
+ self.source = source
26
+ self.bucket_name = bucket_name
27
+
28
+ @staticmethod
29
+ def load_from_file(local_file_path) -> List[str]:
30
+ if os.path.isfile(local_file_path):
31
+ if os.path.exists(local_file_path):
32
+ if os.access(local_file_path, os.R_OK):
33
+ return [local_file_path]
34
+ else:
35
+ return []
36
+ else:
37
+ return []
38
+ elif os.path.isdir(local_file_path):
39
+ access_path = []
40
+ for root, dirs, files in os.walk(local_file_path):
41
+ for file in files:
42
+ file_path = os.path.join(root, file)
43
+ if os.path.exists(file_path):
44
+ if os.access(file_path, os.R_OK):
45
+ access_path.append(file_path)
46
+ else:
47
+ continue
48
+ else:
49
+ continue
50
+ return access_path
51
+ else:
52
+ return []
53
+
54
+ def load_from_oss_source(self, oss_path: str) -> List[str]:
55
+ if not os.path.exists(self.download_path):
56
+ os.makedirs(self.download_path)
57
+
58
+ self.download(oss_path=oss_path)
59
+
60
+ file_list = []
61
+ for root, dirs, files in os.walk(self.download_path):
62
+ for file in files:
63
+ file_path = os.path.join(self.download_path, file)
64
+ file_list.append(file_path)
65
+
66
+ success_file_list = []
67
+ for file_path in file_list:
68
+ if self.load_from_file(file_path):
69
+ success_file_list.append(file_path)
70
+
71
+ return success_file_list
72
+
73
+ def download(self, oss_path: str):
74
+ if self.source == 'MinIO':
75
+ file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
76
+ for path in file_list:
77
+ self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
78
+ file_path=f'{self.download_path}/{path.split("/")[-1]}')
79
+ elif self.source == "Oss":
80
+ keys = self.oss.get_objects_in_folders(prefix=oss_path)
81
+ for path in keys:
82
+ self.oss.get_object_to_file(object_name=path,
83
+ file_path=f'{self.download_path}/{path.split("/")[-1]}')
84
+
85
+ def upload(self, local_file_path: str, save_prefix: str):
86
+ if self.source == 'MinIO':
87
+ if os.path.isdir(local_file_path):
88
+ for root, dirs, files in os.walk(local_file_path):
89
+ for file in files:
90
+ file_path = os.path.join(root, file)
91
+ self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
92
+ file_path=file_path)
93
+ elif os.path.isfile(local_file_path):
94
+ self.mi.upload_file(bucket_name=self.bucket_name,
95
+ object_name=save_prefix + os.path.basename(local_file_path),
96
+ file_path=local_file_path)
97
+ else:
98
+ pass
99
+
100
+ elif self.source == "Oss":
101
+ if os.path.isdir(local_file_path):
102
+ self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
103
+ elif os.path.isfile(local_file_path):
104
+ self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
105
+ file_path=local_file_path)
106
+ else:
107
+ pass
108
+
109
+ def share(self, oss_path: str,
110
+ expires: int = None,
111
+ aliyun_oss_url_prefix: str = None,
112
+ csnt_url_prefix: str = None):
113
+ if self.source == 'MinIO':
114
+ return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
115
+ elif self.source == "Oss":
116
+ return self.oss.get_oss_url(object_name=oss_path,
117
+ url_expires_time=expires,
118
+ aliyun_oss_url_prefix=aliyun_oss_url_prefix,
119
119
  csnt_url_prefix=csnt_url_prefix)