tfduck-bsd 0.18.9__tar.gz → 0.19.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- {tfduck-bsd-0.18.9/tfduck_bsd.egg-info → tfduck-bsd-0.19.1}/PKG-INFO +1 -1
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/setup.py +1 -1
- tfduck-bsd-0.19.1/tfduck/__init__.py +1 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/s3/s3oper.py +172 -71
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/serverless_k8s/k8s_task.py +8 -1
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1/tfduck_bsd.egg-info}/PKG-INFO +1 -1
- tfduck-bsd-0.18.9/tfduck/__init__.py +0 -1
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/LICENSE +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/README.md +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/bin/tfduck +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/setup.cfg +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/example.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/common/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/common/defines.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/common/defines_clean.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/common/extendEncoder.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/main.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/oss/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/oss/oss.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/pyspark_k8s/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/pyspark_k8s/spark_manage.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/s3/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/sagemaker/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/sagemaker/saoper.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/serverless_k8s/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/serverless_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/base_tga.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/predict_sql_ltv.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/predict_sql_retain.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/predict_sql_yh.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/tga.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/tga_test.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/train_sql_ltv.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/train_sql_retain.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/tga/train_sql_yh.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/thinkdata/__init__.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck/thinkdata/query.py +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck_bsd.egg-info/requires.txt +0 -0
- {tfduck-bsd-0.18.9 → tfduck-bsd-0.19.1}/tfduck_bsd.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__="0.19.1"
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
s3的公共操作
|
|
3
3
|
版本号见setup.py
|
|
4
4
|
"""
|
|
5
|
+
|
|
5
6
|
# coding=utf-8
|
|
6
7
|
import boto3
|
|
7
8
|
import time
|
|
@@ -14,7 +15,13 @@ import gzip
|
|
|
14
15
|
import pprint
|
|
15
16
|
from tfduck.common.defines import BMOBJ, Et
|
|
16
17
|
from botocore.exceptions import ClientError
|
|
17
|
-
from concurrent.futures import
|
|
18
|
+
from concurrent.futures import (
|
|
19
|
+
ThreadPoolExecutor,
|
|
20
|
+
as_completed,
|
|
21
|
+
wait,
|
|
22
|
+
ALL_COMPLETED,
|
|
23
|
+
FIRST_COMPLETED,
|
|
24
|
+
)
|
|
18
25
|
from io import StringIO, BytesIO
|
|
19
26
|
from botocore.client import Config
|
|
20
27
|
|
|
@@ -24,7 +31,9 @@ class S3Oper(object):
|
|
|
24
31
|
@des: S3的公共操作, 上传文件
|
|
25
32
|
"""
|
|
26
33
|
|
|
27
|
-
def __init__(
|
|
34
|
+
def __init__(
|
|
35
|
+
self, ctx={}, ak="", sk="", bucket="", region_name="", print_files_info=False
|
|
36
|
+
):
|
|
28
37
|
"""
|
|
29
38
|
@des: 初始化
|
|
30
39
|
"""
|
|
@@ -44,7 +53,6 @@ class S3Oper(object):
|
|
|
44
53
|
初始化client
|
|
45
54
|
"""
|
|
46
55
|
self.set_s3_client()
|
|
47
|
-
|
|
48
56
|
|
|
49
57
|
def get_s3_config(self):
|
|
50
58
|
"""
|
|
@@ -52,11 +60,14 @@ class S3Oper(object):
|
|
|
52
60
|
"""
|
|
53
61
|
# connect_timeout 和 read_timeout默认值都是60秒
|
|
54
62
|
# max_pool_connections 最大的连接池,默认10
|
|
55
|
-
config = Config(
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
63
|
+
config = Config(
|
|
64
|
+
connect_timeout=60,
|
|
65
|
+
read_timeout=600,
|
|
66
|
+
retries={"max_attempts": 0}, # 将默认重试次数设为0,不重试
|
|
67
|
+
# retries={"max_attempts": 5, "mode": "standard"}, # 将默认重试次数设为5
|
|
68
|
+
# 如果是多线程共享client, upload_foler的max_workers必须比这个值小; 但是目前_upload_i是每个线程一个client,所以这里就算超过也不受影响
|
|
69
|
+
max_pool_connections=10,
|
|
70
|
+
)
|
|
60
71
|
# s3 = boto3.client('s3', config=config)
|
|
61
72
|
return config
|
|
62
73
|
|
|
@@ -67,7 +78,7 @@ class S3Oper(object):
|
|
|
67
78
|
s3_session = boto3.Session(
|
|
68
79
|
aws_access_key_id=self.access_key,
|
|
69
80
|
aws_secret_access_key=self.secret_key,
|
|
70
|
-
region_name=self.region_name
|
|
81
|
+
region_name=self.region_name,
|
|
71
82
|
)
|
|
72
83
|
return s3_session
|
|
73
84
|
|
|
@@ -77,7 +88,7 @@ class S3Oper(object):
|
|
|
77
88
|
"""
|
|
78
89
|
s3_session = self.get_s3_session()
|
|
79
90
|
config = self.get_s3_config()
|
|
80
|
-
s3_resource = s3_session.resource(
|
|
91
|
+
s3_resource = s3_session.resource("s3")
|
|
81
92
|
# s3_resource = s3_session.resource('s3', config=config)
|
|
82
93
|
return s3_resource
|
|
83
94
|
|
|
@@ -143,16 +154,16 @@ class S3Oper(object):
|
|
|
143
154
|
可以超过1000
|
|
144
155
|
"""
|
|
145
156
|
paginator = client.get_paginator(
|
|
146
|
-
|
|
157
|
+
"list_objects_v2"
|
|
158
|
+
) # MaxKeys的最大值为1000, 不能超过1000,
|
|
147
159
|
# 所以用分页的方式,这样就可以通过分很多页超过1000
|
|
148
|
-
pages = paginator.paginate(
|
|
149
|
-
Bucket=self.bucket, Prefix=remote_path, MaxKeys=1000)
|
|
160
|
+
pages = paginator.paginate(Bucket=self.bucket, Prefix=remote_path, MaxKeys=1000)
|
|
150
161
|
remote_path_parents_count = len(pathlib.Path(remote_path).parents)
|
|
151
162
|
for page in pages:
|
|
152
|
-
for obj in page[
|
|
153
|
-
key = obj[
|
|
163
|
+
for obj in page["Contents"]:
|
|
164
|
+
key = obj["Key"]
|
|
154
165
|
if not isrm:
|
|
155
|
-
if len(pathlib.Path(key).parents)-1 == remote_path_parents_count:
|
|
166
|
+
if len(pathlib.Path(key).parents) - 1 == remote_path_parents_count:
|
|
156
167
|
file_list.append(key)
|
|
157
168
|
else:
|
|
158
169
|
file_list.append(key)
|
|
@@ -166,23 +177,96 @@ class S3Oper(object):
|
|
|
166
177
|
s3_client = self.get_s3_client()
|
|
167
178
|
try:
|
|
168
179
|
resp = s3_client.list_objects(
|
|
169
|
-
Bucket=self.bucket, Delimiter=
|
|
180
|
+
Bucket=self.bucket, Delimiter="/", Prefix=remote_path + "/"
|
|
181
|
+
)
|
|
170
182
|
return [d.get("Prefix") for d in resp.get("CommonPrefixes")]
|
|
171
183
|
except ClientError as e:
|
|
172
184
|
BMOBJ.clog(self.ctx, "get s3 folders error:", e)
|
|
173
185
|
return []
|
|
174
186
|
|
|
175
|
-
def upload_file(self, local_path, remote_path):
|
|
187
|
+
# def upload_file(self, local_path, remote_path):
|
|
188
|
+
# """
|
|
189
|
+
# @des: 将本地文件上传到s3
|
|
190
|
+
# """
|
|
191
|
+
# s3_client = self.get_s3_client()
|
|
192
|
+
# try:
|
|
193
|
+
# _ = s3_client.upload_file(local_path, self.bucket, remote_path)
|
|
194
|
+
# except ClientError as e:
|
|
195
|
+
# BMOBJ.clog(self.ctx, "upload s3 file error:", e)
|
|
196
|
+
# return False
|
|
197
|
+
# return True
|
|
198
|
+
|
|
199
|
+
def upload_file(self, local_path, remote_path, max_attempts=5, initial_delay=1):
|
|
176
200
|
"""
|
|
177
|
-
@des: 将本地文件上传到s3
|
|
201
|
+
@des: 将本地文件上传到s3,增加了健壮的重试逻辑。
|
|
202
|
+
@param max_attempts: 最大尝试次数(包括第一次)。
|
|
203
|
+
@param initial_delay: 初始重试等待时间(秒)。
|
|
178
204
|
"""
|
|
179
205
|
s3_client = self.get_s3_client()
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
206
|
+
last_exception = None
|
|
207
|
+
|
|
208
|
+
# 定义哪些S3错误代码是可重试的
|
|
209
|
+
# 'InternalError' 和 'SlowDown' 是最常见的。'ThrottlingException' 也可能出现。
|
|
210
|
+
retryable_error_codes = {"InternalError", "SlowDown", "ThrottlingException"}
|
|
211
|
+
|
|
212
|
+
for attempt in range(max_attempts):
|
|
213
|
+
try:
|
|
214
|
+
# 尝试上传文件
|
|
215
|
+
_ = s3_client.upload_file(local_path, self.bucket, remote_path)
|
|
216
|
+
# 如果上传成功,记录日志并直接返回True
|
|
217
|
+
if attempt > 0: # 如果不是第一次尝试就成功了,可以加个日志
|
|
218
|
+
BMOBJ.clog(
|
|
219
|
+
self.ctx,
|
|
220
|
+
f"Successfully uploaded {local_path} on attempt {attempt + 1}",
|
|
221
|
+
)
|
|
222
|
+
return True
|
|
223
|
+
except ClientError as e:
|
|
224
|
+
last_exception = e
|
|
225
|
+
error_code = e.response.get("Error", {}).get("Code")
|
|
226
|
+
|
|
227
|
+
# 检查错误代码是否在我们的可重试列表中
|
|
228
|
+
if error_code in retryable_error_codes:
|
|
229
|
+
# 如果是最后一次尝试,则不再等待,直接跳出循环去处理失败
|
|
230
|
+
if attempt == max_attempts - 1:
|
|
231
|
+
BMOBJ.clog(
|
|
232
|
+
self.ctx,
|
|
233
|
+
f"Upload of {local_path} failed on the final attempt ({max_attempts}). Error: {e}",
|
|
234
|
+
)
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
# 计算下一次重试的等待时间(指数退避 + 随机抖动)
|
|
238
|
+
# 等待时间 = initial_delay * 2^attempt + 随机数(0~1秒)
|
|
239
|
+
# 例如:1s, 2s, 4s, 8s, 16s
|
|
240
|
+
sleep_time = (initial_delay * (2**attempt)) + random.randint(0, 10)
|
|
241
|
+
|
|
242
|
+
BMOBJ.clog(
|
|
243
|
+
self.ctx,
|
|
244
|
+
f"Attempt {attempt + 1}/{max_attempts} to upload {local_path} failed with a retryable error: {error_code}. "
|
|
245
|
+
f"Retrying in {sleep_time:.2f} seconds...",
|
|
246
|
+
)
|
|
247
|
+
time.sleep(sleep_time)
|
|
248
|
+
else:
|
|
249
|
+
# 如果是不可重试的错误(如权限问题),立即记录错误并返回False
|
|
250
|
+
BMOBJ.clog(
|
|
251
|
+
self.ctx,
|
|
252
|
+
f"Upload of {local_path} failed with a non-retryable error: {e}",
|
|
253
|
+
)
|
|
254
|
+
return False
|
|
255
|
+
except Exception as e:
|
|
256
|
+
# 捕获其他可能的异常 (如网络问题),并将其视为失败
|
|
257
|
+
BMOBJ.clog(
|
|
258
|
+
self.ctx,
|
|
259
|
+
f"An unexpected error occurred during upload of {local_path}: {e}",
|
|
260
|
+
)
|
|
261
|
+
last_exception = e
|
|
262
|
+
break # 出现未知异常,终止重试
|
|
263
|
+
|
|
264
|
+
# 如果循环结束仍未成功,说明所有重试都失败了
|
|
265
|
+
BMOBJ.clog(
|
|
266
|
+
self.ctx,
|
|
267
|
+
f"Failed to upload {local_path} to {remote_path} after {max_attempts} attempts. Last error: {last_exception}",
|
|
268
|
+
)
|
|
269
|
+
return False
|
|
186
270
|
|
|
187
271
|
def upload_fileobj(self, io_obj, remote_path):
|
|
188
272
|
"""
|
|
@@ -251,7 +335,7 @@ class S3Oper(object):
|
|
|
251
335
|
try:
|
|
252
336
|
s3_client.head_object(Bucket=self.bucket, Key=remote_path)
|
|
253
337
|
except ClientError as e:
|
|
254
|
-
return int(e.response[
|
|
338
|
+
return int(e.response["Error"]["Code"]) != 404
|
|
255
339
|
return True
|
|
256
340
|
|
|
257
341
|
def delete_file(self, remote_path):
|
|
@@ -274,7 +358,7 @@ class S3Oper(object):
|
|
|
274
358
|
bucket.objects.filter(Prefix=remote_path).delete()
|
|
275
359
|
return True
|
|
276
360
|
|
|
277
|
-
def _upload_i(self,
|
|
361
|
+
def _upload_i(self, remote_file, local_file, retry_count):
|
|
278
362
|
"""
|
|
279
363
|
@des: 多线程批量上传
|
|
280
364
|
"""
|
|
@@ -286,33 +370,39 @@ class S3Oper(object):
|
|
|
286
370
|
"""
|
|
287
371
|
for i in range(retry_count): # 最多重试三次,由于网络不稳定等问题
|
|
288
372
|
try:
|
|
289
|
-
|
|
290
373
|
_s = time.time()
|
|
291
374
|
_ = s3_client.upload_file( # 这个方法本来就是分块多线程上传,所以开一个和多个在大文件来说上传速度区别不大
|
|
292
|
-
local_file, self.bucket, remote_file
|
|
375
|
+
local_file, self.bucket, remote_file
|
|
376
|
+
) # 返回值为None
|
|
293
377
|
_e = time.time()
|
|
294
378
|
if self.print_files_info:
|
|
295
379
|
BMOBJ.clog(
|
|
296
|
-
ctx,
|
|
380
|
+
ctx,
|
|
381
|
+
f"{local_file} upload success, sub time {_e - _s}",
|
|
382
|
+
)
|
|
297
383
|
break
|
|
298
384
|
except Exception as e:
|
|
299
|
-
BMOBJ.clog(
|
|
300
|
-
ctx, f"{local_file} upload fail, repeat {i}, error: {e}")
|
|
385
|
+
BMOBJ.clog(ctx, f"{local_file} upload fail, repeat {i}, error: {e}")
|
|
301
386
|
#
|
|
302
|
-
if i < retry_count-1:
|
|
387
|
+
if i < retry_count - 1:
|
|
303
388
|
sleep_time = random.randint(60, 120)
|
|
304
389
|
time.sleep(sleep_time)
|
|
305
390
|
continue
|
|
306
391
|
else:
|
|
307
|
-
BMOBJ.clog(
|
|
308
|
-
ctx, f"{local_file} upload finally fail: {e}")
|
|
392
|
+
BMOBJ.clog(ctx, f"{local_file} upload finally fail: {e}")
|
|
309
393
|
raise Et(2, f"upload fail {remote_file} {local_file}")
|
|
310
394
|
|
|
311
|
-
def upload_folder(
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
395
|
+
def upload_folder(
|
|
396
|
+
self,
|
|
397
|
+
local_path,
|
|
398
|
+
remote_path,
|
|
399
|
+
add_success=False,
|
|
400
|
+
add_empty=False,
|
|
401
|
+
max_workers=50,
|
|
402
|
+
isrm=True,
|
|
403
|
+
isdel=True,
|
|
404
|
+
retry_count=5,
|
|
405
|
+
):
|
|
316
406
|
"""
|
|
317
407
|
@des: 上传到s3---多线程上传---上传文件夹
|
|
318
408
|
"""
|
|
@@ -330,17 +420,22 @@ class S3Oper(object):
|
|
|
330
420
|
total_files = []
|
|
331
421
|
total_size = 0
|
|
332
422
|
for subfile in subfiles:
|
|
333
|
-
size = round(subfile.stat().st_size/1024/1024, 4)
|
|
423
|
+
size = round(subfile.stat().st_size / 1024 / 1024, 4)
|
|
334
424
|
total_size += size
|
|
335
425
|
name = subfile.name
|
|
336
426
|
total_files.append(f"{size}M {name}")
|
|
337
|
-
_infos =
|
|
427
|
+
_infos = "\n".join(total_files)
|
|
338
428
|
if self.print_files_info:
|
|
339
429
|
BMOBJ.clog(
|
|
340
|
-
ctx,
|
|
430
|
+
ctx,
|
|
431
|
+
f"""upload file info * file total count {len(subfiles)} file total size {total_size}M""",
|
|
432
|
+
_infos,
|
|
433
|
+
)
|
|
341
434
|
else:
|
|
342
435
|
BMOBJ.clog(
|
|
343
|
-
ctx,
|
|
436
|
+
ctx,
|
|
437
|
+
f"""upload file info * file total count {len(subfiles)} file total size {total_size}M""",
|
|
438
|
+
)
|
|
344
439
|
|
|
345
440
|
# 参考 https://www.jianshu.com/p/b9b3d66aa0be
|
|
346
441
|
# 控制最大队列数200,记得修改settings.py的redis队列数必须大于这个
|
|
@@ -357,7 +452,8 @@ class S3Oper(object):
|
|
|
357
452
|
local_file = str(subfile)
|
|
358
453
|
# 通过submit函数提交执行的函数到线程池中,submit函数立即返回,不阻塞
|
|
359
454
|
task_i = executor.submit(
|
|
360
|
-
self._upload_i, *(remote_file, local_file, retry_count)
|
|
455
|
+
self._upload_i, *(remote_file, local_file, retry_count)
|
|
456
|
+
)
|
|
361
457
|
all_tasks.append(task_i)
|
|
362
458
|
# 等待所有任务完成后
|
|
363
459
|
# wait(all_tasks, timeout=timeout, return_when=ALL_COMPLETED)
|
|
@@ -367,17 +463,16 @@ class S3Oper(object):
|
|
|
367
463
|
if add_success:
|
|
368
464
|
# 上传成功后,上传一个空文件代表成功
|
|
369
465
|
with BytesIO() as f:
|
|
370
|
-
with gzip.open(f,
|
|
371
|
-
r.write(b
|
|
466
|
+
with gzip.open(f, "wb") as r:
|
|
467
|
+
r.write(b"")
|
|
372
468
|
f.seek(0)
|
|
373
|
-
self.upload_fileobj(
|
|
374
|
-
f, os.path.join(remote_path, "_SUCCESS"))
|
|
469
|
+
self.upload_fileobj(f, os.path.join(remote_path, "_SUCCESS"))
|
|
375
470
|
else:
|
|
376
471
|
if add_empty:
|
|
377
472
|
# 上传一个empty文件,代表没有数据
|
|
378
473
|
with BytesIO() as f:
|
|
379
|
-
with gzip.open(f,
|
|
380
|
-
r.write(b
|
|
474
|
+
with gzip.open(f, "wb") as r:
|
|
475
|
+
r.write(b"")
|
|
381
476
|
f.seek(0)
|
|
382
477
|
self.upload_fileobj(f, os.path.join(remote_path, "_EMPTY"))
|
|
383
478
|
e = time.time()
|
|
@@ -387,8 +482,7 @@ class S3Oper(object):
|
|
|
387
482
|
raise Et(2, "cannt del root folder")
|
|
388
483
|
BMOBJ.remove_folder(local_path)
|
|
389
484
|
#
|
|
390
|
-
BMOBJ.clog(
|
|
391
|
-
ctx, f"{remote_path} upload all time", e-s)
|
|
485
|
+
BMOBJ.clog(ctx, f"{remote_path} upload all time", e - s)
|
|
392
486
|
|
|
393
487
|
def _download_i(self, remote_file, local_file, retry_count):
|
|
394
488
|
"""
|
|
@@ -402,26 +496,33 @@ class S3Oper(object):
|
|
|
402
496
|
try:
|
|
403
497
|
_s = time.time()
|
|
404
498
|
_ = s3_client.download_file(
|
|
405
|
-
self.bucket, remote_file, local_file
|
|
499
|
+
self.bucket, remote_file, local_file
|
|
500
|
+
) # 返回值为None
|
|
406
501
|
_e = time.time()
|
|
407
502
|
BMOBJ.clog(
|
|
408
|
-
ctx,
|
|
503
|
+
ctx,
|
|
504
|
+
f"{local_file} download success, sub time {_e - _s}",
|
|
505
|
+
)
|
|
409
506
|
break
|
|
410
507
|
except Exception as e:
|
|
411
|
-
BMOBJ.clog(
|
|
412
|
-
|
|
413
|
-
if i < retry_count-1:
|
|
508
|
+
BMOBJ.clog(ctx, f"{local_file} download fail, repeat {i}, error: {e}")
|
|
509
|
+
if i < retry_count - 1:
|
|
414
510
|
sleep_time = random.randint(60, 120)
|
|
415
511
|
time.sleep(sleep_time)
|
|
416
512
|
continue
|
|
417
513
|
else:
|
|
418
|
-
BMOBJ.clog(
|
|
419
|
-
ctx, f"{local_file} download finally fail: {e}")
|
|
514
|
+
BMOBJ.clog(ctx, f"{local_file} download finally fail: {e}")
|
|
420
515
|
raise Et(2, f"download fail {remote_file} {local_file}")
|
|
421
516
|
|
|
422
|
-
def download_folder(
|
|
423
|
-
|
|
424
|
-
|
|
517
|
+
def download_folder(
|
|
518
|
+
self,
|
|
519
|
+
local_path,
|
|
520
|
+
remote_path,
|
|
521
|
+
max_workers=50,
|
|
522
|
+
isrm=True,
|
|
523
|
+
isdel=True,
|
|
524
|
+
retry_count=5,
|
|
525
|
+
):
|
|
425
526
|
"""
|
|
426
527
|
@des: 下载到本地---多线程下载---下载文件夹--下载后删除s3的文件
|
|
427
528
|
"""
|
|
@@ -442,15 +543,16 @@ class S3Oper(object):
|
|
|
442
543
|
if not isrm:
|
|
443
544
|
local_file = os.path.join(local_path, subfile_name)
|
|
444
545
|
else:
|
|
445
|
-
l_name = str(pathlib.PurePath(remote_file)
|
|
446
|
-
).replace(remote_path, "")
|
|
546
|
+
l_name = str(pathlib.PurePath(remote_file)).replace(remote_path, "")
|
|
447
547
|
l_name = l_name.lstrip("/")
|
|
448
548
|
local_file = os.path.join(local_path, l_name)
|
|
449
|
-
os.makedirs(
|
|
450
|
-
|
|
549
|
+
os.makedirs(
|
|
550
|
+
os.path.dirname(local_file), exist_ok=True
|
|
551
|
+
) # 创建不存在的子文件夹
|
|
451
552
|
# 通过submit函数提交执行的函数到线程池中,submit函数立即返回,不阻塞
|
|
452
553
|
task_i = executor.submit(
|
|
453
|
-
self._download_i, *(remote_file, local_file, retry_count)
|
|
554
|
+
self._download_i, *(remote_file, local_file, retry_count)
|
|
555
|
+
)
|
|
454
556
|
all_tasks.append(task_i)
|
|
455
557
|
# 等待所有任务完成后
|
|
456
558
|
# wait(all_tasks, timeout=timeout, return_when=ALL_COMPLETED)
|
|
@@ -464,11 +566,10 @@ class S3Oper(object):
|
|
|
464
566
|
raise Et(2, "cannt del root folder")
|
|
465
567
|
self.delete_folder(remote_path)
|
|
466
568
|
#
|
|
467
|
-
BMOBJ.clog(
|
|
468
|
-
ctx, f"{remote_path} download all time", e-s)
|
|
569
|
+
BMOBJ.clog(ctx, f"{remote_path} download all time", e - s)
|
|
469
570
|
|
|
470
571
|
|
|
471
|
-
if __name__ ==
|
|
572
|
+
if __name__ == "__main__": # 打版本的时候一定记得记得脱敏
|
|
472
573
|
pass
|
|
473
574
|
# s3 = S3Oper(ctx = {}, ak="xx", sk="yy", bucket="xx", region_name="us-east-2")
|
|
474
575
|
# s3.upload_folder(local_path="/Users/yuanxiao/Downloads/train/samples",
|
|
@@ -72,6 +72,8 @@ class ServerlessTaskManage(object):
|
|
|
72
72
|
"pvc_name": "tfduck-k8s-pvc",
|
|
73
73
|
# pypi的源,如果为空字符串,则使用官方源
|
|
74
74
|
"pypi_mirror": "https://pypi.tuna.tsinghua.edu.cn/simple",
|
|
75
|
+
# 执行python脚本参数---最好是base64编码的json字符串
|
|
76
|
+
"params": "xxx",
|
|
75
77
|
},
|
|
76
78
|
#
|
|
77
79
|
is_debug=False,
|
|
@@ -86,6 +88,11 @@ class ServerlessTaskManage(object):
|
|
|
86
88
|
code_path: 代码路径
|
|
87
89
|
project_configs: 所有项目配置
|
|
88
90
|
project_name: 项目名称,在project_configs里面找到对应的key
|
|
91
|
+
params: 用法
|
|
92
|
+
param_b64_content = base64.b64encode(json.dumps(self.task_params).encode(
|
|
93
|
+
'utf8')).decode() # 将参数编辑为base64,防止出现特殊字符分割参数
|
|
94
|
+
# 解码
|
|
95
|
+
# ds = json.loads(base64.b64decode(param_b64_content).decode('utf8'))
|
|
89
96
|
######################################
|
|
90
97
|
"""
|
|
91
98
|
"""
|
|
@@ -213,7 +220,7 @@ class ServerlessTaskManage(object):
|
|
|
213
220
|
pip install -U pip {pypi_i} && \
|
|
214
221
|
pip install arrow {pypi_i} && \
|
|
215
222
|
pip install -r requirements.txt {pypi_i} && \
|
|
216
|
-
python main.py
|
|
223
|
+
python main.py --params {self.task_config["params"]}
|
|
217
224
|
"""
|
|
218
225
|
],
|
|
219
226
|
"resources": {
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__="0.18.9"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|