smartlibs 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- auto_tasks/aaas/__init__.py +0 -0
- auto_tasks/aaas/__utils.py +6 -0
- auto_tasks/aaas/aaas_task.py +62 -0
- auto_tasks/gpu/GpuTool.py +82 -0
- auto_tasks/gpu/__init__.py +0 -0
- auto_tasks/gpu/__utils.py +8 -0
- auto_tasks/gpu/cuda_tool.py +204 -0
- auto_tasks/jsonl/__init__.py +0 -0
- auto_tasks/jsonl/__utils.py +6 -0
- auto_tasks/jsonl/jsonl_file.py +132 -0
- auto_tasks/lib/__init__.py +0 -0
- auto_tasks/lib/__utils.py +6 -0
- auto_tasks/lib/auto.py +26 -0
- auto_tasks/redis/RedisQueue.py +296 -0
- auto_tasks/redis/__init__.py +0 -0
- auto_tasks/redis/__utils.py +6 -0
- auto_tasks/redis/redis_pip.py +308 -0
- auto_tasks/redis/redis_utils.py +90 -0
- auto_tasks/tasks.yml +5 -0
- auto_tasks/test.yml +25 -0
- auto_tasks/tools/__init__.py +0 -0
- auto_tasks/tools/__utils.py +8 -0
- auto_tasks/tools/batch.py +29 -0
- auto_tasks/tools/item_tool.py +110 -0
- auto_tasks/tools/t_pager.py +29 -0
- auto_tasks/tools/t_print.py +116 -0
- auto_tasks/tools/t_statefile.py +158 -0
- auto_tasks/tools/t_tool.py +175 -0
- smart/aaas/Runner.py +48 -0
- smart/aaas/__init__.py +2 -0
- smart/aaas/__logger.py +4 -0
- smart/aaas/auto_manage.py +505 -0
- smart/aaas/base.py +20 -0
- smart/aaas/client.py +382 -0
- smart/aaas/config.py +3 -0
- smart/aaas/process_pool.py +23 -0
- smart/aaas/run.py +87 -0
- smart/aaas/run_debug.py +24 -0
- smart/aaas/service/__init__.py +0 -0
- smart/aaas/service/admin.py +59 -0
- smart/aaas/service/auto.py +209 -0
- smart/aaas/state/__init__.py +0 -0
- smart/aaas/state/async_redis_hook.py +67 -0
- smart/aaas/state/state_hook.py +112 -0
- smart/aaas/task_log/__init__.py +1 -0
- smart/aaas/task_log/file_log.py +65 -0
- smart/aaas/utils/__init__.py +0 -0
- smart/aaas/utils/task_info.py +59 -0
- smart/aaas/wsgi.py +46 -0
- smart/auto/Runner.py +267 -0
- smart/auto/__init__.py +12 -0
- smart/auto/__logger.py +7 -0
- smart/auto/base.py +119 -0
- smart/auto/constants.py +18 -0
- smart/auto/ctx/__init__.py +0 -0
- smart/auto/ctx/runner_context.py +10 -0
- smart/auto/ctx/tree_context.py +95 -0
- smart/auto/ctx/worker_state.py +64 -0
- smart/auto/exec/__init__.py +0 -0
- smart/auto/exec/fn_chain.py +96 -0
- smart/auto/exec/task_pod.py +252 -0
- smart/auto/exec/tree_exec.py +195 -0
- smart/auto/exec/tree_pod.py +100 -0
- smart/auto/exec/win_mp.py +12 -0
- smart/auto/exec/worker/ProcessWorker.py +41 -0
- smart/auto/exec/worker/ThreadWorker.py +31 -0
- smart/auto/exec/worker/__init__.py +49 -0
- smart/auto/loader/AutoLoad.py +173 -0
- smart/auto/loader/AutoLoader.py +57 -0
- smart/auto/loader/TaskHook.py +32 -0
- smart/auto/loader/__init__.py +0 -0
- smart/auto/loader/manage.py +70 -0
- smart/auto/loader/meta.py +118 -0
- smart/auto/meta.py +150 -0
- smart/auto/parser/__init__.py +0 -0
- smart/auto/parser/auto_yml.py +784 -0
- smart/auto/parser/cmd_args.py +51 -0
- smart/auto/parser/hook.py +35 -0
- smart/auto/parser/json_extend.py +28 -0
- smart/auto/parser/path_ctx.py +80 -0
- smart/auto/parser/task.py +184 -0
- smart/auto/parser/tools.py +243 -0
- smart/auto/pip/Broadcast.py +43 -0
- smart/auto/pip/QueuePip.py +119 -0
- smart/auto/pip/QueuePipItemRecv.py +45 -0
- smart/auto/pip/__init__.py +3 -0
- smart/auto/pip/cmd.py +28 -0
- smart/auto/pip/event.py +8 -0
- smart/auto/run.py +146 -0
- smart/auto/run_debug.py +38 -0
- smart/auto/tree.py +204 -0
- smart/auto/util/__init__.py +0 -0
- smart/auto/util/task_util.py +28 -0
- smart/evals/__init__.py +2 -0
- smart/evals/__logger.py +3 -0
- smart/evals/core/__init__.py +4 -0
- smart/evals/core/aggregate.py +76 -0
- smart/evals/core/filter_op.py +87 -0
- smart/evals/core/item.py +256 -0
- smart/evals/core/label_matcher.py +121 -0
- smart/rest/__init__.py +11 -0
- smart/rest/__logger.py +4 -0
- smart/rest/aio/__init__.py +0 -0
- smart/rest/aio/application.py +142 -0
- smart/rest/aio/handler.py +16 -0
- smart/rest/aio/queue_handler.py +22 -0
- smart/rest/aio/request.py +150 -0
- smart/rest/app/__init__.py +0 -0
- smart/rest/app/application.py +89 -0
- smart/rest/app/base_app.py +50 -0
- smart/rest/app/boot.py +121 -0
- smart/rest/app/cron.py +50 -0
- smart/rest/app/crond.py +235 -0
- smart/rest/app/dispatch.py +136 -0
- smart/rest/app/handler.py +35 -0
- smart/rest/app/interceptor.py +11 -0
- smart/rest/app/interceptor_manage.py +21 -0
- smart/rest/app/module_manage.py +44 -0
- smart/rest/app/route.py +82 -0
- smart/rest/app/route_manage.py +157 -0
- smart/rest/app/service.py +11 -0
- smart/rest/base.py +39 -0
- smart/rest/base_req.py +152 -0
- smart/rest/http/ThreadingHTTPServer.py +6 -0
- smart/rest/http/__init__.py +1 -0
- smart/rest/http/dispatch.py +13 -0
- smart/rest/http/handler.py +47 -0
- smart/rest/http/request.py +141 -0
- smart/rest/http/server.py +43 -0
- smart/rest/main.py +2 -0
- smart/rest/util/__init__.py +0 -0
- smart/rest/util/url_path.py +66 -0
- smart/rest/websock/__init__.py +0 -0
- smart/rest/websock/ws_client.py +130 -0
- smart/rest/websock/ws_ctx.py +73 -0
- smart/rest/wsgi/__init__.py +0 -0
- smart/rest/wsgi/dispatch.py +5 -0
- smart/rest/wsgi/request.py +144 -0
- smart/rest/wsgi/server.py +27 -0
- smart/utils/__init__.py +15 -0
- smart/utils/__logger.py +5 -0
- smart/utils/base.py +7 -0
- smart/utils/batch/BatchItemRecv.py +78 -0
- smart/utils/batch/BatchIter.py +22 -0
- smart/utils/batch/ItemRecv.py +89 -0
- smart/utils/batch/__init__.py +0 -0
- smart/utils/bound.py +90 -0
- smart/utils/cast.py +25 -0
- smart/utils/common/__init__.py +0 -0
- smart/utils/common/cluster.py +28 -0
- smart/utils/common/filter.py +5 -0
- smart/utils/common/timeout.py +17 -0
- smart/utils/common/value.py +15 -0
- smart/utils/config.py +80 -0
- smart/utils/dag.py +211 -0
- smart/utils/dict.py +204 -0
- smart/utils/dot_path.py +204 -0
- smart/utils/env.py +86 -0
- smart/utils/file/__init__.py +1 -0
- smart/utils/file/cat.py +407 -0
- smart/utils/file/manage.py +58 -0
- smart/utils/func.py +90 -0
- smart/utils/inspect.py +20 -0
- smart/utils/item.py +182 -0
- smart/utils/iter.py +23 -0
- smart/utils/json.py +39 -0
- smart/utils/jsonl.py +107 -0
- smart/utils/kafka/KafkaQueue.py +502 -0
- smart/utils/kafka/__init__.py +0 -0
- smart/utils/lang/DictObj.py +28 -0
- smart/utils/lang/UnSupport.py +21 -0
- smart/utils/lang/__init__.py +0 -0
- smart/utils/list.py +31 -0
- smart/utils/loader.py +234 -0
- smart/utils/log.py +62 -0
- smart/utils/number.py +19 -0
- smart/utils/path.py +104 -0
- smart/utils/process.py +54 -0
- smart/utils/ratio.py +63 -0
- smart/utils/remote_debug.py +61 -0
- smart/utils/retry.py +39 -0
- smart/utils/serialize.py +124 -0
- smart/utils/signal.py +37 -0
- smart/utils/storage/__init__.py +0 -0
- smart/utils/storage/base.py +14 -0
- smart/utils/storage/local_storage.py +48 -0
- smart/utils/storage/minio_storage.py +17 -0
- smart/utils/storage/obj_factory.py +71 -0
- smart/utils/storage/obj_storage.py +40 -0
- smart/utils/store/__init__.py +0 -0
- smart/utils/store/mp_store.py +114 -0
- smart/utils/store/store.py +408 -0
- smart/utils/template.py +234 -0
- smart/utils/thread.py +68 -0
- smart/utils/tuple.py +31 -0
- smart/utils/yaml.py +60 -0
- smartlibs-0.1.9-py3.14-nspkg.pth +2 -0
- smartlibs-0.1.9.dist-info/METADATA +143 -0
- smartlibs-0.1.9.dist-info/RECORD +211 -0
- smartlibs-0.1.9.dist-info/WHEEL +5 -0
- smartlibs-0.1.9.dist-info/entry_points.txt +5 -0
- smartlibs-0.1.9.dist-info/namespace_packages.txt +2 -0
- smartlibs-0.1.9.dist-info/top_level.txt +3 -0
- starter/aaas/guid.py +26 -0
- starter/helloworld/__init__.py +4 -0
- starter/helloworld/bind_obj.py +77 -0
- starter/helloworld/cfg_hook.py +19 -0
- starter/helloworld/example_task.py +136 -0
- starter/helloworld/func_task.py +22 -0
- starter/helloworld/join_ext.py +48 -0
- starter/helloworld/utils.py +20 -0
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from smart.auto.tree import TreeMultiTask
|
|
2
|
+
from smart.aaas.client import AaasClient
|
|
3
|
+
from smart.utils.yaml import yaml_dumps
|
|
4
|
+
|
|
5
|
+
from .__utils import auto_load, logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@auto_load.task('aaas__client')
|
|
9
|
+
class AaasTask(TreeMultiTask):
|
|
10
|
+
CTX_TASK_LIST_NAME = 'aaas__client:tasks'
|
|
11
|
+
|
|
12
|
+
def conn(self, entrypoint:str=None, namespace=None, module=None, enable_https=False):
|
|
13
|
+
client = AaasClient(
|
|
14
|
+
entrypoint = entrypoint,
|
|
15
|
+
namespace = namespace,
|
|
16
|
+
enable_https = enable_https
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if module:
|
|
20
|
+
client.set_module(module)
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
'client': client
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def run(self, client:AaasClient, task_name, task_module=None, task_id=None, \
|
|
27
|
+
task_configs=None, bind_arg=None, run_opts=None, state_hook=None):
|
|
28
|
+
|
|
29
|
+
create_rst = client.create_task(
|
|
30
|
+
task_name=task_name,
|
|
31
|
+
task_id=task_id,
|
|
32
|
+
module=task_module,
|
|
33
|
+
configs=task_configs,
|
|
34
|
+
bind_arg=bind_arg,
|
|
35
|
+
run_opts=run_opts,
|
|
36
|
+
state_hook=state_hook
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
task_id = create_rst.get('task_id')
|
|
40
|
+
|
|
41
|
+
if task_id:
|
|
42
|
+
tasks = self.context.list(self.CTX_TASK_LIST_NAME)
|
|
43
|
+
tasks.append({
|
|
44
|
+
'client': client.init_args(),
|
|
45
|
+
'task_id': task_id,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
logger.info('aaas__client create_task: %s', create_rst)
|
|
49
|
+
|
|
50
|
+
def asdl(self, client:AaasClient, task_module=None, task_configs=None, bind_arg=None, run_opts=None):
|
|
51
|
+
asdl_rst = client.asdl(
|
|
52
|
+
module=task_module,
|
|
53
|
+
configs=task_configs,
|
|
54
|
+
bind_arg=bind_arg,
|
|
55
|
+
run_opts=run_opts,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
logger.info('aaas__client asdl_rst:\n%s', yaml_dumps(asdl_rst.get("result") or asdl_rst))
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"asdl": asdl_rst
|
|
62
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from .__utils import logger
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
MemInfo = namedtuple('MemInfo', ['total', 'free', 'used'])
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GpuInfoGetter:
|
|
10
|
+
def get_device_count(self) -> int:
|
|
11
|
+
"""获取显卡数量
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
int: 显卡数量
|
|
15
|
+
"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def get_memory_info(self, index:int) -> MemInfo:
|
|
19
|
+
"""获取指定显卡的显存信息
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
index (int): 显卡序号, 0表示第一块显卡
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
MemInfo: 显存信息
|
|
26
|
+
"""
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def find_mem_free_device(self, gpu_num:int=1, shuffle:bool=False,
|
|
30
|
+
free_memory:int=None, free_memory_ratio:float=None, filter_fn:callable=None)->list:
|
|
31
|
+
"""查找显存足够的显卡列表
|
|
32
|
+
free_memory和free_memory_ratio都为None时, 所有显卡都可返回。
|
|
33
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
34
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
35
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
gpu_num (int, optional): 需要查找的设备数量. Defaults to 1.
|
|
39
|
+
shuffle (bool, optional): 是否随机打乱GPU获取顺序. Defaults to False.
|
|
40
|
+
free_memory (int, optional): 过滤可用显存小于free_memory的显卡. Defaults to None.
|
|
41
|
+
free_memory_ratio (float, optional): 过滤可用显存/显存小于free_memory_ratio的显卡. Defaults to None.
|
|
42
|
+
filter_fn (callable, optional): 过滤可用显卡的函数, 例如lambda idx, memInfo:True将过滤所有显卡. Defaults to None.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list: 数据结构为[tuple(显卡序号:int, 显存信息:MemInfo)]
|
|
46
|
+
"""
|
|
47
|
+
gpu_count = self.get_device_count()
|
|
48
|
+
gpu_index_iter = range(gpu_count)
|
|
49
|
+
|
|
50
|
+
if shuffle:
|
|
51
|
+
gpu_index_iter = list(gpu_index_iter)
|
|
52
|
+
random.shuffle(gpu_index_iter)
|
|
53
|
+
|
|
54
|
+
choosed_gpu = []
|
|
55
|
+
|
|
56
|
+
for gpu_index in gpu_index_iter:
|
|
57
|
+
meminfo:MemInfo = self.get_memory_info(gpu_index)
|
|
58
|
+
|
|
59
|
+
_choose = True
|
|
60
|
+
try:
|
|
61
|
+
if free_memory_ratio is not None:
|
|
62
|
+
_ratio = float(meminfo.free) / meminfo.total
|
|
63
|
+
if _ratio < free_memory_ratio:
|
|
64
|
+
_choose = False
|
|
65
|
+
|
|
66
|
+
if free_memory is not None:
|
|
67
|
+
if meminfo.free < free_memory:
|
|
68
|
+
_choose = False
|
|
69
|
+
|
|
70
|
+
if filter_fn and filter_fn(gpu_index, meminfo):
|
|
71
|
+
_choose = False
|
|
72
|
+
except Exception as err:
|
|
73
|
+
logger.warning("find_mem_free_device err: %s", err)
|
|
74
|
+
_choose = False
|
|
75
|
+
|
|
76
|
+
if _choose:
|
|
77
|
+
choosed_gpu.append((gpu_index, meminfo))
|
|
78
|
+
|
|
79
|
+
if len(choosed_gpu) >= gpu_num:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
return choosed_gpu
|
|
File without changes
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import os, time
|
|
2
|
+
|
|
3
|
+
from smart.auto import TreeMultiTask
|
|
4
|
+
from smart.utils.env import auto_set_env_by_prefix, AppEnv
|
|
5
|
+
from smart.utils.cast import cast_bool
|
|
6
|
+
from .__utils import auto_load, logger, task_hook
|
|
7
|
+
from .GpuTool import GpuInfoGetter, MemInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_options = {
|
|
11
|
+
"nvml_is_disable": False
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import pynvml
|
|
16
|
+
except:
|
|
17
|
+
_options['nvml_is_disable'] = True
|
|
18
|
+
_options['err_msg'] = "miss pynvml"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CudaInfoGetter(GpuInfoGetter):
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
self.__inited = False
|
|
24
|
+
|
|
25
|
+
def __nvmlInit(self):
|
|
26
|
+
if not self.__inited:
|
|
27
|
+
pynvml.nvmlInit()
|
|
28
|
+
self.__inited = True
|
|
29
|
+
|
|
30
|
+
def get_device_count(self):
|
|
31
|
+
self.__nvmlInit()
|
|
32
|
+
return pynvml.nvmlDeviceGetCount()
|
|
33
|
+
|
|
34
|
+
def get_memory_info(self, index:int):
|
|
35
|
+
self.__nvmlInit()
|
|
36
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
|
|
37
|
+
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
38
|
+
return MemInfo(
|
|
39
|
+
meminfo.total, meminfo.free, meminfo.used
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@auto_load.task('gpu_tools.cuda_tool')
|
|
44
|
+
class CudaToolTask(TreeMultiTask):
|
|
45
|
+
def find_available_gpu(self, gpu_num:int=1, shuffle:bool=False,
|
|
46
|
+
free_memory:int=None, free_memory_ratio:float=None,
|
|
47
|
+
device_env_key:str='CUDA_VISIBLE_DEVICES', ctx_state_name:str='cuda_tool'):
|
|
48
|
+
"""查找可用的GPU
|
|
49
|
+
当free_memory和free_memory_ratio都为None时,所有显卡都为可用,返回的available_gpu数组长度=min(gpu_num, 机器实际显卡数)。
|
|
50
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
51
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
52
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
gpu_num (int, optional): 需要的GPU数量. Defaults to 1.
|
|
56
|
+
shuffle (bool, optional): 是否随机打乱GPU顺序. Defaults to False.
|
|
57
|
+
free_memory (int, optional): 显卡的可用显存小于free_memory为不可用. Defaults to None.
|
|
58
|
+
free_memory_ratio (float, optional): 显卡的可用显存/显存小于free_memory_ratio为不可用. Defaults to None.
|
|
59
|
+
device_env_key (str, optional): 将可用的设备序号设置到环境变量中, 空值表示不设置环境变量. Defaults to 'CUDA_VISIBLE_DEVICES'.
|
|
60
|
+
ctx_state_name (str, optional): 将返回的available_gpu列表保存到context中. Defaults to 'cuda_tool'.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
dict: {"available_gpu":[(显卡序号:int, 显存信息:MemInfo)]}
|
|
64
|
+
"""
|
|
65
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
if _options['nvml_is_disable']:
|
|
69
|
+
logger.error("nvml is not support. %s", _options.get("err_msg", ""))
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
cuda = CudaInfoGetter()
|
|
73
|
+
|
|
74
|
+
with self.context.store.lock((ctx_state_name, "find_available_gpu")):
|
|
75
|
+
used_gpu = self.context.list((ctx_state_name, "used_gpu"))
|
|
76
|
+
used_gpu_idx = [idx for idx, _ in used_gpu]
|
|
77
|
+
|
|
78
|
+
choosed_gpu = cuda.find_mem_free_device(
|
|
79
|
+
gpu_num=gpu_num,
|
|
80
|
+
shuffle=shuffle,
|
|
81
|
+
free_memory=free_memory,
|
|
82
|
+
free_memory_ratio=free_memory_ratio,
|
|
83
|
+
filter_fn=lambda idx, _:(idx in used_gpu_idx)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if choosed_gpu:
|
|
87
|
+
self.context.list((ctx_state_name, "used_gpu")).extend(choosed_gpu)
|
|
88
|
+
|
|
89
|
+
if device_env_key:
|
|
90
|
+
device_env_val = ','.join([
|
|
91
|
+
str(val[0]) for val in choosed_gpu
|
|
92
|
+
])
|
|
93
|
+
auto_set_env_by_prefix(device_env_key, device_env_val)
|
|
94
|
+
logger.info("cuda_tool.find_available_gpu set_env %s: %s", device_env_key, device_env_val)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"available_gpu": choosed_gpu
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
@task_hook.before_task()
|
|
101
|
+
def hook_available_gpu(self, gpu_num:int=1, shuffle:bool=False,
|
|
102
|
+
free_memory:int=None, free_memory_ratio:float=None,
|
|
103
|
+
device_env_key:str=None, ctx_state_name:str='cuda_tool'):
|
|
104
|
+
"""查找可用的GPU的勾子函数(在其他任务启动前先执行)
|
|
105
|
+
当free_memory和free_memory_ratio都为None时,所有显卡都为可用,返回的available_gpu数组长度=min(gpu_num, 机器实际显卡数)。
|
|
106
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
107
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
108
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
gpu_num (int, optional): 需要的GPU数量. Defaults to 1.
|
|
112
|
+
shuffle (bool, optional): 是否随机打乱GPU顺序. Defaults to False.
|
|
113
|
+
free_memory (int, optional): 显卡的可用显存小于free_memory为不可用. Defaults to None.
|
|
114
|
+
free_memory_ratio (float, optional): 显卡的可用显存/显存小于free_memory_ratio为不可用. Defaults to None.
|
|
115
|
+
device_env_key (str, optional): 将可用的设备序号设置到环境变量中, 空值表示不设置环境变量. Defaults to None.
|
|
116
|
+
ctx_state_name (str, optional): 将返回的available_gpu列表保存到context中. Defaults to 'cuda_tool'.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
dict: {"available_gpu":[(显卡序号:int, 显存信息:MemInfo)]}
|
|
120
|
+
"""
|
|
121
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
122
|
+
return
|
|
123
|
+
if _options['nvml_is_disable']:
|
|
124
|
+
logger.error("nvml is not support. %s", _options.get("err_msg", ""))
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
cuda = CudaInfoGetter()
|
|
128
|
+
|
|
129
|
+
choosed_gpu = cuda.find_mem_free_device(
|
|
130
|
+
gpu_num=gpu_num,
|
|
131
|
+
shuffle=shuffle,
|
|
132
|
+
free_memory=free_memory,
|
|
133
|
+
free_memory_ratio=free_memory_ratio
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if device_env_key:
|
|
137
|
+
device_env_val = ','.join([
|
|
138
|
+
str(val[0]) for val in choosed_gpu
|
|
139
|
+
])
|
|
140
|
+
auto_set_env_by_prefix(device_env_key, device_env_val)
|
|
141
|
+
logger.info("cuda_tool.hook_available_gpu set_env %s: %s", device_env_key, device_env_val)
|
|
142
|
+
|
|
143
|
+
if ctx_state_name:
|
|
144
|
+
self.context.state(ctx_state_name).update({
|
|
145
|
+
"available_gpu_num": len(choosed_gpu),
|
|
146
|
+
"available_gpu_list": choosed_gpu
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"available_gpu": choosed_gpu
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
def pop_gpu_from_ctx(self, gpu_num:int=1, min_gpu_num:int=0,
|
|
154
|
+
device_env_key:str='CUDA_VISIBLE_DEVICES', ctx_state_name:str='cuda_tool'):
|
|
155
|
+
"""从context中获取可用GPU, 并将GPU序号保存到环境变量
|
|
156
|
+
本方法与hook_available_gpu搭配使用.
|
|
157
|
+
当可用的GPU数量少于min_gpu_num时, 则不从context的available_gpu_list中pop数据, 同时CUDA_VISIBLE_DEVICES环境变量设置为-1.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
gpu_num (int, optional): 需要的gpu数量. Defaults to 1.
|
|
161
|
+
min_gpu_num (int, optional): 最小需要的gpu数量. Defaults to 0.
|
|
162
|
+
device_env_key (str, optional): 保存GPU序号的环境变量, 本参数一般不修改. Defaults to 'CUDA_VISIBLE_DEVICES'.
|
|
163
|
+
ctx_state_name (str, optional): 保存可用GPU列表的context名称, 本参数一般不修改. Defaults to 'cuda_tool'.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
dict: {available_gpu:[(显卡序号:int, 显存信息:MemInfo)]}
|
|
167
|
+
"""
|
|
168
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
if gpu_num < min_gpu_num:
|
|
172
|
+
gpu_num = min_gpu_num
|
|
173
|
+
|
|
174
|
+
ctx_state = self.context.state(ctx_state_name)
|
|
175
|
+
available_gpu_num = ctx_state.wait("available_gpu_num")
|
|
176
|
+
choosed_gpu = []
|
|
177
|
+
|
|
178
|
+
if available_gpu_num >= min_gpu_num:
|
|
179
|
+
with self.context.store.lock((ctx_state_name, "pop_gpu_from_ctx")):
|
|
180
|
+
gpu_list = ctx_state.get("available_gpu_list")
|
|
181
|
+
logger.debug("CudaToolTask.pop_gpu_from_ctx current gpu_list=%s", gpu_list)
|
|
182
|
+
# time.sleep(1)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
if gpu_list and len(gpu_list) >= min_gpu_num:
|
|
186
|
+
for i in range(gpu_num):
|
|
187
|
+
gpu_idx_info_tuple = gpu_list.pop(0)
|
|
188
|
+
choosed_gpu.append(gpu_idx_info_tuple)
|
|
189
|
+
except IndexError:
|
|
190
|
+
logger.info("CudaToolTask.pop_gpu_from_ctx no enough gpu")
|
|
191
|
+
|
|
192
|
+
ctx_state.set("available_gpu_list", gpu_list)
|
|
193
|
+
|
|
194
|
+
if device_env_key:
|
|
195
|
+
os.environ[device_env_key] = ", ".join(
|
|
196
|
+
str(i) for i, _ in choosed_gpu
|
|
197
|
+
) if len(choosed_gpu) else "-1"
|
|
198
|
+
logger.info("CudaToolTask.pop_gpu_from_ctx set %s=%s", device_env_key, os.environ[device_env_key])
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"available_gpu": choosed_gpu
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
File without changes
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os, json, logging
|
|
2
|
+
|
|
3
|
+
from smart.auto import TreeMultiTask, AutoLoad
|
|
4
|
+
from smart.utils import list_safe_iter, path_join
|
|
5
|
+
|
|
6
|
+
from .__utils import auto_load, logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@auto_load.task('jsonl__file')
|
|
10
|
+
class JsonlFileTask(TreeMultiTask):
|
|
11
|
+
def __resolve_file_name_keys(self, file_name_keys, file_name_idx_or_key=None):
|
|
12
|
+
if isinstance(file_name_keys, str):
|
|
13
|
+
file_name_key_list = [k.strip() for k in file_name_keys.split(',')]
|
|
14
|
+
else:
|
|
15
|
+
file_name_key_list = file_name_keys
|
|
16
|
+
|
|
17
|
+
if isinstance(file_name_idx_or_key, int):
|
|
18
|
+
yield file_name_key_list[file_name_idx_or_key]
|
|
19
|
+
elif file_name_idx_or_key:
|
|
20
|
+
yield file_name_idx_or_key
|
|
21
|
+
else:
|
|
22
|
+
yield from file_name_key_list
|
|
23
|
+
|
|
24
|
+
def pattern_read(self, file_name_keys, file_name_pattern:dict='{}', dir_path=None, file_open_opts=None, root_dir=None, group_key='_group', file_path=None):
|
|
25
|
+
"""读取多份jsonl文件
|
|
26
|
+
|
|
27
|
+
Arguments:
|
|
28
|
+
file_name_keys {list} -- 文件名的键列表
|
|
29
|
+
|
|
30
|
+
Keyword Arguments:
|
|
31
|
+
file_name_pattern {str} -- 文件名模版, 使用'{}'占位file_name_key (default: {'{}'})
|
|
32
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
33
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
34
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
35
|
+
group_key {str} -- item的分组键, 值为file_name_key (default: {'_group'})
|
|
36
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict -- {item_iter_fn}
|
|
40
|
+
"""
|
|
41
|
+
_file_name_keys = file_name_keys
|
|
42
|
+
dir_path = dir_path or file_path
|
|
43
|
+
file_open_opts = {'mode': 'r', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
44
|
+
|
|
45
|
+
def item_iter_fn(file_name_idx_or_key = None, file_name_keys=None):
|
|
46
|
+
file_name_key_list = self.__resolve_file_name_keys(file_name_keys or _file_name_keys, file_name_idx_or_key)
|
|
47
|
+
|
|
48
|
+
for file_name_key in file_name_key_list:
|
|
49
|
+
num_items = 0
|
|
50
|
+
file_name = file_name_pattern.format(file_name_key)
|
|
51
|
+
file = path_join(root_dir, dir_path, file_name)
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(file):
|
|
54
|
+
logger.warning('JsonlFileTask.pattern_read: no found file %s', file)
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
with open(file, **file_open_opts) as f:
|
|
58
|
+
for line in f:
|
|
59
|
+
if not line:
|
|
60
|
+
continue
|
|
61
|
+
item = json.loads(line)
|
|
62
|
+
if isinstance(item, dict) and group_key:
|
|
63
|
+
item[group_key] = file_name_key
|
|
64
|
+
yield item
|
|
65
|
+
num_items += 1
|
|
66
|
+
logger.debug('JsonlFileTask.pattern_read %s items from %s', num_items, file)
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
'item_iter_fn': item_iter_fn
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def read(self, file_name, dir_path=None, file_open_opts:dict=None, root_dir=None, file_path=None):
|
|
73
|
+
"""读取jsonl文件
|
|
74
|
+
|
|
75
|
+
Arguments:
|
|
76
|
+
file_name {str} -- 文件名
|
|
77
|
+
|
|
78
|
+
Keyword Arguments:
|
|
79
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
80
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
81
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
82
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
dict -- {item_iter_fn}
|
|
86
|
+
"""
|
|
87
|
+
dir_path = dir_path or file_path
|
|
88
|
+
file = path_join(root_dir, dir_path, file_name)
|
|
89
|
+
file_open_opts = {'mode': 'r', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
90
|
+
|
|
91
|
+
def item_iter_fn():
|
|
92
|
+
with open(file, **file_open_opts) as f:
|
|
93
|
+
for line in f:
|
|
94
|
+
item = json.loads(line)
|
|
95
|
+
yield item
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
'item_iter_fn': item_iter_fn
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def write(self, file_name, dir_path=None, file_open_opts:dict=None, root_dir=None, item_iter=None, item_iter_fn=None, recv_args={}, file_path=None):
|
|
102
|
+
"""写jsonl文件
|
|
103
|
+
|
|
104
|
+
Arguments:
|
|
105
|
+
file_name {str} -- 文件名
|
|
106
|
+
|
|
107
|
+
Keyword Arguments:
|
|
108
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
109
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
110
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
111
|
+
item_iter {generator} -- item生成器 (default: {None})
|
|
112
|
+
item_iter_fn {callable} -- item生成器构造函数; item_iter非空时, 本参数无效 (default: {None})
|
|
113
|
+
recv_args {dict} -- 接收数据函数的参数选项; item_iter非空时, 本参数无效 (default: {{}})
|
|
114
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
115
|
+
"""
|
|
116
|
+
assert file_name
|
|
117
|
+
dir_path = dir_path or file_path
|
|
118
|
+
file = path_join(root_dir, dir_path, file_name, auto_mkdir=True)
|
|
119
|
+
file_open_opts = {'mode': 'w', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
120
|
+
logger.info('jsonl__file.write %s', file_name)
|
|
121
|
+
|
|
122
|
+
item_iter = item_iter or (item_iter_fn or self.recv_data)(**recv_args)
|
|
123
|
+
|
|
124
|
+
count = 0
|
|
125
|
+
with open(file, **file_open_opts) as f:
|
|
126
|
+
for item in item_iter:
|
|
127
|
+
json.dump(item, f, ensure_ascii=False)
|
|
128
|
+
f.write('\n')
|
|
129
|
+
count += 1
|
|
130
|
+
|
|
131
|
+
logger.debug('jsonl__file.write %s items to %s', count, file_name)
|
|
132
|
+
|
|
File without changes
|
auto_tasks/lib/auto.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from smart.auto.tree import TreeMultiTask
|
|
2
|
+
from smart.auto.Runner import AutoRunner
|
|
3
|
+
from smart.auto.ctx.runner_context import WithAutoRunner
|
|
4
|
+
|
|
5
|
+
from .__utils import auto_load, logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@auto_load.task('lib.auto')
|
|
9
|
+
class LibAuto(TreeMultiTask, WithAutoRunner):
|
|
10
|
+
def run_tree(self, tree_name):
|
|
11
|
+
auto_runner:AutoRunner = getattr(self, 'auto_runner', None)
|
|
12
|
+
logger.debug('LibAuto.run_tree: %s', tree_name)
|
|
13
|
+
|
|
14
|
+
if auto_runner:
|
|
15
|
+
auto_runner.start(tree_name)
|
|
16
|
+
else:
|
|
17
|
+
logger.error('!!! lib.auto.run_tree fail because auto_runner is empty, maybe you should use lib.auto as task_cls')
|
|
18
|
+
|
|
19
|
+
def run_task(self, task_exp):
|
|
20
|
+
auto_runner:AutoRunner = getattr(self, 'auto_runner', None)
|
|
21
|
+
logger.debug('LibAuto.run_task: %s', task_exp)
|
|
22
|
+
|
|
23
|
+
if auto_runner:
|
|
24
|
+
auto_runner.start(task_exp, default_ns='task')
|
|
25
|
+
else:
|
|
26
|
+
logger.error('!!! lib.auto.run_task fail because auto_runner is empty, maybe you should use lib.auto as task_cls')
|