smartlibs 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smartlibs-0.1.9/PKG-INFO +143 -0
- smartlibs-0.1.9/auto_tasks/__init__.py +6 -0
- smartlibs-0.1.9/auto_tasks/aaas/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/aaas/__utils.py +6 -0
- smartlibs-0.1.9/auto_tasks/aaas/aaas_task.py +62 -0
- smartlibs-0.1.9/auto_tasks/gpu/GpuTool.py +82 -0
- smartlibs-0.1.9/auto_tasks/gpu/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/gpu/__utils.py +8 -0
- smartlibs-0.1.9/auto_tasks/gpu/cuda_tool.py +204 -0
- smartlibs-0.1.9/auto_tasks/jsonl/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/jsonl/__utils.py +6 -0
- smartlibs-0.1.9/auto_tasks/jsonl/jsonl_file.py +132 -0
- smartlibs-0.1.9/auto_tasks/lib/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/lib/__utils.py +6 -0
- smartlibs-0.1.9/auto_tasks/lib/auto.py +26 -0
- smartlibs-0.1.9/auto_tasks/redis/RedisQueue.py +296 -0
- smartlibs-0.1.9/auto_tasks/redis/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/redis/__utils.py +6 -0
- smartlibs-0.1.9/auto_tasks/redis/redis_pip.py +308 -0
- smartlibs-0.1.9/auto_tasks/redis/redis_utils.py +90 -0
- smartlibs-0.1.9/auto_tasks/tasks.yml +5 -0
- smartlibs-0.1.9/auto_tasks/test.yml +25 -0
- smartlibs-0.1.9/auto_tasks/tools/__init__.py +0 -0
- smartlibs-0.1.9/auto_tasks/tools/__utils.py +8 -0
- smartlibs-0.1.9/auto_tasks/tools/batch.py +29 -0
- smartlibs-0.1.9/auto_tasks/tools/item_tool.py +110 -0
- smartlibs-0.1.9/auto_tasks/tools/t_pager.py +29 -0
- smartlibs-0.1.9/auto_tasks/tools/t_print.py +116 -0
- smartlibs-0.1.9/auto_tasks/tools/t_statefile.py +158 -0
- smartlibs-0.1.9/auto_tasks/tools/t_tool.py +175 -0
- smartlibs-0.1.9/pyproject.toml +46 -0
- smartlibs-0.1.9/readme.md +118 -0
- smartlibs-0.1.9/setup.cfg +4 -0
- smartlibs-0.1.9/setup.py +56 -0
- smartlibs-0.1.9/smart/__init__.py +6 -0
- smartlibs-0.1.9/smart/aaas/Runner.py +48 -0
- smartlibs-0.1.9/smart/aaas/__init__.py +2 -0
- smartlibs-0.1.9/smart/aaas/__logger.py +4 -0
- smartlibs-0.1.9/smart/aaas/auto_manage.py +505 -0
- smartlibs-0.1.9/smart/aaas/base.py +20 -0
- smartlibs-0.1.9/smart/aaas/client.py +382 -0
- smartlibs-0.1.9/smart/aaas/config.py +3 -0
- smartlibs-0.1.9/smart/aaas/process_pool.py +23 -0
- smartlibs-0.1.9/smart/aaas/run.py +87 -0
- smartlibs-0.1.9/smart/aaas/run_debug.py +24 -0
- smartlibs-0.1.9/smart/aaas/service/__init__.py +0 -0
- smartlibs-0.1.9/smart/aaas/service/admin.py +59 -0
- smartlibs-0.1.9/smart/aaas/service/auto.py +209 -0
- smartlibs-0.1.9/smart/aaas/state/__init__.py +0 -0
- smartlibs-0.1.9/smart/aaas/state/async_redis_hook.py +67 -0
- smartlibs-0.1.9/smart/aaas/state/state_hook.py +112 -0
- smartlibs-0.1.9/smart/aaas/task_log/__init__.py +1 -0
- smartlibs-0.1.9/smart/aaas/task_log/file_log.py +65 -0
- smartlibs-0.1.9/smart/aaas/utils/__init__.py +0 -0
- smartlibs-0.1.9/smart/aaas/utils/task_info.py +59 -0
- smartlibs-0.1.9/smart/aaas/wsgi.py +46 -0
- smartlibs-0.1.9/smart/auto/Runner.py +267 -0
- smartlibs-0.1.9/smart/auto/__init__.py +12 -0
- smartlibs-0.1.9/smart/auto/__logger.py +7 -0
- smartlibs-0.1.9/smart/auto/base.py +119 -0
- smartlibs-0.1.9/smart/auto/constants.py +18 -0
- smartlibs-0.1.9/smart/auto/ctx/__init__.py +0 -0
- smartlibs-0.1.9/smart/auto/ctx/runner_context.py +10 -0
- smartlibs-0.1.9/smart/auto/ctx/tree_context.py +95 -0
- smartlibs-0.1.9/smart/auto/ctx/worker_state.py +64 -0
- smartlibs-0.1.9/smart/auto/exec/__init__.py +0 -0
- smartlibs-0.1.9/smart/auto/exec/fn_chain.py +96 -0
- smartlibs-0.1.9/smart/auto/exec/task_pod.py +252 -0
- smartlibs-0.1.9/smart/auto/exec/tree_exec.py +195 -0
- smartlibs-0.1.9/smart/auto/exec/tree_pod.py +100 -0
- smartlibs-0.1.9/smart/auto/exec/win_mp.py +12 -0
- smartlibs-0.1.9/smart/auto/exec/worker/ProcessWorker.py +41 -0
- smartlibs-0.1.9/smart/auto/exec/worker/ThreadWorker.py +31 -0
- smartlibs-0.1.9/smart/auto/exec/worker/__init__.py +49 -0
- smartlibs-0.1.9/smart/auto/loader/AutoLoad.py +173 -0
- smartlibs-0.1.9/smart/auto/loader/AutoLoader.py +57 -0
- smartlibs-0.1.9/smart/auto/loader/TaskHook.py +32 -0
- smartlibs-0.1.9/smart/auto/loader/__init__.py +0 -0
- smartlibs-0.1.9/smart/auto/loader/manage.py +70 -0
- smartlibs-0.1.9/smart/auto/loader/meta.py +118 -0
- smartlibs-0.1.9/smart/auto/meta.py +150 -0
- smartlibs-0.1.9/smart/auto/parser/__init__.py +0 -0
- smartlibs-0.1.9/smart/auto/parser/auto_yml.py +784 -0
- smartlibs-0.1.9/smart/auto/parser/cmd_args.py +51 -0
- smartlibs-0.1.9/smart/auto/parser/hook.py +35 -0
- smartlibs-0.1.9/smart/auto/parser/json_extend.py +28 -0
- smartlibs-0.1.9/smart/auto/parser/path_ctx.py +80 -0
- smartlibs-0.1.9/smart/auto/parser/task.py +184 -0
- smartlibs-0.1.9/smart/auto/parser/tools.py +243 -0
- smartlibs-0.1.9/smart/auto/pip/Broadcast.py +43 -0
- smartlibs-0.1.9/smart/auto/pip/QueuePip.py +119 -0
- smartlibs-0.1.9/smart/auto/pip/QueuePipItemRecv.py +45 -0
- smartlibs-0.1.9/smart/auto/pip/__init__.py +3 -0
- smartlibs-0.1.9/smart/auto/pip/cmd.py +28 -0
- smartlibs-0.1.9/smart/auto/pip/event.py +8 -0
- smartlibs-0.1.9/smart/auto/run.py +146 -0
- smartlibs-0.1.9/smart/auto/run_debug.py +38 -0
- smartlibs-0.1.9/smart/auto/tree.py +204 -0
- smartlibs-0.1.9/smart/auto/util/__init__.py +0 -0
- smartlibs-0.1.9/smart/auto/util/task_util.py +28 -0
- smartlibs-0.1.9/smart/evals/__init__.py +2 -0
- smartlibs-0.1.9/smart/evals/__logger.py +3 -0
- smartlibs-0.1.9/smart/evals/core/__init__.py +4 -0
- smartlibs-0.1.9/smart/evals/core/aggregate.py +76 -0
- smartlibs-0.1.9/smart/evals/core/filter_op.py +87 -0
- smartlibs-0.1.9/smart/evals/core/item.py +256 -0
- smartlibs-0.1.9/smart/evals/core/label_matcher.py +121 -0
- smartlibs-0.1.9/smart/rest/__init__.py +11 -0
- smartlibs-0.1.9/smart/rest/__logger.py +4 -0
- smartlibs-0.1.9/smart/rest/aio/__init__.py +0 -0
- smartlibs-0.1.9/smart/rest/aio/application.py +142 -0
- smartlibs-0.1.9/smart/rest/aio/handler.py +16 -0
- smartlibs-0.1.9/smart/rest/aio/queue_handler.py +22 -0
- smartlibs-0.1.9/smart/rest/aio/request.py +150 -0
- smartlibs-0.1.9/smart/rest/app/__init__.py +0 -0
- smartlibs-0.1.9/smart/rest/app/application.py +89 -0
- smartlibs-0.1.9/smart/rest/app/base_app.py +50 -0
- smartlibs-0.1.9/smart/rest/app/boot.py +121 -0
- smartlibs-0.1.9/smart/rest/app/cron.py +50 -0
- smartlibs-0.1.9/smart/rest/app/crond.py +235 -0
- smartlibs-0.1.9/smart/rest/app/dispatch.py +136 -0
- smartlibs-0.1.9/smart/rest/app/handler.py +35 -0
- smartlibs-0.1.9/smart/rest/app/interceptor.py +11 -0
- smartlibs-0.1.9/smart/rest/app/interceptor_manage.py +21 -0
- smartlibs-0.1.9/smart/rest/app/module_manage.py +44 -0
- smartlibs-0.1.9/smart/rest/app/route.py +82 -0
- smartlibs-0.1.9/smart/rest/app/route_manage.py +157 -0
- smartlibs-0.1.9/smart/rest/app/service.py +11 -0
- smartlibs-0.1.9/smart/rest/base.py +39 -0
- smartlibs-0.1.9/smart/rest/base_req.py +152 -0
- smartlibs-0.1.9/smart/rest/http/ThreadingHTTPServer.py +6 -0
- smartlibs-0.1.9/smart/rest/http/__init__.py +1 -0
- smartlibs-0.1.9/smart/rest/http/dispatch.py +13 -0
- smartlibs-0.1.9/smart/rest/http/handler.py +47 -0
- smartlibs-0.1.9/smart/rest/http/request.py +141 -0
- smartlibs-0.1.9/smart/rest/http/server.py +43 -0
- smartlibs-0.1.9/smart/rest/main.py +2 -0
- smartlibs-0.1.9/smart/rest/util/__init__.py +0 -0
- smartlibs-0.1.9/smart/rest/util/url_path.py +66 -0
- smartlibs-0.1.9/smart/rest/websock/__init__.py +0 -0
- smartlibs-0.1.9/smart/rest/websock/ws_client.py +130 -0
- smartlibs-0.1.9/smart/rest/websock/ws_ctx.py +73 -0
- smartlibs-0.1.9/smart/rest/wsgi/__init__.py +0 -0
- smartlibs-0.1.9/smart/rest/wsgi/dispatch.py +5 -0
- smartlibs-0.1.9/smart/rest/wsgi/request.py +144 -0
- smartlibs-0.1.9/smart/rest/wsgi/server.py +27 -0
- smartlibs-0.1.9/smart/utils/__init__.py +15 -0
- smartlibs-0.1.9/smart/utils/__logger.py +5 -0
- smartlibs-0.1.9/smart/utils/base.py +7 -0
- smartlibs-0.1.9/smart/utils/batch/BatchItemRecv.py +78 -0
- smartlibs-0.1.9/smart/utils/batch/BatchIter.py +22 -0
- smartlibs-0.1.9/smart/utils/batch/ItemRecv.py +89 -0
- smartlibs-0.1.9/smart/utils/batch/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/bound.py +90 -0
- smartlibs-0.1.9/smart/utils/cast.py +25 -0
- smartlibs-0.1.9/smart/utils/common/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/common/cluster.py +28 -0
- smartlibs-0.1.9/smart/utils/common/filter.py +5 -0
- smartlibs-0.1.9/smart/utils/common/timeout.py +17 -0
- smartlibs-0.1.9/smart/utils/common/value.py +15 -0
- smartlibs-0.1.9/smart/utils/config.py +80 -0
- smartlibs-0.1.9/smart/utils/dag.py +211 -0
- smartlibs-0.1.9/smart/utils/dict.py +204 -0
- smartlibs-0.1.9/smart/utils/dot_path.py +204 -0
- smartlibs-0.1.9/smart/utils/env.py +86 -0
- smartlibs-0.1.9/smart/utils/file/__init__.py +1 -0
- smartlibs-0.1.9/smart/utils/file/cat.py +407 -0
- smartlibs-0.1.9/smart/utils/file/manage.py +58 -0
- smartlibs-0.1.9/smart/utils/func.py +90 -0
- smartlibs-0.1.9/smart/utils/inspect.py +20 -0
- smartlibs-0.1.9/smart/utils/item.py +182 -0
- smartlibs-0.1.9/smart/utils/iter.py +23 -0
- smartlibs-0.1.9/smart/utils/json.py +39 -0
- smartlibs-0.1.9/smart/utils/jsonl.py +107 -0
- smartlibs-0.1.9/smart/utils/kafka/KafkaQueue.py +502 -0
- smartlibs-0.1.9/smart/utils/kafka/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/lang/DictObj.py +28 -0
- smartlibs-0.1.9/smart/utils/lang/UnSupport.py +21 -0
- smartlibs-0.1.9/smart/utils/lang/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/list.py +31 -0
- smartlibs-0.1.9/smart/utils/loader.py +234 -0
- smartlibs-0.1.9/smart/utils/log.py +62 -0
- smartlibs-0.1.9/smart/utils/number.py +19 -0
- smartlibs-0.1.9/smart/utils/path.py +104 -0
- smartlibs-0.1.9/smart/utils/process.py +54 -0
- smartlibs-0.1.9/smart/utils/ratio.py +63 -0
- smartlibs-0.1.9/smart/utils/remote_debug.py +61 -0
- smartlibs-0.1.9/smart/utils/retry.py +39 -0
- smartlibs-0.1.9/smart/utils/serialize.py +124 -0
- smartlibs-0.1.9/smart/utils/signal.py +37 -0
- smartlibs-0.1.9/smart/utils/storage/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/storage/base.py +14 -0
- smartlibs-0.1.9/smart/utils/storage/local_storage.py +48 -0
- smartlibs-0.1.9/smart/utils/storage/minio_storage.py +17 -0
- smartlibs-0.1.9/smart/utils/storage/obj_factory.py +71 -0
- smartlibs-0.1.9/smart/utils/storage/obj_storage.py +40 -0
- smartlibs-0.1.9/smart/utils/store/__init__.py +0 -0
- smartlibs-0.1.9/smart/utils/store/mp_store.py +114 -0
- smartlibs-0.1.9/smart/utils/store/store.py +408 -0
- smartlibs-0.1.9/smart/utils/template.py +234 -0
- smartlibs-0.1.9/smart/utils/thread.py +68 -0
- smartlibs-0.1.9/smart/utils/tuple.py +31 -0
- smartlibs-0.1.9/smart/utils/yaml.py +60 -0
- smartlibs-0.1.9/smartlibs.egg-info/PKG-INFO +143 -0
- smartlibs-0.1.9/smartlibs.egg-info/SOURCES.txt +216 -0
- smartlibs-0.1.9/smartlibs.egg-info/dependency_links.txt +1 -0
- smartlibs-0.1.9/smartlibs.egg-info/entry_points.txt +5 -0
- smartlibs-0.1.9/smartlibs.egg-info/namespace_packages.txt +2 -0
- smartlibs-0.1.9/smartlibs.egg-info/requires.txt +9 -0
- smartlibs-0.1.9/smartlibs.egg-info/top_level.txt +4 -0
- smartlibs-0.1.9/starter/aaas/guid.py +26 -0
- smartlibs-0.1.9/starter/helloworld/__init__.py +4 -0
- smartlibs-0.1.9/starter/helloworld/bind_obj.py +77 -0
- smartlibs-0.1.9/starter/helloworld/cfg_hook.py +19 -0
- smartlibs-0.1.9/starter/helloworld/example_task.py +136 -0
- smartlibs-0.1.9/starter/helloworld/func_task.py +22 -0
- smartlibs-0.1.9/starter/helloworld/join_ext.py +48 -0
- smartlibs-0.1.9/starter/helloworld/utils.py +20 -0
smartlibs-0.1.9/PKG-INFO
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smartlibs
|
|
3
|
+
Version: 0.1.9
|
|
4
|
+
Summary: Smart Platforms Libs
|
|
5
|
+
Author: huanghongwu
|
|
6
|
+
Author-email: huanghongwu <huanghongwu@sipuai.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/huanghw1989/smartlibs
|
|
9
|
+
Project-URL: Repository, https://github.com/huanghw1989/smartlibs
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.5.0
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: debugpy
|
|
16
|
+
Requires-Dist: fire
|
|
17
|
+
Requires-Dist: pyyaml
|
|
18
|
+
Requires-Dist: requests
|
|
19
|
+
Requires-Dist: redis
|
|
20
|
+
Requires-Dist: nvidia-ml-py
|
|
21
|
+
Provides-Extra: kafka
|
|
22
|
+
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: requires-python
|
|
25
|
+
|
|
26
|
+
Smart Platforms Libs
|
|
27
|
+
|
|
28
|
+
# 项目结构
|
|
29
|
+
**主框架**
|
|
30
|
+
* smart.auto: 自动化框架
|
|
31
|
+
* smart.aaas: auto as a service 自动化任务发布为rest服务
|
|
32
|
+
* smart.utils: 工具类
|
|
33
|
+
* smart.rest: simple rest server
|
|
34
|
+
* smart.rulenet: 规则网络 (原型版本)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
**任务类库**
|
|
38
|
+
* auto_tasks.tools: 基础任务工具
|
|
39
|
+
* auto_tasks.jsonl: jsonl文件读写
|
|
40
|
+
* auto_tasks.redis: redis管道服务, 用于向aaas发送数据和读取结果
|
|
41
|
+
* auto_tasks.aaas: aaas客户端, 启动远程任务
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# How-to-Use
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# 安装方法1
|
|
50
|
+
## 仓库安装
|
|
51
|
+
pip install smartlibs
|
|
52
|
+
|
|
53
|
+
## 源码打包安装
|
|
54
|
+
python3 setup.py sdist
|
|
55
|
+
pip install dist/smartlibs-0.1.4.tar.gz
|
|
56
|
+
|
|
57
|
+
# 记录安装文件
|
|
58
|
+
sudo python3 setup.py install --record logs/files.txt
|
|
59
|
+
|
|
60
|
+
## 更新
|
|
61
|
+
pip install --force-reinstall --no-deps smartlibs
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
# Quick Start
|
|
65
|
+
## smart_auto quick start
|
|
66
|
+
*查看命令说明*: `smart_auto -- --help`
|
|
67
|
+
|
|
68
|
+
*Run Cmd Example*:
|
|
69
|
+
> smart_auto auto_tasks.tasks 'task:tools__tool.range~@tools__print.item_iter'
|
|
70
|
+
|
|
71
|
+
*Bind Arg*:
|
|
72
|
+
> smart_auto auto_tasks.tasks task:tools__tool.range~@tools__print.item_iter --bind_arg.tools__tool.range.size=20 --bind_arg.tools__print.item_iter.head=None
|
|
73
|
+
|
|
74
|
+
*Multi Tasks*:
|
|
75
|
+
> smart_auto auto_tasks.test helloworld,helloworld
|
|
76
|
+
> smart_auto auto_tasks.test '["task:tools__tool.range~@tools__print.item_iter","helloworld"]'
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
## smart_aaas quick start
|
|
80
|
+
|
|
81
|
+
*启动aaas服务*: `smart_aaas`
|
|
82
|
+
|
|
83
|
+
*查看命令说明*: `smart_aaas -- --help`
|
|
84
|
+
|
|
85
|
+
[推荐使用 Postman 测试后面的Api]
|
|
86
|
+
|
|
87
|
+
*查看服务描述*(asdl: auto service description language):
|
|
88
|
+
```
|
|
89
|
+
curl 'http://127.0.0.1/auto/run?module=starter.aaas.client&only_parse=1'
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
*创建任务*:
|
|
93
|
+
```
|
|
94
|
+
curl 'http://127.0.0.1/auto/run?module=starter.aaas.client&name=task:tools__tool.range~@tools__print.item_iter'
|
|
95
|
+
在运行smart_aaas的终端将看到任务执行日志
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# 名词解释
|
|
101
|
+
* smartlibs: smartnlp底层框架, 设计用于数据科学
|
|
102
|
+
* smart_auto: 任务自动化工具命令行, 等同于`python -m smart.auto.run`
|
|
103
|
+
* smart_aaas: 自动化发布成服务的工具命令行, 等同于`python -m smart.aaas.run`
|
|
104
|
+
|
|
105
|
+
**smart_auto**
|
|
106
|
+
* asdl: auto service description language 自动化服务描述语言, 基于 [yaml](https://yaml.org/spec/1.1/)
|
|
107
|
+
* auto yml: 使用asdl编写的yml文件
|
|
108
|
+
* task: 最小任务单元, 即使是单进程运行模式, 任务的设计应符合进程隔离; 即任务间应通过管道传递数据
|
|
109
|
+
* tree: 任务树, 每个任务最多一个前置任务, 可以有多个后置任务, 不能出现环形依赖
|
|
110
|
+
* task expression: 任务表达式, 设计简要的字符串规则, 可被解析为任务名、任务方法、连接方法、绑定配置
|
|
111
|
+
* task key: 任务表达式中的任务名、任务方法作为任务关键字, 同一个tree下task key不能重复; 如需复用任务函数, 可用$+数字追加到任务方法之后
|
|
112
|
+
* module: 以'.'分割的路径名(dotted_module_path), 追加'.yml'后缀应指向 auto yml 文件; 完整格式: python包名.下属文件路径(路径分隔符替换成'.').auto_yml文件名
|
|
113
|
+
* pip: 数据管道, 为任务树的依赖任务之间提供数据队列; 发送机制为广播, 即前置任务同时向多个后置任务发送相同数据, 后置任务不能向前置任务发送数据
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# 教程
|
|
118
|
+
* [smart_auto doc](./starter/docs/auto/quick_start.md#Directory)
|
|
119
|
+
|
|
120
|
+
* [smart_aaas doc](./starter/docs/aaas/quick_start.md#Directory)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# 参考代码
|
|
125
|
+
* tests: 单元测试
|
|
126
|
+
|
|
127
|
+
**starter: 入门教程**
|
|
128
|
+
* starter.helloworld: 学习创建auto.yml, 编写任务类
|
|
129
|
+
|
|
130
|
+
* starter.aaas: aaas客户端和服务端示例 (通过Redis建立数据管道)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# 开发
|
|
134
|
+
## Windows环境
|
|
135
|
+
|
|
136
|
+
**注意**: Windows需在命令行执行`git config --global core.autocrlf input`
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## Start Redis Server
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
docker run -d --name myredis -p 6379:6379 redis redis-server --appendonly yes
|
|
143
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from smart.auto.tree import TreeMultiTask
|
|
2
|
+
from smart.aaas.client import AaasClient
|
|
3
|
+
from smart.utils.yaml import yaml_dumps
|
|
4
|
+
|
|
5
|
+
from .__utils import auto_load, logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@auto_load.task('aaas__client')
|
|
9
|
+
class AaasTask(TreeMultiTask):
|
|
10
|
+
CTX_TASK_LIST_NAME = 'aaas__client:tasks'
|
|
11
|
+
|
|
12
|
+
def conn(self, entrypoint:str=None, namespace=None, module=None, enable_https=False):
|
|
13
|
+
client = AaasClient(
|
|
14
|
+
entrypoint = entrypoint,
|
|
15
|
+
namespace = namespace,
|
|
16
|
+
enable_https = enable_https
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if module:
|
|
20
|
+
client.set_module(module)
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
'client': client
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def run(self, client:AaasClient, task_name, task_module=None, task_id=None, \
|
|
27
|
+
task_configs=None, bind_arg=None, run_opts=None, state_hook=None):
|
|
28
|
+
|
|
29
|
+
create_rst = client.create_task(
|
|
30
|
+
task_name=task_name,
|
|
31
|
+
task_id=task_id,
|
|
32
|
+
module=task_module,
|
|
33
|
+
configs=task_configs,
|
|
34
|
+
bind_arg=bind_arg,
|
|
35
|
+
run_opts=run_opts,
|
|
36
|
+
state_hook=state_hook
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
task_id = create_rst.get('task_id')
|
|
40
|
+
|
|
41
|
+
if task_id:
|
|
42
|
+
tasks = self.context.list(self.CTX_TASK_LIST_NAME)
|
|
43
|
+
tasks.append({
|
|
44
|
+
'client': client.init_args(),
|
|
45
|
+
'task_id': task_id,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
logger.info('aaas__client create_task: %s', create_rst)
|
|
49
|
+
|
|
50
|
+
def asdl(self, client:AaasClient, task_module=None, task_configs=None, bind_arg=None, run_opts=None):
|
|
51
|
+
asdl_rst = client.asdl(
|
|
52
|
+
module=task_module,
|
|
53
|
+
configs=task_configs,
|
|
54
|
+
bind_arg=bind_arg,
|
|
55
|
+
run_opts=run_opts,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
logger.info('aaas__client asdl_rst:\n%s', yaml_dumps(asdl_rst.get("result") or asdl_rst))
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"asdl": asdl_rst
|
|
62
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from .__utils import logger
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
MemInfo = namedtuple('MemInfo', ['total', 'free', 'used'])
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GpuInfoGetter:
|
|
10
|
+
def get_device_count(self) -> int:
|
|
11
|
+
"""获取显卡数量
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
int: 显卡数量
|
|
15
|
+
"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def get_memory_info(self, index:int) -> MemInfo:
|
|
19
|
+
"""获取指定显卡的显存信息
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
index (int): 显卡序号, 0表示第一块显卡
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
MemInfo: 显存信息
|
|
26
|
+
"""
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def find_mem_free_device(self, gpu_num:int=1, shuffle:bool=False,
|
|
30
|
+
free_memory:int=None, free_memory_ratio:float=None, filter_fn:callable=None)->list:
|
|
31
|
+
"""查找显存足够的显卡列表
|
|
32
|
+
free_memory和free_memory_ratio都为None时, 所有显卡都可返回。
|
|
33
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
34
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
35
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
gpu_num (int, optional): 需要查找的设备数量. Defaults to 1.
|
|
39
|
+
shuffle (bool, optional): 是否随机打乱GPU获取顺序. Defaults to False.
|
|
40
|
+
free_memory (int, optional): 过滤可用显存小于free_memory的显卡. Defaults to None.
|
|
41
|
+
free_memory_ratio (float, optional): 过滤可用显存/显存小于free_memory_ratio的显卡. Defaults to None.
|
|
42
|
+
filter_fn (callable, optional): 过滤可用显卡的函数, 例如lambda idx, memInfo:True将过滤所有显卡. Defaults to None.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list: 数据结构为[tuple(显卡序号:int, 显存信息:MemInfo)]
|
|
46
|
+
"""
|
|
47
|
+
gpu_count = self.get_device_count()
|
|
48
|
+
gpu_index_iter = range(gpu_count)
|
|
49
|
+
|
|
50
|
+
if shuffle:
|
|
51
|
+
gpu_index_iter = list(gpu_index_iter)
|
|
52
|
+
random.shuffle(gpu_index_iter)
|
|
53
|
+
|
|
54
|
+
choosed_gpu = []
|
|
55
|
+
|
|
56
|
+
for gpu_index in gpu_index_iter:
|
|
57
|
+
meminfo:MemInfo = self.get_memory_info(gpu_index)
|
|
58
|
+
|
|
59
|
+
_choose = True
|
|
60
|
+
try:
|
|
61
|
+
if free_memory_ratio is not None:
|
|
62
|
+
_ratio = float(meminfo.free) / meminfo.total
|
|
63
|
+
if _ratio < free_memory_ratio:
|
|
64
|
+
_choose = False
|
|
65
|
+
|
|
66
|
+
if free_memory is not None:
|
|
67
|
+
if meminfo.free < free_memory:
|
|
68
|
+
_choose = False
|
|
69
|
+
|
|
70
|
+
if filter_fn and filter_fn(gpu_index, meminfo):
|
|
71
|
+
_choose = False
|
|
72
|
+
except Exception as err:
|
|
73
|
+
logger.warning("find_mem_free_device err: %s", err)
|
|
74
|
+
_choose = False
|
|
75
|
+
|
|
76
|
+
if _choose:
|
|
77
|
+
choosed_gpu.append((gpu_index, meminfo))
|
|
78
|
+
|
|
79
|
+
if len(choosed_gpu) >= gpu_num:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
return choosed_gpu
|
|
File without changes
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import os, time
|
|
2
|
+
|
|
3
|
+
from smart.auto import TreeMultiTask
|
|
4
|
+
from smart.utils.env import auto_set_env_by_prefix, AppEnv
|
|
5
|
+
from smart.utils.cast import cast_bool
|
|
6
|
+
from .__utils import auto_load, logger, task_hook
|
|
7
|
+
from .GpuTool import GpuInfoGetter, MemInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_options = {
|
|
11
|
+
"nvml_is_disable": False
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import pynvml
|
|
16
|
+
except:
|
|
17
|
+
_options['nvml_is_disable'] = True
|
|
18
|
+
_options['err_msg'] = "miss pynvml"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CudaInfoGetter(GpuInfoGetter):
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
self.__inited = False
|
|
24
|
+
|
|
25
|
+
def __nvmlInit(self):
|
|
26
|
+
if not self.__inited:
|
|
27
|
+
pynvml.nvmlInit()
|
|
28
|
+
self.__inited = True
|
|
29
|
+
|
|
30
|
+
def get_device_count(self):
|
|
31
|
+
self.__nvmlInit()
|
|
32
|
+
return pynvml.nvmlDeviceGetCount()
|
|
33
|
+
|
|
34
|
+
def get_memory_info(self, index:int):
|
|
35
|
+
self.__nvmlInit()
|
|
36
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
|
|
37
|
+
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
38
|
+
return MemInfo(
|
|
39
|
+
meminfo.total, meminfo.free, meminfo.used
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@auto_load.task('gpu_tools.cuda_tool')
|
|
44
|
+
class CudaToolTask(TreeMultiTask):
|
|
45
|
+
def find_available_gpu(self, gpu_num:int=1, shuffle:bool=False,
|
|
46
|
+
free_memory:int=None, free_memory_ratio:float=None,
|
|
47
|
+
device_env_key:str='CUDA_VISIBLE_DEVICES', ctx_state_name:str='cuda_tool'):
|
|
48
|
+
"""查找可用的GPU
|
|
49
|
+
当free_memory和free_memory_ratio都为None时,所有显卡都为可用,返回的available_gpu数组长度=min(gpu_num, 机器实际显卡数)。
|
|
50
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
51
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
52
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
gpu_num (int, optional): 需要的GPU数量. Defaults to 1.
|
|
56
|
+
shuffle (bool, optional): 是否随机打乱GPU顺序. Defaults to False.
|
|
57
|
+
free_memory (int, optional): 显卡的可用显存小于free_memory为不可用. Defaults to None.
|
|
58
|
+
free_memory_ratio (float, optional): 显卡的可用显存/显存小于free_memory_ratio为不可用. Defaults to None.
|
|
59
|
+
device_env_key (str, optional): 将可用的设备序号设置到环境变量中, 空值表示不设置环境变量. Defaults to 'CUDA_VISIBLE_DEVICES'.
|
|
60
|
+
ctx_state_name (str, optional): 将返回的available_gpu列表保存到context中. Defaults to 'cuda_tool'.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
dict: {"available_gpu":[(显卡序号:int, 显存信息:MemInfo)]}
|
|
64
|
+
"""
|
|
65
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
if _options['nvml_is_disable']:
|
|
69
|
+
logger.error("nvml is not support. %s", _options.get("err_msg", ""))
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
cuda = CudaInfoGetter()
|
|
73
|
+
|
|
74
|
+
with self.context.store.lock((ctx_state_name, "find_available_gpu")):
|
|
75
|
+
used_gpu = self.context.list((ctx_state_name, "used_gpu"))
|
|
76
|
+
used_gpu_idx = [idx for idx, _ in used_gpu]
|
|
77
|
+
|
|
78
|
+
choosed_gpu = cuda.find_mem_free_device(
|
|
79
|
+
gpu_num=gpu_num,
|
|
80
|
+
shuffle=shuffle,
|
|
81
|
+
free_memory=free_memory,
|
|
82
|
+
free_memory_ratio=free_memory_ratio,
|
|
83
|
+
filter_fn=lambda idx, _:(idx in used_gpu_idx)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if choosed_gpu:
|
|
87
|
+
self.context.list((ctx_state_name, "used_gpu")).extend(choosed_gpu)
|
|
88
|
+
|
|
89
|
+
if device_env_key:
|
|
90
|
+
device_env_val = ','.join([
|
|
91
|
+
str(val[0]) for val in choosed_gpu
|
|
92
|
+
])
|
|
93
|
+
auto_set_env_by_prefix(device_env_key, device_env_val)
|
|
94
|
+
logger.info("cuda_tool.find_available_gpu set_env %s: %s", device_env_key, device_env_val)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"available_gpu": choosed_gpu
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
@task_hook.before_task()
|
|
101
|
+
def hook_available_gpu(self, gpu_num:int=1, shuffle:bool=False,
|
|
102
|
+
free_memory:int=None, free_memory_ratio:float=None,
|
|
103
|
+
device_env_key:str=None, ctx_state_name:str='cuda_tool'):
|
|
104
|
+
"""查找可用的GPU的勾子函数(在其他任务启动前先执行)
|
|
105
|
+
当free_memory和free_memory_ratio都为None时,所有显卡都为可用,返回的available_gpu数组长度=min(gpu_num, 机器实际显卡数)。
|
|
106
|
+
shuffle=True可随机获取可用显卡;shuffle=False则按顺序检查可用显卡。
|
|
107
|
+
查找可用显卡以执行代码的时刻的显存来判断,与实际占用显卡一般有一段间隔。
|
|
108
|
+
在多个任务并发时,可能会出现多个任务同时在抢占同一张显卡。shuffle=True能减少抢占情况。
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
gpu_num (int, optional): 需要的GPU数量. Defaults to 1.
|
|
112
|
+
shuffle (bool, optional): 是否随机打乱GPU顺序. Defaults to False.
|
|
113
|
+
free_memory (int, optional): 显卡的可用显存小于free_memory为不可用. Defaults to None.
|
|
114
|
+
free_memory_ratio (float, optional): 显卡的可用显存/显存小于free_memory_ratio为不可用. Defaults to None.
|
|
115
|
+
device_env_key (str, optional): 将可用的设备序号设置到环境变量中, 空值表示不设置环境变量. Defaults to None.
|
|
116
|
+
ctx_state_name (str, optional): 将返回的available_gpu列表保存到context中. Defaults to 'cuda_tool'.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
dict: {"available_gpu":[(显卡序号:int, 显存信息:MemInfo)]}
|
|
120
|
+
"""
|
|
121
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
122
|
+
return
|
|
123
|
+
if _options['nvml_is_disable']:
|
|
124
|
+
logger.error("nvml is not support. %s", _options.get("err_msg", ""))
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
cuda = CudaInfoGetter()
|
|
128
|
+
|
|
129
|
+
choosed_gpu = cuda.find_mem_free_device(
|
|
130
|
+
gpu_num=gpu_num,
|
|
131
|
+
shuffle=shuffle,
|
|
132
|
+
free_memory=free_memory,
|
|
133
|
+
free_memory_ratio=free_memory_ratio
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if device_env_key:
|
|
137
|
+
device_env_val = ','.join([
|
|
138
|
+
str(val[0]) for val in choosed_gpu
|
|
139
|
+
])
|
|
140
|
+
auto_set_env_by_prefix(device_env_key, device_env_val)
|
|
141
|
+
logger.info("cuda_tool.hook_available_gpu set_env %s: %s", device_env_key, device_env_val)
|
|
142
|
+
|
|
143
|
+
if ctx_state_name:
|
|
144
|
+
self.context.state(ctx_state_name).update({
|
|
145
|
+
"available_gpu_num": len(choosed_gpu),
|
|
146
|
+
"available_gpu_list": choosed_gpu
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"available_gpu": choosed_gpu
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
def pop_gpu_from_ctx(self, gpu_num:int=1, min_gpu_num:int=0,
|
|
154
|
+
device_env_key:str='CUDA_VISIBLE_DEVICES', ctx_state_name:str='cuda_tool'):
|
|
155
|
+
"""从context中获取可用GPU, 并将GPU序号保存到环境变量
|
|
156
|
+
本方法与hook_available_gpu搭配使用.
|
|
157
|
+
当可用的GPU数量少于min_gpu_num时, 则不从context的available_gpu_list中pop数据, 同时CUDA_VISIBLE_DEVICES环境变量设置为-1.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
gpu_num (int, optional): 需要的gpu数量. Defaults to 1.
|
|
161
|
+
min_gpu_num (int, optional): 最小需要的gpu数量. Defaults to 0.
|
|
162
|
+
device_env_key (str, optional): 保存GPU序号的环境变量, 本参数一般不修改. Defaults to 'CUDA_VISIBLE_DEVICES'.
|
|
163
|
+
ctx_state_name (str, optional): 保存可用GPU列表的context名称, 本参数一般不修改. Defaults to 'cuda_tool'.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
dict: {available_gpu:[(显卡序号:int, 显存信息:MemInfo)]}
|
|
167
|
+
"""
|
|
168
|
+
if not cast_bool(AppEnv.get("USE_GPU", True)):
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
if gpu_num < min_gpu_num:
|
|
172
|
+
gpu_num = min_gpu_num
|
|
173
|
+
|
|
174
|
+
ctx_state = self.context.state(ctx_state_name)
|
|
175
|
+
available_gpu_num = ctx_state.wait("available_gpu_num")
|
|
176
|
+
choosed_gpu = []
|
|
177
|
+
|
|
178
|
+
if available_gpu_num >= min_gpu_num:
|
|
179
|
+
with self.context.store.lock((ctx_state_name, "pop_gpu_from_ctx")):
|
|
180
|
+
gpu_list = ctx_state.get("available_gpu_list")
|
|
181
|
+
logger.debug("CudaToolTask.pop_gpu_from_ctx current gpu_list=%s", gpu_list)
|
|
182
|
+
# time.sleep(1)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
if gpu_list and len(gpu_list) >= min_gpu_num:
|
|
186
|
+
for i in range(gpu_num):
|
|
187
|
+
gpu_idx_info_tuple = gpu_list.pop(0)
|
|
188
|
+
choosed_gpu.append(gpu_idx_info_tuple)
|
|
189
|
+
except IndexError:
|
|
190
|
+
logger.info("CudaToolTask.pop_gpu_from_ctx no enough gpu")
|
|
191
|
+
|
|
192
|
+
ctx_state.set("available_gpu_list", gpu_list)
|
|
193
|
+
|
|
194
|
+
if device_env_key:
|
|
195
|
+
os.environ[device_env_key] = ", ".join(
|
|
196
|
+
str(i) for i, _ in choosed_gpu
|
|
197
|
+
) if len(choosed_gpu) else "-1"
|
|
198
|
+
logger.info("CudaToolTask.pop_gpu_from_ctx set %s=%s", device_env_key, os.environ[device_env_key])
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"available_gpu": choosed_gpu
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
File without changes
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os, json, logging
|
|
2
|
+
|
|
3
|
+
from smart.auto import TreeMultiTask, AutoLoad
|
|
4
|
+
from smart.utils import list_safe_iter, path_join
|
|
5
|
+
|
|
6
|
+
from .__utils import auto_load, logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@auto_load.task('jsonl__file')
|
|
10
|
+
class JsonlFileTask(TreeMultiTask):
|
|
11
|
+
def __resolve_file_name_keys(self, file_name_keys, file_name_idx_or_key=None):
|
|
12
|
+
if isinstance(file_name_keys, str):
|
|
13
|
+
file_name_key_list = [k.strip() for k in file_name_keys.split(',')]
|
|
14
|
+
else:
|
|
15
|
+
file_name_key_list = file_name_keys
|
|
16
|
+
|
|
17
|
+
if isinstance(file_name_idx_or_key, int):
|
|
18
|
+
yield file_name_key_list[file_name_idx_or_key]
|
|
19
|
+
elif file_name_idx_or_key:
|
|
20
|
+
yield file_name_idx_or_key
|
|
21
|
+
else:
|
|
22
|
+
yield from file_name_key_list
|
|
23
|
+
|
|
24
|
+
def pattern_read(self, file_name_keys, file_name_pattern:dict='{}', dir_path=None, file_open_opts=None, root_dir=None, group_key='_group', file_path=None):
|
|
25
|
+
"""读取多份jsonl文件
|
|
26
|
+
|
|
27
|
+
Arguments:
|
|
28
|
+
file_name_keys {list} -- 文件名的键列表
|
|
29
|
+
|
|
30
|
+
Keyword Arguments:
|
|
31
|
+
file_name_pattern {str} -- 文件名模版, 使用'{}'占位file_name_key (default: {'{}'})
|
|
32
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
33
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
34
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
35
|
+
group_key {str} -- item的分组键, 值为file_name_key (default: {'_group'})
|
|
36
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict -- {item_iter_fn}
|
|
40
|
+
"""
|
|
41
|
+
_file_name_keys = file_name_keys
|
|
42
|
+
dir_path = dir_path or file_path
|
|
43
|
+
file_open_opts = {'mode': 'r', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
44
|
+
|
|
45
|
+
def item_iter_fn(file_name_idx_or_key = None, file_name_keys=None):
|
|
46
|
+
file_name_key_list = self.__resolve_file_name_keys(file_name_keys or _file_name_keys, file_name_idx_or_key)
|
|
47
|
+
|
|
48
|
+
for file_name_key in file_name_key_list:
|
|
49
|
+
num_items = 0
|
|
50
|
+
file_name = file_name_pattern.format(file_name_key)
|
|
51
|
+
file = path_join(root_dir, dir_path, file_name)
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(file):
|
|
54
|
+
logger.warning('JsonlFileTask.pattern_read: no found file %s', file)
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
with open(file, **file_open_opts) as f:
|
|
58
|
+
for line in f:
|
|
59
|
+
if not line:
|
|
60
|
+
continue
|
|
61
|
+
item = json.loads(line)
|
|
62
|
+
if isinstance(item, dict) and group_key:
|
|
63
|
+
item[group_key] = file_name_key
|
|
64
|
+
yield item
|
|
65
|
+
num_items += 1
|
|
66
|
+
logger.debug('JsonlFileTask.pattern_read %s items from %s', num_items, file)
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
'item_iter_fn': item_iter_fn
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def read(self, file_name, dir_path=None, file_open_opts:dict=None, root_dir=None, file_path=None):
|
|
73
|
+
"""读取jsonl文件
|
|
74
|
+
|
|
75
|
+
Arguments:
|
|
76
|
+
file_name {str} -- 文件名
|
|
77
|
+
|
|
78
|
+
Keyword Arguments:
|
|
79
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
80
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
81
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
82
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
dict -- {item_iter_fn}
|
|
86
|
+
"""
|
|
87
|
+
dir_path = dir_path or file_path
|
|
88
|
+
file = path_join(root_dir, dir_path, file_name)
|
|
89
|
+
file_open_opts = {'mode': 'r', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
90
|
+
|
|
91
|
+
def item_iter_fn():
|
|
92
|
+
with open(file, **file_open_opts) as f:
|
|
93
|
+
for line in f:
|
|
94
|
+
item = json.loads(line)
|
|
95
|
+
yield item
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
'item_iter_fn': item_iter_fn
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def write(self, file_name, dir_path=None, file_open_opts:dict=None, root_dir=None, item_iter=None, item_iter_fn=None, recv_args={}, file_path=None):
|
|
102
|
+
"""写jsonl文件
|
|
103
|
+
|
|
104
|
+
Arguments:
|
|
105
|
+
file_name {str} -- 文件名
|
|
106
|
+
|
|
107
|
+
Keyword Arguments:
|
|
108
|
+
dir_path {str} -- 文件目录路径 (default: {None})
|
|
109
|
+
file_open_opts {dict} -- 打开文件选项 (default: {None})
|
|
110
|
+
root_dir {str} -- 根路径 (default: {None})
|
|
111
|
+
item_iter {generator} -- item生成器 (default: {None})
|
|
112
|
+
item_iter_fn {callable} -- item生成器构造函数; item_iter非空时, 本参数无效 (default: {None})
|
|
113
|
+
recv_args {dict} -- 接收数据函数的参数选项; item_iter非空时, 本参数无效 (default: {{}})
|
|
114
|
+
file_path {str} -- 弃用, 请使用dir_path代替 (default: {None})
|
|
115
|
+
"""
|
|
116
|
+
assert file_name
|
|
117
|
+
dir_path = dir_path or file_path
|
|
118
|
+
file = path_join(root_dir, dir_path, file_name, auto_mkdir=True)
|
|
119
|
+
file_open_opts = {'mode': 'w', 'encoding': 'utf8', **(file_open_opts or {})}
|
|
120
|
+
logger.info('jsonl__file.write %s', file_name)
|
|
121
|
+
|
|
122
|
+
item_iter = item_iter or (item_iter_fn or self.recv_data)(**recv_args)
|
|
123
|
+
|
|
124
|
+
count = 0
|
|
125
|
+
with open(file, **file_open_opts) as f:
|
|
126
|
+
for item in item_iter:
|
|
127
|
+
json.dump(item, f, ensure_ascii=False)
|
|
128
|
+
f.write('\n')
|
|
129
|
+
count += 1
|
|
130
|
+
|
|
131
|
+
logger.debug('jsonl__file.write %s items to %s', count, file_name)
|
|
132
|
+
|
|
File without changes
|