addftool 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- addftool/process/__init__.py +6 -3
- addftool/process/utils.py +12 -1
- addftool/sync.py +52 -12
- {addftool-0.1.7.dist-info → addftool-0.1.9.dist-info}/METADATA +1 -1
- {addftool-0.1.7.dist-info → addftool-0.1.9.dist-info}/RECORD +8 -8
- {addftool-0.1.7.dist-info → addftool-0.1.9.dist-info}/WHEEL +0 -0
- {addftool-0.1.7.dist-info → addftool-0.1.9.dist-info}/entry_points.txt +0 -0
- {addftool-0.1.7.dist-info → addftool-0.1.9.dist-info}/top_level.txt +0 -0
addftool/process/__init__.py
CHANGED
|
@@ -11,6 +11,8 @@ def add_killer_args(subparsers):
|
|
|
11
11
|
process_killer_parser.add_argument("--timeout", help="timeout of command", default=5, type=int)
|
|
12
12
|
process_killer_parser.add_argument("--try_count", help="try count of command", default=3, type=int)
|
|
13
13
|
|
|
14
|
+
process_killer_parser.add_argument("--contain_arg", help="args of command", default="", type=str)
|
|
15
|
+
|
|
14
16
|
process_killer_parser.add_argument("--rocm", help="kill process using rocm", action='store_true', default=False)
|
|
15
17
|
process_killer_parser.add_argument("--cuda", help="kill process using cuda", action='store_true', default=False)
|
|
16
18
|
|
|
@@ -59,7 +61,7 @@ def kill_process(processes, timeout=5, try_count=3):
|
|
|
59
61
|
exit(1)
|
|
60
62
|
|
|
61
63
|
|
|
62
|
-
def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=False, timeout=5, try_count=3, only_view=False):
|
|
64
|
+
def find_and_kill_process(command="", contains=False, contain_arg="", use_rocm=False, use_cuda=False, timeout=5, try_count=3, only_view=False):
|
|
63
65
|
do_not_do_anything = command is None or len(command) == 0
|
|
64
66
|
if use_rocm:
|
|
65
67
|
processes = get_process_using_rocm()
|
|
@@ -69,7 +71,7 @@ def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=F
|
|
|
69
71
|
print("Use top or htop to find the process you want to kill")
|
|
70
72
|
return
|
|
71
73
|
else:
|
|
72
|
-
processes = get_processes(command=command, contains=contains)
|
|
74
|
+
processes = get_processes(command=command, contains=contains, contain_arg=contain_arg)
|
|
73
75
|
|
|
74
76
|
if only_view:
|
|
75
77
|
print(f"Found {len(processes)} processes")
|
|
@@ -83,7 +85,8 @@ def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=F
|
|
|
83
85
|
|
|
84
86
|
|
|
85
87
|
def killer_main(args):
|
|
88
|
+
print(args)
|
|
86
89
|
find_and_kill_process(
|
|
87
|
-
args.name, args.contains, use_rocm=args.rocm, use_cuda=args.cuda,
|
|
90
|
+
args.name, args.contains, args.contain_arg, use_rocm=args.rocm, use_cuda=args.cuda,
|
|
88
91
|
timeout=args.timeout, try_count=args.try_count, only_view=args.view,
|
|
89
92
|
)
|
addftool/process/utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import psutil
|
|
|
2
2
|
import subprocess
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def get_processes(command="", contains=False, pids=None):
|
|
5
|
+
def get_processes(command="", contains=False, pids=None, contain_arg=""):
|
|
6
6
|
"""获取进程的PID和命令"""
|
|
7
7
|
|
|
8
8
|
processes = []
|
|
@@ -21,6 +21,17 @@ def get_processes(command="", contains=False, pids=None):
|
|
|
21
21
|
continue
|
|
22
22
|
if len(cmdline) > 1 and cmdline[1].endswith('addf'):
|
|
23
23
|
continue
|
|
24
|
+
if len(contain_arg) > 0:
|
|
25
|
+
if len(cmdline) <= 1:
|
|
26
|
+
continue
|
|
27
|
+
flag = False
|
|
28
|
+
for arg in cmdline[1:]:
|
|
29
|
+
if contain_arg in arg:
|
|
30
|
+
flag = True
|
|
31
|
+
break
|
|
32
|
+
if not flag:
|
|
33
|
+
continue
|
|
34
|
+
|
|
24
35
|
processes.append({'pid': pid, 'command': cmdline})
|
|
25
36
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
26
37
|
pass
|
addftool/sync.py
CHANGED
|
@@ -35,6 +35,8 @@ def add_args(parser):
|
|
|
35
35
|
parser.add_argument("--md5_verify", action='store_true', default=False,
|
|
36
36
|
help="whether to verify the md5 of the file after sync, default is False.")
|
|
37
37
|
parser.add_argument("--port", help="the port for torchrun, default is 29501", type=int, default=29501)
|
|
38
|
+
parser.add_argument("--torchrun_alias", type=str, default="torchrun",
|
|
39
|
+
help="the alias of torchrun, default is torchrun. If you use torchrun, please set it to torchrun.")
|
|
38
40
|
# distributed downloader from blob
|
|
39
41
|
parser.add_argument("--donwload_nodes", help="download nodes, default is node-0", type=int, default=1)
|
|
40
42
|
parser.add_argument("folder", help="the folder need to sync", type=str)
|
|
@@ -104,18 +106,22 @@ def sync_main(args):
|
|
|
104
106
|
|
|
105
107
|
# divide the files into chunks for each node by file size
|
|
106
108
|
sorted_files = sorted(file_size_list.items(), key=lambda x: x[1], reverse=True)
|
|
109
|
+
zero_files = []
|
|
107
110
|
|
|
108
111
|
groups = [[] for _ in range(args.donwload_nodes)]
|
|
109
112
|
for i, (file_name, file_size) in enumerate(sorted_files):
|
|
113
|
+
if file_size == 0:
|
|
114
|
+
zero_files.append(file_name)
|
|
115
|
+
continue
|
|
110
116
|
groups[i % args.donwload_nodes].append(file_name)
|
|
111
117
|
|
|
112
118
|
# create a temp folder to save the downloaded files
|
|
113
|
-
|
|
114
|
-
os.makedirs(
|
|
115
|
-
print(f"Temp config dir: {
|
|
119
|
+
local_temp_config_dir = tempfile.mktemp()
|
|
120
|
+
os.makedirs(local_temp_config_dir, exist_ok=True)
|
|
121
|
+
print(f"Temp config dir: {local_temp_config_dir}")
|
|
116
122
|
|
|
117
123
|
for i, group in enumerate(groups):
|
|
118
|
-
group_file_path = os.path.join(
|
|
124
|
+
group_file_path = os.path.join(local_temp_config_dir, f"node_{i}.txt")
|
|
119
125
|
total_size = 0
|
|
120
126
|
with open(group_file_path, "w") as f:
|
|
121
127
|
for file_name in group:
|
|
@@ -123,6 +129,12 @@ def sync_main(args):
|
|
|
123
129
|
total_size += file_size_list[file_name]
|
|
124
130
|
print(f"Node-{i} will download {len(group)} files, total size: {total_size} bytes")
|
|
125
131
|
|
|
132
|
+
with open(os.path.join(local_temp_config_dir, "zero_files.txt"), "w") as f:
|
|
133
|
+
for file_name in zero_files:
|
|
134
|
+
f.write(file_name + "\n")
|
|
135
|
+
|
|
136
|
+
print(f"Detect {len(zero_files)} files with size 0 bytes, they will be special handled.")
|
|
137
|
+
|
|
126
138
|
with open(args.hostfile, "r") as f:
|
|
127
139
|
host_list = []
|
|
128
140
|
for line in f:
|
|
@@ -135,24 +147,31 @@ def sync_main(args):
|
|
|
135
147
|
|
|
136
148
|
print(f"Find {len(host_list)} hosts in hostfile: {args.hostfile}")
|
|
137
149
|
connection_list = []
|
|
150
|
+
|
|
151
|
+
# avoid the temp_config_dir to be the same as remote_temp_config_dir
|
|
152
|
+
remote_temp_config_dir = tempfile.mktemp()
|
|
153
|
+
while remote_temp_config_dir == local_temp_config_dir:
|
|
154
|
+
remote_temp_config_dir = tempfile.mktemp()
|
|
155
|
+
|
|
138
156
|
master_addr = get_ip_via_ssh(host_list[0])
|
|
139
157
|
for i, host in enumerate(host_list):
|
|
140
158
|
# copy this .py file to the remote host
|
|
141
159
|
put_commands = []
|
|
142
|
-
put_commands.append((__file__, os.path.join(
|
|
160
|
+
put_commands.append((__file__, os.path.join(remote_temp_config_dir, "sync.py")))
|
|
143
161
|
if i < args.donwload_nodes:
|
|
144
|
-
local_group_file = os.path.join(
|
|
145
|
-
put_commands.append((local_group_file, os.path.join(
|
|
162
|
+
local_group_file = os.path.join(local_temp_config_dir, f"node_{i}.txt")
|
|
163
|
+
put_commands.append((local_group_file, os.path.join(remote_temp_config_dir, f"node_{i}.txt")))
|
|
164
|
+
put_commands.append((os.path.join(local_temp_config_dir, "zero_files.txt"), os.path.join(remote_temp_config_dir, "zero_files.txt")))
|
|
146
165
|
|
|
147
166
|
commnads = f"export SAS_TOKEN=\"{sas_token}\""
|
|
148
|
-
commnads += f" &&
|
|
149
|
-
commnads += f" {
|
|
167
|
+
commnads += f" && {args.torchrun_alias} --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
|
|
168
|
+
commnads += f" {remote_temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
|
|
150
169
|
if args.md5_verify:
|
|
151
170
|
commnads += " --md5_verify"
|
|
152
171
|
if i < args.donwload_nodes:
|
|
153
|
-
commnads += f" --download_index_file {
|
|
172
|
+
commnads += f" --download_index_file {remote_temp_config_dir}/node_{i}.txt"
|
|
154
173
|
|
|
155
|
-
connection_list.append(ConnectionWithCommand(host,
|
|
174
|
+
connection_list.append(ConnectionWithCommand(host, remote_temp_config_dir, put_commands, commnads))
|
|
156
175
|
|
|
157
176
|
group = ThreadingGroup.from_connections(connection_list)
|
|
158
177
|
group.run('echo "Hello"', hide=False)
|
|
@@ -167,7 +186,9 @@ def download_files_from_blob(queue, blob_url, sas_token, folder, download_files,
|
|
|
167
186
|
print(f"Node-{node_rank} start downloading {len(download_files)} files from {blob_url} to {folder}")
|
|
168
187
|
for file_name in download_files:
|
|
169
188
|
file_path = os.path.join(folder, file_name)
|
|
170
|
-
os.
|
|
189
|
+
file_dir = os.path.dirname(file_path)
|
|
190
|
+
if not os.path.exists(file_dir):
|
|
191
|
+
os.makedirs(file_dir, exist_ok=True)
|
|
171
192
|
for try_count in range(3):
|
|
172
193
|
try:
|
|
173
194
|
download_status = subprocess.run(
|
|
@@ -208,6 +229,9 @@ def sync_file_from_rank(rank, file_path, from_rank, md5_verify=False):
|
|
|
208
229
|
|
|
209
230
|
dist.broadcast(tensor, src=from_rank)
|
|
210
231
|
if rank != from_rank:
|
|
232
|
+
file_dir = os.path.dirname(file_path)
|
|
233
|
+
if not os.path.exists(file_dir):
|
|
234
|
+
os.makedirs(file_dir, exist_ok=True)
|
|
211
235
|
with open(file_path, "wb") as f:
|
|
212
236
|
tensor.cpu().numpy().tofile(f)
|
|
213
237
|
if md5_verify:
|
|
@@ -283,9 +307,25 @@ def sync_worker(args):
|
|
|
283
307
|
else:
|
|
284
308
|
sync_file_from_rank(node_rank, "", global_status_code, md5_verify=args.md5_verify)
|
|
285
309
|
|
|
310
|
+
print(f"Node-{node_rank} finished downloading files, time taken: {time.time() - start_time:.2f}s")
|
|
286
311
|
dist.barrier()
|
|
287
312
|
download_process.join()
|
|
288
313
|
destroy_process_group()
|
|
314
|
+
|
|
315
|
+
# current directory
|
|
316
|
+
zero_file = os.path.join(__file__, "zero_files.txt")
|
|
317
|
+
if os.path.exists(zero_file):
|
|
318
|
+
with open(zero_file, "r") as f:
|
|
319
|
+
zero_files = [line.strip() for line in f]
|
|
320
|
+
for zero_file_name in zero_files:
|
|
321
|
+
zero_file_path = os.path.join(args.folder, zero_file_name)
|
|
322
|
+
zero_file_dir = os.path.dirname(zero_file_path)
|
|
323
|
+
if not os.path.exists(zero_file_dir):
|
|
324
|
+
os.makedirs(zero_file_dir, exist_ok=True)
|
|
325
|
+
with open(zero_file_path, "wb") as f:
|
|
326
|
+
f.write(b"")
|
|
327
|
+
print(f"Node-{node_rank} handled {len(zero_files)} files with size 0 bytes, time taken: {time.time() - start_time:.2f}s")
|
|
328
|
+
|
|
289
329
|
print(f"Node-{node_rank} finished syncing all files, time taken: {time.time() - start_time:.2f}s")
|
|
290
330
|
|
|
291
331
|
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
addftool/addf_portal.py,sha256=w2LgsoutfnrKhtrQAXouUMwLqnsp5ALlsBYUWg8n9NM,781
|
|
3
3
|
addftool/blob.py,sha256=NZOItDyFUIdV1tfhJZJJBEzGy296CE5NCictTzP4OPc,8282
|
|
4
|
-
addftool/sync.py,sha256=
|
|
4
|
+
addftool/sync.py,sha256=ZpYxbM8uiPFrV7ODmOaM7asVPCWaxBixA-arVc-1kfs,14045
|
|
5
5
|
addftool/tool.py,sha256=EuKQ2t2InN7yB-_oYLcdsA7vRqzRGTunwIxplUSqEG0,2054
|
|
6
6
|
addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
|
|
7
7
|
addftool/deploy/__init__.py,sha256=tpyoTh3SqAQojPizsJDvQohu1Pcb3-w-DP5sO4-5lBM,1220
|
|
8
8
|
addftool/deploy/azure.py,sha256=_o_9Eh8cVwLDAqvfyRYBtQRHs_Gul-nCs2ZXttwO1bk,1301
|
|
9
9
|
addftool/deploy/ssh_server.py,sha256=f2T8fgwACVljPfdcimMywUjsFnLCWRde7iWPAILpRz8,5463
|
|
10
|
-
addftool/process/__init__.py,sha256=
|
|
11
|
-
addftool/process/utils.py,sha256=
|
|
10
|
+
addftool/process/__init__.py,sha256=Dze8OrcyjQlAbPrjE_h8bMi8W4b3OJyZOjTucPrkJvM,3721
|
|
11
|
+
addftool/process/utils.py,sha256=JldxnwanLJOgxaPgmCJh7SeBRaaj5rFxWWxh1hpsvbA,2609
|
|
12
12
|
addftool/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
addftool-0.1.
|
|
14
|
-
addftool-0.1.
|
|
15
|
-
addftool-0.1.
|
|
16
|
-
addftool-0.1.
|
|
17
|
-
addftool-0.1.
|
|
13
|
+
addftool-0.1.9.dist-info/METADATA,sha256=VEfXNesNw0S0lZl2_hYrJhrUyRN9SpP1iT1aIhtgKbU,170
|
|
14
|
+
addftool-0.1.9.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
15
|
+
addftool-0.1.9.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
|
|
16
|
+
addftool-0.1.9.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
|
|
17
|
+
addftool-0.1.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|