addftool 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,8 @@ def add_killer_args(subparsers):
11
11
  process_killer_parser.add_argument("--timeout", help="timeout of command", default=5, type=int)
12
12
  process_killer_parser.add_argument("--try_count", help="try count of command", default=3, type=int)
13
13
 
14
+ process_killer_parser.add_argument("--contain_arg", help="args of command", default="", type=str)
15
+
14
16
  process_killer_parser.add_argument("--rocm", help="kill process using rocm", action='store_true', default=False)
15
17
  process_killer_parser.add_argument("--cuda", help="kill process using cuda", action='store_true', default=False)
16
18
 
@@ -59,7 +61,7 @@ def kill_process(processes, timeout=5, try_count=3):
59
61
  exit(1)
60
62
 
61
63
 
62
- def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=False, timeout=5, try_count=3, only_view=False):
64
+ def find_and_kill_process(command="", contains=False, contain_arg="", use_rocm=False, use_cuda=False, timeout=5, try_count=3, only_view=False):
63
65
  do_not_do_anything = command is None or len(command) == 0
64
66
  if use_rocm:
65
67
  processes = get_process_using_rocm()
@@ -69,7 +71,7 @@ def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=F
69
71
  print("Use top or htop to find the process you want to kill")
70
72
  return
71
73
  else:
72
- processes = get_processes(command=command, contains=contains)
74
+ processes = get_processes(command=command, contains=contains, contain_arg=contain_arg)
73
75
 
74
76
  if only_view:
75
77
  print(f"Found {len(processes)} processes")
@@ -83,7 +85,8 @@ def find_and_kill_process(command="", contains=False, use_rocm=False, use_cuda=F
83
85
 
84
86
 
85
87
  def killer_main(args):
88
+ print(args)
86
89
  find_and_kill_process(
87
- args.name, args.contains, use_rocm=args.rocm, use_cuda=args.cuda,
90
+ args.name, args.contains, args.contain_arg, use_rocm=args.rocm, use_cuda=args.cuda,
88
91
  timeout=args.timeout, try_count=args.try_count, only_view=args.view,
89
92
  )
addftool/process/utils.py CHANGED
@@ -2,7 +2,7 @@ import psutil
2
2
  import subprocess
3
3
 
4
4
 
5
- def get_processes(command="", contains=False, pids=None):
5
+ def get_processes(command="", contains=False, pids=None, contain_arg=""):
6
6
  """获取进程的PID和命令"""
7
7
 
8
8
  processes = []
@@ -21,6 +21,17 @@ def get_processes(command="", contains=False, pids=None):
21
21
  continue
22
22
  if len(cmdline) > 1 and cmdline[1].endswith('addf'):
23
23
  continue
24
+ if len(contain_arg) > 0:
25
+ if len(cmdline) <= 1:
26
+ continue
27
+ flag = False
28
+ for arg in cmdline[1:]:
29
+ if contain_arg in arg:
30
+ flag = True
31
+ break
32
+ if not flag:
33
+ continue
34
+
24
35
  processes.append({'pid': pid, 'command': cmdline})
25
36
  except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
26
37
  pass
addftool/sync.py CHANGED
@@ -35,6 +35,8 @@ def add_args(parser):
35
35
  parser.add_argument("--md5_verify", action='store_true', default=False,
36
36
  help="whether to verify the md5 of the file after sync, default is False.")
37
37
  parser.add_argument("--port", help="the port for torchrun, default is 29501", type=int, default=29501)
38
+ parser.add_argument("--torchrun_alias", type=str, default="torchrun",
39
+ help="the alias of torchrun, default is torchrun. If you use torchrun, please set it to torchrun.")
38
40
  # distributed downloader from blob
39
41
  parser.add_argument("--donwload_nodes", help="download nodes, default is node-0", type=int, default=1)
40
42
  parser.add_argument("folder", help="the folder need to sync", type=str)
@@ -104,18 +106,22 @@ def sync_main(args):
104
106
 
105
107
  # divide the files into chunks for each node by file size
106
108
  sorted_files = sorted(file_size_list.items(), key=lambda x: x[1], reverse=True)
109
+ zero_files = []
107
110
 
108
111
  groups = [[] for _ in range(args.donwload_nodes)]
109
112
  for i, (file_name, file_size) in enumerate(sorted_files):
113
+ if file_size == 0:
114
+ zero_files.append(file_name)
115
+ continue
110
116
  groups[i % args.donwload_nodes].append(file_name)
111
117
 
112
118
  # create a temp folder to save the downloaded files
113
- temp_config_dir = tempfile.mktemp()
114
- os.makedirs(temp_config_dir, exist_ok=True)
115
- print(f"Temp config dir: {temp_config_dir}")
119
+ local_temp_config_dir = tempfile.mktemp()
120
+ os.makedirs(local_temp_config_dir, exist_ok=True)
121
+ print(f"Temp config dir: {local_temp_config_dir}")
116
122
 
117
123
  for i, group in enumerate(groups):
118
- group_file_path = os.path.join(temp_config_dir, f"node_{i}.txt")
124
+ group_file_path = os.path.join(local_temp_config_dir, f"node_{i}.txt")
119
125
  total_size = 0
120
126
  with open(group_file_path, "w") as f:
121
127
  for file_name in group:
@@ -123,6 +129,12 @@ def sync_main(args):
123
129
  total_size += file_size_list[file_name]
124
130
  print(f"Node-{i} will download {len(group)} files, total size: {total_size} bytes")
125
131
 
132
+ with open(os.path.join(local_temp_config_dir, "zero_files.txt"), "w") as f:
133
+ for file_name in zero_files:
134
+ f.write(file_name + "\n")
135
+
136
+ print(f"Detect {len(zero_files)} files with size 0 bytes, they will be special handled.")
137
+
126
138
  with open(args.hostfile, "r") as f:
127
139
  host_list = []
128
140
  for line in f:
@@ -135,24 +147,31 @@ def sync_main(args):
135
147
 
136
148
  print(f"Find {len(host_list)} hosts in hostfile: {args.hostfile}")
137
149
  connection_list = []
150
+
151
+ # avoid the temp_config_dir to be the same as remote_temp_config_dir
152
+ remote_temp_config_dir = tempfile.mktemp()
153
+ while remote_temp_config_dir == local_temp_config_dir:
154
+ remote_temp_config_dir = tempfile.mktemp()
155
+
138
156
  master_addr = get_ip_via_ssh(host_list[0])
139
157
  for i, host in enumerate(host_list):
140
158
  # copy this .py file to the remote host
141
159
  put_commands = []
142
- put_commands.append((__file__, os.path.join(temp_config_dir, "sync.py")))
160
+ put_commands.append((__file__, os.path.join(remote_temp_config_dir, "sync.py")))
143
161
  if i < args.donwload_nodes:
144
- local_group_file = os.path.join(temp_config_dir, f"node_{i}.txt")
145
- put_commands.append((local_group_file, os.path.join(temp_config_dir, f"node_{i}.txt")))
162
+ local_group_file = os.path.join(local_temp_config_dir, f"node_{i}.txt")
163
+ put_commands.append((local_group_file, os.path.join(remote_temp_config_dir, f"node_{i}.txt")))
164
+ put_commands.append((os.path.join(local_temp_config_dir, "zero_files.txt"), os.path.join(remote_temp_config_dir, "zero_files.txt")))
146
165
 
147
166
  commnads = f"export SAS_TOKEN=\"{sas_token}\""
148
- commnads += f" && torchrun --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
149
- commnads += f" {temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
167
+ commnads += f" && {args.torchrun_alias} --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
168
+ commnads += f" {remote_temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
150
169
  if args.md5_verify:
151
170
  commnads += " --md5_verify"
152
171
  if i < args.donwload_nodes:
153
- commnads += f" --download_index_file {temp_config_dir}/node_{i}.txt"
172
+ commnads += f" --download_index_file {remote_temp_config_dir}/node_{i}.txt"
154
173
 
155
- connection_list.append(ConnectionWithCommand(host, temp_config_dir, put_commands, commnads))
174
+ connection_list.append(ConnectionWithCommand(host, remote_temp_config_dir, put_commands, commnads))
156
175
 
157
176
  group = ThreadingGroup.from_connections(connection_list)
158
177
  group.run('echo "Hello"', hide=False)
@@ -167,7 +186,9 @@ def download_files_from_blob(queue, blob_url, sas_token, folder, download_files,
167
186
  print(f"Node-{node_rank} start downloading {len(download_files)} files from {blob_url} to {folder}")
168
187
  for file_name in download_files:
169
188
  file_path = os.path.join(folder, file_name)
170
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
189
+ file_dir = os.path.dirname(file_path)
190
+ if not os.path.exists(file_dir):
191
+ os.makedirs(file_dir, exist_ok=True)
171
192
  for try_count in range(3):
172
193
  try:
173
194
  download_status = subprocess.run(
@@ -208,6 +229,9 @@ def sync_file_from_rank(rank, file_path, from_rank, md5_verify=False):
208
229
 
209
230
  dist.broadcast(tensor, src=from_rank)
210
231
  if rank != from_rank:
232
+ file_dir = os.path.dirname(file_path)
233
+ if not os.path.exists(file_dir):
234
+ os.makedirs(file_dir, exist_ok=True)
211
235
  with open(file_path, "wb") as f:
212
236
  tensor.cpu().numpy().tofile(f)
213
237
  if md5_verify:
@@ -283,9 +307,25 @@ def sync_worker(args):
283
307
  else:
284
308
  sync_file_from_rank(node_rank, "", global_status_code, md5_verify=args.md5_verify)
285
309
 
310
+ print(f"Node-{node_rank} finished downloading files, time taken: {time.time() - start_time:.2f}s")
286
311
  dist.barrier()
287
312
  download_process.join()
288
313
  destroy_process_group()
314
+
315
+ # current directory
316
+ zero_file = os.path.join(__file__, "zero_files.txt")
317
+ if os.path.exists(zero_file):
318
+ with open(zero_file, "r") as f:
319
+ zero_files = [line.strip() for line in f]
320
+ for zero_file_name in zero_files:
321
+ zero_file_path = os.path.join(args.folder, zero_file_name)
322
+ zero_file_dir = os.path.dirname(zero_file_path)
323
+ if not os.path.exists(zero_file_dir):
324
+ os.makedirs(zero_file_dir, exist_ok=True)
325
+ with open(zero_file_path, "wb") as f:
326
+ f.write(b"")
327
+ print(f"Node-{node_rank} handled {len(zero_files)} files with size 0 bytes, time taken: {time.time() - start_time:.2f}s")
328
+
289
329
  print(f"Node-{node_rank} finished syncing all files, time taken: {time.time() - start_time:.2f}s")
290
330
 
291
331
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: addftool
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Requires-Dist: cryptography
5
5
  Requires-Dist: requests
6
6
  Requires-Dist: PyYAML
@@ -1,17 +1,17 @@
1
1
  addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  addftool/addf_portal.py,sha256=w2LgsoutfnrKhtrQAXouUMwLqnsp5ALlsBYUWg8n9NM,781
3
3
  addftool/blob.py,sha256=NZOItDyFUIdV1tfhJZJJBEzGy296CE5NCictTzP4OPc,8282
4
- addftool/sync.py,sha256=ZFxDeHLYcoTftQUKqVeXJqpkHnqwPg1SvrVUm0kt9OI,12019
4
+ addftool/sync.py,sha256=ZpYxbM8uiPFrV7ODmOaM7asVPCWaxBixA-arVc-1kfs,14045
5
5
  addftool/tool.py,sha256=EuKQ2t2InN7yB-_oYLcdsA7vRqzRGTunwIxplUSqEG0,2054
6
6
  addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
7
7
  addftool/deploy/__init__.py,sha256=tpyoTh3SqAQojPizsJDvQohu1Pcb3-w-DP5sO4-5lBM,1220
8
8
  addftool/deploy/azure.py,sha256=_o_9Eh8cVwLDAqvfyRYBtQRHs_Gul-nCs2ZXttwO1bk,1301
9
9
  addftool/deploy/ssh_server.py,sha256=f2T8fgwACVljPfdcimMywUjsFnLCWRde7iWPAILpRz8,5463
10
- addftool/process/__init__.py,sha256=gPdGsjMEET6crzOz4Iw5cmf6RR1toXGovydRXv8Uagk,3543
11
- addftool/process/utils.py,sha256=me4HqMz5OgRcQMUJmVhKdTJh4SW5BB-pd_lq7g8-UwE,2252
10
+ addftool/process/__init__.py,sha256=Dze8OrcyjQlAbPrjE_h8bMi8W4b3OJyZOjTucPrkJvM,3721
11
+ addftool/process/utils.py,sha256=JldxnwanLJOgxaPgmCJh7SeBRaaj5rFxWWxh1hpsvbA,2609
12
12
  addftool/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- addftool-0.1.7.dist-info/METADATA,sha256=m-g6ENcdOOB7ufkIibSl5L8tYflejPWzlqorWwT2j1A,170
14
- addftool-0.1.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
15
- addftool-0.1.7.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
16
- addftool-0.1.7.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
17
- addftool-0.1.7.dist-info/RECORD,,
13
+ addftool-0.1.9.dist-info/METADATA,sha256=VEfXNesNw0S0lZl2_hYrJhrUyRN9SpP1iT1aIhtgKbU,170
14
+ addftool-0.1.9.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
15
+ addftool-0.1.9.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
16
+ addftool-0.1.9.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
17
+ addftool-0.1.9.dist-info/RECORD,,