addftool 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
addftool/sync.py CHANGED
@@ -35,6 +35,8 @@ def add_args(parser):
35
35
  parser.add_argument("--md5_verify", action='store_true', default=False,
36
36
  help="whether to verify the md5 of the file after sync, default is False.")
37
37
  parser.add_argument("--port", help="the port for torchrun, default is 29501", type=int, default=29501)
38
+ parser.add_argument("--torchrun_alias", type=str, default="torchrun",
39
+ help="the alias of torchrun, default is torchrun. If you use torchrun, please set it to torchrun.")
38
40
  # distributed downloader from blob
39
41
  parser.add_argument("--donwload_nodes", help="download nodes, default is node-0", type=int, default=1)
40
42
  parser.add_argument("folder", help="the folder need to sync", type=str)
@@ -104,18 +106,22 @@ def sync_main(args):
104
106
 
105
107
  # divide the files into chunks for each node by file size
106
108
  sorted_files = sorted(file_size_list.items(), key=lambda x: x[1], reverse=True)
109
+ zero_files = []
107
110
 
108
111
  groups = [[] for _ in range(args.donwload_nodes)]
109
112
  for i, (file_name, file_size) in enumerate(sorted_files):
113
+ if file_size == 0:
114
+ zero_files.append(file_name)
115
+ continue
110
116
  groups[i % args.donwload_nodes].append(file_name)
111
117
 
112
118
  # create a temp folder to save the downloaded files
113
- temp_config_dir = tempfile.mktemp()
114
- os.makedirs(temp_config_dir, exist_ok=True)
115
- print(f"Temp config dir: {temp_config_dir}")
119
+ local_temp_config_dir = tempfile.mktemp()
120
+ os.makedirs(local_temp_config_dir, exist_ok=True)
121
+ print(f"Temp config dir: {local_temp_config_dir}")
116
122
 
117
123
  for i, group in enumerate(groups):
118
- group_file_path = os.path.join(temp_config_dir, f"node_{i}.txt")
124
+ group_file_path = os.path.join(local_temp_config_dir, f"node_{i}.txt")
119
125
  total_size = 0
120
126
  with open(group_file_path, "w") as f:
121
127
  for file_name in group:
@@ -123,6 +129,12 @@ def sync_main(args):
123
129
  total_size += file_size_list[file_name]
124
130
  print(f"Node-{i} will download {len(group)} files, total size: {total_size} bytes")
125
131
 
132
+ with open(os.path.join(local_temp_config_dir, "zero_files.txt"), "w") as f:
133
+ for file_name in zero_files:
134
+ f.write(file_name + "\n")
135
+
136
+ print(f"Detect {len(zero_files)} files with size 0 bytes, they will be special handled.")
137
+
126
138
  with open(args.hostfile, "r") as f:
127
139
  host_list = []
128
140
  for line in f:
@@ -135,24 +147,31 @@ def sync_main(args):
135
147
 
136
148
  print(f"Find {len(host_list)} hosts in hostfile: {args.hostfile}")
137
149
  connection_list = []
150
+
151
+ # avoid the temp_config_dir to be the same as remote_temp_config_dir
152
+ remote_temp_config_dir = tempfile.mktemp()
153
+ while remote_temp_config_dir == local_temp_config_dir:
154
+ remote_temp_config_dir = tempfile.mktemp()
155
+
138
156
  master_addr = get_ip_via_ssh(host_list[0])
139
157
  for i, host in enumerate(host_list):
140
158
  # copy this .py file to the remote host
141
159
  put_commands = []
142
- put_commands.append((__file__, os.path.join(temp_config_dir, "sync.py")))
160
+ put_commands.append((__file__, os.path.join(remote_temp_config_dir, "sync.py")))
143
161
  if i < args.donwload_nodes:
144
- local_group_file = os.path.join(temp_config_dir, f"node_{i}.txt")
145
- put_commands.append((local_group_file, os.path.join(temp_config_dir, f"node_{i}.txt")))
162
+ local_group_file = os.path.join(local_temp_config_dir, f"node_{i}.txt")
163
+ put_commands.append((local_group_file, os.path.join(remote_temp_config_dir, f"node_{i}.txt")))
164
+ put_commands.append((os.path.join(local_temp_config_dir, "zero_files.txt"), os.path.join(remote_temp_config_dir, "zero_files.txt")))
146
165
 
147
166
  commnads = f"export SAS_TOKEN=\"{sas_token}\""
148
- commnads += f" && torchrun --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
149
- commnads += f" {temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
167
+ commnads += f" && {args.torchrun_alias} --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
168
+ commnads += f" {remote_temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
150
169
  if args.md5_verify:
151
170
  commnads += " --md5_verify"
152
171
  if i < args.donwload_nodes:
153
- commnads += f" --download_index_file {temp_config_dir}/node_{i}.txt"
172
+ commnads += f" --download_index_file {remote_temp_config_dir}/node_{i}.txt"
154
173
 
155
- connection_list.append(ConnectionWithCommand(host, temp_config_dir, put_commands, commnads))
174
+ connection_list.append(ConnectionWithCommand(host, remote_temp_config_dir, put_commands, commnads))
156
175
 
157
176
  group = ThreadingGroup.from_connections(connection_list)
158
177
  group.run('echo "Hello"', hide=False)
@@ -167,7 +186,9 @@ def download_files_from_blob(queue, blob_url, sas_token, folder, download_files,
167
186
  print(f"Node-{node_rank} start downloading {len(download_files)} files from {blob_url} to {folder}")
168
187
  for file_name in download_files:
169
188
  file_path = os.path.join(folder, file_name)
170
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
189
+ file_dir = os.path.dirname(file_path)
190
+ if not os.path.exists(file_dir):
191
+ os.makedirs(file_dir, exist_ok=True)
171
192
  for try_count in range(3):
172
193
  try:
173
194
  download_status = subprocess.run(
@@ -208,6 +229,9 @@ def sync_file_from_rank(rank, file_path, from_rank, md5_verify=False):
208
229
 
209
230
  dist.broadcast(tensor, src=from_rank)
210
231
  if rank != from_rank:
232
+ file_dir = os.path.dirname(file_path)
233
+ if not os.path.exists(file_dir):
234
+ os.makedirs(file_dir, exist_ok=True)
211
235
  with open(file_path, "wb") as f:
212
236
  tensor.cpu().numpy().tofile(f)
213
237
  if md5_verify:
@@ -283,9 +307,25 @@ def sync_worker(args):
283
307
  else:
284
308
  sync_file_from_rank(node_rank, "", global_status_code, md5_verify=args.md5_verify)
285
309
 
310
+ print(f"Node-{node_rank} finished downloading files, time taken: {time.time() - start_time:.2f}s")
286
311
  dist.barrier()
287
312
  download_process.join()
288
313
  destroy_process_group()
314
+
315
+ # current directory
316
+ zero_file = os.path.join(__file__, "zero_files.txt")
317
+ if os.path.exists(zero_file):
318
+ with open(zero_file, "r") as f:
319
+ zero_files = [line.strip() for line in f]
320
+ for zero_file_name in zero_files:
321
+ zero_file_path = os.path.join(args.folder, zero_file_name)
322
+ zero_file_dir = os.path.dirname(zero_file_path)
323
+ if not os.path.exists(zero_file_dir):
324
+ os.makedirs(zero_file_dir, exist_ok=True)
325
+ with open(zero_file_path, "wb") as f:
326
+ f.write(b"")
327
+ print(f"Node-{node_rank} handled {len(zero_files)} files with size 0 bytes, time taken: {time.time() - start_time:.2f}s")
328
+
289
329
  print(f"Node-{node_rank} finished syncing all files, time taken: {time.time() - start_time:.2f}s")
290
330
 
291
331
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: addftool
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Requires-Dist: cryptography
5
5
  Requires-Dist: requests
6
6
  Requires-Dist: PyYAML
@@ -1,7 +1,7 @@
1
1
  addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  addftool/addf_portal.py,sha256=w2LgsoutfnrKhtrQAXouUMwLqnsp5ALlsBYUWg8n9NM,781
3
3
  addftool/blob.py,sha256=NZOItDyFUIdV1tfhJZJJBEzGy296CE5NCictTzP4OPc,8282
4
- addftool/sync.py,sha256=ZFxDeHLYcoTftQUKqVeXJqpkHnqwPg1SvrVUm0kt9OI,12019
4
+ addftool/sync.py,sha256=ZpYxbM8uiPFrV7ODmOaM7asVPCWaxBixA-arVc-1kfs,14045
5
5
  addftool/tool.py,sha256=EuKQ2t2InN7yB-_oYLcdsA7vRqzRGTunwIxplUSqEG0,2054
6
6
  addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
7
7
  addftool/deploy/__init__.py,sha256=tpyoTh3SqAQojPizsJDvQohu1Pcb3-w-DP5sO4-5lBM,1220
@@ -10,8 +10,8 @@ addftool/deploy/ssh_server.py,sha256=f2T8fgwACVljPfdcimMywUjsFnLCWRde7iWPAILpRz8
10
10
  addftool/process/__init__.py,sha256=gPdGsjMEET6crzOz4Iw5cmf6RR1toXGovydRXv8Uagk,3543
11
11
  addftool/process/utils.py,sha256=me4HqMz5OgRcQMUJmVhKdTJh4SW5BB-pd_lq7g8-UwE,2252
12
12
  addftool/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- addftool-0.1.7.dist-info/METADATA,sha256=m-g6ENcdOOB7ufkIibSl5L8tYflejPWzlqorWwT2j1A,170
14
- addftool-0.1.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
15
- addftool-0.1.7.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
16
- addftool-0.1.7.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
17
- addftool-0.1.7.dist-info/RECORD,,
13
+ addftool-0.1.8.dist-info/METADATA,sha256=f50lOq51j55hNh2hnk6SdAni0E7MXHec81sBCOHZ_ro,170
14
+ addftool-0.1.8.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
15
+ addftool-0.1.8.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
16
+ addftool-0.1.8.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
17
+ addftool-0.1.8.dist-info/RECORD,,