addftool 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- addftool/addf_portal.py +8 -0
- addftool/deploy/azure.py +1 -0
- addftool/process/__init__.py +1 -0
- addftool/sync.py +301 -0
- addftool/util.py +68 -5
- {addftool-0.1.5.dist-info → addftool-0.1.7.dist-info}/METADATA +2 -1
- addftool-0.1.7.dist-info/RECORD +17 -0
- addftool/sync/__init__.py +0 -42
- addftool-0.1.5.dist-info/RECORD +0 -17
- {addftool-0.1.5.dist-info → addftool-0.1.7.dist-info}/WHEEL +0 -0
- {addftool-0.1.5.dist-info → addftool-0.1.7.dist-info}/entry_points.txt +0 -0
- {addftool-0.1.5.dist-info → addftool-0.1.7.dist-info}/top_level.txt +0 -0
addftool/addf_portal.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from addftool.process import add_killer_args, killer_main
|
|
3
|
+
from addftool.sync import add_sync_args, sync_main
|
|
4
|
+
from addftool.deploy import add_deploy_args, deploy_main
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
def get_args():
|
|
@@ -7,6 +9,8 @@ def get_args():
|
|
|
7
9
|
|
|
8
10
|
subparsers = parser.add_subparsers(dest='command', help='Sub-command help')
|
|
9
11
|
add_killer_args(subparsers)
|
|
12
|
+
add_sync_args(subparsers)
|
|
13
|
+
add_deploy_args(subparsers)
|
|
10
14
|
|
|
11
15
|
return parser.parse_args()
|
|
12
16
|
|
|
@@ -15,6 +19,10 @@ def main():
|
|
|
15
19
|
args = get_args()
|
|
16
20
|
if args.command == "kill":
|
|
17
21
|
killer_main(args)
|
|
22
|
+
elif args.command == "sync":
|
|
23
|
+
sync_main(args)
|
|
24
|
+
elif args.command == "deploy":
|
|
25
|
+
deploy_main(args)
|
|
18
26
|
else:
|
|
19
27
|
print("Unknown command: ", args.command)
|
|
20
28
|
|
addftool/deploy/azure.py
CHANGED
|
@@ -11,6 +11,7 @@ def deploy_azure(packages):
|
|
|
11
11
|
command_prefix = "sudo " if need_sudo() else ""
|
|
12
12
|
command = "dpkg -i /tmp/packages-microsoft-prod.deb"
|
|
13
13
|
execute_command(command_prefix + command)
|
|
14
|
+
execute_command(command_prefix + "apt-get update")
|
|
14
15
|
|
|
15
16
|
install_packages(packages)
|
|
16
17
|
|
addftool/process/__init__.py
CHANGED
addftool/sync.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
import subprocess
|
|
5
|
+
import hashlib
|
|
6
|
+
import tempfile
|
|
7
|
+
from multiprocessing import Queue, Process
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from fabric import Connection, ThreadingGroup
|
|
11
|
+
except ImportError:
|
|
12
|
+
Connection = object
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import torch
|
|
16
|
+
import torch.distributed as dist
|
|
17
|
+
from torch.distributed import init_process_group, destroy_process_group
|
|
18
|
+
_torch_is_available = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
_torch_is_available = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def add_sync_args(subparsers):
|
|
24
|
+
deploy_parser = subparsers.add_parser('sync', help='download and sync folder from master node to other nodes')
|
|
25
|
+
add_args(deploy_parser)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def add_args(parser):
|
|
29
|
+
parser.add_argument("--from_blob_url", help="download from blob url to master node before sync", type=str, default="")
|
|
30
|
+
parser.add_argument("--tool", help="tool name", type=str, default="torch_nccl", choices=["torch_nccl"])
|
|
31
|
+
parser.add_argument("--hostfile", help="host file, sync file from node-0 to others", type=str, default="")
|
|
32
|
+
|
|
33
|
+
parser.add_argument("--download_index_file", type=str, default="",
|
|
34
|
+
help="the file to save the download index, should be generated by master node.")
|
|
35
|
+
parser.add_argument("--md5_verify", action='store_true', default=False,
|
|
36
|
+
help="whether to verify the md5 of the file after sync, default is False.")
|
|
37
|
+
parser.add_argument("--port", help="the port for torchrun, default is 29501", type=int, default=29501)
|
|
38
|
+
# distributed downloader from blob
|
|
39
|
+
parser.add_argument("--donwload_nodes", help="download nodes, default is node-0", type=int, default=1)
|
|
40
|
+
parser.add_argument("folder", help="the folder need to sync", type=str)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ConnectionWithCommand(Connection):
|
|
44
|
+
def __init__(self, host, temp_config_dir, puts, command):
|
|
45
|
+
super().__init__(host)
|
|
46
|
+
self.command = command
|
|
47
|
+
self.puts = puts
|
|
48
|
+
self.temp_config_dir = temp_config_dir
|
|
49
|
+
|
|
50
|
+
def run(self, command, **kwargs):
|
|
51
|
+
super().run(f"mkdir -p {self.temp_config_dir}", **kwargs)
|
|
52
|
+
for src, dest in self.puts:
|
|
53
|
+
self.put(src, remote=dest)
|
|
54
|
+
super().run(self.command, **kwargs)
|
|
55
|
+
if command:
|
|
56
|
+
super().run(command, **kwargs)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_ip_via_ssh(hostname):
|
|
60
|
+
if hostname == "localhost":
|
|
61
|
+
return "127.0.0.1"
|
|
62
|
+
try:
|
|
63
|
+
cmd = ["ssh", hostname, "hostname -I | awk '{print $1}'"]
|
|
64
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
|
|
65
|
+
|
|
66
|
+
if result.returncode == 0:
|
|
67
|
+
ip = result.stdout.strip()
|
|
68
|
+
return ip
|
|
69
|
+
else:
|
|
70
|
+
print(f"SSH {hostname} failed: {result.stderr}")
|
|
71
|
+
return None
|
|
72
|
+
except Exception as e:
|
|
73
|
+
print(f"Error executing SSH command on {hostname}: {e}")
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def sync_main(args):
|
|
78
|
+
sas_token = os.environ.get("SAS_TOKEN")
|
|
79
|
+
if not sas_token:
|
|
80
|
+
raise ValueError("SAS_TOKEN environment variable is not set.")
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
list_operation = subprocess.run(
|
|
84
|
+
["azcopy", "list", args.from_blob_url + sas_token, "--machine-readable"],
|
|
85
|
+
stdout=subprocess.PIPE,
|
|
86
|
+
stderr=subprocess.PIPE,
|
|
87
|
+
text=True
|
|
88
|
+
)
|
|
89
|
+
if list_operation.returncode != 0:
|
|
90
|
+
raise RuntimeError(f"Failed to list blob: {list_operation.stderr}")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
raise RuntimeError(f"Error while listing blob: {e}")
|
|
93
|
+
|
|
94
|
+
file_size_list = {}
|
|
95
|
+
for line in list_operation.stdout.splitlines():
|
|
96
|
+
# print(line)
|
|
97
|
+
parts = line.split("; Content Length:")
|
|
98
|
+
if len(parts) != 2:
|
|
99
|
+
print(f"INFO: {line}")
|
|
100
|
+
continue
|
|
101
|
+
file_name = parts[0].strip()
|
|
102
|
+
file_size = int(parts[1])
|
|
103
|
+
file_size_list[file_name] = file_size
|
|
104
|
+
|
|
105
|
+
# divide the files into chunks for each node by file size
|
|
106
|
+
sorted_files = sorted(file_size_list.items(), key=lambda x: x[1], reverse=True)
|
|
107
|
+
|
|
108
|
+
groups = [[] for _ in range(args.donwload_nodes)]
|
|
109
|
+
for i, (file_name, file_size) in enumerate(sorted_files):
|
|
110
|
+
groups[i % args.donwload_nodes].append(file_name)
|
|
111
|
+
|
|
112
|
+
# create a temp folder to save the downloaded files
|
|
113
|
+
temp_config_dir = tempfile.mktemp()
|
|
114
|
+
os.makedirs(temp_config_dir, exist_ok=True)
|
|
115
|
+
print(f"Temp config dir: {temp_config_dir}")
|
|
116
|
+
|
|
117
|
+
for i, group in enumerate(groups):
|
|
118
|
+
group_file_path = os.path.join(temp_config_dir, f"node_{i}.txt")
|
|
119
|
+
total_size = 0
|
|
120
|
+
with open(group_file_path, "w") as f:
|
|
121
|
+
for file_name in group:
|
|
122
|
+
f.write(file_name + "\n")
|
|
123
|
+
total_size += file_size_list[file_name]
|
|
124
|
+
print(f"Node-{i} will download {len(group)} files, total size: {total_size} bytes")
|
|
125
|
+
|
|
126
|
+
with open(args.hostfile, "r") as f:
|
|
127
|
+
host_list = []
|
|
128
|
+
for line in f:
|
|
129
|
+
line = line.strip()
|
|
130
|
+
if line and not line.startswith("#"):
|
|
131
|
+
host_list.append(line)
|
|
132
|
+
|
|
133
|
+
if len(host_list) < len(groups):
|
|
134
|
+
raise ValueError(f"Number of hosts in hostfile {len(host_list)} is less than number of download nodes {len(groups)}")
|
|
135
|
+
|
|
136
|
+
print(f"Find {len(host_list)} hosts in hostfile: {args.hostfile}")
|
|
137
|
+
connection_list = []
|
|
138
|
+
master_addr = get_ip_via_ssh(host_list[0])
|
|
139
|
+
for i, host in enumerate(host_list):
|
|
140
|
+
# copy this .py file to the remote host
|
|
141
|
+
put_commands = []
|
|
142
|
+
put_commands.append((__file__, os.path.join(temp_config_dir, "sync.py")))
|
|
143
|
+
if i < args.donwload_nodes:
|
|
144
|
+
local_group_file = os.path.join(temp_config_dir, f"node_{i}.txt")
|
|
145
|
+
put_commands.append((local_group_file, os.path.join(temp_config_dir, f"node_{i}.txt")))
|
|
146
|
+
|
|
147
|
+
commnads = f"export SAS_TOKEN=\"{sas_token}\""
|
|
148
|
+
commnads += f" && torchrun --nproc_per_node=1 --nnodes={len(host_list)} --node_rank={i} --master_addr={master_addr} --master_port={args.port}"
|
|
149
|
+
commnads += f" {temp_config_dir}/sync.py {args.folder} --tool {args.tool} --from_blob_url {args.from_blob_url}"
|
|
150
|
+
if args.md5_verify:
|
|
151
|
+
commnads += " --md5_verify"
|
|
152
|
+
if i < args.donwload_nodes:
|
|
153
|
+
commnads += f" --download_index_file {temp_config_dir}/node_{i}.txt"
|
|
154
|
+
|
|
155
|
+
connection_list.append(ConnectionWithCommand(host, temp_config_dir, put_commands, commnads))
|
|
156
|
+
|
|
157
|
+
group = ThreadingGroup.from_connections(connection_list)
|
|
158
|
+
group.run('echo "Hello"', hide=False)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def download_files_from_blob(queue, blob_url, sas_token, folder, download_files, node_rank):
|
|
162
|
+
# This function should implement the logic to download files from blob storage
|
|
163
|
+
# using the provided blob_url and sas_token. The downloaded files should be
|
|
164
|
+
# saved in the specified folder.
|
|
165
|
+
if not blob_url.endswith("/"):
|
|
166
|
+
blob_url += "/"
|
|
167
|
+
print(f"Node-{node_rank} start downloading {len(download_files)} files from {blob_url} to {folder}")
|
|
168
|
+
for file_name in download_files:
|
|
169
|
+
file_path = os.path.join(folder, file_name)
|
|
170
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
171
|
+
for try_count in range(3):
|
|
172
|
+
try:
|
|
173
|
+
download_status = subprocess.run(
|
|
174
|
+
["azcopy", "copy", blob_url + file_name + sas_token, file_path],
|
|
175
|
+
stdout=subprocess.PIPE,
|
|
176
|
+
stderr=subprocess.PIPE,
|
|
177
|
+
text=True
|
|
178
|
+
)
|
|
179
|
+
if download_status.returncode != 0:
|
|
180
|
+
raise RuntimeError(f"Failed to download {file_name}: {download_status.stderr}")
|
|
181
|
+
print(f"Rank {node_rank}: Downloaded {file_name} successfully, from {blob_url} to {file_path}")
|
|
182
|
+
queue.put(file_path)
|
|
183
|
+
break
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(f"Rank {node_rank}: Download failed: {e}")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def sync_file_from_rank(rank, file_path, from_rank, md5_verify=False):
|
|
189
|
+
if rank == from_rank:
|
|
190
|
+
with open(file_path, "rb") as f:
|
|
191
|
+
data = f.read()
|
|
192
|
+
num_bytes = len(data)
|
|
193
|
+
if md5_verify:
|
|
194
|
+
md5 = hashlib.md5()
|
|
195
|
+
md5.update(data)
|
|
196
|
+
md5_value = md5.hexdigest()
|
|
197
|
+
else:
|
|
198
|
+
md5_value = ""
|
|
199
|
+
obj_list = [file_path, num_bytes, md5_value]
|
|
200
|
+
dist.broadcast_object_list(obj_list, src=from_rank)
|
|
201
|
+
tensor = torch.frombuffer(data, dtype=torch.uint8)
|
|
202
|
+
tensor = tensor.cuda()
|
|
203
|
+
else:
|
|
204
|
+
obj_list = [0, "", ""]
|
|
205
|
+
dist.broadcast_object_list(obj_list, src=from_rank)
|
|
206
|
+
file_path, num_bytes, md5_value = obj_list
|
|
207
|
+
tensor = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
|
|
208
|
+
|
|
209
|
+
dist.broadcast(tensor, src=from_rank)
|
|
210
|
+
if rank != from_rank:
|
|
211
|
+
with open(file_path, "wb") as f:
|
|
212
|
+
tensor.cpu().numpy().tofile(f)
|
|
213
|
+
if md5_verify:
|
|
214
|
+
md5 = hashlib.md5()
|
|
215
|
+
md5.update(tensor.cpu().numpy())
|
|
216
|
+
md5_value_recv = md5.hexdigest()
|
|
217
|
+
if md5_value_recv != md5_value:
|
|
218
|
+
raise ValueError(f"MD5 mismatch for file {file_path}: {md5_value_recv} != {md5_value}")
|
|
219
|
+
else:
|
|
220
|
+
print(f"Node-{rank} verified file {file_path} with MD5: {md5_value_recv}")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def sync_worker(args):
|
|
224
|
+
assert args.tool in ["torch_nccl"], f"tool {args.tool} is not supported"
|
|
225
|
+
if not _torch_is_available:
|
|
226
|
+
raise ImportError("Torch is not available. Please install torch to use this feature.")
|
|
227
|
+
start_time = time.time()
|
|
228
|
+
|
|
229
|
+
init_process_group(backend='nccl')
|
|
230
|
+
node_rank = int(os.environ['RANK'])
|
|
231
|
+
world_size = int(os.environ['WORLD_SIZE'])
|
|
232
|
+
|
|
233
|
+
print(f"rank {node_rank} start sync worker, args = {args}, nccl init time: {time.time() - start_time:.2f}s")
|
|
234
|
+
|
|
235
|
+
if world_size < 2:
|
|
236
|
+
raise ValueError("World size must be at least 2 for distributed download.")
|
|
237
|
+
|
|
238
|
+
download_queue = Queue()
|
|
239
|
+
|
|
240
|
+
download_files = []
|
|
241
|
+
transfered_files = set()
|
|
242
|
+
if args.download_index_file:
|
|
243
|
+
with open(args.download_index_file, "r") as f:
|
|
244
|
+
for line in f:
|
|
245
|
+
download_files.append(line.strip())
|
|
246
|
+
|
|
247
|
+
download_process = Process(
|
|
248
|
+
target=download_files_from_blob,
|
|
249
|
+
args=(download_queue, args.from_blob_url, os.environ["SAS_TOKEN"], args.folder, download_files, node_rank),
|
|
250
|
+
)
|
|
251
|
+
download_process.start()
|
|
252
|
+
|
|
253
|
+
last_download = None
|
|
254
|
+
|
|
255
|
+
while True:
|
|
256
|
+
if len(download_files) == len(transfered_files):
|
|
257
|
+
status_code = world_size + 1
|
|
258
|
+
elif last_download is not None:
|
|
259
|
+
status_code = node_rank
|
|
260
|
+
else:
|
|
261
|
+
try:
|
|
262
|
+
last_download = download_queue.get(timeout=1)
|
|
263
|
+
status_code = node_rank
|
|
264
|
+
except Exception as e:
|
|
265
|
+
status_code = world_size
|
|
266
|
+
|
|
267
|
+
global_status_code = torch.tensor(status_code).cuda()
|
|
268
|
+
dist.all_reduce(global_status_code, op=dist.ReduceOp.MIN)
|
|
269
|
+
global_status_code = global_status_code.item()
|
|
270
|
+
|
|
271
|
+
if global_status_code == world_size + 1:
|
|
272
|
+
print(f"Node-{node_rank} finished downloading all files, time taken: {time.time() - start_time:.2f}s")
|
|
273
|
+
break
|
|
274
|
+
elif global_status_code == world_size:
|
|
275
|
+
if node_rank == 0:
|
|
276
|
+
print(f"All nodes is waiting for other nodes to finish downloading...")
|
|
277
|
+
time.sleep(1)
|
|
278
|
+
elif global_status_code == node_rank:
|
|
279
|
+
print(f"Node-{node_rank} is downloaded {last_download}, prepare to broadcast it..., time taken: {time.time() - start_time:.2f}s")
|
|
280
|
+
sync_file_from_rank(node_rank, last_download, node_rank, md5_verify=args.md5_verify)
|
|
281
|
+
transfered_files.add(last_download)
|
|
282
|
+
last_download = None
|
|
283
|
+
else:
|
|
284
|
+
sync_file_from_rank(node_rank, "", global_status_code, md5_verify=args.md5_verify)
|
|
285
|
+
|
|
286
|
+
dist.barrier()
|
|
287
|
+
download_process.join()
|
|
288
|
+
destroy_process_group()
|
|
289
|
+
print(f"Node-{node_rank} finished syncing all files, time taken: {time.time() - start_time:.2f}s")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
if __name__ == "__main__":
|
|
293
|
+
import argparse
|
|
294
|
+
|
|
295
|
+
parser = argparse.ArgumentParser(description="Addf's tool")
|
|
296
|
+
add_args(parser)
|
|
297
|
+
args = parser.parse_args()
|
|
298
|
+
if args.hostfile:
|
|
299
|
+
sync_main(args)
|
|
300
|
+
else:
|
|
301
|
+
sync_worker(args)
|
addftool/util.py
CHANGED
|
@@ -1,12 +1,75 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import subprocess
|
|
2
3
|
|
|
3
4
|
|
|
4
|
-
def execute_command(command, to_file=None):
|
|
5
|
+
def execute_command(command, to_file=None, only_stdout=True, hide=False):
|
|
5
6
|
if to_file is not None:
|
|
6
7
|
to_file.write(command + "\n")
|
|
7
8
|
return None
|
|
8
9
|
else:
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
if not hide:
|
|
11
|
+
print("Execute command: ", command)
|
|
12
|
+
result = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
|
13
|
+
result.wait()
|
|
14
|
+
print(f"Return code: {result.returncode}")
|
|
15
|
+
if result.stdout is not None:
|
|
16
|
+
stdout = result.stdout.read().decode()
|
|
17
|
+
print(f"Stdout: {stdout}")
|
|
18
|
+
else:
|
|
19
|
+
stdout = None
|
|
20
|
+
if only_stdout:
|
|
21
|
+
if not hide and stdout is not None:
|
|
22
|
+
print(stdout)
|
|
23
|
+
return stdout
|
|
24
|
+
if result.stderr is not None:
|
|
25
|
+
stderr = result.stderr.read().decode()
|
|
26
|
+
print(f"Stderr: {stderr}")
|
|
27
|
+
else:
|
|
28
|
+
stderr = None
|
|
29
|
+
|
|
30
|
+
return {'stdout': stdout, 'stderr': stderr, 'returncode': result.returncode}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def need_sudo():
|
|
34
|
+
return os.name == 'posix' and os.getuid() != 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def is_running_in_docker():
|
|
38
|
+
return os.path.exists('/.dockerenv') or \
|
|
39
|
+
any('docker' in line for line in open('/proc/self/cgroup', 'r')) if os.path.exists('/proc/self/cgroup') else False or \
|
|
40
|
+
os.environ.get('container') == 'docker' or \
|
|
41
|
+
os.environ.get('DOCKER') == 'true' or \
|
|
42
|
+
os.environ.get('DOCKER_CONTAINER') == 'yes'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_ubuntu_version():
|
|
46
|
+
with open("/etc/os-release") as f:
|
|
47
|
+
for line in f:
|
|
48
|
+
if line.startswith("VERSION_ID="):
|
|
49
|
+
version = line.split("=")[1].strip().strip('"')
|
|
50
|
+
return version
|
|
51
|
+
return "22.04"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def check_package_installed(package):
|
|
55
|
+
command = f"dpkg -l | grep {package}"
|
|
56
|
+
result = execute_command(command)
|
|
57
|
+
if result is not None and package in result:
|
|
58
|
+
return True
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def install_packages(package_list):
|
|
63
|
+
to_install = []
|
|
64
|
+
for package in package_list:
|
|
65
|
+
if check_package_installed(package):
|
|
66
|
+
print(f"{package} is already installed")
|
|
67
|
+
continue
|
|
68
|
+
to_install.append(package)
|
|
69
|
+
|
|
70
|
+
if len(to_install) > 0:
|
|
71
|
+
packages = " ".join(to_install)
|
|
72
|
+
command = f"apt-get install -y {packages}"
|
|
73
|
+
if need_sudo():
|
|
74
|
+
command = "sudo " + command
|
|
75
|
+
execute_command(command)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
addftool/addf_portal.py,sha256=w2LgsoutfnrKhtrQAXouUMwLqnsp5ALlsBYUWg8n9NM,781
|
|
3
|
+
addftool/blob.py,sha256=NZOItDyFUIdV1tfhJZJJBEzGy296CE5NCictTzP4OPc,8282
|
|
4
|
+
addftool/sync.py,sha256=ZFxDeHLYcoTftQUKqVeXJqpkHnqwPg1SvrVUm0kt9OI,12019
|
|
5
|
+
addftool/tool.py,sha256=EuKQ2t2InN7yB-_oYLcdsA7vRqzRGTunwIxplUSqEG0,2054
|
|
6
|
+
addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
|
|
7
|
+
addftool/deploy/__init__.py,sha256=tpyoTh3SqAQojPizsJDvQohu1Pcb3-w-DP5sO4-5lBM,1220
|
|
8
|
+
addftool/deploy/azure.py,sha256=_o_9Eh8cVwLDAqvfyRYBtQRHs_Gul-nCs2ZXttwO1bk,1301
|
|
9
|
+
addftool/deploy/ssh_server.py,sha256=f2T8fgwACVljPfdcimMywUjsFnLCWRde7iWPAILpRz8,5463
|
|
10
|
+
addftool/process/__init__.py,sha256=gPdGsjMEET6crzOz4Iw5cmf6RR1toXGovydRXv8Uagk,3543
|
|
11
|
+
addftool/process/utils.py,sha256=me4HqMz5OgRcQMUJmVhKdTJh4SW5BB-pd_lq7g8-UwE,2252
|
|
12
|
+
addftool/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
addftool-0.1.7.dist-info/METADATA,sha256=m-g6ENcdOOB7ufkIibSl5L8tYflejPWzlqorWwT2j1A,170
|
|
14
|
+
addftool-0.1.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
15
|
+
addftool-0.1.7.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
|
|
16
|
+
addftool-0.1.7.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
|
|
17
|
+
addftool-0.1.7.dist-info/RECORD,,
|
addftool/sync/__init__.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def add_sync_args(subparsers):
|
|
5
|
-
process_killer_parser = subparsers.add_parser('sync', help='download and sync folder from master node to other nodes')
|
|
6
|
-
|
|
7
|
-
process_killer_parser.add_argument("--from_blob_url", help="download from blob url to master node before sync", type=str, default="")
|
|
8
|
-
process_killer_parser.add_argument("--sas_token", help="sas token for blob url", type=str, default="")
|
|
9
|
-
process_killer_parser.add_argument("--tool", help="tool name", type=str, default="torch_nccl", choices=["torch_nccl", "rsync"])
|
|
10
|
-
process_killer_parser.add_argument("--hostfile", help="host file, sync file from node-0 to others", type=str, default="")
|
|
11
|
-
|
|
12
|
-
# distributed downloader from blob
|
|
13
|
-
process_killer_parser.add_argument("--donwload_nodes", help="download nodes, default is node-0", type=int, default=1)
|
|
14
|
-
|
|
15
|
-
process_killer_parser.add_argument("folder", nargs='?', help="the folder need to sync", type=str, default="")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def sync_main(args):
|
|
19
|
-
print(args)
|
|
20
|
-
exit(0)
|
|
21
|
-
if args.source == "" or args.target == "":
|
|
22
|
-
print("Please provide source and target folder")
|
|
23
|
-
return
|
|
24
|
-
|
|
25
|
-
# check if source is a folder
|
|
26
|
-
if not os.path.isdir(args.source):
|
|
27
|
-
print(f"Source {args.source} is not a folder")
|
|
28
|
-
return
|
|
29
|
-
|
|
30
|
-
# check if target is a folder
|
|
31
|
-
if not os.path.isdir(args.target):
|
|
32
|
-
print(f"Target {args.target} is not a folder")
|
|
33
|
-
return
|
|
34
|
-
|
|
35
|
-
# check if source and target are the same
|
|
36
|
-
if os.path.abspath(args.source) == os.path.abspath(args.target):
|
|
37
|
-
print(f"Source and target are the same")
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
# sync source to target
|
|
41
|
-
command = f"rsync -avz --delete {args.source} {args.target}"
|
|
42
|
-
os.system(command)
|
addftool-0.1.5.dist-info/RECORD
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
addftool/addf_portal.py,sha256=U52TdNcwWOEvv_C5r-guWYxn3ntzwYI2eBzJIE7IdcY,493
|
|
3
|
-
addftool/blob.py,sha256=NZOItDyFUIdV1tfhJZJJBEzGy296CE5NCictTzP4OPc,8282
|
|
4
|
-
addftool/tool.py,sha256=EuKQ2t2InN7yB-_oYLcdsA7vRqzRGTunwIxplUSqEG0,2054
|
|
5
|
-
addftool/util.py,sha256=Q3A68vJDxgfeNiEFmk54HuMuworVndocXpSbVpvGMfc,362
|
|
6
|
-
addftool/deploy/__init__.py,sha256=tpyoTh3SqAQojPizsJDvQohu1Pcb3-w-DP5sO4-5lBM,1220
|
|
7
|
-
addftool/deploy/azure.py,sha256=UQR1hOEYUtsm2fbWBczsnEB_mh7yUuN2NDv3sgMMsac,1246
|
|
8
|
-
addftool/deploy/ssh_server.py,sha256=f2T8fgwACVljPfdcimMywUjsFnLCWRde7iWPAILpRz8,5463
|
|
9
|
-
addftool/process/__init__.py,sha256=OB-cZXP1jK7l8uN8nKfhg_bCX6Slz6DeeBpEodt-IK4,3515
|
|
10
|
-
addftool/process/utils.py,sha256=me4HqMz5OgRcQMUJmVhKdTJh4SW5BB-pd_lq7g8-UwE,2252
|
|
11
|
-
addftool/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
addftool/sync/__init__.py,sha256=wOqFCOA51rFUttBjOO44W3Fc66mhX5ir2R89lsO6gR0,1702
|
|
13
|
-
addftool-0.1.5.dist-info/METADATA,sha256=8fCVZ1r4rS685bprhRuQE5Le7MV3gu8dbNRkvFfo05w,148
|
|
14
|
-
addftool-0.1.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
15
|
-
addftool-0.1.5.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
|
|
16
|
-
addftool-0.1.5.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
|
|
17
|
-
addftool-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|