coiled 1.128.1.dev2__tar.gz → 1.128.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of coiled might be problematic. Click here for more details.
- {coiled-1.128.1.dev2 → coiled-1.128.2}/PKG-INFO +1 -1
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/mpi.py +67 -29
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/run.py +4 -4
- {coiled-1.128.1.dev2 → coiled-1.128.2}/.gitignore +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/LICENSE +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/README.md +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/__main__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/analytics.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/auth.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/batch.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/capture_environment.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/list.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/logs.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/run.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/status.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/util.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/batch/wait.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/azure_logs.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/better_logs.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/crud.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/get_address.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/list.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/logs.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/metrics.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/ssh.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/cluster/utils.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/config.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/core.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/curl.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/diagnostics.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/env.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/file.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/exit.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/hello_world.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/nyc_parquet.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/pytorch.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/examples/xarray_nwm.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/hello.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/scripts/fill_ipython.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/scripts/nyc_parquet.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/scripts/pytorch.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/scripts/xarray_nwm.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/hello/utils.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/login.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/notebook/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/notebook/notebook.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/package_sync.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/prefect.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/prefect_serve.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/amp.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/aws.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/azure.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/entry.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/gcp.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/prometheus.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/setup/util.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/sync.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cli/utils.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/cluster.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/coiled.yaml +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/compatibility.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/config.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/context.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/core.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/credentials/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/credentials/aws.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/credentials/google.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/errors.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/exceptions.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/extensions/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/extensions/prefect/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/extensions/prefect/runners.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/extensions/prefect/workers.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/filestore.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/function.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/plugins.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/prefect.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/pypi_conda_map.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/scan.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/software.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/software_utils.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/spans.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/spark.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/types.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/utils.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/cluster.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/cluster_comms.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/core.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/cwi_log_link.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/states.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/widgets/__init__.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/widgets/interface.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/widgets/rich.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/v2/widgets/util.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/coiled/websockets.py +0 -0
- {coiled-1.128.1.dev2 → coiled-1.128.2}/pyproject.toml +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os.path
|
|
2
|
+
import pathlib
|
|
2
3
|
import shlex
|
|
3
4
|
|
|
4
5
|
import click
|
|
@@ -9,7 +10,7 @@ import coiled
|
|
|
9
10
|
from .. import AWSOptions
|
|
10
11
|
from ..v2.cluster_comms import get_cluster_connection_info, get_comm_from_connection_info
|
|
11
12
|
from .cluster.utils import find_cluster
|
|
12
|
-
from .run import KeepaliveSession, get_ssh_connection,
|
|
13
|
+
from .run import KeepaliveSession, get_ssh_connection, upload_file
|
|
13
14
|
from .utils import CONTEXT_SETTINGS
|
|
14
15
|
|
|
15
16
|
|
|
@@ -99,34 +100,13 @@ def setup(
|
|
|
99
100
|
default=True,
|
|
100
101
|
type=bool,
|
|
101
102
|
)
|
|
103
|
+
@click.option("--upload", type=str, multiple=True)
|
|
102
104
|
@click.argument("command", nargs=-1, required=True)
|
|
103
|
-
def run(cluster, workspace, legate, include_head, command):
|
|
105
|
+
def run(cluster, workspace, legate, include_head, upload, command):
|
|
104
106
|
nodes = "$(cat workers | wc -w)"
|
|
105
107
|
|
|
106
108
|
command = list(command)
|
|
107
109
|
|
|
108
|
-
files = {}
|
|
109
|
-
for i, c in enumerate(command):
|
|
110
|
-
if os.path.exists(c):
|
|
111
|
-
remote_path = f"/scratch/batch/{os.path.basename(c)}"
|
|
112
|
-
command[i] = remote_path
|
|
113
|
-
with open(c) as f:
|
|
114
|
-
content = f.read()
|
|
115
|
-
files[remote_path] = content
|
|
116
|
-
|
|
117
|
-
if legate:
|
|
118
|
-
# TODO make "--gpus 1 --sysmem 2000 --fbmem 20000" configurable
|
|
119
|
-
wrapped_command = f"""
|
|
120
|
-
legate \
|
|
121
|
-
--gpus 1 --sysmem 2000 --fbmem 20000 \
|
|
122
|
-
--nodes {nodes} \
|
|
123
|
-
--launcher mpirun \
|
|
124
|
-
--launcher-extra ' --hostfile workers -x PATH ' \
|
|
125
|
-
{shlex.join(command)}
|
|
126
|
-
"""
|
|
127
|
-
else:
|
|
128
|
-
wrapped_command = f"mpirun --hostfile workers -x PATH {shlex.join(command)}"
|
|
129
|
-
|
|
130
110
|
with coiled.Cloud(workspace=workspace) as cloud:
|
|
131
111
|
cluster_info = find_cluster(cloud, cluster)
|
|
132
112
|
cluster_id = cluster_info["id"]
|
|
@@ -136,9 +116,11 @@ legate \
|
|
|
136
116
|
|
|
137
117
|
setup_mpi_ssh(connection, include_scheduler=include_head)
|
|
138
118
|
|
|
139
|
-
|
|
140
|
-
worker_connections = []
|
|
119
|
+
has_implicit_file = any(os.path.exists(c) for c in command)
|
|
141
120
|
|
|
121
|
+
if has_implicit_file or upload:
|
|
122
|
+
# get SSH connections to each of the workers (using scheduler as jump server)
|
|
123
|
+
worker_connections = []
|
|
142
124
|
for worker in cluster_info["workers"]:
|
|
143
125
|
if (
|
|
144
126
|
not worker.get("instance")
|
|
@@ -154,10 +136,66 @@ legate \
|
|
|
154
136
|
)
|
|
155
137
|
)
|
|
156
138
|
|
|
157
|
-
for
|
|
158
|
-
|
|
139
|
+
for idx, implicit_file in enumerate(command):
|
|
140
|
+
if os.path.exists(implicit_file) and os.path.isfile(implicit_file):
|
|
141
|
+
# this will preserve path structure relative to cwd
|
|
142
|
+
# so `coiled run python ./subdir/foo.py` will go to `/scratch/subdir/foo.py`
|
|
143
|
+
remote_path = upload_file(connection, implicit_file, remote_root="/scratch/batch")
|
|
144
|
+
print(f"Uploaded {implicit_file} to {remote_path}")
|
|
145
|
+
|
|
146
|
+
for conn in worker_connections:
|
|
147
|
+
upload_file(conn, implicit_file, remote_root="/scratch/batch")
|
|
148
|
+
|
|
149
|
+
# adjust command to reference path on VM
|
|
150
|
+
command[idx] = remote_path
|
|
151
|
+
|
|
152
|
+
files_to_upload = []
|
|
153
|
+
for f in upload:
|
|
154
|
+
path = pathlib.Path(f)
|
|
155
|
+
if not path.exists():
|
|
156
|
+
raise FileNotFoundError(f"Cannot find specified file {f}")
|
|
157
|
+
|
|
158
|
+
if path.is_file():
|
|
159
|
+
files_to_upload.append({"f": path})
|
|
160
|
+
elif path.is_dir():
|
|
161
|
+
# for paths outside cwd, parent_dir is used as the root so that path structure from there is preserved
|
|
162
|
+
parent_dir = pathlib.Path(path).parent
|
|
163
|
+
for subfile in path.rglob("*"):
|
|
164
|
+
if subfile.is_file():
|
|
165
|
+
files_to_upload.append({"f": subfile, "specified_root": parent_dir})
|
|
166
|
+
|
|
167
|
+
if files_to_upload:
|
|
168
|
+
print(
|
|
169
|
+
f"Uploading {len(files_to_upload)} file{'s' if len(files_to_upload) > 1 else ''} "
|
|
170
|
+
"from local machine to cloud VM..."
|
|
171
|
+
)
|
|
172
|
+
for i, file_to_upload in enumerate(files_to_upload):
|
|
173
|
+
try:
|
|
174
|
+
mb_size = file_to_upload["f"].stat().st_size / 1_000_000
|
|
175
|
+
if mb_size > 1:
|
|
176
|
+
print(f" {file_to_upload['f']} is {mb_size:.2f} MB, this may be slow to upload")
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
remote_path = upload_file(connection, remote_root="/scratch/batch", **file_to_upload)
|
|
180
|
+
print(" ", remote_path)
|
|
159
181
|
for conn in worker_connections:
|
|
160
|
-
|
|
182
|
+
upload_file(conn, remote_root="/scratch/batch", **file_to_upload)
|
|
183
|
+
if i and (i % 20 == 0 or i + 1 == len(files_to_upload)):
|
|
184
|
+
print(f" {i + 1}/{len(files_to_upload)} files uploaded")
|
|
185
|
+
|
|
186
|
+
if legate:
|
|
187
|
+
# TODO make "--gpus 1 --sysmem 2000 --fbmem 20000" configurable
|
|
188
|
+
# wrapped_command = f"""
|
|
189
|
+
wrapped_command = f"""
|
|
190
|
+
legate \
|
|
191
|
+
--gpus 1 --sysmem 2000 --fbmem 20000 \
|
|
192
|
+
--nodes {nodes} \
|
|
193
|
+
--launcher mpirun \
|
|
194
|
+
--launcher-extra ' --hostfile workers -x PATH ' \
|
|
195
|
+
{shlex.join(command)}
|
|
196
|
+
"""
|
|
197
|
+
else:
|
|
198
|
+
wrapped_command = f"mpirun --hostfile workers -x PATH {shlex.join(command)}"
|
|
161
199
|
|
|
162
200
|
print(f"Running command:\n{wrapped_command}")
|
|
163
201
|
|
|
@@ -125,7 +125,7 @@ def write_files_into_container(connection, container_name: str, files: Dict[str,
|
|
|
125
125
|
)
|
|
126
126
|
|
|
127
127
|
|
|
128
|
-
def upload_file(connection, f, specified_root=None) -> str:
|
|
128
|
+
def upload_file(connection: fabric.Connection, f: str, specified_root=None, remote_root="/scratch") -> str:
|
|
129
129
|
cwd = os.path.abspath(os.path.curdir)
|
|
130
130
|
base = os.path.basename(f)
|
|
131
131
|
is_under_cwd = os.path.commonpath((os.path.abspath(f), cwd)) == cwd
|
|
@@ -151,16 +151,16 @@ def upload_file(connection, f, specified_root=None) -> str:
|
|
|
151
151
|
# For example, if user specified `--file /absolute/subdir/`, then preserve path structure relative
|
|
152
152
|
# to `/absolute/subdir/`, so `/absolute/subdir/foo/bar.txt` would go to `/scratch/subdir/foo/bar.txt`.
|
|
153
153
|
specified_path_dir = os.path.dirname(os.path.relpath(f, relative_to))
|
|
154
|
-
remote_dir = f"/
|
|
154
|
+
remote_dir = f"{remote_root}/{specified_path_dir}/"
|
|
155
155
|
make_remote_dir(connection, remote_dir)
|
|
156
156
|
else:
|
|
157
|
-
remote_dir = "/
|
|
157
|
+
remote_dir = f"{remote_root}/"
|
|
158
158
|
|
|
159
159
|
connection.put(f, remote_dir)
|
|
160
160
|
|
|
161
161
|
# we want path on Linux VM, which might not match os.path.join run client-side, so join path manually
|
|
162
162
|
# remote_dir should already end in "/"
|
|
163
|
-
return f"{remote_dir}{base}"
|
|
163
|
+
return f"{remote_dir}{base}".replace("//", "/")
|
|
164
164
|
|
|
165
165
|
|
|
166
166
|
def run_via_ssh(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|