coiled 1.118.4.dev6__py3-none-any.whl → 1.129.3.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coiled/batch.py +17 -1
- coiled/capture_environment.py +45 -77
- coiled/cli/batch/run.py +141 -10
- coiled/cli/batch/util.py +28 -0
- coiled/cli/batch/wait.py +57 -47
- coiled/cli/core.py +4 -0
- coiled/cli/curl.py +7 -2
- coiled/cli/file.py +116 -0
- coiled/cli/hello/hello.py +6 -5
- coiled/cli/mpi.py +252 -0
- coiled/cli/notebook/notebook.py +10 -0
- coiled/cli/run.py +53 -10
- coiled/cli/setup/aws.py +48 -12
- coiled/cli/setup/azure.py +50 -1
- coiled/context.py +2 -2
- coiled/credentials/google.py +1 -20
- coiled/filestore.py +458 -0
- coiled/plugins.py +3 -0
- coiled/pypi_conda_map.py +14 -0
- coiled/software_utils.py +140 -5
- coiled/spans.py +2 -0
- coiled/types.py +18 -1
- coiled/utils.py +65 -1
- coiled/v2/cluster.py +25 -3
- coiled/v2/cluster_comms.py +72 -0
- coiled/v2/core.py +7 -0
- {coiled-1.118.4.dev6.dist-info → coiled-1.129.3.dev10.dist-info}/METADATA +1 -1
- {coiled-1.118.4.dev6.dist-info → coiled-1.129.3.dev10.dist-info}/RECORD +31 -26
- {coiled-1.118.4.dev6.dist-info → coiled-1.129.3.dev10.dist-info}/WHEEL +1 -1
- {coiled-1.118.4.dev6.dist-info → coiled-1.129.3.dev10.dist-info}/entry_points.txt +0 -0
- {coiled-1.118.4.dev6.dist-info → coiled-1.129.3.dev10.dist-info}/licenses/LICENSE +0 -0
coiled/cli/curl.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from json import dumps as json_dumps
|
|
1
2
|
from json import loads as json_loads
|
|
2
3
|
|
|
3
4
|
import click
|
|
@@ -24,16 +25,20 @@ def curl(url: str, request, data, json, json_output):
|
|
|
24
25
|
url = f"{cloud.server}{url}"
|
|
25
26
|
response = sync_request(cloud, url, method=request, data=all_data, json=json, json_output=json_output)
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
if json_output:
|
|
29
|
+
print(json_dumps(response, indent=4))
|
|
30
|
+
else:
|
|
31
|
+
print(response)
|
|
28
32
|
|
|
29
33
|
|
|
30
|
-
def sync_request(cloud, url, method, data, json: bool = False, json_output: bool = False):
|
|
34
|
+
def sync_request(cloud, url, method, data=None, json: bool = False, json_output: bool = False):
|
|
31
35
|
kwargs = {"method": method, "url": url}
|
|
32
36
|
|
|
33
37
|
if json:
|
|
34
38
|
kwargs["json"] = json_loads(data) if isinstance(data, str) else data
|
|
35
39
|
else:
|
|
36
40
|
kwargs["data"] = data
|
|
41
|
+
|
|
37
42
|
response = cloud._sync(cloud._do_request, **kwargs)
|
|
38
43
|
if response.status >= 400:
|
|
39
44
|
print(f"{url} returned {response.status}")
|
coiled/cli/file.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
import coiled
|
|
4
|
+
from coiled.filestore import FilestoreManager
|
|
5
|
+
|
|
6
|
+
from .cluster.utils import find_cluster
|
|
7
|
+
from .utils import CONTEXT_SETTINGS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.command(
|
|
11
|
+
context_settings=CONTEXT_SETTINGS,
|
|
12
|
+
)
|
|
13
|
+
@click.argument("cluster", default="", required=False)
|
|
14
|
+
@click.option(
|
|
15
|
+
"--workspace",
|
|
16
|
+
default=None,
|
|
17
|
+
help="Coiled workspace (uses default workspace if not specified).",
|
|
18
|
+
)
|
|
19
|
+
@click.option(
|
|
20
|
+
"--filestore",
|
|
21
|
+
default=None,
|
|
22
|
+
help="Name of filestore (optional).",
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"--filter",
|
|
26
|
+
"name_includes",
|
|
27
|
+
default=None,
|
|
28
|
+
help="Filter on file paths and/or names to download (optional).",
|
|
29
|
+
)
|
|
30
|
+
@click.option("--into", default=".")
|
|
31
|
+
def download(cluster, workspace, filestore, name_includes, into):
|
|
32
|
+
if filestore:
|
|
33
|
+
filestores = FilestoreManager.get_filestore(name=filestore) or []
|
|
34
|
+
if not filestores:
|
|
35
|
+
print(f"{filestore} filestore not found")
|
|
36
|
+
|
|
37
|
+
for fs in filestores:
|
|
38
|
+
coiled.filestore.download_from_filestore_with_ui(
|
|
39
|
+
fs=fs,
|
|
40
|
+
into=into,
|
|
41
|
+
name_includes=name_includes,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
else:
|
|
45
|
+
with coiled.Cloud(workspace=workspace) as cloud:
|
|
46
|
+
cluster_info = find_cluster(cloud, cluster)
|
|
47
|
+
cluster_id = cluster_info["id"]
|
|
48
|
+
attachments = FilestoreManager.get_cluster_attachments(cluster_id)
|
|
49
|
+
if not attachments:
|
|
50
|
+
print(f"No filestore found for {cluster_info['name']} ({cluster_info['id']})")
|
|
51
|
+
|
|
52
|
+
# TODO (possible enhancement) if there are multiple output filestores, let user pick which to download
|
|
53
|
+
for attachment in attachments:
|
|
54
|
+
if attachment["output"]:
|
|
55
|
+
coiled.filestore.download_from_filestore_with_ui(
|
|
56
|
+
fs=attachment["filestore"],
|
|
57
|
+
into=into,
|
|
58
|
+
name_includes=name_includes,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@click.command(
|
|
63
|
+
context_settings=CONTEXT_SETTINGS,
|
|
64
|
+
)
|
|
65
|
+
@click.argument("cluster", default="", required=False)
|
|
66
|
+
@click.option(
|
|
67
|
+
"--workspace",
|
|
68
|
+
default=None,
|
|
69
|
+
help="Coiled workspace (uses default workspace if not specified).",
|
|
70
|
+
)
|
|
71
|
+
@click.option(
|
|
72
|
+
"--filestore",
|
|
73
|
+
default=None,
|
|
74
|
+
help="Name of filestore (optional).",
|
|
75
|
+
)
|
|
76
|
+
@click.option(
|
|
77
|
+
"--filter",
|
|
78
|
+
"name_includes",
|
|
79
|
+
default=None,
|
|
80
|
+
help="Filter on file paths and/or names to download (optional).",
|
|
81
|
+
)
|
|
82
|
+
def list_files(cluster, workspace, filestore, name_includes):
|
|
83
|
+
if filestore:
|
|
84
|
+
filestores = FilestoreManager.get_filestore(name=filestore) or []
|
|
85
|
+
if not filestores:
|
|
86
|
+
print(f"{filestore} filestore not found")
|
|
87
|
+
|
|
88
|
+
for fs in filestores:
|
|
89
|
+
coiled.filestore.list_files_ui(
|
|
90
|
+
fs=fs,
|
|
91
|
+
name_includes=name_includes,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
else:
|
|
95
|
+
with coiled.Cloud(workspace=workspace) as cloud:
|
|
96
|
+
cluster_info = find_cluster(cloud, cluster)
|
|
97
|
+
cluster_id = cluster_info["id"]
|
|
98
|
+
attachments = FilestoreManager.get_cluster_attachments(cluster_id)
|
|
99
|
+
if not attachments:
|
|
100
|
+
print(f"No filestore found for {cluster_info['name']} ({cluster_info['id']})")
|
|
101
|
+
|
|
102
|
+
# TODO (possible enhancement) if there are multiple output filestores, let user pick which to download
|
|
103
|
+
for attachment in attachments:
|
|
104
|
+
if attachment["output"]:
|
|
105
|
+
coiled.filestore.list_files_ui(
|
|
106
|
+
fs=attachment["filestore"],
|
|
107
|
+
name_includes=name_includes,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@click.group(name="file", context_settings=CONTEXT_SETTINGS)
|
|
112
|
+
def file_group(): ...
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
file_group.add_command(download)
|
|
116
|
+
file_group.add_command(list_files, "list")
|
coiled/cli/hello/hello.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
import json
|
|
5
|
-
import subprocess
|
|
6
4
|
import sys
|
|
7
5
|
import time
|
|
8
6
|
|
|
@@ -18,6 +16,7 @@ import coiled
|
|
|
18
16
|
from coiled.scan import scan_prefix
|
|
19
17
|
from coiled.utils import login_if_required
|
|
20
18
|
|
|
19
|
+
from ..curl import sync_request
|
|
21
20
|
from .examples import examples
|
|
22
21
|
from .utils import PRIMARY_COLOR, Panel, console, has_macos_system_python, log_interactions
|
|
23
22
|
|
|
@@ -41,8 +40,10 @@ def needs_login():
|
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
def get_interactions():
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
with coiled.Cloud() as cloud:
|
|
44
|
+
return sync_request(
|
|
45
|
+
cloud, url=f"{cloud.server}/api/v2/interactions/user-interactions/hello", method="get", json_output=True
|
|
46
|
+
)
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
def get_already_run_examples():
|
|
@@ -294,7 +295,7 @@ Choose any computation you'd like to run:
|
|
|
294
295
|
Yee-haw you've done all my examples 🎉
|
|
295
296
|
Now you can:
|
|
296
297
|
- Try Coiled in your own use case
|
|
297
|
-
- [
|
|
298
|
+
- [Ask us questions](mailto:support@coiled.io)
|
|
298
299
|
- Explore the [docs](https://docs.coiled.io?utm_source=coiled-hello&utm_medium=finished) to see all the other things Coiled can do
|
|
299
300
|
"""), # noqa
|
|
300
301
|
border_style="green",
|
coiled/cli/mpi.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
import pathlib
|
|
3
|
+
import shlex
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
import fabric.connection
|
|
7
|
+
|
|
8
|
+
import coiled
|
|
9
|
+
|
|
10
|
+
from .. import AWSOptions
|
|
11
|
+
from ..v2.cluster_comms import get_cluster_connection_info, get_comm_from_connection_info
|
|
12
|
+
from .cluster.utils import find_cluster
|
|
13
|
+
from .run import KeepaliveSession, get_ssh_connection, upload_file
|
|
14
|
+
from .utils import CONTEXT_SETTINGS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.command(
|
|
18
|
+
context_settings=CONTEXT_SETTINGS,
|
|
19
|
+
)
|
|
20
|
+
@click.option("--worker-nodes", default=1, type=int)
|
|
21
|
+
@click.option(
|
|
22
|
+
"--include-head/--exclude-head",
|
|
23
|
+
default=True,
|
|
24
|
+
type=bool,
|
|
25
|
+
help="Include head (scheduler) node in placement group with EFA",
|
|
26
|
+
)
|
|
27
|
+
@click.option("--vm-type", default="g6.8xlarge", type=str)
|
|
28
|
+
@click.option("--head-vm-type", default=None, type=str)
|
|
29
|
+
@click.option("--worker-vm-type", default=None, type=str)
|
|
30
|
+
@click.option("--pip", multiple=True, type=str, help="Packages to install with pip")
|
|
31
|
+
@click.option("--apt", multiple=True, type=str, help="Packages to install with apt install")
|
|
32
|
+
@click.option("--setup-script", default=None, type=str, help="Path to additional host setup script")
|
|
33
|
+
@click.option("--cluster-timeout", default="1h", type=str, help="Maximum lifetime for cluster")
|
|
34
|
+
@click.option(
|
|
35
|
+
"--idle-timeout",
|
|
36
|
+
default="10m",
|
|
37
|
+
type=str,
|
|
38
|
+
help="How long to wait when nothing is running before shutting down cluster",
|
|
39
|
+
)
|
|
40
|
+
def setup(
|
|
41
|
+
worker_nodes,
|
|
42
|
+
include_head,
|
|
43
|
+
vm_type,
|
|
44
|
+
head_vm_type,
|
|
45
|
+
worker_vm_type,
|
|
46
|
+
pip,
|
|
47
|
+
apt,
|
|
48
|
+
setup_script,
|
|
49
|
+
cluster_timeout,
|
|
50
|
+
idle_timeout,
|
|
51
|
+
):
|
|
52
|
+
additional_host_setup = None
|
|
53
|
+
if setup_script:
|
|
54
|
+
with open(setup_script) as f:
|
|
55
|
+
additional_host_setup = f.read()
|
|
56
|
+
setup_script = get_host_setup_script(pip_install=pip, apt_install=apt, additional_setup=additional_host_setup)
|
|
57
|
+
|
|
58
|
+
efa_settings = (
|
|
59
|
+
{"use_placement_group": True, "use_efa": True}
|
|
60
|
+
if include_head
|
|
61
|
+
else {"use_worker_placement_group": True, "use_worker_efa": True}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
cluster = coiled.Cluster(
|
|
65
|
+
n_workers=worker_nodes,
|
|
66
|
+
container="daskdev/dask:latest",
|
|
67
|
+
allow_ssh_from="me",
|
|
68
|
+
host_setup_script=setup_script,
|
|
69
|
+
backend_options=AWSOptions(**{**efa_settings, "ami_version": "DL"}),
|
|
70
|
+
scheduler_vm_types=[head_vm_type or vm_type],
|
|
71
|
+
worker_vm_types=[worker_vm_type or vm_type],
|
|
72
|
+
worker_disk_size="100GB",
|
|
73
|
+
scheduler_disk_size="100GB",
|
|
74
|
+
shutdown_on_close=False,
|
|
75
|
+
idle_timeout="520 weeks", # don't use idle timeout
|
|
76
|
+
cluster_timeout=cluster_timeout,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
cluster.set_keepalive(keepalive=idle_timeout)
|
|
80
|
+
|
|
81
|
+
print("Cluster created, installing software for MPI...")
|
|
82
|
+
|
|
83
|
+
with coiled.Cloud() as cloud:
|
|
84
|
+
connection = get_ssh_connection(cloud, cluster.cluster_id)
|
|
85
|
+
|
|
86
|
+
with KeepaliveSession(cluster=cluster, prefix="mpi-ssh-setup"):
|
|
87
|
+
setup_mpi_ssh(connection)
|
|
88
|
+
|
|
89
|
+
print("MPI is ready")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@click.command(
|
|
93
|
+
context_settings=CONTEXT_SETTINGS,
|
|
94
|
+
)
|
|
95
|
+
@click.option("--cluster", default=None)
|
|
96
|
+
@click.option("--workspace", default=None, type=str)
|
|
97
|
+
@click.option("--legate", is_flag=True, default=False, type=bool)
|
|
98
|
+
@click.option(
|
|
99
|
+
"--include-head/--exclude-head",
|
|
100
|
+
default=True,
|
|
101
|
+
type=bool,
|
|
102
|
+
)
|
|
103
|
+
@click.option("--upload", type=str, multiple=True)
|
|
104
|
+
@click.argument("command", nargs=-1, required=True)
|
|
105
|
+
def run(cluster, workspace, legate, include_head, upload, command):
|
|
106
|
+
nodes = "$(cat workers | wc -w)"
|
|
107
|
+
|
|
108
|
+
command = list(command)
|
|
109
|
+
|
|
110
|
+
with coiled.Cloud(workspace=workspace) as cloud:
|
|
111
|
+
cluster_info = find_cluster(cloud, cluster)
|
|
112
|
+
cluster_id = cluster_info["id"]
|
|
113
|
+
connection = get_ssh_connection(cloud, cluster_id)
|
|
114
|
+
address, security = get_cluster_connection_info(cluster_id, cloud)
|
|
115
|
+
comms = get_comm_from_connection_info(address, security)
|
|
116
|
+
|
|
117
|
+
setup_mpi_ssh(connection, include_scheduler=include_head)
|
|
118
|
+
|
|
119
|
+
has_implicit_file = any(os.path.exists(c) for c in command)
|
|
120
|
+
|
|
121
|
+
if has_implicit_file or upload:
|
|
122
|
+
# get SSH connections to each of the workers (using scheduler as jump server)
|
|
123
|
+
worker_connections = []
|
|
124
|
+
for worker in cluster_info["workers"]:
|
|
125
|
+
if (
|
|
126
|
+
not worker.get("instance")
|
|
127
|
+
or not worker["instance"].get("current_state")
|
|
128
|
+
or worker["instance"]["current_state"]["state"] != "ready"
|
|
129
|
+
):
|
|
130
|
+
continue
|
|
131
|
+
worker_address = worker["instance"]["private_ip_address"]
|
|
132
|
+
|
|
133
|
+
worker_connections.append(
|
|
134
|
+
fabric.connection.Connection(
|
|
135
|
+
worker_address, gateway=connection, user=connection.user, connect_kwargs=connection.connect_kwargs
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for idx, implicit_file in enumerate(command):
|
|
140
|
+
if os.path.exists(implicit_file) and os.path.isfile(implicit_file):
|
|
141
|
+
# this will preserve path structure relative to cwd
|
|
142
|
+
# so `coiled run python ./subdir/foo.py` will go to `/scratch/subdir/foo.py`
|
|
143
|
+
remote_path = upload_file(connection, implicit_file, remote_root="/scratch/batch")
|
|
144
|
+
print(f"Uploaded {implicit_file} to {remote_path}")
|
|
145
|
+
|
|
146
|
+
for conn in worker_connections:
|
|
147
|
+
upload_file(conn, implicit_file, remote_root="/scratch/batch")
|
|
148
|
+
|
|
149
|
+
# adjust command to reference path on VM
|
|
150
|
+
command[idx] = remote_path
|
|
151
|
+
|
|
152
|
+
files_to_upload = []
|
|
153
|
+
for f in upload:
|
|
154
|
+
path = pathlib.Path(f)
|
|
155
|
+
if not path.exists():
|
|
156
|
+
raise FileNotFoundError(f"Cannot find specified file {f}")
|
|
157
|
+
|
|
158
|
+
if path.is_file():
|
|
159
|
+
files_to_upload.append({"f": path})
|
|
160
|
+
elif path.is_dir():
|
|
161
|
+
# for paths outside cwd, parent_dir is used as the root so that path structure from there is preserved
|
|
162
|
+
parent_dir = pathlib.Path(path).parent
|
|
163
|
+
for subfile in path.rglob("*"):
|
|
164
|
+
if subfile.is_file():
|
|
165
|
+
files_to_upload.append({"f": subfile, "specified_root": parent_dir})
|
|
166
|
+
|
|
167
|
+
if files_to_upload:
|
|
168
|
+
print(
|
|
169
|
+
f"Uploading {len(files_to_upload)} file{'s' if len(files_to_upload) > 1 else ''} "
|
|
170
|
+
"from local machine to cloud VM..."
|
|
171
|
+
)
|
|
172
|
+
for i, file_to_upload in enumerate(files_to_upload):
|
|
173
|
+
try:
|
|
174
|
+
mb_size = file_to_upload["f"].stat().st_size / 1_000_000
|
|
175
|
+
if mb_size > 1:
|
|
176
|
+
print(f" {file_to_upload['f']} is {mb_size:.2f} MB, this may be slow to upload")
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
remote_path = upload_file(connection, remote_root="/scratch/batch", **file_to_upload)
|
|
180
|
+
print(" ", remote_path)
|
|
181
|
+
for conn in worker_connections:
|
|
182
|
+
upload_file(conn, remote_root="/scratch/batch", **file_to_upload)
|
|
183
|
+
if i and (i % 20 == 0 or i + 1 == len(files_to_upload)):
|
|
184
|
+
print(f" {i + 1}/{len(files_to_upload)} files uploaded")
|
|
185
|
+
|
|
186
|
+
if legate:
|
|
187
|
+
# TODO make "--gpus 1 --sysmem 2000 --fbmem 20000" configurable
|
|
188
|
+
# wrapped_command = f"""
|
|
189
|
+
wrapped_command = f"""
|
|
190
|
+
legate \
|
|
191
|
+
--gpus 1 --sysmem 2000 --fbmem 20000 \
|
|
192
|
+
--nodes {nodes} \
|
|
193
|
+
--launcher mpirun \
|
|
194
|
+
--launcher-extra ' --hostfile workers -x PATH ' \
|
|
195
|
+
{shlex.join(command)}
|
|
196
|
+
"""
|
|
197
|
+
else:
|
|
198
|
+
wrapped_command = f"mpirun --hostfile workers -x PATH {shlex.join(command)}"
|
|
199
|
+
|
|
200
|
+
print(f"Running command:\n{wrapped_command}")
|
|
201
|
+
|
|
202
|
+
with KeepaliveSession(cluster=None, comms=comms, prefix="mpi-ssh"):
|
|
203
|
+
connection.run(wrapped_command, hide=False, pty=True, warn=True, env={"PATH": "/tmp/host-user-venv/bin:$PATH"})
|
|
204
|
+
|
|
205
|
+
comms.close_comms()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def setup_mpi_ssh(connection, include_scheduler=True):
|
|
209
|
+
add_scheduler_line = 'printf "\n127.0.0.1" >> workers' if include_scheduler else ""
|
|
210
|
+
|
|
211
|
+
setup_mpi = f"""
|
|
212
|
+
/bin/coiled_agent list-worker-ips | sudo tee workers && sudo chown ubuntu workers
|
|
213
|
+
ssh-keyscan -f workers -t ed25519 >> ~/.ssh/known_hosts
|
|
214
|
+
{add_scheduler_line}
|
|
215
|
+
|
|
216
|
+
# block until host setup script has finished, at least on schedule node
|
|
217
|
+
until [ -f /tmp/host-setup-done ]
|
|
218
|
+
do
|
|
219
|
+
sleep 5
|
|
220
|
+
done
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
_ = connection.run(setup_mpi, hide=True, pty=False)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None, additional_setup=None):
|
|
227
|
+
apt_install = list(apt_install or [])
|
|
228
|
+
apt_install.extend(["openmpi-bin", "python3-pip", "python3-venv"])
|
|
229
|
+
|
|
230
|
+
pip_install = list(pip_install or [])
|
|
231
|
+
|
|
232
|
+
pip_install_line = f"{venv_path}/bin/python -m pip install {' '.join(pip_install)}" if pip_install else ""
|
|
233
|
+
|
|
234
|
+
return f"""
|
|
235
|
+
sudo apt install {" ".join(apt_install)} -y
|
|
236
|
+
|
|
237
|
+
mkdir {venv_path}
|
|
238
|
+
python3 -m venv {venv_path}
|
|
239
|
+
|
|
240
|
+
{pip_install_line}
|
|
241
|
+
{additional_setup or ""}
|
|
242
|
+
|
|
243
|
+
echo 'done' > /tmp/host-setup-done
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@click.group(name="mpi", context_settings=CONTEXT_SETTINGS)
|
|
248
|
+
def mpi_group(): ...
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
mpi_group.add_command(setup)
|
|
252
|
+
mpi_group.add_command(run)
|
coiled/cli/notebook/notebook.py
CHANGED
|
@@ -237,6 +237,12 @@ def check_jupyter() -> bool:
|
|
|
237
237
|
),
|
|
238
238
|
hidden=True,
|
|
239
239
|
)
|
|
240
|
+
@click.option(
|
|
241
|
+
"--private",
|
|
242
|
+
default=False,
|
|
243
|
+
is_flag=True,
|
|
244
|
+
help="Make this notebook private to you (other workspace members cannot access it).",
|
|
245
|
+
)
|
|
240
246
|
def start_notebook(
|
|
241
247
|
name: str | None,
|
|
242
248
|
account: str | None,
|
|
@@ -262,6 +268,7 @@ def start_notebook(
|
|
|
262
268
|
mount_bucket: List[str] | None,
|
|
263
269
|
host_setup_script: str | None,
|
|
264
270
|
resumable: bool = False,
|
|
271
|
+
private: bool = False,
|
|
265
272
|
):
|
|
266
273
|
"""
|
|
267
274
|
Launch or re-open a notebook session, with optional file syncing.
|
|
@@ -299,6 +306,7 @@ def start_notebook(
|
|
|
299
306
|
mount_bucket=mount_bucket,
|
|
300
307
|
host_setup_script=host_setup_script,
|
|
301
308
|
resumable=resumable,
|
|
309
|
+
private=private,
|
|
302
310
|
)
|
|
303
311
|
|
|
304
312
|
|
|
@@ -329,6 +337,7 @@ def _start_notebook(
|
|
|
329
337
|
mount_bucket: List[str] | None = None,
|
|
330
338
|
host_setup_script: str | None = None,
|
|
331
339
|
resumable: bool = False,
|
|
340
|
+
private: bool = False,
|
|
332
341
|
) -> coiled.Cluster | None:
|
|
333
342
|
"""
|
|
334
343
|
Launch or re-open a notebook session, with optional file syncing.
|
|
@@ -418,6 +427,7 @@ def _start_notebook(
|
|
|
418
427
|
shutdown_on_close=False,
|
|
419
428
|
unset_single_threading_variables=True,
|
|
420
429
|
pause_on_exit=resumable,
|
|
430
|
+
private_to_creator=private,
|
|
421
431
|
)
|
|
422
432
|
info["cluster_id"] = cluster.cluster_id
|
|
423
433
|
|
coiled/cli/run.py
CHANGED
|
@@ -29,8 +29,11 @@ from coiled.utils import (
|
|
|
29
29
|
unset_single_thread_defaults,
|
|
30
30
|
)
|
|
31
31
|
from coiled.v2.cluster import ClusterKwargs
|
|
32
|
+
from coiled.v2.cluster_comms import use_comm_rpc
|
|
33
|
+
from coiled.v2.core import Cloud
|
|
32
34
|
from coiled.v2.widgets.rich import LightRichClusterWidget
|
|
33
35
|
|
|
36
|
+
from ..filestore import FilestoreManager
|
|
34
37
|
from .sync import SYNC_TARGET, start_sync, stop_sync
|
|
35
38
|
from .utils import CONTEXT_SETTINGS
|
|
36
39
|
|
|
@@ -45,20 +48,37 @@ USER_CONTAINER_NAME = "tmp-user-1"
|
|
|
45
48
|
|
|
46
49
|
|
|
47
50
|
class KeepaliveSession:
|
|
48
|
-
def __init__(self, cluster, prefix="", monitor_proc_activity=False):
|
|
51
|
+
def __init__(self, cluster, comms=None, prefix="", monitor_proc_activity=False):
|
|
49
52
|
self.cluster = cluster
|
|
53
|
+
self.comms = comms
|
|
50
54
|
self.monitor_proc_activity = monitor_proc_activity
|
|
51
55
|
rand_uuid = short_random_string()
|
|
52
56
|
self.session_id = f"{prefix}-{rand_uuid}" if prefix else rand_uuid
|
|
53
57
|
|
|
58
|
+
if self.comms:
|
|
59
|
+
self.cloud = Cloud.current(asynchronous=False)
|
|
60
|
+
|
|
54
61
|
def __enter__(self):
|
|
55
62
|
# keepalive session lets us use keepalive without dask client
|
|
56
|
-
self.cluster
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
if self.cluster:
|
|
64
|
+
self.cluster._call_scheduler_comm(
|
|
65
|
+
"coiled_add_keepalive_session", name=self.session_id, monitor_proc_activity=self.monitor_proc_activity
|
|
66
|
+
)
|
|
67
|
+
elif self.comms:
|
|
68
|
+
use_comm_rpc(
|
|
69
|
+
self.cloud,
|
|
70
|
+
self.comms,
|
|
71
|
+
"coiled_add_keepalive_session",
|
|
72
|
+
name=self.session_id,
|
|
73
|
+
monitor_proc_activity=self.monitor_proc_activity,
|
|
74
|
+
)
|
|
59
75
|
|
|
60
76
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
61
|
-
self.cluster
|
|
77
|
+
if self.cluster:
|
|
78
|
+
self.cluster._call_scheduler_comm("coiled_end_keepalive_session", name=self.session_id)
|
|
79
|
+
elif self.comms:
|
|
80
|
+
use_comm_rpc(self.cloud, self.comms, "coiled_end_keepalive_session", name=self.session_id)
|
|
81
|
+
self.cloud.close()
|
|
62
82
|
|
|
63
83
|
|
|
64
84
|
def get_ssh_connection(cloud, cluster_id) -> fabric.connection.Connection:
|
|
@@ -105,7 +125,7 @@ def write_files_into_container(connection, container_name: str, files: Dict[str,
|
|
|
105
125
|
)
|
|
106
126
|
|
|
107
127
|
|
|
108
|
-
def upload_file(connection, f, specified_root=None) -> str:
|
|
128
|
+
def upload_file(connection: fabric.Connection, f: str, specified_root=None, remote_root="/scratch") -> str:
|
|
109
129
|
cwd = os.path.abspath(os.path.curdir)
|
|
110
130
|
base = os.path.basename(f)
|
|
111
131
|
is_under_cwd = os.path.commonpath((os.path.abspath(f), cwd)) == cwd
|
|
@@ -131,16 +151,16 @@ def upload_file(connection, f, specified_root=None) -> str:
|
|
|
131
151
|
# For example, if user specified `--file /absolute/subdir/`, then preserve path structure relative
|
|
132
152
|
# to `/absolute/subdir/`, so `/absolute/subdir/foo/bar.txt` would go to `/scratch/subdir/foo/bar.txt`.
|
|
133
153
|
specified_path_dir = os.path.dirname(os.path.relpath(f, relative_to))
|
|
134
|
-
remote_dir = f"/
|
|
154
|
+
remote_dir = f"{remote_root}/{specified_path_dir}/"
|
|
135
155
|
make_remote_dir(connection, remote_dir)
|
|
136
156
|
else:
|
|
137
|
-
remote_dir = "/
|
|
157
|
+
remote_dir = f"{remote_root}/"
|
|
138
158
|
|
|
139
159
|
connection.put(f, remote_dir)
|
|
140
160
|
|
|
141
161
|
# we want path on Linux VM, which might not match os.path.join run client-side, so join path manually
|
|
142
162
|
# remote_dir should already end in "/"
|
|
143
|
-
return f"{remote_dir}{base}"
|
|
163
|
+
return f"{remote_dir}{base}".replace("//", "/")
|
|
144
164
|
|
|
145
165
|
|
|
146
166
|
def run_via_ssh(
|
|
@@ -238,7 +258,8 @@ def run_via_ssh(
|
|
|
238
258
|
|
|
239
259
|
if container and "/uv:" in container and command_string.startswith("uv"):
|
|
240
260
|
command_string = (
|
|
241
|
-
"(apt update && apt upgrade && apt install -y --no-install-recommends ca-certificates)
|
|
261
|
+
"(apt update -y && apt upgrade -y && apt install -y --no-install-recommends ca-certificates) "
|
|
262
|
+
"2>&1 > /dev/null\n"
|
|
242
263
|
f"{command_string}"
|
|
243
264
|
)
|
|
244
265
|
|
|
@@ -549,6 +570,13 @@ def get_entrypoint(connection, container_name) -> str:
|
|
|
549
570
|
default=None,
|
|
550
571
|
help="Non-default value for shm_size (for example, '3 GiB').",
|
|
551
572
|
)
|
|
573
|
+
@click.option(
|
|
574
|
+
"--filestore",
|
|
575
|
+
"filestore_names",
|
|
576
|
+
default=None,
|
|
577
|
+
multiple=True,
|
|
578
|
+
help="Name of filestore to attach; can be specified multiple times for multiple filestores.",
|
|
579
|
+
)
|
|
552
580
|
@click.argument("command", nargs=-1)
|
|
553
581
|
def run(
|
|
554
582
|
name: str | None,
|
|
@@ -579,6 +607,7 @@ def run(
|
|
|
579
607
|
package_sync_strict,
|
|
580
608
|
package_sync_conda_extras,
|
|
581
609
|
docker_shm_size,
|
|
610
|
+
filestore_names,
|
|
582
611
|
command,
|
|
583
612
|
):
|
|
584
613
|
"""
|
|
@@ -614,6 +643,7 @@ def run(
|
|
|
614
643
|
package_sync_strict=package_sync_strict,
|
|
615
644
|
package_sync_conda_extras=package_sync_conda_extras,
|
|
616
645
|
docker_shm_size=docker_shm_size,
|
|
646
|
+
filestore_names=filestore_names,
|
|
617
647
|
)
|
|
618
648
|
sys.exit(info["exit_code"])
|
|
619
649
|
|
|
@@ -651,6 +681,7 @@ def start_run(
|
|
|
651
681
|
package_sync_strict: bool = False,
|
|
652
682
|
package_sync_conda_extras: List[str] | None = None,
|
|
653
683
|
docker_shm_size: str | None = None,
|
|
684
|
+
filestore_names: list[str] | None = None,
|
|
654
685
|
):
|
|
655
686
|
runtime_env_dict = dict_from_key_val_list(env)
|
|
656
687
|
tags = dict_from_key_val_list(tag)
|
|
@@ -693,9 +724,20 @@ def start_run(
|
|
|
693
724
|
# fail early if user specified `--file` that doesn't exist
|
|
694
725
|
check_explicit_files(file)
|
|
695
726
|
|
|
727
|
+
filestores_to_attach = []
|
|
728
|
+
|
|
696
729
|
try:
|
|
697
730
|
with coiled.Cloud(workspace=workspace or account) as cloud:
|
|
698
731
|
workspace = workspace or cloud.default_workspace
|
|
732
|
+
|
|
733
|
+
if filestore_names:
|
|
734
|
+
filestores = FilestoreManager.get_or_create_filestores(
|
|
735
|
+
names=filestore_names,
|
|
736
|
+
workspace=workspace,
|
|
737
|
+
region=region,
|
|
738
|
+
)
|
|
739
|
+
filestores_to_attach.extend([{"id": fs["id"], "input": True, "output": True} for fs in filestores])
|
|
740
|
+
|
|
699
741
|
with LightRichClusterWidget(
|
|
700
742
|
workspace=workspace,
|
|
701
743
|
title=f"Running [bold]{coiled.utils.join_command_parts(command)}[/bold]",
|
|
@@ -736,6 +778,7 @@ def start_run(
|
|
|
736
778
|
"package_sync_conda_extras": package_sync_conda_extras,
|
|
737
779
|
"backend_options": {"docker_shm_size": docker_shm_size} if docker_shm_size else None,
|
|
738
780
|
"unset_single_threading_variables": True,
|
|
781
|
+
"filestores_to_attach": filestores_to_attach,
|
|
739
782
|
}
|
|
740
783
|
cluster_kwargs["name"] = name or f"run-{short_random_string()}"
|
|
741
784
|
cluster_kwargs["cloud"] = cloud
|