coiled 1.128.0__py3-none-any.whl → 1.128.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of coiled might be problematic. Click here for more details.
- coiled/cli/mpi.py +63 -15
- coiled/cli/run.py +24 -5
- coiled/v2/cluster_comms.py +72 -0
- {coiled-1.128.0.dist-info → coiled-1.128.1.dist-info}/METADATA +1 -1
- {coiled-1.128.0.dist-info → coiled-1.128.1.dist-info}/RECORD +8 -7
- {coiled-1.128.0.dist-info → coiled-1.128.1.dist-info}/WHEEL +0 -0
- {coiled-1.128.0.dist-info → coiled-1.128.1.dist-info}/entry_points.txt +0 -0
- {coiled-1.128.0.dist-info → coiled-1.128.1.dist-info}/licenses/LICENSE +0 -0
coiled/cli/mpi.py
CHANGED
|
@@ -6,8 +6,10 @@ import fabric.connection
|
|
|
6
6
|
|
|
7
7
|
import coiled
|
|
8
8
|
|
|
9
|
+
from .. import AWSOptions
|
|
10
|
+
from ..v2.cluster_comms import get_cluster_connection_info, get_comm_from_connection_info
|
|
9
11
|
from .cluster.utils import find_cluster
|
|
10
|
-
from .run import get_ssh_connection, write_via_ssh
|
|
12
|
+
from .run import KeepaliveSession, get_ssh_connection, write_via_ssh
|
|
11
13
|
from .utils import CONTEXT_SETTINGS
|
|
12
14
|
|
|
13
15
|
|
|
@@ -15,32 +17,73 @@ from .utils import CONTEXT_SETTINGS
|
|
|
15
17
|
context_settings=CONTEXT_SETTINGS,
|
|
16
18
|
)
|
|
17
19
|
@click.option("--worker-nodes", default=1, type=int)
|
|
20
|
+
@click.option(
|
|
21
|
+
"--include-head/--exclude-head",
|
|
22
|
+
default=True,
|
|
23
|
+
type=bool,
|
|
24
|
+
help="Include head (scheduler) node in placement group with EFA",
|
|
25
|
+
)
|
|
18
26
|
@click.option("--vm-type", default="g6.8xlarge", type=str)
|
|
19
|
-
@click.option("--
|
|
20
|
-
@click.option("--
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
@click.option("--head-vm-type", default=None, type=str)
|
|
28
|
+
@click.option("--worker-vm-type", default=None, type=str)
|
|
29
|
+
@click.option("--pip", multiple=True, type=str, help="Packages to install with pip")
|
|
30
|
+
@click.option("--apt", multiple=True, type=str, help="Packages to install with apt install")
|
|
31
|
+
@click.option("--setup-script", default=None, type=str, help="Path to additional host setup script")
|
|
32
|
+
@click.option("--cluster-timeout", default="1h", type=str, help="Maximum lifetime for cluster")
|
|
33
|
+
@click.option(
|
|
34
|
+
"--idle-timeout",
|
|
35
|
+
default="10m",
|
|
36
|
+
type=str,
|
|
37
|
+
help="How long to wait when nothing is running before shutting down cluster",
|
|
38
|
+
)
|
|
39
|
+
def setup(
|
|
40
|
+
worker_nodes,
|
|
41
|
+
include_head,
|
|
42
|
+
vm_type,
|
|
43
|
+
head_vm_type,
|
|
44
|
+
worker_vm_type,
|
|
45
|
+
pip,
|
|
46
|
+
apt,
|
|
47
|
+
setup_script,
|
|
48
|
+
cluster_timeout,
|
|
49
|
+
idle_timeout,
|
|
50
|
+
):
|
|
51
|
+
additional_host_setup = None
|
|
52
|
+
if setup_script:
|
|
53
|
+
with open(setup_script) as f:
|
|
54
|
+
additional_host_setup = f.read()
|
|
55
|
+
setup_script = get_host_setup_script(pip_install=pip, apt_install=apt, additional_setup=additional_host_setup)
|
|
56
|
+
|
|
57
|
+
efa_settings = (
|
|
58
|
+
{"use_placement_group": True, "use_efa": True}
|
|
59
|
+
if include_head
|
|
60
|
+
else {"use_worker_placement_group": True, "use_worker_efa": True}
|
|
61
|
+
)
|
|
23
62
|
|
|
24
63
|
cluster = coiled.Cluster(
|
|
25
64
|
n_workers=worker_nodes,
|
|
26
65
|
container="daskdev/dask:latest",
|
|
27
66
|
allow_ssh_from="me",
|
|
28
67
|
host_setup_script=setup_script,
|
|
29
|
-
backend_options={
|
|
30
|
-
scheduler_vm_types=[vm_type],
|
|
31
|
-
worker_vm_types=[vm_type],
|
|
68
|
+
backend_options=AWSOptions(**{**efa_settings, "ami_version": "DL"}),
|
|
69
|
+
scheduler_vm_types=[head_vm_type or vm_type],
|
|
70
|
+
worker_vm_types=[worker_vm_type or vm_type],
|
|
32
71
|
worker_disk_size="100GB",
|
|
33
72
|
scheduler_disk_size="100GB",
|
|
34
73
|
shutdown_on_close=False,
|
|
35
|
-
idle_timeout=
|
|
74
|
+
idle_timeout="520 weeks", # don't use idle timeout
|
|
75
|
+
cluster_timeout=cluster_timeout,
|
|
36
76
|
)
|
|
37
77
|
|
|
78
|
+
cluster.set_keepalive(keepalive=idle_timeout)
|
|
79
|
+
|
|
38
80
|
print("Cluster created, installing software for MPI...")
|
|
39
81
|
|
|
40
82
|
with coiled.Cloud() as cloud:
|
|
41
83
|
connection = get_ssh_connection(cloud, cluster.cluster_id)
|
|
42
84
|
|
|
43
|
-
|
|
85
|
+
with KeepaliveSession(cluster=cluster, prefix="mpi-ssh-setup"):
|
|
86
|
+
setup_mpi_ssh(connection)
|
|
44
87
|
|
|
45
88
|
print("MPI is ready")
|
|
46
89
|
|
|
@@ -88,6 +131,8 @@ legate \
|
|
|
88
131
|
cluster_info = find_cluster(cloud, cluster)
|
|
89
132
|
cluster_id = cluster_info["id"]
|
|
90
133
|
connection = get_ssh_connection(cloud, cluster_id)
|
|
134
|
+
address, security = get_cluster_connection_info(cluster_id, cloud)
|
|
135
|
+
comms = get_comm_from_connection_info(address, security)
|
|
91
136
|
|
|
92
137
|
setup_mpi_ssh(connection, include_scheduler=include_head)
|
|
93
138
|
|
|
@@ -116,8 +161,10 @@ legate \
|
|
|
116
161
|
|
|
117
162
|
print(f"Running command:\n{wrapped_command}")
|
|
118
163
|
|
|
119
|
-
|
|
120
|
-
|
|
164
|
+
with KeepaliveSession(cluster=None, comms=comms, prefix="mpi-ssh"):
|
|
165
|
+
connection.run(wrapped_command, hide=False, pty=True, warn=True, env={"PATH": "/tmp/host-user-venv/bin:$PATH"})
|
|
166
|
+
|
|
167
|
+
comms.close_comms()
|
|
121
168
|
|
|
122
169
|
|
|
123
170
|
def setup_mpi_ssh(connection, include_scheduler=True):
|
|
@@ -138,11 +185,11 @@ done
|
|
|
138
185
|
_ = connection.run(setup_mpi, hide=True, pty=False)
|
|
139
186
|
|
|
140
187
|
|
|
141
|
-
def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None):
|
|
142
|
-
apt_install = apt_install or []
|
|
188
|
+
def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None, additional_setup=None):
|
|
189
|
+
apt_install = list(apt_install or [])
|
|
143
190
|
apt_install.extend(["openmpi-bin", "python3-pip", "python3-venv"])
|
|
144
191
|
|
|
145
|
-
pip_install = pip_install or []
|
|
192
|
+
pip_install = list(pip_install or [])
|
|
146
193
|
|
|
147
194
|
pip_install_line = f"{venv_path}/bin/python -m pip install {' '.join(pip_install)}" if pip_install else ""
|
|
148
195
|
|
|
@@ -153,6 +200,7 @@ mkdir {venv_path}
|
|
|
153
200
|
python3 -m venv {venv_path}
|
|
154
201
|
|
|
155
202
|
{pip_install_line}
|
|
203
|
+
{additional_setup or ""}
|
|
156
204
|
|
|
157
205
|
echo 'done' > /tmp/host-setup-done
|
|
158
206
|
"""
|
coiled/cli/run.py
CHANGED
|
@@ -29,6 +29,8 @@ from coiled.utils import (
|
|
|
29
29
|
unset_single_thread_defaults,
|
|
30
30
|
)
|
|
31
31
|
from coiled.v2.cluster import ClusterKwargs
|
|
32
|
+
from coiled.v2.cluster_comms import use_comm_rpc
|
|
33
|
+
from coiled.v2.core import Cloud
|
|
32
34
|
from coiled.v2.widgets.rich import LightRichClusterWidget
|
|
33
35
|
|
|
34
36
|
from ..filestore import FilestoreManager
|
|
@@ -46,20 +48,37 @@ USER_CONTAINER_NAME = "tmp-user-1"
|
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
class KeepaliveSession:
|
|
49
|
-
def __init__(self, cluster, prefix="", monitor_proc_activity=False):
|
|
51
|
+
def __init__(self, cluster, comms=None, prefix="", monitor_proc_activity=False):
|
|
50
52
|
self.cluster = cluster
|
|
53
|
+
self.comms = comms
|
|
51
54
|
self.monitor_proc_activity = monitor_proc_activity
|
|
52
55
|
rand_uuid = short_random_string()
|
|
53
56
|
self.session_id = f"{prefix}-{rand_uuid}" if prefix else rand_uuid
|
|
54
57
|
|
|
58
|
+
if self.comms:
|
|
59
|
+
self.cloud = Cloud.current(asynchronous=False)
|
|
60
|
+
|
|
55
61
|
def __enter__(self):
|
|
56
62
|
# keepalive session lets us use keepalive without dask client
|
|
57
|
-
self.cluster
|
|
58
|
-
|
|
59
|
-
|
|
63
|
+
if self.cluster:
|
|
64
|
+
self.cluster._call_scheduler_comm(
|
|
65
|
+
"coiled_add_keepalive_session", name=self.session_id, monitor_proc_activity=self.monitor_proc_activity
|
|
66
|
+
)
|
|
67
|
+
elif self.comms:
|
|
68
|
+
use_comm_rpc(
|
|
69
|
+
self.cloud,
|
|
70
|
+
self.comms,
|
|
71
|
+
"coiled_add_keepalive_session",
|
|
72
|
+
name=self.session_id,
|
|
73
|
+
monitor_proc_activity=self.monitor_proc_activity,
|
|
74
|
+
)
|
|
60
75
|
|
|
61
76
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
62
|
-
self.cluster
|
|
77
|
+
if self.cluster:
|
|
78
|
+
self.cluster._call_scheduler_comm("coiled_end_keepalive_session", name=self.session_id)
|
|
79
|
+
elif self.comms:
|
|
80
|
+
use_comm_rpc(self.cloud, self.comms, "coiled_end_keepalive_session", name=self.session_id)
|
|
81
|
+
self.cloud.close()
|
|
63
82
|
|
|
64
83
|
|
|
65
84
|
def get_ssh_connection(cloud, cluster_id) -> fabric.connection.Connection:
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from coiled.v2.states import ProcessStateEnum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_cluster_connection_info(
|
|
7
|
+
cluster_id: int,
|
|
8
|
+
cloud,
|
|
9
|
+
*,
|
|
10
|
+
use_scheduler_public_ip: bool = True,
|
|
11
|
+
) -> tuple[str, dict]:
|
|
12
|
+
"""
|
|
13
|
+
Get the comms info we need to connect to Dask in a running cluster.
|
|
14
|
+
|
|
15
|
+
(This is a bit of a hack. It would be nicer to have a way to tell coiled.Cluster not to
|
|
16
|
+
create, just retrieve. But Cluster is a bit hard to deal with... )
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
cluster_info = cloud._get_cluster_details_synced(cluster_id=cluster_id) # type: ignore
|
|
20
|
+
|
|
21
|
+
if ProcessStateEnum(cluster_info["scheduler"]["current_state"]["state"]) != ProcessStateEnum.started:
|
|
22
|
+
scheduler_state = cluster_info["scheduler"]["current_state"]["state"]
|
|
23
|
+
raise RuntimeError(f"Cannot get security info for cluster {cluster_id}, scheduler state is {scheduler_state}")
|
|
24
|
+
|
|
25
|
+
public_ip = cluster_info["scheduler"]["instance"]["public_ip_address"]
|
|
26
|
+
private_ip = cluster_info["scheduler"]["instance"]["private_ip_address"]
|
|
27
|
+
tls_cert = cluster_info["cluster_options"]["tls_cert"]
|
|
28
|
+
tls_key = cluster_info["cluster_options"]["tls_key"]
|
|
29
|
+
scheduler_port = cluster_info["scheduler_port"]
|
|
30
|
+
dashboard_address = cluster_info["scheduler"]["dashboard_address"]
|
|
31
|
+
give_scheduler_public_ip = cluster_info["cluster_infra"]["give_scheduler_public_ip"]
|
|
32
|
+
|
|
33
|
+
private_address = f"tls://{private_ip}:{scheduler_port}"
|
|
34
|
+
public_address = f"tls://{public_ip}:{scheduler_port}"
|
|
35
|
+
|
|
36
|
+
use_public_address = give_scheduler_public_ip and use_scheduler_public_ip
|
|
37
|
+
if use_public_address:
|
|
38
|
+
if not public_ip:
|
|
39
|
+
raise RuntimeError(
|
|
40
|
+
"Your Coiled client is configured to use the public IP address, but the scheduler VM does not "
|
|
41
|
+
"have a public IP address."
|
|
42
|
+
)
|
|
43
|
+
address_to_use = public_address
|
|
44
|
+
else:
|
|
45
|
+
address_to_use = private_address
|
|
46
|
+
|
|
47
|
+
security_info = {
|
|
48
|
+
"tls_key": tls_key,
|
|
49
|
+
"tls_cert": tls_cert,
|
|
50
|
+
"dashboard_address": dashboard_address,
|
|
51
|
+
"public_address": public_address,
|
|
52
|
+
"private_address": private_address,
|
|
53
|
+
"address_to_use": address_to_use,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return address_to_use, security_info
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_comm_from_connection_info(address, security):
|
|
60
|
+
from distributed import rpc
|
|
61
|
+
|
|
62
|
+
from coiled.utils import GatewaySecurity
|
|
63
|
+
|
|
64
|
+
security_obj = GatewaySecurity(security["tls_key"], security["tls_cert"])
|
|
65
|
+
return rpc(address, connection_args=security_obj.get_connection_args("client"))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def use_comm_rpc(cloud, comm, function, **kwargs):
|
|
69
|
+
async def foo():
|
|
70
|
+
await getattr(comm, function)(**kwargs)
|
|
71
|
+
|
|
72
|
+
cloud._sync(foo)
|
|
@@ -33,11 +33,11 @@ coiled/cli/diagnostics.py,sha256=1jIeue7xLOaf7LQFsNc6NmO5yU1jqmPFpKZSKjGN4rs,394
|
|
|
33
33
|
coiled/cli/env.py,sha256=NHh7ZSq9yfongkpFqzon1eLhnH1FwToVvkKFIhqXRBE,6932
|
|
34
34
|
coiled/cli/file.py,sha256=fJmOG3YhxpxXokGYu90wpjdwkJpp1XVqPJ_iveb5ShA,3623
|
|
35
35
|
coiled/cli/login.py,sha256=cByVXmMsfGEuY2TkYU_Y8zq1zVTUHAxOe_wpw2uHsBs,2242
|
|
36
|
-
coiled/cli/mpi.py,sha256=
|
|
36
|
+
coiled/cli/mpi.py,sha256=_i5GPTu4kE1kc-DQ1JLixIEpVVNJ0suwyLUgjp-iUww,6793
|
|
37
37
|
coiled/cli/package_sync.py,sha256=lABDY20yjfLYGfPlQu8ugI-Q8doY4JtN8_0nb9PkcT4,4101
|
|
38
38
|
coiled/cli/prefect.py,sha256=T-SSFey4jlA_jpEI0DqAhVIPwlt2GvBFogEqYCwwevI,302
|
|
39
39
|
coiled/cli/prefect_serve.py,sha256=gemq6YOVbnBoq4k3tSaU2gFJR3aMSxXLNxH6jB8V3n8,4378
|
|
40
|
-
coiled/cli/run.py,sha256=
|
|
40
|
+
coiled/cli/run.py,sha256=CtUmIarGMBC4n2xpbplStTrGGgGZiP5mTTKXOoEN_yo,31600
|
|
41
41
|
coiled/cli/sync.py,sha256=S5PzB9GSPJn3HvviOMLKVbo4ET46FlPwLYK_7sRyonQ,9726
|
|
42
42
|
coiled/cli/utils.py,sha256=cp7ToFGRpUKi6iNL6BbLjzgrgeTYSX_C55lYhaKWHHA,3479
|
|
43
43
|
coiled/cli/batch/__init__.py,sha256=539CnfnqqcW7ndSufTS-Ie5FGZiElMYxE0Ptu70wo8M,660
|
|
@@ -89,6 +89,7 @@ coiled/extensions/prefect/runners.py,sha256=AcaGS1637TnqFPKnjmmLHpdzjwAsxBLDKrOF
|
|
|
89
89
|
coiled/extensions/prefect/workers.py,sha256=Z2VuAhTm5AjWEKyCniMZrTxqtkn3uJp3sO3bFeR2Rr0,1642
|
|
90
90
|
coiled/v2/__init__.py,sha256=KaCULaAqatcsYbTbj_SQtTLocbSKZa-uQXiyCICKFRM,805
|
|
91
91
|
coiled/v2/cluster.py,sha256=hGs5_SVdrh2zKWpd_8RNPC7LbNd1F6ggEgAj9fCEDm8,148134
|
|
92
|
+
coiled/v2/cluster_comms.py,sha256=UcJWLeZlc68S0uaNd9lLKbF5uaDhYqqkdTsA0CBXYRI,2643
|
|
92
93
|
coiled/v2/core.py,sha256=Bf5A_rzK3tuUqqMVAgN5vix-tX_F8AEWR2pICnG3YcA,71615
|
|
93
94
|
coiled/v2/cwi_log_link.py,sha256=d4k6wRYhcdDVdhWYZIX6WL1g0lscXY0yq__H1sPUNWk,1883
|
|
94
95
|
coiled/v2/states.py,sha256=VduyWuf6rByG_wg5AXTxZpe88cCTSdIa4HrPjk1jBcA,9031
|
|
@@ -96,8 +97,8 @@ coiled/v2/widgets/__init__.py,sha256=Bt3GHTTyri-kFUaqGRVydDM-sCg5NdNujDg2RyvgV8U
|
|
|
96
97
|
coiled/v2/widgets/interface.py,sha256=YeMQ5qdRbbpM04x9qIg2LE1xwxyRxFbdDYnkrwHazPk,301
|
|
97
98
|
coiled/v2/widgets/rich.py,sha256=3rU5-yso92NdeEh3uSvEE-GwPNyp6i0Nb5PE5czXCik,28974
|
|
98
99
|
coiled/v2/widgets/util.py,sha256=Y8qpGqwNzqfCzgyRFRy7vcscBoXqop-Upi4HLPpXLgg,3120
|
|
99
|
-
coiled-1.128.
|
|
100
|
-
coiled-1.128.
|
|
101
|
-
coiled-1.128.
|
|
102
|
-
coiled-1.128.
|
|
103
|
-
coiled-1.128.
|
|
100
|
+
coiled-1.128.1.dist-info/METADATA,sha256=ju4vAo9MSxF8AdwdzeX8wZX2IdsCIrARVEafamEoUbo,2176
|
|
101
|
+
coiled-1.128.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
102
|
+
coiled-1.128.1.dist-info/entry_points.txt,sha256=C8dz1ST_bTlTO-kNvuHBJQma9PyJPotg0S4xpPt5aHY,47
|
|
103
|
+
coiled-1.128.1.dist-info/licenses/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
|
|
104
|
+
coiled-1.128.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|