coiled 1.128.0__py3-none-any.whl → 1.128.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of coiled might be problematic. Click here for more details.

coiled/cli/mpi.py CHANGED
@@ -6,8 +6,10 @@ import fabric.connection
6
6
 
7
7
  import coiled
8
8
 
9
+ from .. import AWSOptions
10
+ from ..v2.cluster_comms import get_cluster_connection_info, get_comm_from_connection_info
9
11
  from .cluster.utils import find_cluster
10
- from .run import get_ssh_connection, write_via_ssh
12
+ from .run import KeepaliveSession, get_ssh_connection, write_via_ssh
11
13
  from .utils import CONTEXT_SETTINGS
12
14
 
13
15
 
@@ -15,32 +17,73 @@ from .utils import CONTEXT_SETTINGS
15
17
  context_settings=CONTEXT_SETTINGS,
16
18
  )
17
19
  @click.option("--worker-nodes", default=1, type=int)
20
+ @click.option(
21
+ "--include-head/--exclude-head",
22
+ default=True,
23
+ type=bool,
24
+ help="Include head (scheduler) node in placement group with EFA",
25
+ )
18
26
  @click.option("--vm-type", default="g6.8xlarge", type=str)
19
- @click.option("--pip", multiple=True, type=str)
20
- @click.option("--idle-timeout", default=None, type=str)
21
- def setup(worker_nodes, vm_type, pip, idle_timeout):
22
- setup_script = get_host_setup_script(pip_install=pip)
27
+ @click.option("--head-vm-type", default=None, type=str)
28
+ @click.option("--worker-vm-type", default=None, type=str)
29
+ @click.option("--pip", multiple=True, type=str, help="Packages to install with pip")
30
+ @click.option("--apt", multiple=True, type=str, help="Packages to install with apt install")
31
+ @click.option("--setup-script", default=None, type=str, help="Path to additional host setup script")
32
+ @click.option("--cluster-timeout", default="1h", type=str, help="Maximum lifetime for cluster")
33
+ @click.option(
34
+ "--idle-timeout",
35
+ default="10m",
36
+ type=str,
37
+ help="How long to wait when nothing is running before shutting down cluster",
38
+ )
39
+ def setup(
40
+ worker_nodes,
41
+ include_head,
42
+ vm_type,
43
+ head_vm_type,
44
+ worker_vm_type,
45
+ pip,
46
+ apt,
47
+ setup_script,
48
+ cluster_timeout,
49
+ idle_timeout,
50
+ ):
51
+ additional_host_setup = None
52
+ if setup_script:
53
+ with open(setup_script) as f:
54
+ additional_host_setup = f.read()
55
+ setup_script = get_host_setup_script(pip_install=pip, apt_install=apt, additional_setup=additional_host_setup)
56
+
57
+ efa_settings = (
58
+ {"use_placement_group": True, "use_efa": True}
59
+ if include_head
60
+ else {"use_worker_placement_group": True, "use_worker_efa": True}
61
+ )
23
62
 
24
63
  cluster = coiled.Cluster(
25
64
  n_workers=worker_nodes,
26
65
  container="daskdev/dask:latest",
27
66
  allow_ssh_from="me",
28
67
  host_setup_script=setup_script,
29
- backend_options={"use_placement_group": True, "use_efa": True, "ami_version": "DL"},
30
- scheduler_vm_types=[vm_type],
31
- worker_vm_types=[vm_type],
68
+ backend_options=AWSOptions(**{**efa_settings, "ami_version": "DL"}),
69
+ scheduler_vm_types=[head_vm_type or vm_type],
70
+ worker_vm_types=[worker_vm_type or vm_type],
32
71
  worker_disk_size="100GB",
33
72
  scheduler_disk_size="100GB",
34
73
  shutdown_on_close=False,
35
- idle_timeout=idle_timeout,
74
+ idle_timeout="520 weeks", # don't use idle timeout
75
+ cluster_timeout=cluster_timeout,
36
76
  )
37
77
 
78
+ cluster.set_keepalive(keepalive=idle_timeout)
79
+
38
80
  print("Cluster created, installing software for MPI...")
39
81
 
40
82
  with coiled.Cloud() as cloud:
41
83
  connection = get_ssh_connection(cloud, cluster.cluster_id)
42
84
 
43
- setup_mpi_ssh(connection)
85
+ with KeepaliveSession(cluster=cluster, prefix="mpi-ssh-setup"):
86
+ setup_mpi_ssh(connection)
44
87
 
45
88
  print("MPI is ready")
46
89
 
@@ -88,6 +131,8 @@ legate \
88
131
  cluster_info = find_cluster(cloud, cluster)
89
132
  cluster_id = cluster_info["id"]
90
133
  connection = get_ssh_connection(cloud, cluster_id)
134
+ address, security = get_cluster_connection_info(cluster_id, cloud)
135
+ comms = get_comm_from_connection_info(address, security)
91
136
 
92
137
  setup_mpi_ssh(connection, include_scheduler=include_head)
93
138
 
@@ -116,8 +161,10 @@ legate \
116
161
 
117
162
  print(f"Running command:\n{wrapped_command}")
118
163
 
119
- # TODO keepalive session so this will interact correctly with idle timeout / keepalive
120
- connection.run(wrapped_command, hide=False, pty=True, warn=True, env={"PATH": "/tmp/host-user-venv/bin:$PATH"})
164
+ with KeepaliveSession(cluster=None, comms=comms, prefix="mpi-ssh"):
165
+ connection.run(wrapped_command, hide=False, pty=True, warn=True, env={"PATH": "/tmp/host-user-venv/bin:$PATH"})
166
+
167
+ comms.close_comms()
121
168
 
122
169
 
123
170
  def setup_mpi_ssh(connection, include_scheduler=True):
@@ -138,11 +185,11 @@ done
138
185
  _ = connection.run(setup_mpi, hide=True, pty=False)
139
186
 
140
187
 
141
- def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None):
142
- apt_install = apt_install or []
188
+ def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None, additional_setup=None):
189
+ apt_install = list(apt_install or [])
143
190
  apt_install.extend(["openmpi-bin", "python3-pip", "python3-venv"])
144
191
 
145
- pip_install = pip_install or []
192
+ pip_install = list(pip_install or [])
146
193
 
147
194
  pip_install_line = f"{venv_path}/bin/python -m pip install {' '.join(pip_install)}" if pip_install else ""
148
195
 
@@ -153,6 +200,7 @@ mkdir {venv_path}
153
200
  python3 -m venv {venv_path}
154
201
 
155
202
  {pip_install_line}
203
+ {additional_setup or ""}
156
204
 
157
205
  echo 'done' > /tmp/host-setup-done
158
206
  """
coiled/cli/run.py CHANGED
@@ -29,6 +29,8 @@ from coiled.utils import (
29
29
  unset_single_thread_defaults,
30
30
  )
31
31
  from coiled.v2.cluster import ClusterKwargs
32
+ from coiled.v2.cluster_comms import use_comm_rpc
33
+ from coiled.v2.core import Cloud
32
34
  from coiled.v2.widgets.rich import LightRichClusterWidget
33
35
 
34
36
  from ..filestore import FilestoreManager
@@ -46,20 +48,37 @@ USER_CONTAINER_NAME = "tmp-user-1"
46
48
 
47
49
 
48
50
  class KeepaliveSession:
49
- def __init__(self, cluster, prefix="", monitor_proc_activity=False):
51
+ def __init__(self, cluster, comms=None, prefix="", monitor_proc_activity=False):
50
52
  self.cluster = cluster
53
+ self.comms = comms
51
54
  self.monitor_proc_activity = monitor_proc_activity
52
55
  rand_uuid = short_random_string()
53
56
  self.session_id = f"{prefix}-{rand_uuid}" if prefix else rand_uuid
54
57
 
58
+ if self.comms:
59
+ self.cloud = Cloud.current(asynchronous=False)
60
+
55
61
  def __enter__(self):
56
62
  # keepalive session lets us use keepalive without dask client
57
- self.cluster._call_scheduler_comm(
58
- "coiled_add_keepalive_session", name=self.session_id, monitor_proc_activity=self.monitor_proc_activity
59
- )
63
+ if self.cluster:
64
+ self.cluster._call_scheduler_comm(
65
+ "coiled_add_keepalive_session", name=self.session_id, monitor_proc_activity=self.monitor_proc_activity
66
+ )
67
+ elif self.comms:
68
+ use_comm_rpc(
69
+ self.cloud,
70
+ self.comms,
71
+ "coiled_add_keepalive_session",
72
+ name=self.session_id,
73
+ monitor_proc_activity=self.monitor_proc_activity,
74
+ )
60
75
 
61
76
  def __exit__(self, exc_type, exc_val, exc_tb):
62
- self.cluster._call_scheduler_comm("coiled_end_keepalive_session", name=self.session_id)
77
+ if self.cluster:
78
+ self.cluster._call_scheduler_comm("coiled_end_keepalive_session", name=self.session_id)
79
+ elif self.comms:
80
+ use_comm_rpc(self.cloud, self.comms, "coiled_end_keepalive_session", name=self.session_id)
81
+ self.cloud.close()
63
82
 
64
83
 
65
84
  def get_ssh_connection(cloud, cluster_id) -> fabric.connection.Connection:
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from coiled.v2.states import ProcessStateEnum
4
+
5
+
6
+ def get_cluster_connection_info(
7
+ cluster_id: int,
8
+ cloud,
9
+ *,
10
+ use_scheduler_public_ip: bool = True,
11
+ ) -> tuple[str, dict]:
12
+ """
13
+ Get the comms info we need to connect to Dask in a running cluster.
14
+
15
+ (This is a bit of a hack. It would be nicer to have a way to tell coiled.Cluster not to
16
+ create, just retrieve. But Cluster is a bit hard to deal with... )
17
+ """
18
+
19
+ cluster_info = cloud._get_cluster_details_synced(cluster_id=cluster_id) # type: ignore
20
+
21
+ if ProcessStateEnum(cluster_info["scheduler"]["current_state"]["state"]) != ProcessStateEnum.started:
22
+ scheduler_state = cluster_info["scheduler"]["current_state"]["state"]
23
+ raise RuntimeError(f"Cannot get security info for cluster {cluster_id}, scheduler state is {scheduler_state}")
24
+
25
+ public_ip = cluster_info["scheduler"]["instance"]["public_ip_address"]
26
+ private_ip = cluster_info["scheduler"]["instance"]["private_ip_address"]
27
+ tls_cert = cluster_info["cluster_options"]["tls_cert"]
28
+ tls_key = cluster_info["cluster_options"]["tls_key"]
29
+ scheduler_port = cluster_info["scheduler_port"]
30
+ dashboard_address = cluster_info["scheduler"]["dashboard_address"]
31
+ give_scheduler_public_ip = cluster_info["cluster_infra"]["give_scheduler_public_ip"]
32
+
33
+ private_address = f"tls://{private_ip}:{scheduler_port}"
34
+ public_address = f"tls://{public_ip}:{scheduler_port}"
35
+
36
+ use_public_address = give_scheduler_public_ip and use_scheduler_public_ip
37
+ if use_public_address:
38
+ if not public_ip:
39
+ raise RuntimeError(
40
+ "Your Coiled client is configured to use the public IP address, but the scheduler VM does not "
41
+ "have a public IP address."
42
+ )
43
+ address_to_use = public_address
44
+ else:
45
+ address_to_use = private_address
46
+
47
+ security_info = {
48
+ "tls_key": tls_key,
49
+ "tls_cert": tls_cert,
50
+ "dashboard_address": dashboard_address,
51
+ "public_address": public_address,
52
+ "private_address": private_address,
53
+ "address_to_use": address_to_use,
54
+ }
55
+
56
+ return address_to_use, security_info
57
+
58
+
59
+ def get_comm_from_connection_info(address, security):
60
+ from distributed import rpc
61
+
62
+ from coiled.utils import GatewaySecurity
63
+
64
+ security_obj = GatewaySecurity(security["tls_key"], security["tls_cert"])
65
+ return rpc(address, connection_args=security_obj.get_connection_args("client"))
66
+
67
+
68
+ def use_comm_rpc(cloud, comm, function, **kwargs):
69
+ async def foo():
70
+ await getattr(comm, function)(**kwargs)
71
+
72
+ cloud._sync(foo)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coiled
3
- Version: 1.128.0
3
+ Version: 1.128.1
4
4
  Summary: Python client for coiled.io dask clusters
5
5
  Project-URL: Homepage, https://coiled.io
6
6
  Maintainer-email: Coiled <info@coiled.io>
@@ -33,11 +33,11 @@ coiled/cli/diagnostics.py,sha256=1jIeue7xLOaf7LQFsNc6NmO5yU1jqmPFpKZSKjGN4rs,394
33
33
  coiled/cli/env.py,sha256=NHh7ZSq9yfongkpFqzon1eLhnH1FwToVvkKFIhqXRBE,6932
34
34
  coiled/cli/file.py,sha256=fJmOG3YhxpxXokGYu90wpjdwkJpp1XVqPJ_iveb5ShA,3623
35
35
  coiled/cli/login.py,sha256=cByVXmMsfGEuY2TkYU_Y8zq1zVTUHAxOe_wpw2uHsBs,2242
36
- coiled/cli/mpi.py,sha256=37yPngCYcAbYn35PNVO6-SA3YY6Ik8KVe5blmF3yK4g,5048
36
+ coiled/cli/mpi.py,sha256=_i5GPTu4kE1kc-DQ1JLixIEpVVNJ0suwyLUgjp-iUww,6793
37
37
  coiled/cli/package_sync.py,sha256=lABDY20yjfLYGfPlQu8ugI-Q8doY4JtN8_0nb9PkcT4,4101
38
38
  coiled/cli/prefect.py,sha256=T-SSFey4jlA_jpEI0DqAhVIPwlt2GvBFogEqYCwwevI,302
39
39
  coiled/cli/prefect_serve.py,sha256=gemq6YOVbnBoq4k3tSaU2gFJR3aMSxXLNxH6jB8V3n8,4378
40
- coiled/cli/run.py,sha256=IHy-7a9RCUkNQauugqZYij_B-qMiMEk1gCJSf_EHBdc,30898
40
+ coiled/cli/run.py,sha256=CtUmIarGMBC4n2xpbplStTrGGgGZiP5mTTKXOoEN_yo,31600
41
41
  coiled/cli/sync.py,sha256=S5PzB9GSPJn3HvviOMLKVbo4ET46FlPwLYK_7sRyonQ,9726
42
42
  coiled/cli/utils.py,sha256=cp7ToFGRpUKi6iNL6BbLjzgrgeTYSX_C55lYhaKWHHA,3479
43
43
  coiled/cli/batch/__init__.py,sha256=539CnfnqqcW7ndSufTS-Ie5FGZiElMYxE0Ptu70wo8M,660
@@ -89,6 +89,7 @@ coiled/extensions/prefect/runners.py,sha256=AcaGS1637TnqFPKnjmmLHpdzjwAsxBLDKrOF
89
89
  coiled/extensions/prefect/workers.py,sha256=Z2VuAhTm5AjWEKyCniMZrTxqtkn3uJp3sO3bFeR2Rr0,1642
90
90
  coiled/v2/__init__.py,sha256=KaCULaAqatcsYbTbj_SQtTLocbSKZa-uQXiyCICKFRM,805
91
91
  coiled/v2/cluster.py,sha256=hGs5_SVdrh2zKWpd_8RNPC7LbNd1F6ggEgAj9fCEDm8,148134
92
+ coiled/v2/cluster_comms.py,sha256=UcJWLeZlc68S0uaNd9lLKbF5uaDhYqqkdTsA0CBXYRI,2643
92
93
  coiled/v2/core.py,sha256=Bf5A_rzK3tuUqqMVAgN5vix-tX_F8AEWR2pICnG3YcA,71615
93
94
  coiled/v2/cwi_log_link.py,sha256=d4k6wRYhcdDVdhWYZIX6WL1g0lscXY0yq__H1sPUNWk,1883
94
95
  coiled/v2/states.py,sha256=VduyWuf6rByG_wg5AXTxZpe88cCTSdIa4HrPjk1jBcA,9031
@@ -96,8 +97,8 @@ coiled/v2/widgets/__init__.py,sha256=Bt3GHTTyri-kFUaqGRVydDM-sCg5NdNujDg2RyvgV8U
96
97
  coiled/v2/widgets/interface.py,sha256=YeMQ5qdRbbpM04x9qIg2LE1xwxyRxFbdDYnkrwHazPk,301
97
98
  coiled/v2/widgets/rich.py,sha256=3rU5-yso92NdeEh3uSvEE-GwPNyp6i0Nb5PE5czXCik,28974
98
99
  coiled/v2/widgets/util.py,sha256=Y8qpGqwNzqfCzgyRFRy7vcscBoXqop-Upi4HLPpXLgg,3120
99
- coiled-1.128.0.dist-info/METADATA,sha256=3Vk38O2A51tSoz3OfVKlWQArARO9aRTTLn_rvGNBBS8,2176
100
- coiled-1.128.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
101
- coiled-1.128.0.dist-info/entry_points.txt,sha256=C8dz1ST_bTlTO-kNvuHBJQma9PyJPotg0S4xpPt5aHY,47
102
- coiled-1.128.0.dist-info/licenses/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
103
- coiled-1.128.0.dist-info/RECORD,,
100
+ coiled-1.128.1.dist-info/METADATA,sha256=ju4vAo9MSxF8AdwdzeX8wZX2IdsCIrARVEafamEoUbo,2176
101
+ coiled-1.128.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
102
+ coiled-1.128.1.dist-info/entry_points.txt,sha256=C8dz1ST_bTlTO-kNvuHBJQma9PyJPotg0S4xpPt5aHY,47
103
+ coiled-1.128.1.dist-info/licenses/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
104
+ coiled-1.128.1.dist-info/RECORD,,