coiled 1.127.1.dev19__tar.gz → 1.127.1.dev21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of coiled might be problematic. Click here for more details.

Files changed (103) hide show
  1. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/PKG-INFO +1 -1
  2. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/batch.py +2 -0
  3. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/run.py +1 -0
  4. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/core.py +2 -0
  5. coiled-1.127.1.dev21/coiled/cli/mpi.py +166 -0
  6. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/types.py +4 -0
  7. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/cluster.py +2 -2
  8. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/.gitignore +0 -0
  9. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/LICENSE +0 -0
  10. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/README.md +0 -0
  11. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/__init__.py +0 -0
  12. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/__main__.py +0 -0
  13. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/analytics.py +0 -0
  14. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/auth.py +0 -0
  15. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/capture_environment.py +0 -0
  16. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/__init__.py +0 -0
  17. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/__init__.py +0 -0
  18. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/list.py +0 -0
  19. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/logs.py +0 -0
  20. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/status.py +0 -0
  21. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/util.py +0 -0
  22. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/batch/wait.py +0 -0
  23. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/__init__.py +0 -0
  24. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/azure_logs.py +0 -0
  25. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/better_logs.py +0 -0
  26. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/crud.py +0 -0
  27. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/get_address.py +0 -0
  28. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/list.py +0 -0
  29. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/logs.py +0 -0
  30. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/metrics.py +0 -0
  31. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/ssh.py +0 -0
  32. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/cluster/utils.py +0 -0
  33. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/config.py +0 -0
  34. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/curl.py +0 -0
  35. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/diagnostics.py +0 -0
  36. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/env.py +0 -0
  37. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/file.py +0 -0
  38. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/__init__.py +0 -0
  39. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/__init__.py +0 -0
  40. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/exit.py +0 -0
  41. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/hello_world.py +0 -0
  42. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/nyc_parquet.py +0 -0
  43. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/pytorch.py +0 -0
  44. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/examples/xarray_nwm.py +0 -0
  45. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/hello.py +0 -0
  46. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/scripts/fill_ipython.py +0 -0
  47. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/scripts/nyc_parquet.py +0 -0
  48. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/scripts/pytorch.py +0 -0
  49. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/scripts/xarray_nwm.py +0 -0
  50. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/hello/utils.py +0 -0
  51. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/login.py +0 -0
  52. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/notebook/__init__.py +0 -0
  53. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/notebook/notebook.py +0 -0
  54. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/package_sync.py +0 -0
  55. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/prefect.py +0 -0
  56. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/prefect_serve.py +0 -0
  57. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/run.py +0 -0
  58. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/__init__.py +0 -0
  59. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/amp.py +0 -0
  60. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/aws.py +0 -0
  61. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/azure.py +0 -0
  62. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/entry.py +0 -0
  63. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/gcp.py +0 -0
  64. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/prometheus.py +0 -0
  65. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/setup/util.py +0 -0
  66. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/sync.py +0 -0
  67. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cli/utils.py +0 -0
  68. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/cluster.py +0 -0
  69. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/coiled.yaml +0 -0
  70. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/compatibility.py +0 -0
  71. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/config.py +0 -0
  72. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/context.py +0 -0
  73. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/core.py +0 -0
  74. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/credentials/__init__.py +0 -0
  75. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/credentials/aws.py +0 -0
  76. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/credentials/google.py +0 -0
  77. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/errors.py +0 -0
  78. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/exceptions.py +0 -0
  79. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/extensions/__init__.py +0 -0
  80. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/extensions/prefect/__init__.py +0 -0
  81. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/extensions/prefect/runners.py +0 -0
  82. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/extensions/prefect/workers.py +0 -0
  83. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/filestore.py +0 -0
  84. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/function.py +0 -0
  85. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/plugins.py +0 -0
  86. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/prefect.py +0 -0
  87. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/pypi_conda_map.py +0 -0
  88. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/scan.py +0 -0
  89. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/software.py +0 -0
  90. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/software_utils.py +0 -0
  91. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/spans.py +0 -0
  92. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/spark.py +0 -0
  93. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/utils.py +0 -0
  94. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/__init__.py +0 -0
  95. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/core.py +0 -0
  96. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/cwi_log_link.py +0 -0
  97. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/states.py +0 -0
  98. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/widgets/__init__.py +0 -0
  99. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/widgets/interface.py +0 -0
  100. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/widgets/rich.py +0 -0
  101. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/v2/widgets/util.py +0 -0
  102. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/coiled/websockets.py +0 -0
  103. {coiled-1.127.1.dev19 → coiled-1.127.1.dev21}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coiled
3
- Version: 1.127.1.dev19
3
+ Version: 1.127.1.dev21
4
4
  Summary: Python client for coiled.io dask clusters
5
5
  Project-URL: Homepage, https://coiled.io
6
6
  Maintainer-email: Coiled <info@coiled.io>
@@ -18,6 +18,7 @@ def run(
18
18
  software: str | None = None,
19
19
  container: str | None = None,
20
20
  run_on_host: bool | None = None,
21
+ cluster_kwargs: dict | None = None,
21
22
  env: list | dict | None = None,
22
23
  secret_env: list | dict | None = None,
23
24
  tag: list | dict | None = None,
@@ -86,6 +87,7 @@ def run(
86
87
  software=software,
87
88
  container=container,
88
89
  run_on_host=run_on_host,
90
+ cluster_kwargs=cluster_kwargs,
89
91
  env=env,
90
92
  secret_env=secret_env,
91
93
  tag=tag,
@@ -804,6 +804,7 @@ def _batch_run(default_kwargs, logger=None, from_cli=False, **kwargs) -> dict:
804
804
  "package_sync_ignore": kwargs.get("package_sync_ignore"),
805
805
  "allow_cross_zone": True if kwargs["allow_cross_zone"] is None else kwargs["allow_cross_zone"],
806
806
  "scheduler_sidecars": scheduler_sidecars,
807
+ **(kwargs.get("cluster_kwargs") or {}),
807
808
  }
808
809
 
809
810
  # when task will run on scheduler, give it the same VM specs as worker node
@@ -10,6 +10,7 @@ from .env import env
10
10
  from .file import file_group
11
11
  from .hello import hello
12
12
  from .login import login
13
+ from .mpi import mpi_group
13
14
  from .notebook import notebook_group
14
15
  from .package_sync import package_sync
15
16
  from .prefect import prefect
@@ -42,3 +43,4 @@ cli.add_command(better_logs, "logs")
42
43
  cli.add_command(hello)
43
44
  cli.add_command(hello, "quickstart")
44
45
  cli.add_command(file_group)
46
+ cli.add_command(mpi_group, "mpi")
@@ -0,0 +1,166 @@
1
+ import os.path
2
+ import shlex
3
+
4
+ import click
5
+ import fabric.connection
6
+
7
+ import coiled
8
+
9
+ from .cluster.utils import find_cluster
10
+ from .run import get_ssh_connection, write_via_ssh
11
+ from .utils import CONTEXT_SETTINGS
12
+
13
+
14
+ @click.command(
15
+ context_settings=CONTEXT_SETTINGS,
16
+ )
17
+ @click.option("--worker-nodes", default=1, type=int)
18
+ @click.option("--vm-type", default="g6.8xlarge", type=str)
19
+ @click.option("--pip", multiple=True, type=str)
20
+ @click.option("--idle-timeout", default=None, type=str)
21
+ def setup(worker_nodes, vm_type, pip, idle_timeout):
22
+ setup_script = get_host_setup_script(pip_install=pip)
23
+
24
+ cluster = coiled.Cluster(
25
+ n_workers=worker_nodes,
26
+ container="daskdev/dask:latest",
27
+ allow_ssh_from="me",
28
+ host_setup_script=setup_script,
29
+ backend_options={"use_placement_group": True, "use_efa": True, "ami_version": "DL"},
30
+ scheduler_vm_types=[vm_type],
31
+ worker_vm_types=[vm_type],
32
+ worker_disk_size="100GB",
33
+ scheduler_disk_size="100GB",
34
+ shutdown_on_close=False,
35
+ idle_timeout=idle_timeout,
36
+ )
37
+
38
+ print("Cluster created, installing software for MPI...")
39
+
40
+ with coiled.Cloud() as cloud:
41
+ connection = get_ssh_connection(cloud, cluster.cluster_id)
42
+
43
+ setup_mpi_ssh(connection)
44
+
45
+ print("MPI is ready")
46
+
47
+
48
+ @click.command(
49
+ context_settings=CONTEXT_SETTINGS,
50
+ )
51
+ @click.option("--cluster", default=None)
52
+ @click.option("--workspace", default=None, type=str)
53
+ @click.option("--legate", is_flag=True, default=False, type=bool)
54
+ @click.option(
55
+ "--include-head/--exclude-head",
56
+ default=True,
57
+ type=bool,
58
+ )
59
+ @click.argument("command", nargs=-1, required=True)
60
+ def run(cluster, workspace, legate, include_head, command):
61
+ nodes = "$(cat workers | wc -w)"
62
+
63
+ command = list(command)
64
+
65
+ files = {}
66
+ for i, c in enumerate(command):
67
+ if os.path.exists(c):
68
+ remote_path = f"/scratch/batch/{os.path.basename(c)}"
69
+ command[i] = remote_path
70
+ with open(c) as f:
71
+ content = f.read()
72
+ files[remote_path] = content
73
+
74
+ if legate:
75
+ # TODO make "--gpus 1 --sysmem 2000 --fbmem 20000" configurable
76
+ wrapped_command = f"""
77
+ legate \
78
+ --gpus 1 --sysmem 2000 --fbmem 20000 \
79
+ --nodes {nodes} \
80
+ --launcher mpirun \
81
+ --launcher-extra ' --hostfile workers -x PATH ' \
82
+ {shlex.join(command)}
83
+ """
84
+ else:
85
+ wrapped_command = f"mpirun --hostfile workers -x PATH {shlex.join(command)}"
86
+
87
+ with coiled.Cloud(workspace=workspace) as cloud:
88
+ cluster_info = find_cluster(cloud, cluster)
89
+ cluster_id = cluster_info["id"]
90
+ connection = get_ssh_connection(cloud, cluster_id)
91
+
92
+ setup_mpi_ssh(connection, include_scheduler=include_head)
93
+
94
+ if files:
95
+ worker_connections = []
96
+
97
+ for worker in cluster_info["workers"]:
98
+ if (
99
+ not worker.get("instance")
100
+ or not worker["instance"].get("current_state")
101
+ or worker["instance"]["current_state"]["state"] != "ready"
102
+ ):
103
+ continue
104
+ worker_address = worker["instance"]["private_ip_address"]
105
+
106
+ worker_connections.append(
107
+ fabric.connection.Connection(
108
+ worker_address, gateway=connection, user=connection.user, connect_kwargs=connection.connect_kwargs
109
+ )
110
+ )
111
+
112
+ for path, content in files.items():
113
+ write_via_ssh(connection, content=content, path=path)
114
+ for conn in worker_connections:
115
+ write_via_ssh(conn, content=content, path=path) # , mode=0o555
116
+
117
+ print(f"Running command:\n{wrapped_command}")
118
+
119
+ # TODO keepalive session so this will interact correctly with idle timeout / keepalive
120
+ connection.run(wrapped_command, hide=False, pty=True, warn=True, env={"PATH": "/tmp/host-user-venv/bin:$PATH"})
121
+
122
+
123
+ def setup_mpi_ssh(connection, include_scheduler=True):
124
+ add_scheduler_line = 'printf "\n127.0.0.1" >> workers' if include_scheduler else ""
125
+
126
+ setup_mpi = f"""
127
+ /bin/coiled_agent list-worker-ips | sudo tee workers && sudo chown ubuntu workers
128
+ ssh-keyscan -f workers -t ed25519 >> ~/.ssh/known_hosts
129
+ {add_scheduler_line}
130
+
131
+ # block until host setup script has finished, at least on schedule node
132
+ until [ -f /tmp/host-setup-done ]
133
+ do
134
+ sleep 5
135
+ done
136
+ """
137
+
138
+ _ = connection.run(setup_mpi, hide=True, pty=False)
139
+
140
+
141
+ def get_host_setup_script(venv_path="/tmp/host-user-venv", apt_install=None, pip_install=None):
142
+ apt_install = apt_install or []
143
+ apt_install.extend(["openmpi-bin", "python3-pip", "python3-venv"])
144
+
145
+ pip_install = pip_install or []
146
+
147
+ pip_install_line = f"{venv_path}/bin/python -m pip install {' '.join(pip_install)}" if pip_install else ""
148
+
149
+ return f"""
150
+ sudo apt install {" ".join(apt_install)} -y
151
+
152
+ mkdir {venv_path}
153
+ python3 -m venv {venv_path}
154
+
155
+ {pip_install_line}
156
+
157
+ echo 'done' > /tmp/host-setup-done
158
+ """
159
+
160
+
161
+ @click.group(name="mpi", context_settings=CONTEXT_SETTINGS)
162
+ def mpi_group(): ...
163
+
164
+
165
+ mpi_group.add_command(setup)
166
+ mpi_group.add_command(run)
@@ -442,6 +442,9 @@ class AWSOptions(BackendOptions, total=False):
442
442
  Only some instance types are supported.
443
443
  use_worker_efa
444
444
  Attach Elastic Fabric Adaptor only on cluster workers, not the scheduler.
445
+ ami_version
446
+ Use non-default type of AMI.
447
+ Supported options include "DL" for the Deep Learning Base OSS Nvidia Driver GPU AMI.
445
448
  """
446
449
 
447
450
  keypair_name: Optional[str]
@@ -449,6 +452,7 @@ class AWSOptions(BackendOptions, total=False):
449
452
  use_worker_placement_group: Optional[bool]
450
453
  use_efa: Optional[bool]
451
454
  use_worker_efa: Optional[bool]
455
+ ami_version: Optional[str]
452
456
 
453
457
 
454
458
  class GCPOptions(BackendOptions, total=False):
@@ -498,7 +498,7 @@ class Cluster(DistributedCluster, Generic[IsAsynchronous]):
498
498
  worker_vm_types: list | None = None,
499
499
  worker_cpu: Union[int, List[int]] | None = None,
500
500
  worker_memory: Union[str, List[str]] | None = None,
501
- worker_disk_size: Union[int, str] | None = None,
501
+ worker_disk_size: int | str | None = None,
502
502
  worker_disk_throughput: int | None = None,
503
503
  worker_disk_config: dict | None = None,
504
504
  worker_gpu: Union[int, bool] | None = None,
@@ -507,7 +507,7 @@ class Cluster(DistributedCluster, Generic[IsAsynchronous]):
507
507
  scheduler_vm_types: list | None = None,
508
508
  scheduler_cpu: Union[int, List[int]] | None = None,
509
509
  scheduler_memory: Union[str, List[str]] | None = None,
510
- scheduler_disk_size: int | None = None,
510
+ scheduler_disk_size: int | str | None = None,
511
511
  scheduler_disk_config: dict | None = None,
512
512
  scheduler_gpu: bool | None = None,
513
513
  asynchronous: bool = False,
File without changes
File without changes