skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,572 @@
1
+ """Slurm instance provisioning."""
2
+
3
+ import tempfile
4
+ import textwrap
5
+ import time
6
+ from typing import Any, cast, Dict, List, Optional, Tuple
7
+
8
+ from sky import sky_logging
9
+ from sky import skypilot_config
10
+ from sky.adaptors import slurm
11
+ from sky.provision import common
12
+ from sky.provision import constants
13
+ from sky.provision.slurm import utils as slurm_utils
14
+ from sky.utils import command_runner
15
+ from sky.utils import common_utils
16
+ from sky.utils import status_lib
17
+ from sky.utils import subprocess_utils
18
+ from sky.utils import timeline
19
+
20
+ logger = sky_logging.init_logger(__name__)
21
+
22
+ # TODO(kevin): This assumes $HOME is in a shared filesystem.
23
+ # We should probably make it configurable, and add a check
24
+ # during sky check.
25
+ SHARED_ROOT_SKY_DIRECTORY = '~/.sky_clusters'
26
+ PROVISION_SCRIPTS_DIRECTORY_NAME = '.sky_provision'
27
+ PROVISION_SCRIPTS_DIRECTORY = f'~/{PROVISION_SCRIPTS_DIRECTORY_NAME}'
28
+
29
+ POLL_INTERVAL_SECONDS = 2
30
+ # Default KillWait is 30 seconds, so we add some buffer time here.
31
+ _JOB_TERMINATION_TIMEOUT_SECONDS = 60
32
+ _SKY_DIR_CREATION_TIMEOUT_SECONDS = 30
33
+
34
+
35
+ def _sky_cluster_home_dir(cluster_name_on_cloud: str) -> str:
36
+ """Returns the SkyPilot cluster's home directory path on the Slurm cluster.
37
+
38
+ This path is assumed to be on a shared NFS mount accessible by all nodes.
39
+ To support clusters with non-NFS home directories, we would need to let
40
+ users specify an NFS-backed "working directory" or use a different
41
+ coordination mechanism.
42
+ """
43
+ return f'{SHARED_ROOT_SKY_DIRECTORY}/{cluster_name_on_cloud}'
44
+
45
+
46
+ def _sbatch_provision_script_path(filename: str) -> str:
47
+ """Returns the path to the sbatch provision script on the login node."""
48
+ # Put sbatch script in $HOME instead of /tmp as there can be
49
+ # multiple login nodes, and different SSH connections
50
+ # can land on different login nodes.
51
+ return f'{PROVISION_SCRIPTS_DIRECTORY}/{filename}'
52
+
53
+
54
+ def _skypilot_runtime_dir(cluster_name_on_cloud: str) -> str:
55
+ """Returns the SkyPilot runtime directory path on the Slurm cluster."""
56
+ return f'/tmp/{cluster_name_on_cloud}'
57
+
58
+
59
+ @timeline.event
60
+ def _create_virtual_instance(
61
+ region: str, cluster_name_on_cloud: str,
62
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
63
+ """Creates a Slurm virtual instance from the config.
64
+
65
+ A Slurm virtual instance is created by submitting a long-running
66
+ job with sbatch, to mimic a cloud VM.
67
+ """
68
+ provider_config = config.provider_config
69
+ ssh_config_dict = provider_config['ssh']
70
+ ssh_host = ssh_config_dict['hostname']
71
+ ssh_port = int(ssh_config_dict['port'])
72
+ ssh_user = ssh_config_dict['user']
73
+ ssh_key = ssh_config_dict['private_key']
74
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
75
+ partition = slurm_utils.get_partition_from_config(provider_config)
76
+
77
+ client = slurm.SlurmClient(
78
+ ssh_host,
79
+ ssh_port,
80
+ ssh_user,
81
+ ssh_key,
82
+ ssh_proxy_command=ssh_proxy_command,
83
+ )
84
+
85
+ # COMPLETING state occurs when a job is being terminated - during this
86
+ # phase, slurmd sends SIGTERM to tasks, waits for KillWait period, sends
87
+ # SIGKILL if needed, runs epilog scripts, and notifies slurmctld. This
88
+ # typically happens when a previous job with the same name is being
89
+ # cancelled or has finished. Jobs can get stuck in COMPLETING if epilog
90
+ # scripts hang or tasks don't respond to signals, so we wait with a
91
+ # timeout.
92
+ completing_jobs = client.query_jobs(
93
+ cluster_name_on_cloud,
94
+ ['completing'],
95
+ )
96
+ start_time = time.time()
97
+ while (completing_jobs and
98
+ time.time() - start_time < _JOB_TERMINATION_TIMEOUT_SECONDS):
99
+ logger.debug(f'Found {len(completing_jobs)} completing jobs. '
100
+ f'Waiting for them to finish: {completing_jobs}')
101
+ time.sleep(POLL_INTERVAL_SECONDS)
102
+ completing_jobs = client.query_jobs(
103
+ cluster_name_on_cloud,
104
+ ['completing'],
105
+ )
106
+ if completing_jobs:
107
+ # TODO(kevin): Automatically handle this, following the suggestions in
108
+ # https://slurm.schedmd.com/troubleshoot.html#completing
109
+ raise RuntimeError(f'Found {len(completing_jobs)} jobs still in '
110
+ 'completing state after '
111
+ f'{_JOB_TERMINATION_TIMEOUT_SECONDS}s. '
112
+ 'This is typically due to non-killable processes '
113
+ 'associated with the job.')
114
+
115
+ # Check if job already exists
116
+ existing_jobs = client.query_jobs(
117
+ cluster_name_on_cloud,
118
+ ['pending', 'running'],
119
+ )
120
+
121
+ # Get provision_timeout from config. If not specified, use None,
122
+ # which will use the default timeout specified in the Slurm adaptor.
123
+ provision_timeout = skypilot_config.get_effective_region_config(
124
+ cloud='slurm',
125
+ region=region,
126
+ keys=('provision_timeout',),
127
+ default_value=None)
128
+
129
+ if existing_jobs:
130
+ assert len(existing_jobs) == 1, (
131
+ f'Multiple jobs found with name {cluster_name_on_cloud}: '
132
+ f'{existing_jobs}')
133
+
134
+ job_id = existing_jobs[0]
135
+ logger.debug(f'Job with name {cluster_name_on_cloud} already exists '
136
+ f'(JOBID: {job_id})')
137
+
138
+ # Wait for nodes to be allocated (job might be in PENDING state)
139
+ nodes, _ = client.get_job_nodes(job_id,
140
+ wait=True,
141
+ timeout=provision_timeout)
142
+ return common.ProvisionRecord(provider_name='slurm',
143
+ region=region,
144
+ zone=partition,
145
+ cluster_name=cluster_name_on_cloud,
146
+ head_instance_id=slurm_utils.instance_id(
147
+ job_id, nodes[0]),
148
+ resumed_instance_ids=[],
149
+ created_instance_ids=[])
150
+
151
+ resources = config.node_config
152
+
153
+ # Note: By default Slurm terminates the entire job allocation if any node
154
+ # fails in its range of allocated nodes.
155
+ # In the future we can consider running sbatch with --no-kill to not
156
+ # automatically terminate a job if one of the nodes it has been
157
+ # allocated fails.
158
+ num_nodes = config.count
159
+
160
+ accelerator_type = resources.get('accelerator_type')
161
+ accelerator_count_raw = resources.get('accelerator_count')
162
+ try:
163
+ accelerator_count = int(
164
+ accelerator_count_raw) if accelerator_count_raw is not None else 0
165
+ except (TypeError, ValueError):
166
+ accelerator_count = 0
167
+
168
+ skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
169
+ sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
170
+ ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
171
+
172
+ # Build the sbatch script
173
+ gpu_directive = ''
174
+ if (accelerator_type is not None and accelerator_type.upper() != 'NONE' and
175
+ accelerator_count > 0):
176
+ gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type.lower()}:'
177
+ f'{accelerator_count}')
178
+
179
+ # By default stdout and stderr will be written to $HOME/slurm-%j.out
180
+ # (because we invoke sbatch from $HOME). Redirect elsewhere to not pollute
181
+ # the home directory.
182
+ provision_script = textwrap.dedent(f"""\
183
+ #!/bin/bash
184
+ #SBATCH --job-name={cluster_name_on_cloud}
185
+ #SBATCH --output={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
186
+ #SBATCH --error={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
187
+ #SBATCH --nodes={num_nodes}
188
+ #SBATCH --wait-all-nodes=1
189
+ # Let the job be terminated rather than requeued implicitly.
190
+ #SBATCH --no-requeue
191
+ #SBATCH --cpus-per-task={int(resources["cpus"])}
192
+ #SBATCH --mem={int(resources["memory"])}G
193
+ {gpu_directive}
194
+
195
+ # Cleanup function to remove cluster dirs on job termination.
196
+ cleanup() {{
197
+ # The Skylet is daemonized, so it is not automatically terminated when
198
+ # the Slurm job is terminated, we need to kill it manually.
199
+ echo "Terminating Skylet..."
200
+ if [ -f "{skypilot_runtime_dir}/.sky/skylet_pid" ]; then
201
+ kill $(cat "{skypilot_runtime_dir}/.sky/skylet_pid") 2>/dev/null || true
202
+ fi
203
+ echo "Cleaning up sky directories..."
204
+ # Clean up sky runtime directory on each node.
205
+ # NOTE: We can do this because --nodes for both this srun and the
206
+ # sbatch is the same number. Otherwise, there are no guarantees
207
+ # that this srun will run on the same subset of nodes as the srun
208
+ # that created the sky directories.
209
+ srun --nodes={num_nodes} rm -rf {skypilot_runtime_dir}
210
+ rm -rf {sky_home_dir}
211
+ }}
212
+ trap cleanup TERM
213
+
214
+ # Create sky home directory for the cluster.
215
+ mkdir -p {sky_home_dir}
216
+ # Create sky runtime directory on each node.
217
+ srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
218
+ # Suppress login messages.
219
+ touch {sky_home_dir}/.hushlogin
220
+ # Signal that the sbatch script has completed setup.
221
+ touch {ready_signal}
222
+ sleep infinity
223
+ """)
224
+
225
+ # To bootstrap things, we need to do it with SSHCommandRunner first.
226
+ # SlurmCommandRunner is for after the virtual instances are created.
227
+ login_node_runner = command_runner.SSHCommandRunner(
228
+ (ssh_host, ssh_port),
229
+ ssh_user,
230
+ ssh_key,
231
+ ssh_proxy_command=ssh_proxy_command,
232
+ )
233
+
234
+ cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
235
+ rc, stdout, stderr = login_node_runner.run(cmd,
236
+ require_outputs=True,
237
+ stream_logs=False)
238
+ subprocess_utils.handle_returncode(
239
+ rc,
240
+ cmd,
241
+ 'Failed to create provision scripts directory on login node.',
242
+ stderr=f'{stdout}\n{stderr}')
243
+ # Rsync the provision script to the login node
244
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=True) as f:
245
+ f.write(provision_script)
246
+ f.flush()
247
+ src_path = f.name
248
+ tgt_path = _sbatch_provision_script_path(f'{cluster_name_on_cloud}.sh')
249
+ login_node_runner.rsync(src_path, tgt_path, up=True, stream_logs=False)
250
+
251
+ job_id = client.submit_job(partition, cluster_name_on_cloud, tgt_path)
252
+ logger.debug(f'Successfully submitted Slurm job {job_id} to partition '
253
+ f'{partition} for cluster {cluster_name_on_cloud} '
254
+ f'with {num_nodes} nodes')
255
+
256
+ nodes, _ = client.get_job_nodes(job_id,
257
+ wait=True,
258
+ timeout=provision_timeout)
259
+ created_instance_ids = [
260
+ slurm_utils.instance_id(job_id, node) for node in nodes
261
+ ]
262
+
263
+ # Wait for the sbatch script to create the cluster's sky directories,
264
+ # to avoid a race condition where post-provision commands try to
265
+ # access the directories before they are created.
266
+ ready_check_cmd = (f'end=$((SECONDS+{_SKY_DIR_CREATION_TIMEOUT_SECONDS})); '
267
+ f'while [ ! -f {ready_signal} ]; do '
268
+ 'if (( SECONDS >= end )); then '
269
+ 'exit 1; fi; '
270
+ 'sleep 0.5; '
271
+ 'done')
272
+ rc, stdout, stderr = login_node_runner.run(ready_check_cmd,
273
+ require_outputs=True,
274
+ stream_logs=False)
275
+ subprocess_utils.handle_returncode(
276
+ rc,
277
+ ready_check_cmd,
278
+ 'Failed to verify sky directories creation.',
279
+ stderr=f'{stdout}\n{stderr}')
280
+
281
+ return common.ProvisionRecord(provider_name='slurm',
282
+ region=region,
283
+ zone=partition,
284
+ cluster_name=cluster_name_on_cloud,
285
+ head_instance_id=created_instance_ids[0],
286
+ resumed_instance_ids=[],
287
+ created_instance_ids=created_instance_ids)
288
+
289
+
290
+ @common_utils.retry
291
+ def query_instances(
292
+ cluster_name: str,
293
+ cluster_name_on_cloud: str,
294
+ provider_config: Optional[Dict[str, Any]] = None,
295
+ non_terminated_only: bool = True,
296
+ retry_if_missing: bool = False,
297
+ ) -> Dict[str, Tuple[Optional[status_lib.ClusterStatus], Optional[str]]]:
298
+ """See sky/provision/__init__.py"""
299
+ del cluster_name, retry_if_missing # Unused for Slurm
300
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
301
+
302
+ ssh_config_dict = provider_config['ssh']
303
+ ssh_host = ssh_config_dict['hostname']
304
+ ssh_port = int(ssh_config_dict['port'])
305
+ ssh_user = ssh_config_dict['user']
306
+ ssh_key = ssh_config_dict['private_key']
307
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
308
+
309
+ client = slurm.SlurmClient(
310
+ ssh_host,
311
+ ssh_port,
312
+ ssh_user,
313
+ ssh_key,
314
+ ssh_proxy_command=ssh_proxy_command,
315
+ )
316
+
317
+ # Map Slurm job states to SkyPilot ClusterStatus
318
+ # Slurm states:
319
+ # https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
320
+ # TODO(kevin): Include more states here.
321
+ status_map = {
322
+ 'pending': status_lib.ClusterStatus.INIT,
323
+ 'running': status_lib.ClusterStatus.UP,
324
+ 'completing': status_lib.ClusterStatus.UP,
325
+ 'completed': None,
326
+ 'cancelled': None,
327
+ # NOTE: Jobs that get cancelled (from sky down) will go to failed state
328
+ # with the reason 'NonZeroExitCode' and remain in the squeue output for
329
+ # a while.
330
+ 'failed': None,
331
+ 'node_fail': None,
332
+ }
333
+
334
+ statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
335
+ Optional[str]]] = {}
336
+ for state, sky_status in status_map.items():
337
+ jobs = client.query_jobs(
338
+ cluster_name_on_cloud,
339
+ [state],
340
+ )
341
+
342
+ for job_id in jobs:
343
+ if state in ('pending', 'failed', 'node_fail', 'cancelled',
344
+ 'completed'):
345
+ reason = client.get_job_reason(job_id)
346
+ if non_terminated_only and sky_status is None:
347
+ # TODO(kevin): For better UX, we should also find out
348
+ # which node(s) exactly that failed if it's a node_fail
349
+ # state.
350
+ logger.debug(f'Job {job_id} is terminated, but '
351
+ 'query_instances is called with '
352
+ f'non_terminated_only=True. State: {state}, '
353
+ f'Reason: {reason}')
354
+ continue
355
+ statuses[job_id] = (sky_status, reason)
356
+ else:
357
+ nodes, _ = client.get_job_nodes(job_id, wait=False)
358
+ for node in nodes:
359
+ instance_id = slurm_utils.instance_id(job_id, node)
360
+ statuses[instance_id] = (sky_status, None)
361
+
362
+ # TODO(kevin): Query sacct too to get more historical job info.
363
+ # squeue only includes completed jobs that finished in the last
364
+ # MinJobAge seconds (default 300s). Or could be earlier if it
365
+ # reaches MaxJobCount first (default 10_000).
366
+
367
+ return statuses
368
+
369
+
370
+ def run_instances(
371
+ region: str,
372
+ cluster_name: str, # pylint: disable=unused-argument
373
+ cluster_name_on_cloud: str,
374
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
375
+ """Run instances for the given cluster (Slurm in this case)."""
376
+ return _create_virtual_instance(region, cluster_name_on_cloud, config)
377
+
378
+
379
+ def wait_instances(region: str, cluster_name_on_cloud: str,
380
+ state: Optional[status_lib.ClusterStatus]) -> None:
381
+ """See sky/provision/__init__.py"""
382
+ del region, cluster_name_on_cloud, state
383
+ # We already wait for the instances to be running in run_instances.
384
+ # So we don't need to wait here.
385
+
386
+
387
+ def get_cluster_info(
388
+ region: str,
389
+ cluster_name_on_cloud: str,
390
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
391
+ del region
392
+ assert provider_config is not None, cluster_name_on_cloud
393
+
394
+ # The SSH host is the remote machine running slurmctld daemon.
395
+ # Cross-cluster operations are supported by interacting with
396
+ # the current controller. For details, please refer to
397
+ # https://slurm.schedmd.com/multi_cluster.html.
398
+ ssh_config_dict = provider_config['ssh']
399
+ ssh_host = ssh_config_dict['hostname']
400
+ ssh_port = int(ssh_config_dict['port'])
401
+ ssh_user = ssh_config_dict['user']
402
+ ssh_key = ssh_config_dict['private_key']
403
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
404
+
405
+ client = slurm.SlurmClient(
406
+ ssh_host,
407
+ ssh_port,
408
+ ssh_user,
409
+ ssh_key,
410
+ ssh_proxy_command=ssh_proxy_command,
411
+ )
412
+
413
+ # Find running job for this cluster
414
+ running_jobs = client.query_jobs(
415
+ cluster_name_on_cloud,
416
+ ['running'],
417
+ )
418
+
419
+ if not running_jobs:
420
+ # No running jobs found - cluster may be in pending or terminated state
421
+ return common.ClusterInfo(
422
+ instances={},
423
+ head_instance_id=None,
424
+ ssh_user=ssh_user,
425
+ provider_name='slurm',
426
+ provider_config=provider_config,
427
+ )
428
+ assert len(running_jobs) == 1, (
429
+ f'Multiple running jobs found for cluster {cluster_name_on_cloud}: '
430
+ f'{running_jobs}')
431
+
432
+ job_id = running_jobs[0]
433
+ # Running jobs should already have nodes allocated, so don't wait
434
+ nodes, node_ips = client.get_job_nodes(job_id, wait=False)
435
+
436
+ instances = {
437
+ f'{slurm_utils.instance_id(job_id, node)}': [
438
+ common.InstanceInfo(
439
+ instance_id=slurm_utils.instance_id(job_id, node),
440
+ internal_ip=node_ip,
441
+ external_ip=ssh_host,
442
+ ssh_port=ssh_port,
443
+ tags={
444
+ constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud,
445
+ 'job_id': job_id,
446
+ 'node': node,
447
+ },
448
+ )
449
+ ] for node, node_ip in zip(nodes, node_ips)
450
+ }
451
+
452
+ return common.ClusterInfo(
453
+ instances=instances,
454
+ head_instance_id=slurm_utils.instance_id(job_id, nodes[0]),
455
+ ssh_user=ssh_user,
456
+ provider_name='slurm',
457
+ provider_config=provider_config,
458
+ )
459
+
460
+
461
+ def stop_instances(
462
+ cluster_name_on_cloud: str,
463
+ provider_config: Optional[Dict[str, Any]] = None,
464
+ worker_only: bool = False,
465
+ ) -> None:
466
+ """Keep the Slurm virtual instances running."""
467
+ raise NotImplementedError()
468
+
469
+
470
+ def terminate_instances(
471
+ cluster_name_on_cloud: str,
472
+ provider_config: Optional[Dict[str, Any]] = None,
473
+ worker_only: bool = False,
474
+ ) -> None:
475
+ """See sky/provision/__init__.py"""
476
+ assert provider_config is not None, cluster_name_on_cloud
477
+
478
+ if worker_only:
479
+ logger.warning(
480
+ 'worker_only=True is not supported for Slurm, this is a no-op.')
481
+ return
482
+
483
+ ssh_config_dict = provider_config['ssh']
484
+ ssh_host = ssh_config_dict['hostname']
485
+ ssh_port = int(ssh_config_dict['port'])
486
+ ssh_user = ssh_config_dict['user']
487
+ ssh_private_key = ssh_config_dict['private_key']
488
+ # Check if we are running inside a Slurm job (Only happens with autodown,
489
+ # where the Skylet will invoke terminate_instances on the remote cluster),
490
+ # where we assume SSH between nodes have been set up on each node's
491
+ # ssh config.
492
+ # TODO(kevin): Validate this assumption. Another way would be to
493
+ # mount the private key to the remote cluster, like we do with
494
+ # other clouds' API keys.
495
+ if slurm_utils.is_inside_slurm_job():
496
+ logger.debug('Running inside a Slurm job, using machine\'s ssh config')
497
+ ssh_private_key = None
498
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
499
+
500
+ client = slurm.SlurmClient(
501
+ ssh_host,
502
+ ssh_port,
503
+ ssh_user,
504
+ ssh_private_key,
505
+ ssh_proxy_command=ssh_proxy_command,
506
+ )
507
+ client.cancel_jobs_by_name(
508
+ cluster_name_on_cloud,
509
+ signal='TERM',
510
+ full=True,
511
+ )
512
+
513
+
514
+ def open_ports(
515
+ cluster_name_on_cloud: str,
516
+ ports: List[str],
517
+ provider_config: Optional[Dict[str, Any]] = None,
518
+ ) -> None:
519
+ """See sky/provision/__init__.py"""
520
+ del cluster_name_on_cloud, ports, provider_config
521
+ pass
522
+
523
+
524
+ def cleanup_ports(
525
+ cluster_name_on_cloud: str,
526
+ ports: List[str],
527
+ provider_config: Optional[Dict[str, Any]] = None,
528
+ ) -> None:
529
+ """See sky/provision/__init__.py"""
530
+ del cluster_name_on_cloud, ports, provider_config
531
+ pass
532
+
533
+
534
+ def get_command_runners(
535
+ cluster_info: common.ClusterInfo,
536
+ **credentials: Dict[str, Any],
537
+ ) -> List[command_runner.SlurmCommandRunner]:
538
+ """Get a command runner for the given cluster."""
539
+ assert cluster_info.provider_config is not None, cluster_info
540
+
541
+ if cluster_info.head_instance_id is None:
542
+ # No running job found
543
+ return []
544
+
545
+ head_instance = cluster_info.get_head_instance()
546
+ assert head_instance is not None, 'Head instance not found'
547
+ cluster_name_on_cloud = head_instance.tags.get(
548
+ constants.TAG_SKYPILOT_CLUSTER_NAME, None)
549
+ assert cluster_name_on_cloud is not None, cluster_info
550
+
551
+ # There can only be one InstanceInfo per instance_id.
552
+ instances = [
553
+ instance_infos[0] for instance_infos in cluster_info.instances.values()
554
+ ]
555
+
556
+ # Note: For Slurm, the external IP for all instances is the same,
557
+ # it is the login node's. The internal IP is the private IP of the node.
558
+ ssh_user = cast(str, credentials.pop('ssh_user'))
559
+ ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
560
+ runners = [
561
+ command_runner.SlurmCommandRunner(
562
+ (instance_info.external_ip or '', instance_info.ssh_port),
563
+ ssh_user,
564
+ ssh_private_key,
565
+ sky_dir=_sky_cluster_home_dir(cluster_name_on_cloud),
566
+ skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
567
+ job_id=instance_info.tags['job_id'],
568
+ slurm_node=instance_info.tags['node'],
569
+ **credentials) for instance_info in instances
570
+ ]
571
+
572
+ return runners