skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py CHANGED
@@ -25,6 +25,7 @@ SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
25
25
  # them be in $HOME makes it more convenient.
26
26
  SKY_LOGS_DIRECTORY = '~/sky_logs'
27
27
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
28
+ SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
28
29
  SKY_IGNORE_FILE = '.skyignore'
29
30
  GIT_IGNORE_FILE = '.gitignore'
30
31
 
@@ -67,10 +68,23 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
67
68
  # #!/opt/conda/bin/python3
68
69
  SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
69
70
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
71
+
72
+ # Use $(which env) to find env, falling back to /usr/bin/env if which is
73
+ # unavailable. This works around a Slurm quirk where srun's execvp() doesn't
74
+ # check execute permissions, failing when $HOME/.local/bin/env (non-executable,
75
+ # from uv installation) shadows /usr/bin/env.
76
+ SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
77
+ '-u PYTHONPATH')
78
+ SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
79
+ f'$({SKY_GET_PYTHON_PATH_CMD})')
80
+
70
81
  # Separate env for SkyPilot runtime dependencies.
71
82
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
72
83
  SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
73
84
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
85
+ # Place the conda root in the runtime directory, as installing to $HOME
86
+ # on an NFS takes too long (1-2m slower).
87
+ SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
74
88
  # uv is used for venv and pip, much faster than python implementations.
75
89
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
76
90
  # set UV_SYSTEM_PYTHON to false in case the
@@ -162,6 +176,10 @@ DISABLE_GPU_ECC_COMMAND = (
162
176
  '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
163
177
  '|| true; ')
164
178
 
179
+ SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
180
+ f'mkdir -p ~/.sky/sky_app && '
181
+ f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
182
+
165
183
  # Install conda on the remote cluster if it is not already installed.
166
184
  # We use conda with python 3.10 to be consistent across multiple clouds with
167
185
  # best effort.
@@ -178,8 +196,9 @@ CONDA_INSTALLATION_COMMANDS = (
178
196
  # because for some images, conda is already installed, but not initialized.
179
197
  # In this case, we need to initialize conda and set auto_activate_base to
180
198
  # true.
181
- '{ bash Miniconda3-Linux.sh -b || true; '
182
- 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
199
+ '{ '
200
+ f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
201
+ f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
183
202
  # Caller should replace {conda_auto_activate} with either true or false.
184
203
  'conda config --set auto_activate_base {conda_auto_activate} && '
185
204
  'conda activate base; }; '
@@ -222,7 +241,7 @@ _sky_version = str(version.parse(sky.__version__))
222
241
  RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
223
242
  RAY_INSTALLATION_COMMANDS = (
224
243
  f'{SKY_UV_INSTALL_CMD};'
225
- 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
244
+ f'{SETUP_SKY_DIRS_COMMANDS}'
226
245
  # Print the PATH in provision.log to help debug PATH issues.
227
246
  'echo PATH=$PATH; '
228
247
  # Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
@@ -256,7 +275,7 @@ RAY_INSTALLATION_COMMANDS = (
256
275
  #
257
276
  # Here, we add ~/.local/bin to the end of the PATH to make sure the issues
258
277
  # mentioned above are resolved.
259
- 'export PATH=$PATH:$HOME/.local/bin; '
278
+ f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
260
279
  # Writes ray path to file if it does not exist or the file is empty.
261
280
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
262
281
  f'{{ {SKY_UV_RUN_CMD} '
@@ -264,18 +283,23 @@ RAY_INSTALLATION_COMMANDS = (
264
283
 
265
284
  # Copy SkyPilot templates from the installed wheel to ~/sky_templates.
266
285
  # This must run after the skypilot wheel is installed.
286
+ # Note: We remove ~/sky_templates first to avoid import conflicts where Python
287
+ # would import from ~/sky_templates instead of site-packages (because
288
+ # sky_templates itself is a package), leading to src == dst error when
289
+ # launching on an existing cluster.
267
290
  COPY_SKYPILOT_TEMPLATES_COMMANDS = (
291
+ f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
268
292
  f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
269
293
  f'{SKY_PYTHON_CMD} -c \''
270
294
  'import sky_templates, shutil, os; '
271
295
  'src = os.path.dirname(sky_templates.__file__); '
272
- 'dst = os.path.expanduser(\"~/sky_templates\"); '
296
+ f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
273
297
  'print(f\"Copying templates from {src} to {dst}...\"); '
274
- 'shutil.copytree(src, dst, dirs_exist_ok=True); '
298
+ 'shutil.copytree(src, dst); '
275
299
  'print(f\"Templates copied successfully\")\'; '
276
300
  # Make scripts executable.
277
- 'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
278
- '-exec chmod +x {} \\; ')
301
+ f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
302
+ '-exec chmod +x {} + ; ')
279
303
 
280
304
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
281
305
  f'{SKY_UV_INSTALL_CMD};'
@@ -438,6 +462,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
438
462
  ('gcp', 'enable_gvnic'),
439
463
  ('gcp', 'enable_gpu_direct'),
440
464
  ('gcp', 'placement_policy'),
465
+ ('vast', 'secure_only'),
441
466
  ('active_workspace',),
442
467
  ]
443
468
  # When overriding the SkyPilot configs on the API server with the client one,
@@ -532,7 +557,7 @@ CATALOG_SCHEMA_VERSION = 'v8'
532
557
  CATALOG_DIR = '~/.sky/catalogs'
533
558
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
534
559
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
535
- 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
560
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
536
561
  'hyperbolic', 'seeweb', 'shadeform')
537
562
  # END constants used for service catalog.
538
563
 
sky/skylet/events.py CHANGED
@@ -236,7 +236,7 @@ class AutostopEvent(SkyletEvent):
236
236
  RAY_PROVISIONER_SKYPILOT_TERMINATOR):
237
237
  logger.info('Using new provisioner to stop the cluster.')
238
238
  self._stop_cluster_with_new_provisioner(autostop_config, config,
239
- provider_name)
239
+ provider_name, cloud)
240
240
  return
241
241
  logger.info('Not using new provisioner to stop the cluster. '
242
242
  f'Cloud of this cluster: {provider_name}')
@@ -314,7 +314,8 @@ class AutostopEvent(SkyletEvent):
314
314
  raise NotImplementedError
315
315
 
316
316
  def _stop_cluster_with_new_provisioner(self, autostop_config,
317
- cluster_config, provider_name):
317
+ cluster_config, provider_name,
318
+ cloud):
318
319
  # pylint: disable=import-outside-toplevel
319
320
  from sky import provision as provision_lib
320
321
  autostop_lib.set_autostopping_started()
@@ -334,8 +335,13 @@ class AutostopEvent(SkyletEvent):
334
335
 
335
336
  # Stop the ray autoscaler to avoid scaling up, during
336
337
  # stopping/terminating of the cluster.
337
- logger.info('Stopping the ray cluster.')
338
- subprocess.run(f'{constants.SKY_RAY_CMD} stop', shell=True, check=True)
338
+ if not cloud.uses_ray():
339
+ logger.info('Skipping ray stop as cloud does not use Ray.')
340
+ else:
341
+ logger.info('Stopping the ray cluster.')
342
+ subprocess.run(f'{constants.SKY_RAY_CMD} stop',
343
+ shell=True,
344
+ check=True)
339
345
 
340
346
  operation_fn = provision_lib.stop_instances
341
347
  if autostop_config.down:
@@ -0,0 +1 @@
1
+ """Task Executors"""
@@ -0,0 +1,189 @@
1
+ """Slurm distributed task executor for SkyPilot.
2
+
3
+ This module is invoked on each Slurm compute node via:
4
+ srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
5
+ """
6
+ import argparse
7
+ import json
8
+ import os
9
+ import pathlib
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import time
14
+
15
+ import colorama
16
+
17
+ from sky.skylet.log_lib import run_bash_command_with_log
18
+
19
+
20
+ def _get_ip_address() -> str:
21
+ """Get the IP address of the current node."""
22
+ ip_result = subprocess.run(['hostname', '-I'],
23
+ capture_output=True,
24
+ text=True,
25
+ check=False)
26
+ return ip_result.stdout.strip().split(
27
+ )[0] if ip_result.returncode == 0 else 'unknown'
28
+
29
+
30
+ def _get_job_node_ips() -> str:
31
+ """Get IPs of all nodes in the current Slurm job."""
32
+ nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
33
+ assert nodelist, 'SLURM_JOB_NODELIST is not set'
34
+
35
+ # Expand compressed nodelist (e.g., "node[1-3,5]"
36
+ # -> "node1\nnode2\nnode3\nnode5")
37
+ result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
38
+ capture_output=True,
39
+ text=True,
40
+ check=False)
41
+ if result.returncode != 0:
42
+ raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
43
+
44
+ hostnames = result.stdout.strip().split('\n')
45
+ ips = []
46
+ for hostname in hostnames:
47
+ try:
48
+ ip = socket.gethostbyname(hostname)
49
+ ips.append(ip)
50
+ except socket.gaierror as e:
51
+ raise RuntimeError('Failed to get IP for hostname: '
52
+ f'{hostname}') from e
53
+
54
+ return '\n'.join(ips)
55
+
56
+
57
+ def main():
58
+ parser = argparse.ArgumentParser(
59
+ description='SkyPilot Slurm task runner for distributed execution')
60
+ parser.add_argument('--script', help='User script (inline, shell-quoted)')
61
+ parser.add_argument('--script-path',
62
+ help='Path to script file (if too long for inline)')
63
+ parser.add_argument('--env-vars',
64
+ default='{}',
65
+ help='JSON-encoded environment variables')
66
+ parser.add_argument('--log-dir',
67
+ required=True,
68
+ help='Directory for log files')
69
+ parser.add_argument('--cluster-num-nodes',
70
+ type=int,
71
+ required=True,
72
+ help='Total number of nodes in the cluster')
73
+ parser.add_argument('--cluster-ips',
74
+ required=True,
75
+ help='Comma-separated list of cluster node IPs')
76
+ parser.add_argument('--task-name',
77
+ default=None,
78
+ help='Task name for single-node log prefix')
79
+ parser.add_argument(
80
+ '--is-setup',
81
+ action='store_true',
82
+ help=
83
+ 'Whether this is a setup command (affects logging prefix and filename)')
84
+ parser.add_argument('--alloc-signal-file',
85
+ help='Path to allocation signal file')
86
+ parser.add_argument('--setup-done-signal-file',
87
+ help='Path to setup-done signal file')
88
+ args = parser.parse_args()
89
+
90
+ assert args.script is not None or args.script_path is not None, (
91
+ 'Either '
92
+ '--script or --script-path must be provided')
93
+
94
+ # Task rank, different from index of the node in the cluster.
95
+ rank = int(os.environ['SLURM_PROCID'])
96
+ num_nodes = int(os.environ.get('SLURM_NNODES', 1))
97
+ is_single_node_cluster = (args.cluster_num_nodes == 1)
98
+
99
+ # Determine node index from IP (like Ray's cluster_ips_to_node_id)
100
+ cluster_ips = args.cluster_ips.split(',')
101
+ ip_addr = _get_ip_address()
102
+ try:
103
+ node_idx = cluster_ips.index(ip_addr)
104
+ except ValueError as e:
105
+ raise RuntimeError(f'IP address {ip_addr} not found in '
106
+ f'cluster IPs: {cluster_ips}') from e
107
+ node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
108
+
109
+ # Log files are written to a shared filesystem, so each node must use a
110
+ # unique filename to avoid collisions.
111
+ if args.is_setup:
112
+ # TODO(kevin): This is inconsistent with other clouds, where it is
113
+ # simply called 'setup.log'. On Slurm that is obviously not possible,
114
+ # since the ~/sky_logs directory is shared by all nodes, so
115
+ # 'setup.log' will be overwritten by other nodes.
116
+ # Perhaps we should apply this naming convention to other clouds.
117
+ log_filename = f'setup-{node_name}.log'
118
+ elif is_single_node_cluster:
119
+ log_filename = 'run.log'
120
+ else:
121
+ log_filename = f'{rank}-{node_name}.log'
122
+ log_path = os.path.join(args.log_dir, log_filename)
123
+
124
+ if args.script_path:
125
+ with open(args.script_path, 'r', encoding='utf-8') as f:
126
+ script = f.read()
127
+ else:
128
+ script = args.script
129
+
130
+ # Parse env vars and add SKYPILOT environment variables
131
+ env_vars = json.loads(args.env_vars)
132
+ if not args.is_setup:
133
+ # For setup, env vars are set in CloudVmRayBackend._setup.
134
+ env_vars['SKYPILOT_NODE_RANK'] = str(rank)
135
+ env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
136
+ env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
137
+
138
+ # Signal file coordination for setup/run synchronization
139
+ # Rank 0 touches the allocation signal to indicate resources acquired
140
+ if args.alloc_signal_file is not None and rank == 0:
141
+ pathlib.Path(args.alloc_signal_file).touch()
142
+
143
+ # Wait for setup to complete.
144
+ while args.setup_done_signal_file is not None and not os.path.exists(
145
+ args.setup_done_signal_file):
146
+ time.sleep(0.1)
147
+
148
+ # Build log prefix
149
+ # For setup on head: (setup pid={pid})
150
+ # For setup on workers: (setup pid={pid}, ip=1.2.3.4)
151
+ # For single-node cluster: (task_name, pid={pid})
152
+ # For multi-node on head: (head, rank=0, pid={pid})
153
+ # For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
154
+ # The {pid} placeholder will be replaced by run_with_log
155
+ if args.is_setup:
156
+ # Setup prefix: head (node_idx=0) shows no IP, workers show IP
157
+ if node_idx == 0:
158
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
159
+ f'{colorama.Style.RESET_ALL} ')
160
+ else:
161
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
162
+ f'{colorama.Style.RESET_ALL} ')
163
+ elif is_single_node_cluster:
164
+ # Single-node cluster: use task name
165
+ name_str = args.task_name if args.task_name else 'task'
166
+ prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
167
+ f'{colorama.Style.RESET_ALL} ')
168
+ else:
169
+ # Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
170
+ if node_idx == 0:
171
+ prefix = (
172
+ f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
173
+ f'{colorama.Style.RESET_ALL} ')
174
+ else:
175
+ prefix = (f'{colorama.Fore.CYAN}'
176
+ f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
177
+ f'{colorama.Style.RESET_ALL} ')
178
+
179
+ returncode = run_bash_command_with_log(script,
180
+ log_path,
181
+ env_vars=env_vars,
182
+ stream_logs=True,
183
+ streaming_prefix=prefix)
184
+
185
+ sys.exit(returncode)
186
+
187
+
188
+ if __name__ == '__main__':
189
+ main()
sky/skylet/job_lib.py CHANGED
@@ -1273,4 +1273,5 @@ class JobLibCodeGen:
1273
1273
  def _build(cls, code: List[str]) -> str:
1274
1274
  code = cls._PREFIX + code
1275
1275
  code = ';'.join(code)
1276
- return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}'
1276
+ return (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
1277
+ f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}')
sky/skylet/log_lib.py CHANGED
@@ -172,7 +172,7 @@ def run_with_log(
172
172
  streaming_prefix: Optional[str] = None,
173
173
  log_cmd: bool = False,
174
174
  **kwargs,
175
- ) -> Union[int, Tuple[int, str, str]]:
175
+ ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
176
176
  """Runs a command and logs its output to a file.
177
177
 
178
178
  Args:
@@ -183,6 +183,8 @@ def run_with_log(
183
183
  process_stream: Whether to post-process the stdout/stderr of the
184
184
  command, such as replacing or skipping lines on the fly. If
185
185
  enabled, lines are printed only when '\r' or '\n' is found.
186
+ streaming_prefix: Optional prefix for each log line. Can contain {pid}
187
+ placeholder which will be replaced with the subprocess PID.
186
188
 
187
189
  Returns the returncode or returncode, stdout and stderr of the command.
188
190
  Note that the stdout and stderr is already decoded.
@@ -228,6 +230,13 @@ def run_with_log(
228
230
  # For backward compatibility, do not specify use_kill_pg by
229
231
  # default.
230
232
  subprocess_utils.kill_process_daemon(proc.pid)
233
+
234
+ # Format streaming_prefix with subprocess PID if it contains {pid}
235
+ formatted_streaming_prefix = streaming_prefix
236
+ if streaming_prefix and '{pid}' in streaming_prefix:
237
+ formatted_streaming_prefix = streaming_prefix.format(
238
+ pid=proc.pid)
239
+
231
240
  stdout = ''
232
241
  stderr = ''
233
242
  stdout_stream_handler = None
@@ -256,7 +265,7 @@ def run_with_log(
256
265
  line_processor=line_processor,
257
266
  # Replace CRLF when the output is logged to driver by ray.
258
267
  replace_crlf=with_ray,
259
- streaming_prefix=streaming_prefix,
268
+ streaming_prefix=formatted_streaming_prefix,
260
269
  )
261
270
  stdout_stream_handler = functools.partial(
262
271
  _handle_io_stream,
@@ -349,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
349
358
  log_path: str,
350
359
  env_vars: Optional[Dict[str, str]] = None,
351
360
  stream_logs: bool = False,
352
- with_ray: bool = False):
361
+ with_ray: bool = False,
362
+ streaming_prefix: Optional[str] = None):
353
363
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
354
364
  delete=False) as fp:
355
365
  bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
@@ -364,6 +374,7 @@ def run_bash_command_with_log(bash_command: str,
364
374
  log_path,
365
375
  stream_logs=stream_logs,
366
376
  with_ray=with_ray,
377
+ streaming_prefix=streaming_prefix,
367
378
  shell=True)
368
379
 
369
380
 
@@ -372,9 +383,14 @@ def run_bash_command_with_log_and_return_pid(
372
383
  log_path: str,
373
384
  env_vars: Optional[Dict[str, str]] = None,
374
385
  stream_logs: bool = False,
375
- with_ray: bool = False):
376
- return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
377
- stream_logs, with_ray)
386
+ with_ray: bool = False,
387
+ streaming_prefix: Optional[str] = None):
388
+ return_code = run_bash_command_with_log(bash_command,
389
+ log_path,
390
+ env_vars,
391
+ stream_logs,
392
+ with_ray,
393
+ streaming_prefix=streaming_prefix)
378
394
  return {'return_code': return_code, 'pid': os.getpid()}
379
395
 
380
396
 
sky/skylet/log_lib.pyi CHANGED
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
68
68
  process_stream: bool = ...,
69
69
  line_processor: Optional[log_utils.LineProcessor] = ...,
70
70
  streaming_prefix: Optional[str] = ...,
71
- ray_job_id: Optional[str] = ...,
71
+ log_cmd: bool = ...,
72
72
  **kwargs) -> int:
73
73
  ...
74
74
 
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
87
87
  process_stream: bool = ...,
88
88
  line_processor: Optional[log_utils.LineProcessor] = ...,
89
89
  streaming_prefix: Optional[str] = ...,
90
- ray_job_id: Optional[str] = ...,
90
+ log_cmd: bool = ...,
91
91
  **kwargs) -> Tuple[int, str, str]:
92
92
  ...
93
93
 
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
106
106
  process_stream: bool = ...,
107
107
  line_processor: Optional[log_utils.LineProcessor] = ...,
108
108
  streaming_prefix: Optional[str] = ...,
109
- ray_job_id: Optional[str] = ...,
110
- **kwargs) -> Union[int, Tuple[int, str, str]]:
109
+ log_cmd: bool = ...,
110
+ **kwargs) -> Tuple[int, int]:
111
111
  ...
112
112
 
113
113
 
@@ -125,7 +125,8 @@ def run_bash_command_with_log(bash_command: str,
125
125
  log_path: str,
126
126
  env_vars: Optional[Dict[str, str]] = ...,
127
127
  stream_logs: bool = ...,
128
- with_ray: bool = ...):
128
+ with_ray: bool = ...,
129
+ streaming_prefix: Optional[str] = ...) -> int:
129
130
  ...
130
131
 
131
132
 
@@ -134,7 +135,8 @@ def run_bash_command_with_log_and_return_pid(
134
135
  log_path: str,
135
136
  env_vars: Optional[Dict[str, str]] = ...,
136
137
  stream_logs: bool = ...,
137
- with_ray: bool = ...):
138
+ with_ray: bool = ...,
139
+ streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
138
140
  ...
139
141
 
140
142
 
sky/skylet/skylet.py CHANGED
@@ -48,8 +48,12 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
48
48
  # putting it here for visibility.
49
49
  # TODO(kevin): Determine the optimal max number of threads.
50
50
  max_workers = min(32, (os.cpu_count() or 1) + 4)
51
+ # There's only a single skylet process per cluster, so disable
52
+ # SO_REUSEPORT to raise an error if the port is already in use.
53
+ options = (('grpc.so_reuseport', 0),)
51
54
  server = grpc.server(
52
- concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
55
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers),
56
+ options=options)
53
57
 
54
58
  autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
55
59
  services.AutostopServiceImpl(), server)
@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,
110
110
 
111
111
 
112
112
  def main():
113
- # daemonize()
113
+ daemonize()
114
+
114
115
  parser = argparse.ArgumentParser()
115
116
  parser.add_argument('--parent-pid', type=int, required=True)
116
117
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -0,0 +1,12 @@
1
+ """Constants for SSH Node Pools"""
2
+ # pylint: disable=line-too-long
3
+ import os
4
+
5
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
6
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
7
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
8
+ NODE_POOLS_KEY_DIR = os.path.expanduser('~/.sky/ssh_keys')
9
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
10
+
11
+ # TODO (kyuds): make this configurable?
12
+ K3S_TOKEN = 'mytoken' # Any string can be used as the token
@@ -1,10 +1,15 @@
1
1
  """SSH Node Pool management core functionality."""
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Any, Dict, List
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  import yaml
7
7
 
8
+ from sky import clouds
9
+ from sky.ssh_node_pools import constants
10
+ from sky.ssh_node_pools import deploy
11
+ from sky.usage import usage_lib
12
+ from sky.utils import common_utils
8
13
  from sky.utils import yaml_utils
9
14
 
10
15
 
@@ -12,8 +17,8 @@ class SSHNodePoolManager:
12
17
  """Manager for SSH Node Pool configurations."""
13
18
 
14
19
  def __init__(self):
15
- self.config_path = Path.home() / '.sky' / 'ssh_node_pools.yaml'
16
- self.keys_dir = Path.home() / '.sky' / 'ssh_keys'
20
+ self.config_path = Path(constants.DEFAULT_SSH_NODE_POOLS_PATH)
21
+ self.keys_dir = Path(constants.NODE_POOLS_KEY_DIR)
17
22
  self.keys_dir.mkdir(parents=True, exist_ok=True)
18
23
 
19
24
  def get_all_pools(self) -> Dict[str, Any]:
@@ -133,3 +138,35 @@ def list_ssh_keys() -> List[str]:
133
138
  """List available SSH keys."""
134
139
  manager = SSHNodePoolManager()
135
140
  return manager.list_ssh_keys()
141
+
142
+
143
+ @usage_lib.entrypoint
144
+ def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
145
+ """Deploys or tears down a Kubernetes cluster on SSH targets.
146
+
147
+ Args:
148
+ infra: Name of the cluster configuration in ssh_node_pools.yaml.
149
+ If None, the first cluster in the file is used.
150
+ cleanup: If True, clean up the cluster instead of deploying.
151
+ """
152
+ deploy.run(cleanup=cleanup, infra=infra)
153
+
154
+
155
+ @usage_lib.entrypoint
156
+ def ssh_status(context_name: str) -> Tuple[bool, str]:
157
+ """Check the status of an SSH Node Pool context.
158
+
159
+ Args:
160
+ context_name: The SSH context name (e.g., 'ssh-my-cluster')
161
+
162
+ Returns:
163
+ Tuple[bool, str]: (is_ready, reason)
164
+ - is_ready: True if the SSH Node Pool is ready, False otherwise
165
+ - reason: Explanation of the status
166
+ """
167
+ try:
168
+ is_ready, reason = clouds.SSH.check_single_context(context_name)
169
+ return is_ready, reason
170
+ except Exception as e: # pylint: disable=broad-except
171
+ return False, ('Failed to check SSH context: '
172
+ f'{common_utils.format_exception(e)}')
@@ -0,0 +1,4 @@
1
+ """Module for Deploying SSH Node Pools"""
2
+ from sky.ssh_node_pools.deploy.deploy import run
3
+
4
+ __all__ = ['run']