skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@ include sky/jobs/dashboard/templates/*
15
15
  include sky/jobs/dashboard/static/*
16
16
  include sky/templates/*
17
17
  include sky/utils/kubernetes/*
18
+ include sky/ssh_node_pools/deploy/tunnel/*
18
19
  include sky/server/html/*
19
20
  recursive-include sky/dashboard/out *
20
21
  include sky/users/*.conf
@@ -144,9 +144,11 @@ aws_dependencies = [
144
144
  'awscli>=1.27.10',
145
145
  'botocore>=1.29.10',
146
146
  'boto3>=1.26.1',
147
- # NOTE: required by awscli. To avoid ray automatically installing
148
- # the latest version.
149
- 'colorama < 0.4.5',
147
+ # NOTE: colorama is a dependency of awscli. We pin it to match the
148
+ # version constraint in awscli (<0.4.7) to prevent potential conflicts
149
+ # with other packages like ray, which might otherwise install a newer
150
+ # version.
151
+ 'colorama<0.4.7',
150
152
  ]
151
153
 
152
154
  # Kubernetes 32.0.0 has an authentication bug:
@@ -204,12 +206,21 @@ cloud_dependencies: Dict[str, List[str]] = {
204
206
  'ssh': kubernetes_dependencies,
205
207
  # For the container registry auth api. Reference:
206
208
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
207
- # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
208
- # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
209
- # Instead of installing tomli conditionally, we install it explicitly.
210
- # This is because the conditional installation of tomli does not work
211
- # with controller package installation code.
212
- 'runpod': ['runpod>=1.6.1', 'tomli'],
209
+ 'runpod': [
210
+ # For the container registry auth api. Reference:
211
+ # https://github.com/runpod/runpod-python/releases/tag/1.6.1
212
+ 'runpod>=1.6.1',
213
+ # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
214
+ # 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
215
+ # explicitly. Instead of installing tomli conditionally, we install it
216
+ # explicitly. This is because the conditional installation of tomli does
217
+ # not work with controller package installation code.
218
+ 'tomli',
219
+ # runpod installs aiodns (via aiohttp[speedups]), which is incompatible
220
+ # with pycares 5.0.0 due to deprecations.
221
+ # See https://github.com/aio-libs/aiodns/issues/214
222
+ 'pycares<5',
223
+ ],
213
224
  'fluidstack': [], # No dependencies needed for fluidstack
214
225
  'cudo': ['cudo-compute>=0.1.10'],
215
226
  'paperspace': [], # No dependencies needed for paperspace
@@ -235,7 +246,7 @@ cloud_dependencies: Dict[str, List[str]] = {
235
246
  'hyperbolic': [], # No dependencies needed for hyperbolic
236
247
  'seeweb': ['ecsapi==0.4.0'],
237
248
  'shadeform': [], # No dependencies needed for shadeform
238
- 'slurm': [], # No dependencies needed for slurm
249
+ 'slurm': ['python-hostlist'],
239
250
  }
240
251
 
241
252
  # Calculate which clouds should be included in the [all] installation.
sky/sky_logging.py CHANGED
@@ -15,7 +15,8 @@ from sky.utils import env_options
15
15
  from sky.utils import rich_utils
16
16
 
17
17
  # UX: Should we show logging prefixes and some extra information in optimizer?
18
- _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
18
+ _FORMAT = ('%(levelname).1s %(asctime)s.%(msecs)03d PID=%(process)d '
19
+ '%(filename)s:%(lineno)d] %(message)s')
19
20
  _DATE_FORMAT = '%m-%d %H:%M:%S'
20
21
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
21
22
 
sky/skylet/constants.py CHANGED
@@ -20,6 +20,7 @@ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
20
20
  # os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
21
21
  # '.sky/jobs.db')
22
22
  SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
23
+ SKY_CLUSTER_NAME_ENV_VAR_KEY = 'SKY_CLUSTER_NAME'
23
24
  # We keep sky_logs and sky_workdir in $HOME, because
24
25
  # these are artifacts that users can access, and having
25
26
  # them be in $HOME makes it more convenient.
@@ -46,7 +47,19 @@ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
46
47
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
47
48
  SKY_REMOTE_RAY_VERSION = '2.9.3'
48
49
 
49
- SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
50
+ # To avoid user image causing issue with the SkyPilot runtime, we run SkyPilot
51
+ # commands the following prefix:
52
+ # 1. env -u PYTHONPATH: unset PYTHONPATH to avoid any package specified in
53
+ # PYTHONPATH interfering with the SkyPilot runtime.
54
+ # 2. env -C $HOME: set the execution directory to $HOME to avoid the case when
55
+ # a user's WORKDIR in Dockerfile is a Python site-packages directory. Python
56
+ # adds CWD to the beginning of sys.path, so if WORKDIR contains packages (e.g.,
57
+ # compiled for a different Python version), imports will fail with errors like
58
+ # "ModuleNotFoundError: No module named 'rpds.rpds'".
59
+ #
60
+ # TODO(zhwu): Switch -C $HOME to PYTHONSAFEPATH=1, once we moved our runtime to
61
+ # Python 3.11 for a more robust setup.
62
+ SKY_UNSET_PYTHONPATH_AND_SET_CWD = 'env -u PYTHONPATH -C $HOME'
50
63
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
51
64
  # in this file, so that any future internal commands that need to use python
52
65
  # can use this path. This is useful for the case where the user has a custom
@@ -58,7 +71,8 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
58
71
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
59
72
  'which python3')
60
73
  # Python executable, e.g., /opt/conda/bin/python3
61
- SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
74
+ SKY_PYTHON_CMD = (f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} '
75
+ f'$({SKY_GET_PYTHON_PATH_CMD})')
62
76
  # Prefer SKY_UV_PIP_CMD, which is faster.
63
77
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
64
78
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -91,7 +105,7 @@ SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
91
105
  # user provided docker image set it to true.
92
106
  # unset PYTHONPATH in case the user provided docker image set it.
93
107
  SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
94
- f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
108
+ f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} {SKY_UV_INSTALL_DIR}/uv')
95
109
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
96
110
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
97
111
  'curl -LsSf https://astral.sh/uv/install.sh '
@@ -130,7 +144,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
130
144
  # cluster yaml is updated.
131
145
  #
132
146
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
133
- SKYLET_VERSION = '27'
147
+ SKYLET_VERSION = '29'
134
148
  # The version of the lib files that skylet/jobs use. Whenever there is an API
135
149
  # change for the job_lib or log_lib, we need to bump this version, so that the
136
150
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -462,7 +476,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
462
476
  ('gcp', 'enable_gvnic'),
463
477
  ('gcp', 'enable_gpu_direct'),
464
478
  ('gcp', 'placement_policy'),
465
- ('vast', 'secure_only'),
479
+ ('vast', 'datacenter_only'),
466
480
  ('active_workspace',),
467
481
  ]
468
482
  # When overriding the SkyPilot configs on the API server with the client one,
@@ -523,6 +537,9 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
523
537
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
524
538
  OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
525
539
  IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
540
+ # Environment variable that is set to 'true' if rolling update strategy is
541
+ # enabled for the API server deployment.
542
+ SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'
526
543
 
527
544
  SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
528
545
  f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
@@ -19,12 +19,10 @@ from sky.skylet.log_lib import run_bash_command_with_log
19
19
 
20
20
  def _get_ip_address() -> str:
21
21
  """Get the IP address of the current node."""
22
- ip_result = subprocess.run(['hostname', '-I'],
23
- capture_output=True,
24
- text=True,
25
- check=False)
26
- return ip_result.stdout.strip().split(
27
- )[0] if ip_result.returncode == 0 else 'unknown'
22
+ # Use socket.gethostbyname to be consistent with _get_job_node_ips(),
23
+ # which resolves hostnames the same way. Using `hostname -I` can return
24
+ # Docker bridge IPs (172.17.x.x) first, causing IP mismatch errors.
25
+ return socket.gethostbyname(socket.gethostname())
28
26
 
29
27
 
30
28
  def _get_job_node_ips() -> str:
sky/skylet/job_lib.py CHANGED
@@ -66,6 +66,7 @@ class JobInfoLoc(enum.IntEnum):
66
66
  PID = 9
67
67
  LOG_PATH = 10
68
68
  METADATA = 11
69
+ EXIT_CODES = 12
69
70
 
70
71
 
71
72
  def create_table(cursor, conn):
@@ -124,6 +125,8 @@ def create_table(cursor, conn):
124
125
  'metadata',
125
126
  'TEXT DEFAULT \'{}\'',
126
127
  value_to_replace_existing_entries='{}')
128
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'exit_codes',
129
+ 'TEXT DEFAULT NULL')
127
130
  conn.commit()
128
131
 
129
132
 
@@ -388,10 +391,16 @@ def add_job(job_name: str,
388
391
  assert _DB is not None
389
392
  job_submitted_at = time.time()
390
393
  # job_id will autoincrement with the null value
391
- _DB.cursor.execute(
392
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
393
- (job_name, username, job_submitted_at, JobStatus.INIT.value,
394
- run_timestamp, None, resources_str, metadata))
394
+ if int(constants.SKYLET_VERSION) >= 28:
395
+ _DB.cursor.execute(
396
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?, null)', # pylint: disable=line-too-long
397
+ (job_name, username, job_submitted_at, JobStatus.INIT.value,
398
+ run_timestamp, None, resources_str, metadata))
399
+ else:
400
+ _DB.cursor.execute(
401
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)', # pylint: disable=line-too-long
402
+ (job_name, username, job_submitted_at, JobStatus.INIT.value,
403
+ run_timestamp, None, resources_str, metadata))
395
404
  _DB.conn.commit()
396
405
  rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
397
406
  (run_timestamp,))
@@ -468,6 +477,41 @@ def set_status(job_id: int, status: JobStatus) -> None:
468
477
  _set_status_no_lock(job_id, status)
469
478
 
470
479
 
480
+ @init_db
481
+ def set_exit_codes(job_id: int, exit_codes: List[int]) -> None:
482
+ """Set exit codes for a job as comma-separated string.
483
+
484
+ Args:
485
+ job_id: The job ID to update.
486
+ exit_codes: A list of exit codes to store.
487
+ """
488
+ assert _DB is not None
489
+ exit_codes_str = ','.join(str(code) for code in exit_codes)
490
+ with filelock.FileLock(_get_lock_path(job_id)):
491
+ _DB.cursor.execute('UPDATE jobs SET exit_codes=(?) WHERE job_id=(?)',
492
+ (exit_codes_str, job_id))
493
+ _DB.conn.commit()
494
+
495
+
496
+ @init_db
497
+ def get_exit_codes(job_id: int) -> Optional[List[int]]:
498
+ """Get exit codes for a job from comma-separated string.
499
+
500
+ Args:
501
+ job_id: The job ID to retrieve exit codes for.
502
+
503
+ Returns:
504
+ A list of exit codes, or None if not found.
505
+ """
506
+ assert _DB is not None
507
+ rows = _DB.cursor.execute('SELECT exit_codes FROM jobs WHERE job_id=(?)',
508
+ (job_id,))
509
+ row = rows.fetchone()
510
+ if row is None or row[0] is None:
511
+ return None
512
+ return [int(code) for code in row[0].split(',')]
513
+
514
+
471
515
  @init_db
472
516
  def set_job_started(job_id: int) -> None:
473
517
  # TODO(mraheja): remove pylint disabling when filelock version updated.
@@ -506,6 +550,20 @@ def get_status(job_id: int) -> Optional[JobStatus]:
506
550
  return get_status_no_lock(job_id)
507
551
 
508
552
 
553
+ def wait_for_job_completion(job_id: int, poll_interval: float = 1.0) -> None:
554
+ """Wait for a job to reach a terminal state.
555
+
556
+ Args:
557
+ job_id: The job ID to wait for.
558
+ poll_interval: How often to poll the job status in seconds.
559
+ """
560
+ while True:
561
+ status = get_status(job_id)
562
+ if status is None or status.is_terminal():
563
+ break
564
+ time.sleep(poll_interval)
565
+
566
+
509
567
  @init_db
510
568
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
511
569
  return message_utils.encode_payload(get_statuses(job_ids))
@@ -674,6 +732,14 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
674
732
  'pid': row[JobInfoLoc.PID.value],
675
733
  'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
676
734
  })
735
+ if int(constants.SKYLET_VERSION) >= 28:
736
+ exit_code_str = row[JobInfoLoc.EXIT_CODES.value]
737
+ if not isinstance(exit_code_str, str):
738
+ records[-1]['exit_codes'] = None
739
+ else:
740
+ records[-1]['exit_codes'] = ([
741
+ int(code) for code in exit_code_str.split(',')
742
+ ])
677
743
  return records
678
744
 
679
745
 
@@ -1152,6 +1218,15 @@ class JobLibCodeGen:
1152
1218
  ]
1153
1219
  return cls._build(code)
1154
1220
 
1221
+ @classmethod
1222
+ def wait_for_job(cls, job_id: int) -> str:
1223
+ code = [
1224
+ # TODO(kevin): backward compatibility, remove in 0.13.0.
1225
+ (f'job_lib.wait_for_job_completion({job_id!r}) if '
1226
+ 'hasattr(job_lib, "wait_for_job_completion") else None'),
1227
+ ]
1228
+ return cls._build(code)
1229
+
1155
1230
  @classmethod
1156
1231
  def update_status(cls) -> str:
1157
1232
  code = ['job_lib.update_status()']
@@ -1269,6 +1344,16 @@ class JobLibCodeGen:
1269
1344
  ]
1270
1345
  return cls._build(code)
1271
1346
 
1347
+ @classmethod
1348
+ def get_job_exit_codes(cls, job_id: Optional[int] = None) -> str:
1349
+ """Generate shell command to retrieve exit codes."""
1350
+ code = [
1351
+ f'job_id = {job_id} if {job_id} is not None else job_lib.get_latest_job_id()', # pylint: disable=line-too-long
1352
+ 'exit_codes = job_lib.get_exit_codes(job_id) if job_id is not None and int(constants.SKYLET_VERSION) >= 28 else {}', # pylint: disable=line-too-long
1353
+ 'print(exit_codes, flush=True)',
1354
+ ]
1355
+ return cls._build(code)
1356
+
1272
1357
  @classmethod
1273
1358
  def _build(cls, code: List[str]) -> str:
1274
1359
  code = cls._PREFIX + code
sky/skylet/services.py CHANGED
@@ -197,12 +197,11 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
197
197
  f.write(request.codegen)
198
198
  os.chmod(script_path, 0o755)
199
199
 
200
- cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
201
200
  job_submit_cmd = (
202
201
  # JOB_CMD_IDENTIFIER is used for identifying the process
203
202
  # retrieved with pid is the same driver process.
204
203
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
205
- f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
204
+ f'{constants.SKY_PYTHON_CMD} -u {script_path}'
206
205
  # Do not use &>, which is not POSIX and may not work.
207
206
  # Note that the order of ">filename 2>&1" matters.
208
207
  f' > {remote_log_path} 2>&1')
@@ -387,6 +386,21 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
387
386
  except Exception as e: # pylint: disable=broad-except
388
387
  context.abort(grpc.StatusCode.INTERNAL, str(e))
389
388
 
389
+ def GetJobExitCodes( # type: ignore[return]
390
+ self, request: jobsv1_pb2.GetJobExitCodesRequest,
391
+ context: grpc.ServicerContext
392
+ ) -> jobsv1_pb2.GetJobExitCodesResponse:
393
+ try:
394
+ job_id = request.job_id if request.HasField(
395
+ 'job_id') else job_lib.get_latest_job_id()
396
+ exit_codes: Optional[List[int]] = None
397
+ if job_id:
398
+ exit_codes_list = job_lib.get_exit_codes(job_id)
399
+ exit_codes = exit_codes_list if exit_codes_list else []
400
+ return jobsv1_pb2.GetJobExitCodesResponse(exit_codes=exit_codes)
401
+ except Exception as e: # pylint: disable=broad-except
402
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
403
+
390
404
 
391
405
  class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
392
406
  ):
@@ -488,7 +502,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
488
502
  entrypoint=job.get('entrypoint'),
489
503
  metadata=converted_metadata,
490
504
  pool=job.get('pool'),
491
- pool_hash=job.get('pool_hash'))
505
+ pool_hash=job.get('pool_hash'),
506
+ links=job.get('links'))
492
507
  jobs_info.append(job_info)
493
508
 
494
509
  return managed_jobsv1_pb2.GetJobTableResponse(
@@ -0,0 +1,62 @@
1
+ #!/bin/bash
2
+ # cleanup-tunnel.sh - Script to clean up SSH tunnels for a Kubernetes context
3
+
4
+ # Usage: cleanup-tunnel.sh CONTEXT_NAME
5
+
6
+ CONTEXT="${1:-default}"
7
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
8
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
9
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
10
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
11
+
12
+ # Get the port from kubeconfig if available
13
+ KUBE_PORT=$(kubectl config view --minify --context="$CONTEXT" -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null | grep -o ":[0-9]\+" | tr -d ":" || echo "")
14
+
15
+ if [[ -z "$KUBE_PORT" ]]; then
16
+ # Default to 6443 if we can't determine the port
17
+ KUBE_PORT=6443
18
+ echo "$(date): Could not determine port from kubeconfig, using default port $KUBE_PORT" >> "$LOG_FILE"
19
+ else
20
+ echo "$(date): Found port $KUBE_PORT in kubeconfig for context $CONTEXT" >> "$LOG_FILE"
21
+ fi
22
+
23
+ # Check if PID file exists
24
+ if [[ -f "$PID_FILE" ]]; then
25
+ OLD_PID=$(cat "$PID_FILE")
26
+
27
+ # Log the cleanup attempt
28
+ echo "$(date): Attempting to clean up tunnel for context $CONTEXT (PID: $OLD_PID, Port: $KUBE_PORT)" >> "$LOG_FILE"
29
+
30
+ # Try to kill the process
31
+ if kill -0 "$OLD_PID" 2>/dev/null; then
32
+ # Process exists, kill it
33
+ kill "$OLD_PID" 2>/dev/null
34
+
35
+ # Wait a moment and check if it's really gone
36
+ sleep 1
37
+ if kill -0 "$OLD_PID" 2>/dev/null; then
38
+ # Still running, force kill
39
+ kill -9 "$OLD_PID" 2>/dev/null
40
+ echo "$(date): Forcefully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
41
+ else
42
+ echo "$(date): Successfully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
43
+ fi
44
+ else
45
+ echo "$(date): No running process found with PID $OLD_PID" >> "$LOG_FILE"
46
+ fi
47
+
48
+ # Remove PID file
49
+ rm -f "$PID_FILE"
50
+ else
51
+ echo "$(date): No PID file found for context $CONTEXT. Nothing to clean up." >> "$LOG_FILE"
52
+ fi
53
+
54
+ # Clean up lock file if it exists
55
+ rm -f "$LOCK_FILE"
56
+
57
+ # Check if port is still in use
58
+ if nc -z localhost "$KUBE_PORT" 2>/dev/null; then
59
+ echo "$(date): Warning: Port $KUBE_PORT is still in use after cleanup. Another process might be using it." >> "$LOG_FILE"
60
+ fi
61
+
62
+ echo "$(date): Cleanup complete for context $CONTEXT" >> "$LOG_FILE"