skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ import copy
4
4
  import inspect
5
5
  import json
6
6
  import math
7
+ import os
7
8
  import textwrap
8
9
  from typing import Dict, List, Optional, Tuple
9
10
 
@@ -146,6 +147,7 @@ class TaskCodeGen:
146
147
  if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
147
148
  [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
148
149
  [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
150
+ FLUSH_START_TIME=$(date +%s)
149
151
  flushed=0
150
152
  # extra second on top of --vfs-cache-poll-interval to
151
153
  # avoid race condition between rclone log line creation and this check.
@@ -158,13 +160,32 @@ class TaskCodeGen:
158
160
  exitcode=0
159
161
  tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
160
162
  if [ $exitcode -ne 0 ]; then
161
- echo "skypilot: cached mount is still uploading to remote"
163
+ ELAPSED=$(($(date +%s) - FLUSH_START_TIME))
164
+ # Extract the last vfs cache status line to show what we're waiting for
165
+ CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed 's/.*vfs cache: cleaned: //' 2>/dev/null)
166
+ # Extract currently uploading files from recent log lines (show up to 2 files)
167
+ UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed 's/.*INFO : //' | sed 's/: vfs cache:.*//' | tr '\\n' ',' | sed 's/,$//' | sed 's/,/, /g' 2>/dev/null)
168
+ # Build status message with available info
169
+ if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then
170
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}] uploading: ${{UPLOADING_FILES}}"
171
+ elif [ -n "$CACHE_STATUS" ]; then
172
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}]"
173
+ else
174
+ # Fallback: show last non-empty line from log
175
+ LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed 's/.*INFO : //' | sed 's/.*ERROR : //' | sed 's/.*NOTICE: //' 2>/dev/null)
176
+ if [ -n "$LAST_LINE" ]; then
177
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) ${{LAST_LINE}}"
178
+ else
179
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s)"
180
+ fi
181
+ fi
162
182
  flushed=0
163
183
  break
164
184
  fi
165
185
  done
166
186
  done
167
- echo "skypilot: cached mount uploaded complete"
187
+ TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))
188
+ echo "skypilot: cached mount upload complete (took ${{TOTAL_FLUSH_TIME}}s)"
168
189
  fi""")
169
190
 
170
191
  def add_prologue(self, job_id: int) -> None:
@@ -181,8 +202,8 @@ class TaskCodeGen:
181
202
  resources_dict: Dict[str, float],
182
203
  stable_cluster_internal_ips: List[str],
183
204
  env_vars: Dict[str, str],
205
+ log_dir: str,
184
206
  setup_cmd: Optional[str] = None,
185
- setup_log_path: Optional[str] = None,
186
207
  ) -> None:
187
208
  """Generates code to set up the task on each node.
188
209
 
@@ -213,6 +234,9 @@ class TaskCodeGen:
213
234
  self._code += [
214
235
  textwrap.dedent(f"""\
215
236
  if sum(returncodes) != 0:
237
+ # Save exit codes to job metadata for potential recovery logic
238
+ if int(constants.SKYLET_VERSION) >= 28:
239
+ job_lib.set_exit_codes({self.job_id!r}, returncodes)
216
240
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
217
241
  # Schedule the next pending job immediately to make the job
218
242
  # scheduling more efficient.
@@ -379,13 +403,15 @@ class RayCodeGen(TaskCodeGen):
379
403
  resources_dict: Dict[str, float],
380
404
  stable_cluster_internal_ips: List[str],
381
405
  env_vars: Dict[str, str],
406
+ log_dir: str,
382
407
  setup_cmd: Optional[str] = None,
383
- setup_log_path: Optional[str] = None,
384
408
  ) -> None:
385
409
  assert self._has_prologue, ('Call add_prologue() before '
386
410
  'add_setup().')
387
411
  self._has_setup = True
388
412
 
413
+ setup_log_path = os.path.join(log_dir, 'setup.log')
414
+
389
415
  bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
390
416
  # Set CPU to avoid ray hanging the resources allocation
391
417
  # for remote functions, since the task will request 1 CPU
@@ -480,6 +506,8 @@ class RayCodeGen(TaskCodeGen):
480
506
  msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
481
507
  msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
482
508
  print(msg, flush=True)
509
+ if int(constants.SKYLET_VERSION) >= 28:
510
+ job_lib.set_exit_codes({self.job_id!r}, setup_returncodes)
483
511
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
484
512
  # This waits for all streaming logs to finish.
485
513
  time.sleep(1)
@@ -631,3 +659,351 @@ class RayCodeGen(TaskCodeGen):
631
659
  """Generates code that waits for all tasks, then exits."""
632
660
  self._code.append('returncodes, _ = get_or_fail(futures, pg)')
633
661
  super().add_epilogue()
662
+
663
+
664
+ class SlurmCodeGen(TaskCodeGen):
665
+ """Code generator for task execution on Slurm using native srun."""
666
+
667
+ def __init__(self, slurm_job_id: str):
668
+ """Initialize SlurmCodeGen
669
+
670
+ Args:
671
+ slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
672
+ """
673
+ super().__init__()
674
+ self._slurm_job_id = slurm_job_id
675
+
676
+ def add_prologue(self, job_id: int) -> None:
677
+ assert not self._has_prologue, 'add_prologue() called twice?'
678
+ self._has_prologue = True
679
+ self.job_id = job_id
680
+
681
+ self._add_common_imports()
682
+
683
+ self._code.append(
684
+ textwrap.dedent("""\
685
+ import colorama
686
+ import copy
687
+ import json
688
+ import multiprocessing
689
+ import signal
690
+ import threading
691
+ from sky.backends import backend_utils
692
+ """))
693
+ self._add_skylet_imports()
694
+
695
+ self._add_constants()
696
+
697
+ self._add_logging_functions()
698
+
699
+ self._code.append(
700
+ textwrap.dedent(f"""\
701
+ def _cancel_slurm_job_steps():
702
+ slurm_job_id = {self._slurm_job_id!r}
703
+ assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
704
+ try:
705
+ # Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
706
+ # Output format: "JOBID.STEPID STEPNAME"
707
+ # TODO(kevin): This assumes that compute node is able
708
+ # to run client commands against the controller.
709
+ # Validate this assumption.
710
+ result = subprocess.run(
711
+ ['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
712
+ capture_output=True, text=True, check=False)
713
+ for line in result.stdout.strip().split('\\n'):
714
+ if not line:
715
+ continue
716
+ parts = line.split()
717
+ assert len(parts) >= 2, 'Expected at least 2 parts'
718
+ step_id, step_name = parts[0], parts[1]
719
+ if step_name == f'sky-{self.job_id}':
720
+ subprocess.run(['scancel', step_id],
721
+ check=False, capture_output=True)
722
+ except Exception as e:
723
+ print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
724
+ pass
725
+
726
+ def _slurm_cleanup_handler(signum, _frame):
727
+ _cancel_slurm_job_steps()
728
+ # Re-raise to let default handler terminate.
729
+ signal.signal(signum, signal.SIG_DFL)
730
+ os.kill(os.getpid(), signum)
731
+
732
+ signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
733
+ """))
734
+
735
+ self._code += [
736
+ 'autostop_lib.set_last_active_time_to_now()',
737
+ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
738
+ ]
739
+
740
+ self._setup_cmd: Optional[str] = None
741
+ self._setup_envs: Optional[Dict[str, str]] = None
742
+ self._setup_log_dir: Optional[str] = None
743
+ self._setup_num_nodes: Optional[int] = None
744
+
745
+ def add_setup(
746
+ self,
747
+ num_nodes: int,
748
+ resources_dict: Dict[str, float],
749
+ stable_cluster_internal_ips: List[str],
750
+ env_vars: Dict[str, str],
751
+ log_dir: str,
752
+ setup_cmd: Optional[str] = None,
753
+ ) -> None:
754
+ assert self._has_prologue, ('Call add_prologue() before add_setup().')
755
+ self._has_setup = True
756
+ self._cluster_num_nodes = len(stable_cluster_internal_ips)
757
+ self._stable_cluster_ips = stable_cluster_internal_ips
758
+
759
+ self._add_waiting_for_resources_msg(num_nodes)
760
+
761
+ # Store setup information for use in add_task().
762
+ if setup_cmd is not None:
763
+ setup_envs = env_vars.copy()
764
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
765
+ self._setup_cmd = setup_cmd
766
+ self._setup_envs = setup_envs
767
+ self._setup_log_dir = log_dir
768
+ self._setup_num_nodes = num_nodes
769
+
770
+ def add_task(
771
+ self,
772
+ num_nodes: int,
773
+ bash_script: Optional[str],
774
+ task_name: Optional[str],
775
+ resources_dict: Dict[str, float],
776
+ log_dir: str,
777
+ env_vars: Optional[Dict[str, str]] = None,
778
+ ) -> None:
779
+ """Generates code for invoking a bash command
780
+ using srun within sbatch allocation.
781
+ """
782
+ assert self._has_setup, 'Call add_setup() before add_task().'
783
+ env_vars = env_vars or {}
784
+ task_name = task_name if task_name is not None else 'task'
785
+
786
+ acc_name, acc_count = self._get_accelerator_details(resources_dict)
787
+ num_gpus = 0
788
+ if (acc_name is not None and
789
+ not accelerator_registry.is_schedulable_non_gpu_accelerator(
790
+ acc_name)):
791
+ num_gpus = int(math.ceil(acc_count))
792
+
793
+ # Slurm does not support fractional CPUs.
794
+ task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
795
+
796
+ sky_env_vars_dict_str = [
797
+ textwrap.dedent(f"""\
798
+ sky_env_vars_dict = {{}}
799
+ sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
800
+ """)
801
+ ]
802
+
803
+ if env_vars:
804
+ sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
805
+ for k, v in env_vars.items())
806
+ sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
807
+
808
+ rclone_flush_script = self._get_rclone_flush_script()
809
+ streaming_msg = self._get_job_started_msg()
810
+ has_setup_cmd = self._setup_cmd is not None
811
+
812
+ self._code += [
813
+ sky_env_vars_dict_str,
814
+ textwrap.dedent(f"""\
815
+ script = {bash_script!r}
816
+ if script is None:
817
+ script = ''
818
+ rclone_flush_script = {rclone_flush_script!r}
819
+
820
+ if script or {has_setup_cmd!r}:
821
+ script += rclone_flush_script
822
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
823
+
824
+ # Signal files for setup/run synchronization:
825
+ # 1. alloc_signal_file: srun has acquired allocation
826
+ # 2. setup_done_signal_file: Driver has finished setup, run can proceed
827
+ #
828
+ # Signal files are stored in home directory, which is
829
+ # assumed to be on a shared NFS mount accessible by all nodes.
830
+ # To support clusters with non-NFS home directories, we would
831
+ # need to let users specify an NFS-backed "working directory"
832
+ # or use a different coordination mechanism.
833
+ alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
834
+ alloc_signal_file = os.path.expanduser(alloc_signal_file)
835
+ setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
836
+ setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
837
+
838
+ # Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
839
+ gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
840
+
841
+ def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
842
+ task_name=None, is_setup=False,
843
+ alloc_signal=None, setup_done_signal=None):
844
+ env_vars_json = json.dumps(env_vars_dict)
845
+
846
+ log_dir = shlex.quote(log_dir)
847
+ env_vars = shlex.quote(env_vars_json)
848
+ cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
849
+
850
+ runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
851
+
852
+ if task_name is not None:
853
+ runner_args += f' --task-name={{shlex.quote(task_name)}}'
854
+
855
+ if is_setup:
856
+ runner_args += ' --is-setup'
857
+
858
+ if alloc_signal is not None:
859
+ runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
860
+
861
+ if setup_done_signal is not None:
862
+ runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
863
+
864
+ script_path = None
865
+ prefix = 'sky_setup_' if is_setup else 'sky_task_'
866
+ if backend_utils.is_command_length_over_limit(user_script):
867
+ with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
868
+ f.write(user_script)
869
+ script_path = f.name
870
+ runner_args += f' --script-path={{shlex.quote(script_path)}}'
871
+ else:
872
+ runner_args += f' --script={{shlex.quote(user_script)}}'
873
+
874
+ # Use /usr/bin/env explicitly to work around a Slurm quirk where
875
+ # srun's execvp() doesn't check execute permissions, failing when
876
+ # $HOME/.local/bin/env (non-executable, from uv installation)
877
+ # shadows /usr/bin/env.
878
+ job_suffix = '-setup' if is_setup else ''
879
+ # Unset SLURM_* environment variables before running srun.
880
+ # When this srun runs inside another srun (from
881
+ # SlurmCommandRunner.run), inherited variables like
882
+ # SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
883
+ # the inner srun to the parent step's allocation. This causes
884
+ # "CPU binding outside of job step allocation" errors.
885
+ # Unsetting all SLURM_* variables allows this srun to access the full job
886
+ # allocation. See:
887
+ # https://support.schedmd.com/show_bug.cgi?id=14298
888
+ # https://github.com/huggingface/datatrove/issues/248
889
+ srun_cmd = (
890
+ "unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
891
+ f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
892
+ f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
893
+ f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
894
+ )
895
+ return srun_cmd, script_path
896
+
897
+ def run_thread_func():
898
+ # This blocks until Slurm allocates resources (--exclusive)
899
+ # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
900
+ run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
901
+ srun_cmd, task_script_path = build_task_runner_cmd(
902
+ script, run_flags, {log_dir!r}, sky_env_vars_dict,
903
+ task_name={task_name!r},
904
+ alloc_signal=alloc_signal_file,
905
+ setup_done_signal=setup_done_signal_file
906
+ )
907
+
908
+ proc = subprocess.Popen(srun_cmd, shell=True,
909
+ stdout=subprocess.PIPE,
910
+ stderr=subprocess.STDOUT,
911
+ text=True)
912
+ for line in proc.stdout:
913
+ print(line, end='', flush=True)
914
+ proc.wait()
915
+
916
+ if task_script_path is not None:
917
+ os.remove(task_script_path)
918
+ return {{'return_code': proc.returncode, 'pid': proc.pid}}
919
+
920
+ run_thread_result = {{'result': None}}
921
+ def run_thread_wrapper():
922
+ run_thread_result['result'] = run_thread_func()
923
+
924
+ run_thread = threading.Thread(target=run_thread_wrapper)
925
+ run_thread.start()
926
+
927
+ # Wait for allocation signal from inside srun
928
+ while not os.path.exists(alloc_signal_file):
929
+ if not run_thread.is_alive():
930
+ # srun failed before creating the signal file.
931
+ run_thread.join()
932
+ result = run_thread_result['result']
933
+ returncode = int(result.get('return_code', 1))
934
+ pid = result.get('pid', os.getpid())
935
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
936
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
937
+ print(msg, flush=True)
938
+ returncodes = [returncode]
939
+ if int(constants.SKYLET_VERSION) >= 28:
940
+ job_lib.set_exit_codes({self.job_id!r}, returncodes)
941
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
942
+ sys.exit(1)
943
+ time.sleep(0.1)
944
+
945
+ print({streaming_msg!r}, flush=True)
946
+
947
+ if {has_setup_cmd!r}:
948
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
949
+
950
+ # The schedule_step should be called after the job status is set to
951
+ # non-PENDING, otherwise, the scheduler will think the current job
952
+ # is not submitted yet, and skip the scheduling step.
953
+ job_lib.scheduler.schedule_step()
954
+
955
+ # --overlap as we have already secured allocation with the srun for the run section,
956
+ # and otherwise this srun would get blocked and deadlock.
957
+ setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
958
+ setup_srun, setup_script_path = build_task_runner_cmd(
959
+ {self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
960
+ is_setup=True
961
+ )
962
+
963
+ # Run setup srun directly, streaming output to driver stdout
964
+ setup_proc = subprocess.Popen(setup_srun, shell=True,
965
+ stdout=subprocess.PIPE,
966
+ stderr=subprocess.STDOUT,
967
+ text=True)
968
+ for line in setup_proc.stdout:
969
+ print(line, end='', flush=True)
970
+ setup_proc.wait()
971
+
972
+ if setup_script_path is not None:
973
+ os.remove(setup_script_path)
974
+
975
+ setup_returncode = setup_proc.returncode
976
+ if setup_returncode != 0:
977
+ setup_pid = setup_proc.pid
978
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
979
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
980
+ print(msg, flush=True)
981
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
982
+ # Cancel the srun spawned by run_thread_func.
983
+ _cancel_slurm_job_steps()
984
+ sys.exit(1)
985
+
986
+ job_lib.set_job_started({self.job_id!r})
987
+ if not {has_setup_cmd!r}:
988
+ # Need to call schedule_step() to make sure the scheduler
989
+ # schedule the next pending job.
990
+ job_lib.scheduler.schedule_step()
991
+
992
+ # Signal run thread to proceed.
993
+ pathlib.Path(setup_done_signal_file).touch()
994
+
995
+ # Wait for run thread to complete.
996
+ run_thread.join()
997
+ result = run_thread_result['result']
998
+
999
+ # Cleanup signal files
1000
+ if os.path.exists(alloc_signal_file):
1001
+ os.remove(alloc_signal_file)
1002
+ if os.path.exists(setup_done_signal_file):
1003
+ os.remove(setup_done_signal_file)
1004
+
1005
+ returncodes = [int(result.get('return_code', 1))]
1006
+ else:
1007
+ returncodes = [0]
1008
+ """),
1009
+ ]
sky/catalog/__init__.py CHANGED
@@ -127,12 +127,9 @@ def list_accelerator_realtime(
127
127
  case_sensitive: bool = True,
128
128
  ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
129
129
  """Lists all accelerators offered by Sky with their realtime availability.
130
-
131
130
  Realtime availability is the total number of accelerators in the cluster
132
131
  and number of accelerators available at the time of the call.
133
-
134
132
  Used for fixed size cluster settings, such as Kubernetes.
135
-
136
133
  Returns:
137
134
  A tuple of three dictionaries mapping canonical accelerator names to:
138
135
  - A list of available counts. (e.g., [1, 2, 4])
@@ -189,6 +189,9 @@ SERIES_TO_DESCRIPTION = {
189
189
  'c2': 'Compute optimized',
190
190
  'c2d': 'C2D AMD Instance',
191
191
  'c3': 'C3 Instance',
192
+ 'c3d': 'C3D Instance',
193
+ 'c4': 'C4 Instance',
194
+ 'c4d': 'C4D Instance',
192
195
  'e2': 'E2 Instance',
193
196
  'f1': 'Micro Instance with burstable CPU',
194
197
  'g1': 'Small Instance with 1 VCPU',
@@ -376,8 +379,13 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
376
379
  is_cpu = True
377
380
  elif resource_group == 'RAM':
378
381
  is_memory = True
382
+ elif resource_group == 'LocalSSD':
383
+ # Ignore local SSD pricing for now, as we do not include disk
384
+ # pricing for instances for now.
385
+ # TODO(zhwu): Handle local SSD pricing.
386
+ pass
379
387
  else:
380
- assert resource_group == 'N1Standard'
388
+ assert resource_group == 'N1Standard', (resource_group, sku)
381
389
  if 'Core' in description:
382
390
  is_cpu = True
383
391
  elif 'Ram' in description:
@@ -180,7 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
180
180
  presets (List[PresetInfo]): A list of PresetInfo objects to write.
181
181
  output_file (str): The path to the output CSV file.
182
182
  """
183
- os.makedirs(os.path.dirname(output_file))
183
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
184
184
  # Set up the CSV writer to output to stdout
185
185
  with open(output_file, 'w', encoding='utf-8') as out:
186
186
  header = [
@@ -50,7 +50,7 @@ if __name__ == '__main__':
50
50
  ('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
51
51
  ('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
52
52
  ('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
53
- ('geolocation', 'Region'))
53
+ ('geolocation', 'Region'), ('hosting_type', 'HostingType'))
54
54
 
55
55
  # Vast has a wide variety of machines, some of
56
56
  # which will have less diskspace and network
@@ -138,7 +138,9 @@ if __name__ == '__main__':
138
138
 
139
139
  maxBid = max([x.get('SpotPrice') for x in toList])
140
140
  for instance in toList:
141
- stub = f'{instance["InstanceType"]} {instance["Region"][-2:]}'
141
+ hosting_type = instance.get('HostingType', 0)
142
+ stub = (f'{instance["InstanceType"]} '
143
+ f'{instance["Region"][-2:]} {hosting_type}')
142
144
  if stub in seen:
143
145
  printstub = f'{stub}#print'
144
146
  if printstub not in seen:
@@ -204,6 +204,9 @@ def _list_accelerators(
204
204
  min_quantity_filter = quantity_filter if quantity_filter else 1
205
205
 
206
206
  for node in nodes:
207
+ # Check if node is ready
208
+ node_is_ready = node.is_ready()
209
+
207
210
  for key in keys:
208
211
  if key in node.metadata.labels:
209
212
  accelerator_name = lf.get_accelerator_from_label_value(
@@ -260,6 +263,15 @@ def _list_accelerators(
260
263
  total_accelerators_capacity[
261
264
  accelerator_name] += quantized_count
262
265
 
266
+ # Initialize the total_accelerators_available to make sure the
267
+ # key exists in the dictionary.
268
+ total_accelerators_available[accelerator_name] = (
269
+ total_accelerators_available.get(accelerator_name, 0))
270
+
271
+ # Skip availability counting for not-ready nodes
272
+ if not node_is_ready:
273
+ continue
274
+
263
275
  if error_on_get_allocated_gpu_qty_by_node:
264
276
  # If we can't get the allocated GPU quantity by each node,
265
277
  # we can't get the GPU usage.
@@ -268,10 +280,6 @@ def _list_accelerators(
268
280
 
269
281
  allocated_qty = allocated_qty_by_node[node.metadata.name]
270
282
  accelerators_available = accelerator_count - allocated_qty
271
- # Initialize the total_accelerators_available to make sure the
272
- # key exists in the dictionary.
273
- total_accelerators_available[accelerator_name] = (
274
- total_accelerators_available.get(accelerator_name, 0))
275
283
 
276
284
  if accelerators_available >= min_quantity_filter:
277
285
  quantized_availability = min_quantity_filter * (
@@ -7,22 +7,33 @@ query instance types and pricing information for Seeweb.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple
9
9
 
10
- import pandas as pd
11
-
10
+ from sky.adaptors import common as adaptors_common
12
11
  from sky.catalog import common
13
12
  from sky.utils import resources_utils
14
13
  from sky.utils import ux_utils
15
14
 
16
15
  if typing.TYPE_CHECKING:
16
+ import pandas as pd
17
+
17
18
  from sky.clouds import cloud
19
+ else:
20
+ pd = adaptors_common.LazyImport('pandas')
18
21
 
19
22
  _PULL_FREQUENCY_HOURS = 8
20
- _df = common.read_catalog('seeweb/vms.csv',
21
- pull_frequency_hours=_PULL_FREQUENCY_HOURS)
23
+ _df = None
24
+
25
+
26
+ def _get_df():
27
+ """Get the dataframe, loading it lazily if needed."""
28
+ global _df
29
+ if _df is None:
30
+ _df = common.read_catalog('seeweb/vms.csv',
31
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
32
+ return _df
22
33
 
23
34
 
24
35
  def instance_type_exists(instance_type: str) -> bool:
25
- result = common.instance_type_exists_impl(_df, instance_type)
36
+ result = common.instance_type_exists_impl(_get_df(), instance_type)
26
37
  return result
27
38
 
28
39
 
@@ -33,7 +44,7 @@ def validate_region_zone(
33
44
  with ux_utils.print_exception_no_traceback():
34
45
  raise ValueError('Seeweb does not support zones.')
35
46
 
36
- result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
47
+ result = common.validate_region_zone_impl('Seeweb', _get_df(), region, zone)
37
48
  return result
38
49
 
39
50
 
@@ -46,14 +57,15 @@ def get_hourly_cost(instance_type: str,
46
57
  with ux_utils.print_exception_no_traceback():
47
58
  raise ValueError('Seeweb does not support zones.')
48
59
 
49
- result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
50
- zone)
60
+ result = common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
61
+ region, zone)
51
62
  return result
52
63
 
53
64
 
54
65
  def get_vcpus_mem_from_instance_type(
55
66
  instance_type: str) -> Tuple[Optional[float], Optional[float]]:
56
- result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
67
+ result = common.get_vcpus_mem_from_instance_type_impl(
68
+ _get_df(), instance_type)
57
69
  return result
58
70
 
59
71
 
@@ -64,7 +76,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
64
76
  region: Optional[str] = None,
65
77
  zone: Optional[str] = None) -> Optional[str]:
66
78
  del disk_tier # unused
67
- result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
79
+ result = common.get_instance_type_for_cpus_mem_impl(_get_df(), cpus, memory,
68
80
  region, zone)
69
81
  return result
70
82
 
@@ -72,7 +84,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
72
84
  def get_accelerators_from_instance_type(
73
85
  instance_type: str) -> Optional[Dict[str, int]]:
74
86
  # Filter the dataframe for the specific instance type
75
- df_filtered = _df[_df['InstanceType'] == instance_type]
87
+ df = _get_df()
88
+ df_filtered = df[df['InstanceType'] == instance_type]
76
89
  if df_filtered.empty:
77
90
  return None
78
91
 
@@ -114,7 +127,7 @@ def get_instance_type_for_accelerator(
114
127
  with ux_utils.print_exception_no_traceback():
115
128
  raise ValueError('Seeweb does not support zones.')
116
129
 
117
- result = common.get_instance_type_for_accelerator_impl(df=_df,
130
+ result = common.get_instance_type_for_accelerator_impl(df=_get_df(),
118
131
  acc_name=acc_name,
119
132
  acc_count=acc_count,
120
133
  cpus=cpus,
@@ -126,7 +139,7 @@ def get_instance_type_for_accelerator(
126
139
 
127
140
 
128
141
  def regions() -> List['cloud.Region']:
129
- result = common.get_region_zones(_df, use_spot=False)
142
+ result = common.get_region_zones(_get_df(), use_spot=False)
130
143
  return result
131
144
 
132
145
 
@@ -135,7 +148,8 @@ def get_region_zones_for_instance_type(instance_type: str,
135
148
  ) -> List['cloud.Region']:
136
149
  """Returns a list of regions for a given instance type."""
137
150
  # Filter the dataframe for the specific instance type
138
- df_filtered = _df[_df['InstanceType'] == instance_type]
151
+ df = _get_df()
152
+ df_filtered = df[df['InstanceType'] == instance_type]
139
153
  if df_filtered.empty:
140
154
  return []
141
155
 
@@ -174,7 +188,8 @@ def list_accelerators(
174
188
  require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
175
189
  """Lists accelerators offered in Seeweb."""
176
190
  # Filter out rows with empty or null regions (indicating unavailability)
177
- df_filtered = _df.dropna(subset=['Region'])
191
+ df = _get_df()
192
+ df_filtered = df.dropna(subset=['Region'])
178
193
  df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
179
194
 
180
195
  result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,