skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -81,6 +81,7 @@ from sky.utils import timeline
81
81
  from sky.utils import ux_utils
82
82
  from sky.utils import volume as volume_lib
83
83
  from sky.utils import yaml_utils
84
+ from sky.utils.plugin_extensions import ExternalFailureSource
84
85
 
85
86
  if typing.TYPE_CHECKING:
86
87
  import grpc
@@ -915,8 +916,10 @@ class RetryingVmProvisioner(object):
915
916
  elif to_provision.region is not None and to_provision.cloud is not None:
916
917
  # For public clouds, provision.region is always set.
917
918
  if clouds.SSH().is_same_cloud(to_provision.cloud):
919
+ ssh_node_pool_name = common_utils.removeprefix(
920
+ to_provision.region, 'ssh-')
918
921
  message += (
919
- f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
922
+ f'in SSH Node Pool ({ssh_node_pool_name}) '
920
923
  f'for {requested_resources}. The SSH Node Pool may not '
921
924
  'have enough resources.')
922
925
  elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
@@ -1176,7 +1179,9 @@ class RetryingVmProvisioner(object):
1176
1179
  if isinstance(to_provision.cloud, clouds.Kubernetes):
1177
1180
  suffix = '.'
1178
1181
  if region.name.startswith('ssh-'):
1179
- suffix = f' ({region.name.lstrip("ssh-")})'
1182
+ ssh_node_pool_name = common_utils.removeprefix(
1183
+ region.name, 'ssh-')
1184
+ suffix = f' ({ssh_node_pool_name})'
1180
1185
  logger.info(
1181
1186
  ux_utils.starting_message(
1182
1187
  f'Launching{controller_str} on '
@@ -2732,6 +2737,13 @@ class SkyletClient:
2732
2737
  ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
2733
2738
  return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
2734
2739
 
2740
+ def get_job_exit_codes(
2741
+ self,
2742
+ request: 'jobsv1_pb2.GetJobExitCodesRequest',
2743
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2744
+ ) -> 'jobsv1_pb2.GetJobExitCodesResponse':
2745
+ return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
2746
+
2735
2747
  def tail_logs(
2736
2748
  self,
2737
2749
  request: 'jobsv1_pb2.TailLogsRequest',
@@ -3040,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3040
3052
  'sky api status -v | grep '
3041
3053
  f'{cluster_name}'))
3042
3054
 
3055
+ def _maybe_clear_external_cluster_failures(
3056
+ self, cluster_name: str,
3057
+ prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
3058
+ """Clear any existing cluster failures when reusing a cluster.
3059
+
3060
+ Clear any existing cluster failures when reusing a cluster. This ensures
3061
+ that when a cluster failure is detected (causing the cluster to be
3062
+ marked as INIT), the user can recover the cluster via `sky start` or
3063
+ `sky launch` and clear the failure.
3064
+ """
3065
+ if prev_cluster_status is not None:
3066
+ failures = ExternalFailureSource.clear(cluster_name=cluster_name)
3067
+ if failures:
3068
+ failure_details = [f'"{f["failure_mode"]}"' for f in failures]
3069
+ plural = 's' if len(failures) > 1 else ''
3070
+ logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
3071
+ f'existing cluster failure{plural} for cluster '
3072
+ f'{cluster_name!r}: {", ".join(failure_details)}'
3073
+ f'{colorama.Style.RESET_ALL}')
3074
+
3043
3075
  def _locked_provision(
3044
3076
  self,
3045
3077
  lock_id: str,
@@ -3070,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3070
3102
  to_provision_config.num_nodes, to_provision_config.resources)
3071
3103
  usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
3072
3104
 
3105
+ self._maybe_clear_external_cluster_failures(cluster_name,
3106
+ prev_cluster_status)
3107
+
3073
3108
  # TODO(suquark): once we have sky on PyPI, we should directly
3074
3109
  # install sky from PyPI.
3075
3110
  # NOTE: can take ~2s.
@@ -3428,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3428
3463
  ssh_user=handle.ssh_user,
3429
3464
  docker_user=handle.docker_user)
3430
3465
  cluster_utils.SSHConfigHelper.add_cluster(
3431
- handle.cluster_name, handle.cached_external_ips, auth_config,
3466
+ handle.cluster_name, handle.cluster_name_on_cloud,
3467
+ handle.cached_external_ips, auth_config,
3432
3468
  handle.cached_external_ssh_ports, handle.docker_user,
3433
3469
  handle.ssh_user)
3434
3470
 
@@ -3769,20 +3805,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3769
3805
  up=True,
3770
3806
  stream_logs=False)
3771
3807
 
3772
- cd = f'cd {SKY_REMOTE_WORKDIR}'
3773
- mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3774
- f'touch {remote_log_path}')
3808
+ mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
3775
3809
  encoded_script = shlex.quote(codegen)
3776
3810
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3777
3811
  job_submit_cmd = (
3778
3812
  # JOB_CMD_IDENTIFIER is used for identifying the process
3779
3813
  # retrieved with pid is the same driver process.
3780
3814
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3781
- f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3815
+ f'{constants.SKY_PYTHON_CMD} -u {script_path}'
3782
3816
  # Do not use &>, which is not POSIX and may not work.
3783
3817
  # Note that the order of ">filename 2>&1" matters.
3784
3818
  f'> {remote_log_path} 2>&1')
3785
3819
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3820
+
3821
+ # For Slurm, we need to wait for the job to complete before exiting,
3822
+ # because Slurm's proctrack/cgroup kills all processes when the srun
3823
+ # job step ends, including child processes launched as a separate
3824
+ # process group.
3825
+ # So this keeps srun alive so the job driver process that was spawned
3826
+ # (and runs in the background) by job_lib.JobScheduler.schedule_step()
3827
+ # does not get killed.
3828
+ # Note: proctrack/cgroup is enabled by default on Nebius' Managed
3829
+ # Soperator.
3830
+ is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
3831
+ if is_slurm:
3832
+ wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
3833
+ code = code + ' && ' + wait_code
3834
+
3786
3835
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3787
3836
 
3788
3837
  # Should also be ealier than is_command_length_over_limit
@@ -3867,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3867
3916
 
3868
3917
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3869
3918
 
3870
- returncode, stdout, stderr = self.run_on_head(handle,
3871
- job_submit_cmd,
3872
- stream_logs=False,
3873
- require_outputs=True)
3919
+ # For Slurm, run in background so that SSH returns immediately.
3920
+ # This is needed because we add the wait_for_job code above which
3921
+ # makes the command block until the job completes.
3922
+ returncode, stdout, stderr = self.run_on_head(
3923
+ handle,
3924
+ job_submit_cmd,
3925
+ stream_logs=False,
3926
+ require_outputs=True,
3927
+ run_in_background=is_slurm)
3874
3928
  # Happens when someone calls `sky exec` but remote is outdated for
3875
3929
  # running a job. Necessitating calling `sky launch`.
3876
3930
  backend_utils.check_stale_runtime_on_remote(returncode, stderr,
@@ -3887,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3887
3941
  _dump_code_to_file(codegen)
3888
3942
  job_submit_cmd = f'{mkdir_code} && {code}'
3889
3943
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3944
+ # See comment above for why run_in_background=is_slurm.
3890
3945
  returncode, stdout, stderr = self.run_on_head(
3891
3946
  handle,
3892
3947
  job_submit_cmd,
3893
3948
  stream_logs=False,
3894
- require_outputs=True)
3949
+ require_outputs=True,
3950
+ run_in_background=is_slurm)
3895
3951
 
3896
3952
  subprocess_utils.handle_returncode(
3897
3953
  returncode,
@@ -4950,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4950
5006
  ports_cleaned_up = True
4951
5007
  except exceptions.PortDoesNotExistError:
4952
5008
  logger.debug('Ports do not exist. Skipping cleanup.')
5009
+ ports_cleaned_up = True
4953
5010
  except Exception as e: # pylint: disable=broad-except
4954
5011
  if purge:
4955
5012
  msg = common_utils.format_exception(e, use_bracket=True)
@@ -5022,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5022
5079
  config['provider'],
5023
5080
  non_terminated_only=False)
5024
5081
 
5025
- unexpected_node_state: Optional[Tuple[str, str]] = None
5082
+ unexpected_nodes = []
5026
5083
  for node_id, node_status_tuple in node_status_dict.items():
5027
5084
  node_status, reason = node_status_tuple
5028
- reason = '' if reason is None else f' ({reason})'
5029
- logger.debug(f'{node_id} status: {node_status}{reason}')
5085
+ reason_str = '' if reason is None else f' ({reason})'
5086
+ logger.debug(f'{node_id} status: {node_status}{reason_str}')
5030
5087
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
5031
5088
  # between "stopping/stopped" and "terminating/terminated",
5032
5089
  # so we allow for either status instead of casing on
@@ -5034,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5034
5091
  if node_status not in [
5035
5092
  None, status_lib.ClusterStatus.STOPPED
5036
5093
  ]:
5037
- unexpected_node_state = (node_id, node_status)
5038
- break
5094
+ unexpected_nodes.append((node_id, node_status, reason))
5039
5095
 
5040
- if unexpected_node_state is None:
5096
+ if not unexpected_nodes:
5041
5097
  break
5042
5098
 
5043
5099
  attempts += 1
5044
5100
  if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
5045
5101
  time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
5046
5102
  else:
5047
- (node_id, node_status) = unexpected_node_state
5048
- raise RuntimeError(f'Instance {node_id} in unexpected '
5049
- f'state {node_status}.')
5103
+ unexpected_nodes_str = '\n'.join([
5104
+ f' - {node_id}: {node_status}' +
5105
+ (f' ({reason})' if reason else '')
5106
+ for node_id, node_status, reason in unexpected_nodes
5107
+ ])
5108
+ raise RuntimeError(f'Instances in unexpected state:\n'
5109
+ f'{unexpected_nodes_str}')
5050
5110
 
5051
5111
  # If cluster_yaml is None, the cluster should ensured to be terminated,
5052
5112
  # so we don't need to do the double check.
@@ -5333,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5333
5393
  assert handle is not None
5334
5394
  # Cluster already exists.
5335
5395
  self.check_resources_fit_cluster(handle, task)
5396
+
5336
5397
  # Use the existing cluster.
5337
5398
  assert handle.launched_resources is not None, (cluster_name, handle)
5338
5399
  # Take a random resource in order to get resource info that applies
@@ -5384,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5384
5445
  for resource in task.resources:
5385
5446
  assert (resource.cluster_config_overrides ==
5386
5447
  one_task_resource.cluster_config_overrides)
5387
- if isinstance(to_provision.cloud, clouds.Kubernetes):
5448
+
5449
+ cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5450
+ cluster_name)
5451
+ cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
5452
+ if cluster_yaml_str is not None else None)
5453
+
5454
+ def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5455
+ return (yaml_obj.get('available_node_types',
5456
+ {}).get('ray_head_default',
5457
+ {}).get('node_config', {}))
5458
+
5459
+ if isinstance(to_provision.cloud,
5460
+ clouds.Kubernetes) and cluster_yaml_obj is not None:
5388
5461
  # Warn users if the Kubernetes pod config is different
5389
5462
  # from the existing cluster.
5390
- cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5391
- cluster_name)
5392
- actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
5393
5463
  desired_cluster_yaml_obj = (
5394
5464
  kubernetes_utils.combine_pod_config_fields_and_metadata(
5395
- actual_cluster_yaml_obj,
5465
+ cluster_yaml_obj,
5396
5466
  cluster_config_overrides=one_task_resource.
5397
5467
  cluster_config_overrides,
5398
5468
  cloud=to_provision.cloud,
5399
5469
  context=to_provision.region))
5400
5470
 
5401
- def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5402
- return (yaml_obj.get('available_node_types',
5403
- {}).get('ray_head_default',
5404
- {}).get('node_config', {}))
5405
-
5406
5471
  if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
5407
- actual_cluster_yaml_obj):
5472
+ cluster_yaml_obj):
5408
5473
  # pylint: disable=line-too-long
5409
5474
  logger.warning(
5410
5475
  f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
@@ -5415,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5415
5480
  f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
5416
5481
  f'{colorama.Style.RESET_ALL}')
5417
5482
 
5483
+ # Check for volume mount warnings
5484
+ if task.volume_mounts:
5485
+ # Get existing cluster's volume mounts from cluster yaml
5486
+ existing_volume_names = set()
5487
+ try:
5488
+ if cluster_yaml_obj is not None:
5489
+ # Extract volume names from existing cluster
5490
+ node_config = _get_pod_config(cluster_yaml_obj)
5491
+
5492
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
5493
+ # Check for K8s-style persistent volumes
5494
+ # (spec.volumes)
5495
+ # See sky/templates/kubernetes-ray.yml.j2.
5496
+ volumes = node_config.get('spec',
5497
+ {}).get('volumes', [])
5498
+ for vol in volumes:
5499
+ # Volume from PVC has structure:
5500
+ # - name: <volume_name>
5501
+ # persistentVolumeClaim:
5502
+ # claimName: <volume_name_on_cloud>
5503
+ if 'persistentVolumeClaim' in vol:
5504
+ pvc = vol.get('persistentVolumeClaim', {})
5505
+ # Use claimName (volume_name_on_cloud) to
5506
+ # be consistent with RunPod.
5507
+ vol_name_on_cloud = pvc.get('claimName')
5508
+ if vol_name_on_cloud:
5509
+ existing_volume_names.add(
5510
+ vol_name_on_cloud)
5511
+
5512
+ # Check for K8s ephemeral volumes
5513
+ # See sky/templates/kubernetes-ray.yml.j2.
5514
+ provider_config = cluster_yaml_obj.get(
5515
+ 'provider', {})
5516
+ ephemeral_specs = provider_config.get(
5517
+ 'ephemeral_volume_specs', [])
5518
+ for spec in ephemeral_specs:
5519
+ # For ephemeral volumes, we check the mount
5520
+ # path.
5521
+ mount_path = spec.get('path')
5522
+ if mount_path:
5523
+ existing_volume_names.add(mount_path)
5524
+
5525
+ elif isinstance(to_provision.cloud, clouds.RunPod):
5526
+ # Check for custom VolumeMounts config
5527
+ # (e.g. RunPod)
5528
+ # See sky/templates/runpod-ray.yml.j2.
5529
+ volume_mounts_config = node_config.get(
5530
+ 'VolumeMounts', [])
5531
+ for vol_mount in volume_mounts_config:
5532
+ vol_name = vol_mount.get('VolumeNameOnCloud')
5533
+ if vol_name:
5534
+ existing_volume_names.add(vol_name)
5535
+ except Exception as e: # pylint: disable=broad-except
5536
+ # If we can't get the existing volume mounts, log debug
5537
+ # and skip the warning check
5538
+ logger.debug(f'Failed to check existing volume mounts: {e}',
5539
+ exc_info=True)
5540
+
5541
+ # Check if task has new volumes not in existing cluster
5542
+ new_ephemeral_volumes = []
5543
+ new_persistent_volumes = []
5544
+ for volume_mount in task.volume_mounts:
5545
+ # Compare using volume_name for user-facing name
5546
+ if volume_mount.is_ephemeral:
5547
+ if volume_mount.path not in existing_volume_names:
5548
+ new_ephemeral_volumes.append(volume_mount.path)
5549
+ elif (volume_mount.volume_name not in existing_volume_names
5550
+ and volume_mount.volume_config.name_on_cloud
5551
+ not in existing_volume_names):
5552
+ new_persistent_volumes.append(volume_mount.volume_name)
5553
+
5554
+ if new_ephemeral_volumes or new_persistent_volumes:
5555
+ msg_parts = []
5556
+ if new_ephemeral_volumes:
5557
+ msg_parts.append(f'new ephemeral volume(s) with path '
5558
+ f'{", ".join(new_ephemeral_volumes)}')
5559
+ if new_persistent_volumes:
5560
+ msg_parts.append(
5561
+ f'new volume(s) {", ".join(new_persistent_volumes)}'
5562
+ )
5563
+
5564
+ volume_msg = ' and '.join(msg_parts)
5565
+ # Capitalize the first letter of the message
5566
+ volume_msg = volume_msg[0].upper() + volume_msg[1:]
5567
+
5568
+ logger.warning(
5569
+ f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
5570
+ f'specified in task but not '
5571
+ f'mounted to existing cluster "{cluster_name}". '
5572
+ f'These volumes will not be mounted to the cluster. '
5573
+ f'To mount new volumes, either:\n'
5574
+ f' • Use a new cluster, or\n'
5575
+ f' • Terminate and recreate this cluster'
5576
+ f'{colorama.Style.RESET_ALL}')
5577
+
5418
5578
  return RetryingVmProvisioner.ToProvisionConfig(
5419
5579
  cluster_name,
5420
5580
  to_provision,
@@ -147,6 +147,7 @@ class TaskCodeGen:
147
147
  if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
148
148
  [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
149
149
  [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
150
+ FLUSH_START_TIME=$(date +%s)
150
151
  flushed=0
151
152
  # extra second on top of --vfs-cache-poll-interval to
152
153
  # avoid race condition between rclone log line creation and this check.
@@ -159,13 +160,32 @@ class TaskCodeGen:
159
160
  exitcode=0
160
161
  tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
161
162
  if [ $exitcode -ne 0 ]; then
162
- echo "skypilot: cached mount is still uploading to remote"
163
+ ELAPSED=$(($(date +%s) - FLUSH_START_TIME))
164
+ # Extract the last vfs cache status line to show what we're waiting for
165
+ CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed 's/.*vfs cache: cleaned: //' 2>/dev/null)
166
+ # Extract currently uploading files from recent log lines (show up to 2 files)
167
+ UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed 's/.*INFO : //' | sed 's/: vfs cache:.*//' | tr '\\n' ',' | sed 's/,$//' | sed 's/,/, /g' 2>/dev/null)
168
+ # Build status message with available info
169
+ if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then
170
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}] uploading: ${{UPLOADING_FILES}}"
171
+ elif [ -n "$CACHE_STATUS" ]; then
172
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}]"
173
+ else
174
+ # Fallback: show last non-empty line from log
175
+ LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed 's/.*INFO : //' | sed 's/.*ERROR : //' | sed 's/.*NOTICE: //' 2>/dev/null)
176
+ if [ -n "$LAST_LINE" ]; then
177
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) ${{LAST_LINE}}"
178
+ else
179
+ echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s)"
180
+ fi
181
+ fi
163
182
  flushed=0
164
183
  break
165
184
  fi
166
185
  done
167
186
  done
168
- echo "skypilot: cached mount uploaded complete"
187
+ TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))
188
+ echo "skypilot: cached mount upload complete (took ${{TOTAL_FLUSH_TIME}}s)"
169
189
  fi""")
170
190
 
171
191
  def add_prologue(self, job_id: int) -> None:
@@ -214,6 +234,9 @@ class TaskCodeGen:
214
234
  self._code += [
215
235
  textwrap.dedent(f"""\
216
236
  if sum(returncodes) != 0:
237
+ # Save exit codes to job metadata for potential recovery logic
238
+ if int(constants.SKYLET_VERSION) >= 28:
239
+ job_lib.set_exit_codes({self.job_id!r}, returncodes)
217
240
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
218
241
  # Schedule the next pending job immediately to make the job
219
242
  # scheduling more efficient.
@@ -483,6 +506,8 @@ class RayCodeGen(TaskCodeGen):
483
506
  msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
484
507
  msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
485
508
  print(msg, flush=True)
509
+ if int(constants.SKYLET_VERSION) >= 28:
510
+ job_lib.set_exit_codes({self.job_id!r}, setup_returncodes)
486
511
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
487
512
  # This waits for all streaming logs to finish.
488
513
  time.sleep(1)
@@ -851,7 +876,18 @@ class SlurmCodeGen(TaskCodeGen):
851
876
  # $HOME/.local/bin/env (non-executable, from uv installation)
852
877
  # shadows /usr/bin/env.
853
878
  job_suffix = '-setup' if is_setup else ''
879
+ # Unset SLURM_* environment variables before running srun.
880
+ # When this srun runs inside another srun (from
881
+ # SlurmCommandRunner.run), inherited variables like
882
+ # SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
883
+ # the inner srun to the parent step's allocation. This causes
884
+ # "CPU binding outside of job step allocation" errors.
885
+ # Unsetting all SLURM_* variables allows this srun to access the full job
886
+ # allocation. See:
887
+ # https://support.schedmd.com/show_bug.cgi?id=14298
888
+ # https://github.com/huggingface/datatrove/issues/248
854
889
  srun_cmd = (
890
+ "unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
855
891
  f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
856
892
  f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
857
893
  f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
@@ -900,6 +936,8 @@ class SlurmCodeGen(TaskCodeGen):
900
936
  msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
901
937
  print(msg, flush=True)
902
938
  returncodes = [returncode]
939
+ if int(constants.SKYLET_VERSION) >= 28:
940
+ job_lib.set_exit_codes({self.job_id!r}, returncodes)
903
941
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
904
942
  sys.exit(1)
905
943
  time.sleep(0.1)
@@ -189,6 +189,9 @@ SERIES_TO_DESCRIPTION = {
189
189
  'c2': 'Compute optimized',
190
190
  'c2d': 'C2D AMD Instance',
191
191
  'c3': 'C3 Instance',
192
+ 'c3d': 'C3D Instance',
193
+ 'c4': 'C4 Instance',
194
+ 'c4d': 'C4D Instance',
192
195
  'e2': 'E2 Instance',
193
196
  'f1': 'Micro Instance with burstable CPU',
194
197
  'g1': 'Small Instance with 1 VCPU',
@@ -376,8 +379,13 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
376
379
  is_cpu = True
377
380
  elif resource_group == 'RAM':
378
381
  is_memory = True
382
+ elif resource_group == 'LocalSSD':
383
+ # Ignore local SSD pricing for now, as we do not include disk
384
+ # pricing for instances for now.
385
+ # TODO(zhwu): Handle local SSD pricing.
386
+ pass
379
387
  else:
380
- assert resource_group == 'N1Standard'
388
+ assert resource_group == 'N1Standard', (resource_group, sku)
381
389
  if 'Core' in description:
382
390
  is_cpu = True
383
391
  elif 'Ram' in description:
@@ -180,7 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
180
180
  presets (List[PresetInfo]): A list of PresetInfo objects to write.
181
181
  output_file (str): The path to the output CSV file.
182
182
  """
183
- os.makedirs(os.path.dirname(output_file))
183
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
184
184
  # Set up the CSV writer to output to stdout
185
185
  with open(output_file, 'w', encoding='utf-8') as out:
186
186
  header = [
@@ -50,7 +50,7 @@ if __name__ == '__main__':
50
50
  ('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
51
51
  ('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
52
52
  ('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
53
- ('geolocation', 'Region'))
53
+ ('geolocation', 'Region'), ('hosting_type', 'HostingType'))
54
54
 
55
55
  # Vast has a wide variety of machines, some of
56
56
  # which will have less diskspace and network
@@ -138,7 +138,9 @@ if __name__ == '__main__':
138
138
 
139
139
  maxBid = max([x.get('SpotPrice') for x in toList])
140
140
  for instance in toList:
141
- stub = f'{instance["InstanceType"]} {instance["Region"][-2:]}'
141
+ hosting_type = instance.get('HostingType', 0)
142
+ stub = (f'{instance["InstanceType"]} '
143
+ f'{instance["Region"][-2:]} {hosting_type}')
142
144
  if stub in seen:
143
145
  printstub = f'{stub}#print'
144
146
  if printstub not in seen:
@@ -7,22 +7,33 @@ query instance types and pricing information for Seeweb.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple
9
9
 
10
- import pandas as pd
11
-
10
+ from sky.adaptors import common as adaptors_common
12
11
  from sky.catalog import common
13
12
  from sky.utils import resources_utils
14
13
  from sky.utils import ux_utils
15
14
 
16
15
  if typing.TYPE_CHECKING:
16
+ import pandas as pd
17
+
17
18
  from sky.clouds import cloud
19
+ else:
20
+ pd = adaptors_common.LazyImport('pandas')
18
21
 
19
22
  _PULL_FREQUENCY_HOURS = 8
20
- _df = common.read_catalog('seeweb/vms.csv',
21
- pull_frequency_hours=_PULL_FREQUENCY_HOURS)
23
+ _df = None
24
+
25
+
26
+ def _get_df():
27
+ """Get the dataframe, loading it lazily if needed."""
28
+ global _df
29
+ if _df is None:
30
+ _df = common.read_catalog('seeweb/vms.csv',
31
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
32
+ return _df
22
33
 
23
34
 
24
35
  def instance_type_exists(instance_type: str) -> bool:
25
- result = common.instance_type_exists_impl(_df, instance_type)
36
+ result = common.instance_type_exists_impl(_get_df(), instance_type)
26
37
  return result
27
38
 
28
39
 
@@ -33,7 +44,7 @@ def validate_region_zone(
33
44
  with ux_utils.print_exception_no_traceback():
34
45
  raise ValueError('Seeweb does not support zones.')
35
46
 
36
- result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
47
+ result = common.validate_region_zone_impl('Seeweb', _get_df(), region, zone)
37
48
  return result
38
49
 
39
50
 
@@ -46,14 +57,15 @@ def get_hourly_cost(instance_type: str,
46
57
  with ux_utils.print_exception_no_traceback():
47
58
  raise ValueError('Seeweb does not support zones.')
48
59
 
49
- result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
50
- zone)
60
+ result = common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
61
+ region, zone)
51
62
  return result
52
63
 
53
64
 
54
65
  def get_vcpus_mem_from_instance_type(
55
66
  instance_type: str) -> Tuple[Optional[float], Optional[float]]:
56
- result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
67
+ result = common.get_vcpus_mem_from_instance_type_impl(
68
+ _get_df(), instance_type)
57
69
  return result
58
70
 
59
71
 
@@ -64,7 +76,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
64
76
  region: Optional[str] = None,
65
77
  zone: Optional[str] = None) -> Optional[str]:
66
78
  del disk_tier # unused
67
- result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
79
+ result = common.get_instance_type_for_cpus_mem_impl(_get_df(), cpus, memory,
68
80
  region, zone)
69
81
  return result
70
82
 
@@ -72,7 +84,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
72
84
  def get_accelerators_from_instance_type(
73
85
  instance_type: str) -> Optional[Dict[str, int]]:
74
86
  # Filter the dataframe for the specific instance type
75
- df_filtered = _df[_df['InstanceType'] == instance_type]
87
+ df = _get_df()
88
+ df_filtered = df[df['InstanceType'] == instance_type]
76
89
  if df_filtered.empty:
77
90
  return None
78
91
 
@@ -114,7 +127,7 @@ def get_instance_type_for_accelerator(
114
127
  with ux_utils.print_exception_no_traceback():
115
128
  raise ValueError('Seeweb does not support zones.')
116
129
 
117
- result = common.get_instance_type_for_accelerator_impl(df=_df,
130
+ result = common.get_instance_type_for_accelerator_impl(df=_get_df(),
118
131
  acc_name=acc_name,
119
132
  acc_count=acc_count,
120
133
  cpus=cpus,
@@ -126,7 +139,7 @@ def get_instance_type_for_accelerator(
126
139
 
127
140
 
128
141
  def regions() -> List['cloud.Region']:
129
- result = common.get_region_zones(_df, use_spot=False)
142
+ result = common.get_region_zones(_get_df(), use_spot=False)
130
143
  return result
131
144
 
132
145
 
@@ -135,7 +148,8 @@ def get_region_zones_for_instance_type(instance_type: str,
135
148
  ) -> List['cloud.Region']:
136
149
  """Returns a list of regions for a given instance type."""
137
150
  # Filter the dataframe for the specific instance type
138
- df_filtered = _df[_df['InstanceType'] == instance_type]
151
+ df = _get_df()
152
+ df_filtered = df[df['InstanceType'] == instance_type]
139
153
  if df_filtered.empty:
140
154
  return []
141
155
 
@@ -174,7 +188,8 @@ def list_accelerators(
174
188
  require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
175
189
  """Lists accelerators offered in Seeweb."""
176
190
  # Filter out rows with empty or null regions (indicating unavailability)
177
- df_filtered = _df.dropna(subset=['Region'])
191
+ df = _get_df()
192
+ df_filtered = df.dropna(subset=['Region'])
178
193
  df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
179
194
 
180
195
  result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
@@ -7,12 +7,15 @@ and can be used to query instance types and pricing information for Shadeform.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
- import pandas as pd
11
-
10
+ from sky.adaptors import common as adaptors_common
12
11
  from sky.catalog import common
13
12
 
14
13
  if typing.TYPE_CHECKING:
14
+ import pandas as pd
15
+
15
16
  from sky.clouds import cloud
17
+ else:
18
+ pd = adaptors_common.LazyImport('pandas')
16
19
 
17
20
  # We'll use dynamic fetching, so no static CSV file to load
18
21
  _df = None