skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
1
1
  """Lambda Cloud."""
2
- import json
3
2
  import typing
4
- from typing import Dict, Iterator, List, Optional, Tuple
3
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
4
 
6
5
  import requests
7
6
 
8
7
  from sky import clouds
9
- from sky import status_lib
10
8
  from sky.clouds import service_catalog
11
- from sky.clouds.utils import lambda_utils
9
+ from sky.provision.lambda_cloud import lambda_utils
10
+ from sky.utils import registry
12
11
  from sky.utils import resources_utils
12
+ from sky.utils import status_lib
13
13
 
14
14
  if typing.TYPE_CHECKING:
15
15
  # Renaming to avoid shadowing variables.
@@ -21,7 +21,7 @@ _CREDENTIAL_FILES = [
21
21
  ]
22
22
 
23
23
 
24
- @clouds.CLOUD_REGISTRY.register
24
+ @registry.CLOUD_REGISTRY.register
25
25
  class Lambda(clouds.Cloud):
26
26
  """Lambda Labs GPU Cloud."""
27
27
 
@@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
37
37
  _CLOUD_UNSUPPORTED_FEATURES = {
38
38
  clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
39
39
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
40
- clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
41
- f'Docker image is currently not supported on {_REPR}. '
42
- 'You can try running docker command inside the `run` section in task.yaml.'
43
- ),
44
40
  clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
45
41
  clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
46
42
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
@@ -48,6 +44,9 @@ class Lambda(clouds.Cloud):
48
44
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
49
45
  }
50
46
 
47
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
48
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
49
+
51
50
  @classmethod
52
51
  def _unsupported_features_for_resources(
53
52
  cls, resources: 'resources_lib.Resources'
@@ -123,10 +122,10 @@ class Lambda(clouds.Cloud):
123
122
 
124
123
  @classmethod
125
124
  def get_default_instance_type(
126
- cls,
127
- cpus: Optional[str] = None,
128
- memory: Optional[str] = None,
129
- disk_tier: Optional[resources_utils.DiskTier] = None
125
+ cls,
126
+ cpus: Optional[str] = None,
127
+ memory: Optional[str] = None,
128
+ disk_tier: Optional['resources_utils.DiskTier'] = None
130
129
  ) -> Optional[str]:
131
130
  return service_catalog.get_default_instance_type(cpus=cpus,
132
131
  memory=memory,
@@ -137,7 +136,7 @@ class Lambda(clouds.Cloud):
137
136
  def get_accelerators_from_instance_type(
138
137
  cls,
139
138
  instance_type: str,
140
- ) -> Optional[Dict[str, int]]:
139
+ ) -> Optional[Dict[str, Union[int, float]]]:
141
140
  return service_catalog.get_accelerators_from_instance_type(
142
141
  instance_type, clouds='lambda')
143
142
 
@@ -156,34 +155,43 @@ class Lambda(clouds.Cloud):
156
155
  def make_deploy_resources_variables(
157
156
  self,
158
157
  resources: 'resources_lib.Resources',
159
- cluster_name_on_cloud: str,
158
+ cluster_name: 'resources_utils.ClusterName',
160
159
  region: 'clouds.Region',
161
160
  zones: Optional[List['clouds.Zone']],
161
+ num_nodes: int,
162
162
  dryrun: bool = False) -> Dict[str, Optional[str]]:
163
- del cluster_name_on_cloud, dryrun # Unused.
163
+ del cluster_name, dryrun # Unused.
164
164
  assert zones is None, 'Lambda does not support zones.'
165
165
 
166
166
  r = resources
167
167
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
168
- if acc_dict is not None:
169
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
170
- else:
171
- custom_resources = None
168
+ custom_resources = resources_utils.make_ray_custom_resources_str(
169
+ acc_dict)
172
170
 
173
- return {
171
+ resources_vars = {
174
172
  'instance_type': resources.instance_type,
175
173
  'custom_resources': custom_resources,
176
174
  'region': region.name,
177
175
  }
178
176
 
177
+ if acc_dict is not None:
178
+ # Lambda cloud's docker runtime information does not contain
179
+ # 'nvidia-container-runtime', causing no GPU option is added to
180
+ # the docker run command. We patch this by adding it here.
181
+ resources_vars['docker_run_options'] = ['--gpus all']
182
+
183
+ return resources_vars
184
+
179
185
  def _get_feasible_launchable_resources(
180
186
  self, resources: 'resources_lib.Resources'
181
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
187
+ ) -> 'resources_utils.FeasibleResources':
182
188
  if resources.instance_type is not None:
183
189
  assert resources.is_launchable(), resources
184
190
  # Accelerators are part of the instance type in Lambda Cloud
185
191
  resources = resources.copy(accelerators=None)
186
- return ([resources], [])
192
+ # TODO: Add hints to all return values in this method to help
193
+ # users understand why the resources are not launchable.
194
+ return resources_utils.FeasibleResources([resources], [], None)
187
195
 
188
196
  def _make(instance_list):
189
197
  resource_list = []
@@ -209,9 +217,10 @@ class Lambda(clouds.Cloud):
209
217
  memory=resources.memory,
210
218
  disk_tier=resources.disk_tier)
211
219
  if default_instance_type is None:
212
- return ([], [])
220
+ return resources_utils.FeasibleResources([], [], None)
213
221
  else:
214
- return (_make([default_instance_type]), [])
222
+ return resources_utils.FeasibleResources(
223
+ _make([default_instance_type]), [], None)
215
224
 
216
225
  assert len(accelerators) == 1, resources
217
226
  acc, acc_count = list(accelerators.items())[0]
@@ -226,8 +235,10 @@ class Lambda(clouds.Cloud):
226
235
  zone=resources.zone,
227
236
  clouds='lambda')
228
237
  if instance_list is None:
229
- return ([], fuzzy_candidate_list)
230
- return (_make(instance_list), fuzzy_candidate_list)
238
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
239
+ None)
240
+ return resources_utils.FeasibleResources(_make(instance_list),
241
+ fuzzy_candidate_list, None)
231
242
 
232
243
  @classmethod
233
244
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -253,8 +264,8 @@ class Lambda(clouds.Cloud):
253
264
  }
254
265
 
255
266
  @classmethod
256
- def get_current_user_identity(cls) -> Optional[List[str]]:
257
- # TODO(ewzeng): Implement get_current_user_identity for Lambda
267
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
268
+ # TODO(ewzeng): Implement get_user_identities for Lambda
258
269
  return None
259
270
 
260
271
  def instance_type_exists(self, instance_type: str) -> bool:
sky/clouds/nebius.py ADDED
@@ -0,0 +1,297 @@
1
+ """ Nebius Cloud. """
2
+ import logging
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import clouds
7
+ from sky.adaptors import nebius
8
+ from sky.clouds import service_catalog
9
+ from sky.utils import registry
10
+ from sky.utils import resources_utils
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sky import resources as resources_lib
14
+
15
+ _CREDENTIAL_FILES = [
16
+ # credential files for Nebius
17
+ nebius.NEBIUS_TENANT_ID_FILENAME,
18
+ nebius.NEBIUS_IAM_TOKEN_FILENAME,
19
+ nebius.NEBIUS_PROJECT_ID_FILENAME,
20
+ ]
21
+
22
+
23
+ @registry.CLOUD_REGISTRY.register
24
+ class Nebius(clouds.Cloud):
25
+ """Nebius GPU Cloud"""
26
+ _REPR = 'Nebius'
27
+ _CLOUD_UNSUPPORTED_FEATURES = {
28
+ clouds.CloudImplementationFeatures.AUTO_TERMINATE:
29
+ ('Autodown and Autostop not supported. Can\'t delete disk.'),
30
+ # Autostop functionality can be implemented, but currently,
31
+ # there is only a single flag for both autostop and autodown.
32
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
33
+ ('Spot is not supported, as Nebius API does not implement spot.'),
34
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
35
+ (f'Migrating disk is currently not supported on {_REPR}.'),
36
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE:
37
+ (f'Docker image is currently not supported on {_REPR}. '
38
+ 'You can try running docker command inside the '
39
+ '`run` section in task.yaml.'),
40
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
41
+ (f'Custom disk tier is currently not supported on {_REPR}.'),
42
+ }
43
+ # Nebius maximum instance name length defined as <= 63 as a hostname length
44
+ # 63 - 8 - 5 = 50 characters since
45
+ # we add 4 character from UUID to make uniq `-xxxx`
46
+ # our provisioner adds additional `-worker`.
47
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 50
48
+ _regions: List[clouds.Region] = []
49
+
50
+ # Using the latest SkyPilot provisioner API to provision and check status.
51
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
52
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
53
+
54
+ @classmethod
55
+ def _unsupported_features_for_resources(
56
+ cls, resources: 'resources_lib.Resources'
57
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
58
+ del resources # unused
59
+ return cls._CLOUD_UNSUPPORTED_FEATURES
60
+
61
+ @classmethod
62
+ def _max_cluster_name_length(cls) -> Optional[int]:
63
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
64
+
65
+ @classmethod
66
+ def regions_with_offering(cls, instance_type: str,
67
+ accelerators: Optional[Dict[str, int]],
68
+ use_spot: bool, region: Optional[str],
69
+ zone: Optional[str]) -> List[clouds.Region]:
70
+ assert zone is None, 'Nebius does not support zones.'
71
+ del accelerators, zone # unused
72
+ if use_spot:
73
+ return []
74
+ regions = service_catalog.get_region_zones_for_instance_type(
75
+ instance_type, use_spot, 'nebius')
76
+
77
+ if region is not None:
78
+ regions = [r for r in regions if r.name == region]
79
+ return regions
80
+
81
+ @classmethod
82
+ def get_vcpus_mem_from_instance_type(
83
+ cls,
84
+ instance_type: str,
85
+ ) -> Tuple[Optional[float], Optional[float]]:
86
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
87
+ clouds='nebius')
88
+
89
+ @classmethod
90
+ def zones_provision_loop(
91
+ cls,
92
+ *,
93
+ region: str,
94
+ num_nodes: int,
95
+ instance_type: str,
96
+ accelerators: Optional[Dict[str, int]] = None,
97
+ use_spot: bool = False,
98
+ ) -> Iterator[None]:
99
+ del num_nodes # unused
100
+ regions = cls.regions_with_offering(instance_type,
101
+ accelerators,
102
+ use_spot,
103
+ region=region,
104
+ zone=None)
105
+ for r in regions:
106
+ assert r.zones is None, r
107
+ yield r.zones
108
+
109
+ def instance_type_to_hourly_cost(self,
110
+ instance_type: str,
111
+ use_spot: bool,
112
+ region: Optional[str] = None,
113
+ zone: Optional[str] = None) -> float:
114
+ return service_catalog.get_hourly_cost(instance_type,
115
+ use_spot=use_spot,
116
+ region=region,
117
+ zone=zone,
118
+ clouds='nebius')
119
+
120
+ def accelerators_to_hourly_cost(self,
121
+ accelerators: Dict[str, int],
122
+ use_spot: bool,
123
+ region: Optional[str] = None,
124
+ zone: Optional[str] = None) -> float:
125
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
126
+ del accelerators, use_spot, region, zone # unused
127
+ return 0.0
128
+
129
+ def get_egress_cost(self, num_gigabytes: float) -> float:
130
+ return 0.0
131
+
132
+ def __repr__(self):
133
+ return self._REPR
134
+
135
+ def is_same_cloud(self, other: clouds.Cloud) -> bool:
136
+ # Returns true if the two clouds are the same cloud type.
137
+ return isinstance(other, Nebius)
138
+
139
+ @classmethod
140
+ def get_default_instance_type(
141
+ cls,
142
+ cpus: Optional[str] = None,
143
+ memory: Optional[str] = None,
144
+ disk_tier: Optional[resources_utils.DiskTier] = None
145
+ ) -> Optional[str]:
146
+ """Returns the default instance type for Nebius."""
147
+ return service_catalog.get_default_instance_type(cpus=cpus,
148
+ memory=memory,
149
+ disk_tier=disk_tier,
150
+ clouds='nebius')
151
+
152
+ @classmethod
153
+ def get_accelerators_from_instance_type(
154
+ cls,
155
+ instance_type: str,
156
+ ) -> Optional[Dict[str, Union[int, float]]]:
157
+ return service_catalog.get_accelerators_from_instance_type(
158
+ instance_type, clouds='nebius')
159
+
160
+ @classmethod
161
+ def get_zone_shell_cmd(cls) -> Optional[str]:
162
+ return None
163
+
164
+ def make_deploy_resources_variables(
165
+ self,
166
+ resources: 'resources_lib.Resources',
167
+ cluster_name: resources_utils.ClusterName,
168
+ region: 'clouds.Region',
169
+ zones: Optional[List['clouds.Zone']],
170
+ num_nodes: int,
171
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
172
+ del dryrun, cluster_name
173
+ assert zones is None, ('Nebius does not support zones', zones)
174
+
175
+ r = resources
176
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
177
+ custom_resources = resources_utils.make_ray_custom_resources_str(
178
+ acc_dict)
179
+ platform, _ = resources.instance_type.split('_')
180
+
181
+ if platform in ('cpu-d3', 'cpu-e2'):
182
+ image_family = 'ubuntu22.04-driverless'
183
+ elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
184
+ image_family = 'ubuntu22.04-cuda12'
185
+ else:
186
+ raise RuntimeError('Unsupported instance type for Nebius cloud:'
187
+ f' {resources.instance_type}')
188
+ return {
189
+ 'instance_type': resources.instance_type,
190
+ 'custom_resources': custom_resources,
191
+ 'region': region.name,
192
+ 'image_id': image_family,
193
+ # Nebius does not support specific zones.
194
+ 'zones': None,
195
+ }
196
+
197
+ def _get_feasible_launchable_resources(
198
+ self, resources: 'resources_lib.Resources'
199
+ ) -> 'resources_utils.FeasibleResources':
200
+ """Returns a list of feasible resources for the given resources."""
201
+ if resources.instance_type is not None:
202
+ assert resources.is_launchable(), resources
203
+ resources = resources.copy(accelerators=None)
204
+ return resources_utils.FeasibleResources([resources], [], None)
205
+
206
+ def _make(instance_list):
207
+ resource_list = []
208
+ for instance_type in instance_list:
209
+ r = resources.copy(
210
+ cloud=Nebius(),
211
+ instance_type=instance_type,
212
+ accelerators=None,
213
+ cpus=None,
214
+ )
215
+ resource_list.append(r)
216
+ return resource_list
217
+
218
+ # Currently, handle a filter on accelerators only.
219
+ accelerators = resources.accelerators
220
+ if accelerators is None:
221
+ # Return a default instance type
222
+ default_instance_type = Nebius.get_default_instance_type(
223
+ cpus=resources.cpus,
224
+ memory=resources.memory,
225
+ disk_tier=resources.disk_tier)
226
+ if default_instance_type is None:
227
+ # TODO: Add hints to all return values in this method to help
228
+ # users understand why the resources are not launchable.
229
+ return resources_utils.FeasibleResources([], [], None)
230
+ else:
231
+ return resources_utils.FeasibleResources(
232
+ _make([default_instance_type]), [], None)
233
+
234
+ assert len(accelerators) == 1, resources
235
+ acc, acc_count = list(accelerators.items())[0]
236
+ (instance_list, fuzzy_candidate_list
237
+ ) = service_catalog.get_instance_type_for_accelerator(
238
+ acc,
239
+ acc_count,
240
+ use_spot=resources.use_spot,
241
+ cpus=resources.cpus,
242
+ region=resources.region,
243
+ zone=resources.zone,
244
+ clouds='nebius')
245
+ if instance_list is None:
246
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
247
+ None)
248
+ return resources_utils.FeasibleResources(_make(instance_list),
249
+ fuzzy_candidate_list, None)
250
+
251
+ @classmethod
252
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
253
+ """ Verify that the user has valid credentials for Nebius. """
254
+ logging.debug('Nebius cloud check credentials')
255
+ token = nebius.get_iam_token()
256
+ token_msg = (' Credentials can be set up by running: \n'\
257
+ f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
258
+ tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
259
+ f' $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
260
+ ' Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
261
+ f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
262
+ if token is None:
263
+ return False, f'{token_msg}'
264
+ sdk = nebius.sdk()
265
+ tenant_id = nebius.get_tenant_id()
266
+ if tenant_id is None:
267
+ return False, f'{tenant_msg}'
268
+ try:
269
+ service = nebius.iam().ProjectServiceClient(sdk)
270
+ service.list(
271
+ nebius.iam().ListProjectsRequest(parent_id=tenant_id)).wait()
272
+ except nebius.request_error() as e:
273
+ return False, (
274
+ f'{e.status} \n' # First line is indented by 4 spaces
275
+ f'{token_msg}'
276
+ f'{tenant_msg}')
277
+ return True, None
278
+
279
+ def get_credential_file_mounts(self) -> Dict[str, str]:
280
+ return {
281
+ f'~/.nebius/{filename}': f'~/.nebius/{filename}'
282
+ for filename in _CREDENTIAL_FILES
283
+ }
284
+
285
+ @classmethod
286
+ def get_current_user_identity(cls) -> Optional[List[str]]:
287
+ # NOTE: used for very advanced SkyPilot functionality
288
+ # Can implement later if desired
289
+ return None
290
+
291
+ def instance_type_exists(self, instance_type: str) -> bool:
292
+ return service_catalog.instance_type_exists(instance_type, 'nebius')
293
+
294
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
295
+ return service_catalog.validate_region_zone(region,
296
+ zone,
297
+ clouds='nebius')