skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/do.py ADDED
@@ -0,0 +1,313 @@
1
+ """ Digital Ocean Cloud. """
2
+
3
+ import json
4
+ import typing
5
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
+
7
+ from sky import clouds
8
+ from sky.adaptors import do
9
+ from sky.clouds import service_catalog
10
+ from sky.provision.do import utils as do_utils
11
+ from sky.utils import registry
12
+ from sky.utils import resources_utils
13
+
14
+ if typing.TYPE_CHECKING:
15
+ from sky import resources as resources_lib
16
+
17
+ _CREDENTIAL_FILE = 'config.yaml'
18
+
19
+
20
+ @registry.CLOUD_REGISTRY.register(aliases=['digitalocean'])
21
+ class DO(clouds.Cloud):
22
+ """Digital Ocean Cloud"""
23
+
24
+ _REPR = 'DO'
25
+ _CLOUD_UNSUPPORTED_FEATURES = {
26
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
27
+ 'Migrating '
28
+ f'disk is not supported in {_REPR}.',
29
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
30
+ 'Spot instances are '
31
+ f'not supported in {_REPR}.',
32
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
33
+ 'Custom disk tiers'
34
+ f' is not supported in {_REPR}.',
35
+ }
36
+ # DO maximum node name length defined as <= 255
37
+ # https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
38
+ # 255 - 8 = 247 characters since
39
+ # our provisioner adds additional `-worker`.
40
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 247
41
+ _regions: List[clouds.Region] = []
42
+
43
+ # Using the latest SkyPilot provisioner API to provision and check status.
44
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
45
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
46
+
47
+ @classmethod
48
+ def _unsupported_features_for_resources(
49
+ cls, resources: 'resources_lib.Resources'
50
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
51
+ """The features not supported based on the resources provided.
52
+
53
+ This method is used by check_features_are_supported() to check if the
54
+ cloud implementation supports all the requested features.
55
+
56
+ Returns:
57
+ A dict of {feature: reason} for the features not supported by the
58
+ cloud implementation.
59
+ """
60
+ del resources # unused
61
+ return cls._CLOUD_UNSUPPORTED_FEATURES
62
+
63
+ @classmethod
64
+ def _max_cluster_name_length(cls) -> Optional[int]:
65
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
66
+
67
+ @classmethod
68
+ def regions_with_offering(
69
+ cls,
70
+ instance_type: str,
71
+ accelerators: Optional[Dict[str, int]],
72
+ use_spot: bool,
73
+ region: Optional[str],
74
+ zone: Optional[str],
75
+ ) -> List[clouds.Region]:
76
+ assert zone is None, 'DO does not support zones.'
77
+ del accelerators, zone # unused
78
+ if use_spot:
79
+ return []
80
+ regions = service_catalog.get_region_zones_for_instance_type(
81
+ instance_type, use_spot, 'DO')
82
+ if region is not None:
83
+ regions = [r for r in regions if r.name == region]
84
+ return regions
85
+
86
+ @classmethod
87
+ def get_vcpus_mem_from_instance_type(
88
+ cls,
89
+ instance_type: str,
90
+ ) -> Tuple[Optional[float], Optional[float]]:
91
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
92
+ clouds='DO')
93
+
94
+ @classmethod
95
+ def zones_provision_loop(
96
+ cls,
97
+ *,
98
+ region: str,
99
+ num_nodes: int,
100
+ instance_type: str,
101
+ accelerators: Optional[Dict[str, int]] = None,
102
+ use_spot: bool = False,
103
+ ) -> Iterator[None]:
104
+ del num_nodes # unused
105
+ regions = cls.regions_with_offering(instance_type,
106
+ accelerators,
107
+ use_spot,
108
+ region=region,
109
+ zone=None)
110
+ for r in regions:
111
+ assert r.zones is None, r
112
+ yield r.zones
113
+
114
+ def instance_type_to_hourly_cost(
115
+ self,
116
+ instance_type: str,
117
+ use_spot: bool,
118
+ region: Optional[str] = None,
119
+ zone: Optional[str] = None,
120
+ ) -> float:
121
+ return service_catalog.get_hourly_cost(
122
+ instance_type,
123
+ use_spot=use_spot,
124
+ region=region,
125
+ zone=zone,
126
+ clouds='DO',
127
+ )
128
+
129
+ def accelerators_to_hourly_cost(
130
+ self,
131
+ accelerators: Dict[str, int],
132
+ use_spot: bool,
133
+ region: Optional[str] = None,
134
+ zone: Optional[str] = None,
135
+ ) -> float:
136
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
137
+ # the acc price is include in the instance price.
138
+ del accelerators, use_spot, region, zone # unused
139
+ return 0.0
140
+
141
+ def get_egress_cost(self, num_gigabytes: float) -> float:
142
+ return 0.0
143
+
144
+ def __repr__(self):
145
+ return self._REPR
146
+
147
+ @classmethod
148
+ def get_default_instance_type(
149
+ cls,
150
+ cpus: Optional[str] = None,
151
+ memory: Optional[str] = None,
152
+ disk_tier: Optional[resources_utils.DiskTier] = None,
153
+ ) -> Optional[str]:
154
+ """Returns the default instance type for DO."""
155
+ return service_catalog.get_default_instance_type(cpus=cpus,
156
+ memory=memory,
157
+ disk_tier=disk_tier,
158
+ clouds='DO')
159
+
160
+ @classmethod
161
+ def get_accelerators_from_instance_type(
162
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
163
+ return service_catalog.get_accelerators_from_instance_type(
164
+ instance_type, clouds='DO')
165
+
166
+ @classmethod
167
+ def get_zone_shell_cmd(cls) -> Optional[str]:
168
+ return None
169
+
170
+ def make_deploy_resources_variables(
171
+ self,
172
+ resources: 'resources_lib.Resources',
173
+ cluster_name: resources_utils.ClusterName,
174
+ region: 'clouds.Region',
175
+ zones: Optional[List['clouds.Zone']],
176
+ num_nodes: int,
177
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
178
+ del zones, dryrun, cluster_name
179
+
180
+ r = resources
181
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
182
+ if acc_dict is not None:
183
+ custom_resources = json.dumps(acc_dict, separators=(',', ':'))
184
+ else:
185
+ custom_resources = None
186
+ image_id = None
187
+ if (resources.image_id is not None and
188
+ resources.extract_docker_image() is None):
189
+ if None in resources.image_id:
190
+ image_id = resources.image_id[None]
191
+ else:
192
+ assert region.name in resources.image_id
193
+ image_id = resources.image_id[region.name]
194
+ return {
195
+ 'instance_type': resources.instance_type,
196
+ 'custom_resources': custom_resources,
197
+ 'region': region.name,
198
+ **({
199
+ 'image_id': image_id
200
+ } if image_id else {})
201
+ }
202
+
203
+ def _get_feasible_launchable_resources(
204
+ self, resources: 'resources_lib.Resources'
205
+ ) -> resources_utils.FeasibleResources:
206
+ """Returns a list of feasible resources for the given resources."""
207
+ if resources.use_spot:
208
+ # TODO: Add hints to all return values in this method to help
209
+ # users understand why the resources are not launchable.
210
+ return resources_utils.FeasibleResources([], [], None)
211
+ if resources.instance_type is not None:
212
+ assert resources.is_launchable(), resources
213
+ resources = resources.copy(accelerators=None)
214
+ return resources_utils.FeasibleResources([resources], [], None)
215
+
216
+ def _make(instance_list):
217
+ resource_list = []
218
+ for instance_type in instance_list:
219
+ r = resources.copy(
220
+ cloud=DO(),
221
+ instance_type=instance_type,
222
+ accelerators=None,
223
+ cpus=None,
224
+ )
225
+ resource_list.append(r)
226
+ return resource_list
227
+
228
+ # Currently, handle a filter on accelerators only.
229
+ accelerators = resources.accelerators
230
+ if accelerators is None:
231
+ # Return a default instance type
232
+ default_instance_type = DO.get_default_instance_type(
233
+ cpus=resources.cpus,
234
+ memory=resources.memory,
235
+ disk_tier=resources.disk_tier)
236
+ if default_instance_type is None:
237
+ return resources_utils.FeasibleResources([], [], None)
238
+ else:
239
+ return resources_utils.FeasibleResources(
240
+ _make([default_instance_type]), [], None)
241
+
242
+ assert len(accelerators) == 1, resources
243
+ acc, acc_count = list(accelerators.items())[0]
244
+ (instance_list, fuzzy_candidate_list) = (
245
+ service_catalog.get_instance_type_for_accelerator(
246
+ acc,
247
+ acc_count,
248
+ use_spot=resources.use_spot,
249
+ cpus=resources.cpus,
250
+ memory=resources.memory,
251
+ region=resources.region,
252
+ zone=resources.zone,
253
+ clouds='DO',
254
+ ))
255
+ if instance_list is None:
256
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
257
+ None)
258
+ return resources_utils.FeasibleResources(_make(instance_list),
259
+ fuzzy_candidate_list, None)
260
+
261
+ @classmethod
262
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
263
+ """Verify that the user has valid credentials for DO."""
264
+
265
+ try:
266
+ do.exceptions()
267
+ except ImportError as err:
268
+ return False, str(err)
269
+
270
+ try:
271
+ # attempt to make a CURL request for listing instances
272
+ do_utils.client().droplets.list()
273
+ except do.exceptions().HttpResponseError as err:
274
+ return False, str(err)
275
+ except do_utils.DigitalOceanError as err:
276
+ return False, str(err)
277
+
278
+ return True, None
279
+
280
+ def get_credential_file_mounts(self) -> Dict[str, str]:
281
+ try:
282
+ do_utils.client()
283
+ return {
284
+ f'~/.config/doctl/{_CREDENTIAL_FILE}': do_utils.CREDENTIALS_PATH
285
+ }
286
+ except do_utils.DigitalOceanError:
287
+ return {}
288
+
289
+ @classmethod
290
+ def get_current_user_identity(cls) -> Optional[List[str]]:
291
+ # NOTE: used for very advanced SkyPilot functionality
292
+ # Can implement later if desired
293
+ return None
294
+
295
+ @classmethod
296
+ def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
297
+ del region
298
+ try:
299
+ response = do_utils.client().images.get(image_id=image_id)
300
+ return response['image']['size_gigabytes']
301
+ except do.exceptions().HttpResponseError as err:
302
+ raise do_utils.DigitalOceanError(
303
+ 'HTTP error while retrieving size of '
304
+ f'image_id {response}: {err.error.message}') from err
305
+ except KeyError as err:
306
+ raise do_utils.DigitalOceanError(
307
+ f'No image_id `{image_id}` found') from err
308
+
309
+ def instance_type_exists(self, instance_type: str) -> bool:
310
+ return service_catalog.instance_type_exists(instance_type, 'DO')
311
+
312
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
313
+ return service_catalog.validate_region_zone(region, zone, clouds='DO')
sky/clouds/fluidstack.py CHANGED
@@ -1,28 +1,28 @@
1
1
  """Fluidstack Cloud."""
2
- import json
3
2
  import os
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  import requests
8
7
 
9
8
  from sky import clouds
10
- from sky import status_lib
11
9
  from sky.clouds import service_catalog
12
10
  from sky.provision.fluidstack import fluidstack_utils
11
+ from sky.utils import registry
12
+ from sky.utils import resources_utils
13
+ from sky.utils import status_lib
13
14
  from sky.utils.resources_utils import DiskTier
14
15
 
15
16
  _CREDENTIAL_FILES = [
16
17
  # credential files for FluidStack,
17
- fluidstack_utils.FLUIDSTACK_API_KEY_PATH,
18
- fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH,
18
+ fluidstack_utils.FLUIDSTACK_API_KEY_PATH
19
19
  ]
20
20
  if typing.TYPE_CHECKING:
21
21
  # Renaming to avoid shadowing variables.
22
22
  from sky import resources as resources_lib
23
23
 
24
24
 
25
- @clouds.CLOUD_REGISTRY.register
25
+ @registry.CLOUD_REGISTRY.register
26
26
  class Fluidstack(clouds.Cloud):
27
27
  """FluidStack GPU Cloud."""
28
28
 
@@ -155,7 +155,7 @@ class Fluidstack(clouds.Cloud):
155
155
  def get_accelerators_from_instance_type(
156
156
  cls,
157
157
  instance_type: str,
158
- ) -> Optional[Dict[str, int]]:
158
+ ) -> Optional[Dict[str, Union[int, float]]]:
159
159
  return service_catalog.get_accelerators_from_instance_type(
160
160
  instance_type, clouds='fluidstack')
161
161
 
@@ -174,9 +174,10 @@ class Fluidstack(clouds.Cloud):
174
174
  def make_deploy_resources_variables(
175
175
  self,
176
176
  resources: 'resources_lib.Resources',
177
- cluster_name_on_cloud: str,
177
+ cluster_name: resources_utils.ClusterName,
178
178
  region: clouds.Region,
179
179
  zones: Optional[List[clouds.Zone]],
180
+ num_nodes: int,
180
181
  dryrun: bool = False,
181
182
  ) -> Dict[str, Optional[str]]:
182
183
 
@@ -184,24 +185,14 @@ class Fluidstack(clouds.Cloud):
184
185
 
185
186
  r = resources
186
187
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
187
- if acc_dict is not None:
188
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
189
- else:
190
- custom_resources = None
191
- cuda_installation_commands = """
192
- sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb;
193
- sudo dpkg -i /usr/local/cuda-keyring_1.1-1_all.deb;
194
- sudo apt-get update;
195
- sudo apt-get -y install cuda-toolkit-12-3;
196
- sudo apt-get install -y cuda-drivers;
197
- sudo apt-get install -y python3-pip;
198
- nvidia-smi || sudo reboot;"""
188
+ custom_resources = resources_utils.make_ray_custom_resources_str(
189
+ acc_dict)
190
+
199
191
  return {
200
192
  'instance_type': resources.instance_type,
201
193
  'custom_resources': custom_resources,
202
194
  'region': region.name,
203
- 'fluidstack_username': self.default_username(region.name),
204
- 'cuda_installation_commands': cuda_installation_commands,
195
+ 'fluidstack_username': 'ubuntu',
205
196
  }
206
197
 
207
198
  def _get_feasible_launchable_resources(
@@ -210,7 +201,9 @@ class Fluidstack(clouds.Cloud):
210
201
  assert resources.is_launchable(), resources
211
202
  # Accelerators are part of the instance type in Fluidstack Cloud
212
203
  resources = resources.copy(accelerators=None)
213
- return ([resources], [])
204
+ # TODO: Add hints to all return values in this method to help
205
+ # users understand why the resources are not launchable.
206
+ return resources_utils.FeasibleResources([resources], [], None)
214
207
 
215
208
  def _make(instance_list):
216
209
  resource_list = []
@@ -238,9 +231,10 @@ class Fluidstack(clouds.Cloud):
238
231
  memory=resources.memory,
239
232
  disk_tier=resources.disk_tier)
240
233
  if default_instance_type is None:
241
- return ([], [])
234
+ return resources_utils.FeasibleResources([], [], None)
242
235
  else:
243
- return (_make([default_instance_type]), [])
236
+ return resources_utils.FeasibleResources(
237
+ _make([default_instance_type]), [], None)
244
238
 
245
239
  assert len(accelerators) == 1, resources
246
240
  acc, acc_count = list(accelerators.items())[0]
@@ -255,8 +249,10 @@ class Fluidstack(clouds.Cloud):
255
249
  zone=resources.zone,
256
250
  clouds='fluidstack')
257
251
  if instance_list is None:
258
- return ([], fuzzy_candidate_list)
259
- return (_make(instance_list), fuzzy_candidate_list)
252
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
253
+ None)
254
+ return resources_utils.FeasibleResources(_make(instance_list),
255
+ fuzzy_candidate_list, None)
260
256
 
261
257
  @classmethod
262
258
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -264,17 +260,26 @@ class Fluidstack(clouds.Cloud):
264
260
  try:
265
261
  assert os.path.exists(
266
262
  os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_KEY_PATH))
267
- assert os.path.exists(
268
- os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH))
263
+
264
+ with open(os.path.expanduser(
265
+ fluidstack_utils.FLUIDSTACK_API_KEY_PATH),
266
+ encoding='UTF-8') as f:
267
+ api_key = f.read().strip()
268
+ if not api_key.startswith('api_key'):
269
+ return False, ('Invalid FluidStack API key format. '
270
+ 'To configure credentials, go to:\n '
271
+ ' https://dashboard.fluidstack.io \n '
272
+ 'to obtain an API key, '
273
+ 'then add save the contents '
274
+ 'to ~/.fluidstack/api_key \n')
269
275
  except AssertionError:
270
- return False, (
271
- 'Failed to access FluidStack Cloud'
272
- ' with credentials. '
273
- 'To configure credentials, go to:\n '
274
- ' https://console.fluidstack.io \n '
275
- 'to obtain an API key and API Token, '
276
- 'then add save the contents '
277
- 'to ~/.fluidstack/api_key and ~/.fluidstack/api_token \n')
276
+ return False, ('Failed to access FluidStack Cloud'
277
+ ' with credentials. '
278
+ 'To configure credentials, go to:\n '
279
+ ' https://dashboard.fluidstack.io \n '
280
+ 'to obtain an API key, '
281
+ 'then add save the contents '
282
+ 'to ~/.fluidstack/api_key \n')
278
283
  except requests.exceptions.ConnectionError:
279
284
  return False, ('Failed to verify FluidStack Cloud credentials. '
280
285
  'Check your network connection '
@@ -285,8 +290,8 @@ class Fluidstack(clouds.Cloud):
285
290
  return {filename: filename for filename in _CREDENTIAL_FILES}
286
291
 
287
292
  @classmethod
288
- def get_current_user_identity(cls) -> Optional[List[str]]:
289
- # TODO(mjibril): Implement get_current_user_identity for Fluidstack
293
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
294
+ # TODO(mjibril): Implement get_active_user_identity for Fluidstack
290
295
  return None
291
296
 
292
297
  def instance_type_exists(self, instance_type: str) -> bool:
@@ -297,21 +302,6 @@ class Fluidstack(clouds.Cloud):
297
302
  zone,
298
303
  clouds='fluidstack')
299
304
 
300
- @classmethod
301
- def default_username(cls, region: str) -> str:
302
- return {
303
- 'norway_2_eu': 'ubuntu',
304
- 'calgary_1_canada': 'ubuntu',
305
- 'norway_3_eu': 'ubuntu',
306
- 'norway_4_eu': 'ubuntu',
307
- 'india_2': 'root',
308
- 'nevada_1_usa': 'fsuser',
309
- 'generic_1_canada': 'ubuntu',
310
- 'iceland_1_eu': 'ubuntu',
311
- 'new_york_1_usa': 'fsuser',
312
- 'illinois_1_usa': 'fsuser'
313
- }.get(region, 'ubuntu')
314
-
315
305
  @classmethod
316
306
  def query_status(
317
307
  cls,