skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,287 @@
1
+ """DigitalOcean instance provisioning."""
2
+
3
+ import time
4
+ from typing import Any, Dict, List, Optional
5
+ import uuid
6
+
7
+ from sky import sky_logging
8
+ from sky.provision import common
9
+ from sky.provision.do import constants
10
+ from sky.provision.do import utils
11
+ from sky.utils import status_lib
12
+
13
+ # The maximum number of times to poll for the status of an operation
14
+ MAX_POLLS = 60 // constants.POLL_INTERVAL
15
+ # Stopping instances can take several minutes, so we increase the timeout
16
+ MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 8
17
+
18
+ logger = sky_logging.init_logger(__name__)
19
+
20
+
21
+ def _get_head_instance(
22
+ instances: Dict[str, Dict[str, Any]]) -> Optional[Dict[str, Any]]:
23
+ for instance_name, instance_meta in instances.items():
24
+ if instance_name.endswith('-head'):
25
+ return instance_meta
26
+ return None
27
+
28
+
29
+ def run_instances(region: str, cluster_name_on_cloud: str,
30
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
31
+ """Runs instances for the given cluster."""
32
+
33
+ pending_status = ['new']
34
+ newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
+ pending_status + ['off'])
36
+ while True:
37
+ instances = utils.filter_instances(cluster_name_on_cloud,
38
+ pending_status)
39
+ if not instances:
40
+ break
41
+ instance_statuses = [
42
+ instance['status'] for instance in instances.values()
43
+ ]
44
+ logger.info(f'Waiting for {len(instances)} instances to be ready: '
45
+ f'{instance_statuses}')
46
+ time.sleep(constants.POLL_INTERVAL)
47
+
48
+ exist_instances = utils.filter_instances(cluster_name_on_cloud,
49
+ status_filters=pending_status +
50
+ ['active', 'off'])
51
+ if len(exist_instances) > config.count:
52
+ raise RuntimeError(
53
+ f'Cluster {cluster_name_on_cloud} already has '
54
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
55
+
56
+ stopped_instances = utils.filter_instances(cluster_name_on_cloud,
57
+ status_filters=['off'])
58
+ for instance in stopped_instances.values():
59
+ utils.start_instance(instance)
60
+ for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
61
+ instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
62
+ if len(instances) == 0:
63
+ break
64
+ num_stopped_instances = len(stopped_instances)
65
+ num_restarted_instances = num_stopped_instances - len(instances)
66
+ logger.info(
67
+ f'Waiting for {num_restarted_instances}/{num_stopped_instances} '
68
+ 'stopped instances to be restarted.')
69
+ time.sleep(constants.POLL_INTERVAL)
70
+ else:
71
+ msg = ('run_instances: Failed to restart all'
72
+ 'instances possibly due to to capacity issue.')
73
+ logger.warning(msg)
74
+ raise RuntimeError(msg)
75
+
76
+ exist_instances = utils.filter_instances(cluster_name_on_cloud,
77
+ status_filters=['active'])
78
+ head_instance = _get_head_instance(exist_instances)
79
+ to_start_count = config.count - len(exist_instances)
80
+ if to_start_count < 0:
81
+ raise RuntimeError(
82
+ f'Cluster {cluster_name_on_cloud} already has '
83
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
84
+ if to_start_count == 0:
85
+ if head_instance is None:
86
+ head_instance = list(exist_instances.values())[0]
87
+ utils.rename_instance(
88
+ head_instance,
89
+ f'{cluster_name_on_cloud}-{uuid.uuid4().hex[:4]}-head')
90
+ assert head_instance is not None, ('`head_instance` should not be None')
91
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
92
+ f'{len(exist_instances)} nodes, no need to start more.')
93
+ return common.ProvisionRecord(
94
+ provider_name='do',
95
+ cluster_name=cluster_name_on_cloud,
96
+ region=region,
97
+ zone=None,
98
+ head_instance_id=head_instance['name'],
99
+ resumed_instance_ids=list(newly_started_instances.keys()),
100
+ created_instance_ids=[],
101
+ )
102
+
103
+ created_instances: List[Dict[str, Any]] = []
104
+ for _ in range(to_start_count):
105
+ instance_type = 'head' if head_instance is None else 'worker'
106
+ instance = utils.create_instance(
107
+ region=region,
108
+ cluster_name_on_cloud=cluster_name_on_cloud,
109
+ instance_type=instance_type,
110
+ config=config)
111
+ logger.info(f'Launched instance {instance["name"]}.')
112
+ created_instances.append(instance)
113
+ if head_instance is None:
114
+ head_instance = instance
115
+
116
+ # Wait for instances to be ready.
117
+ for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
118
+ instances = utils.filter_instances(cluster_name_on_cloud,
119
+ status_filters=['active'])
120
+ logger.info('Waiting for instances to be ready: '
121
+ f'({len(instances)}/{config.count}).')
122
+ if len(instances) == config.count:
123
+ break
124
+
125
+ time.sleep(constants.POLL_INTERVAL)
126
+ else:
127
+ # Failed to launch config.count of instances after max retries
128
+ msg = 'run_instances: Failed to create the instances'
129
+ logger.warning(msg)
130
+ raise RuntimeError(msg)
131
+ assert head_instance is not None, 'head_instance should not be None'
132
+ return common.ProvisionRecord(
133
+ provider_name='do',
134
+ cluster_name=cluster_name_on_cloud,
135
+ region=region,
136
+ zone=None,
137
+ head_instance_id=head_instance['name'],
138
+ resumed_instance_ids=list(stopped_instances.keys()),
139
+ created_instance_ids=[
140
+ instance['name'] for instance in created_instances
141
+ ],
142
+ )
143
+
144
+
145
+ def wait_instances(region: str, cluster_name_on_cloud: str,
146
+ state: Optional[status_lib.ClusterStatus]) -> None:
147
+ del region, cluster_name_on_cloud, state # unused
148
+ # We already wait on ready state in `run_instances` no need
149
+
150
+
151
+ def stop_instances(
152
+ cluster_name_on_cloud: str,
153
+ provider_config: Optional[Dict[str, Any]] = None,
154
+ worker_only: bool = False,
155
+ ) -> None:
156
+ del provider_config # unused
157
+ all_instances = utils.filter_instances(cluster_name_on_cloud,
158
+ status_filters=None)
159
+ num_instances = len(all_instances)
160
+
161
+ # Request a stop on all instances
162
+ for instance_name, instance_meta in all_instances.items():
163
+ if worker_only and instance_name.endswith('-head'):
164
+ num_instances -= 1
165
+ continue
166
+ utils.stop_instance(instance_meta)
167
+
168
+ # Wait for instances to stop
169
+ for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
170
+ all_instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
171
+ if len(all_instances) >= num_instances:
172
+ break
173
+ time.sleep(constants.POLL_INTERVAL)
174
+ else:
175
+ raise RuntimeError(f'Maximum number of polls: '
176
+ f'{MAX_POLLS_FOR_UP_OR_STOP} reached. '
177
+ f'Instance {all_instances} is still not in '
178
+ 'STOPPED status.')
179
+
180
+
181
+ def terminate_instances(
182
+ cluster_name_on_cloud: str,
183
+ provider_config: Optional[Dict[str, Any]] = None,
184
+ worker_only: bool = False,
185
+ ) -> None:
186
+ """See sky/provision/__init__.py"""
187
+ del provider_config # unused
188
+ instances = utils.filter_instances(cluster_name_on_cloud,
189
+ status_filters=None)
190
+ for instance_name, instance_meta in instances.items():
191
+ logger.debug(f'Terminating instance {instance_name}')
192
+ if worker_only and instance_name.endswith('-head'):
193
+ continue
194
+ utils.down_instance(instance_meta)
195
+
196
+ for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
197
+ instances = utils.filter_instances(cluster_name_on_cloud,
198
+ status_filters=None)
199
+ if len(instances) == 0 or len(instances) <= 1 and worker_only:
200
+ break
201
+ time.sleep(constants.POLL_INTERVAL)
202
+ else:
203
+ msg = ('Failed to delete all instances')
204
+ logger.warning(msg)
205
+ raise RuntimeError(msg)
206
+
207
+
208
+ def get_cluster_info(
209
+ region: str,
210
+ cluster_name_on_cloud: str,
211
+ provider_config: Optional[Dict[str, Any]] = None,
212
+ ) -> common.ClusterInfo:
213
+ del region # unused
214
+ running_instances = utils.filter_instances(cluster_name_on_cloud,
215
+ ['active'])
216
+ instances: Dict[str, List[common.InstanceInfo]] = {}
217
+ head_instance: Optional[str] = None
218
+ for instance_name, instance_meta in running_instances.items():
219
+ if instance_name.endswith('-head'):
220
+ head_instance = instance_name
221
+ for net in instance_meta['networks']['v4']:
222
+ if net['type'] == 'public':
223
+ instance_ip = net['ip_address']
224
+ break
225
+ instances[instance_name] = [
226
+ common.InstanceInfo(
227
+ instance_id=instance_meta['name'],
228
+ internal_ip=instance_ip,
229
+ external_ip=instance_ip,
230
+ ssh_port=22,
231
+ tags={},
232
+ )
233
+ ]
234
+
235
+ assert head_instance is not None, 'no head instance found'
236
+ return common.ClusterInfo(
237
+ instances=instances,
238
+ head_instance_id=head_instance,
239
+ provider_name='do',
240
+ provider_config=provider_config,
241
+ )
242
+
243
+
244
+ def query_instances(
245
+ cluster_name_on_cloud: str,
246
+ provider_config: Optional[Dict[str, Any]] = None,
247
+ non_terminated_only: bool = True,
248
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
249
+ """See sky/provision/__init__.py"""
250
+ # terminated instances are not retrieved by the
251
+ # API making `non_terminated_only` argument moot.
252
+ del non_terminated_only
253
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
254
+ instances = utils.filter_instances(cluster_name_on_cloud,
255
+ status_filters=None)
256
+
257
+ status_map = {
258
+ 'new': status_lib.ClusterStatus.INIT,
259
+ 'archive': status_lib.ClusterStatus.INIT,
260
+ 'active': status_lib.ClusterStatus.UP,
261
+ 'off': status_lib.ClusterStatus.STOPPED,
262
+ }
263
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
264
+ for instance_meta in instances.values():
265
+ status = status_map[instance_meta['status']]
266
+ statuses[instance_meta['name']] = status
267
+ return statuses
268
+
269
+
270
+ def open_ports(
271
+ cluster_name_on_cloud: str,
272
+ ports: List[str],
273
+ provider_config: Optional[Dict[str, Any]] = None,
274
+ ) -> None:
275
+ """See sky/provision/__init__.py"""
276
+ logger.debug(
277
+ f'Skip opening ports {ports} for DigitalOcean instances, as all '
278
+ 'ports are open by default.')
279
+ del cluster_name_on_cloud, provider_config, ports
280
+
281
+
282
+ def cleanup_ports(
283
+ cluster_name_on_cloud: str,
284
+ ports: List[str],
285
+ provider_config: Optional[Dict[str, Any]] = None,
286
+ ) -> None:
287
+ del cluster_name_on_cloud, provider_config, ports
@@ -0,0 +1,301 @@
1
+ """DigitalOcean API client wrapper for SkyPilot.
2
+
3
+ Example usage of `pydo` client library was mostly taken from here:
4
+ https://github.com/digitalocean/pydo/blob/main/examples/poc_droplets_volumes_sshkeys.py
5
+ """
6
+
7
+ import copy
8
+ import os
9
+ from typing import Any, Dict, List, Optional
10
+ import urllib
11
+ import uuid
12
+
13
+ from sky import sky_logging
14
+ from sky.adaptors import do
15
+ from sky.provision import common
16
+ from sky.provision import constants as provision_constants
17
+ from sky.provision.do import constants
18
+ from sky.utils import common_utils
19
+
20
+ logger = sky_logging.init_logger(__name__)
21
+
22
+ POSSIBLE_CREDENTIALS_PATHS = [
23
+ os.path.expanduser(
24
+ '~/Library/Application Support/doctl/config.yaml'), # OS X
25
+ os.path.expanduser(
26
+ os.path.join(os.getenv('XDG_CONFIG_HOME', '~/.config/'),
27
+ 'doctl/config.yaml')), # Linux
28
+ ]
29
+ INITIAL_BACKOFF_SECONDS = 10
30
+ MAX_BACKOFF_FACTOR = 10
31
+ MAX_ATTEMPTS = 6
32
+ SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
33
+
34
+ CREDENTIALS_PATH = '~/.config/doctl/config.yaml'
35
+ _client = None
36
+ _ssh_key_id = None
37
+
38
+
39
+ class DigitalOceanError(Exception):
40
+ pass
41
+
42
+
43
+ def _init_client():
44
+ global _client, CREDENTIALS_PATH
45
+ assert _client is None
46
+ CREDENTIALS_PATH = None
47
+ credentials_found = 0
48
+ for path in POSSIBLE_CREDENTIALS_PATHS:
49
+ if os.path.exists(path):
50
+ CREDENTIALS_PATH = path
51
+ credentials_found += 1
52
+ logger.debug(f'Digital Ocean credential path found at {path}')
53
+ if not credentials_found > 1:
54
+ logger.debug('more than 1 credential file found')
55
+ if CREDENTIALS_PATH is None:
56
+ raise DigitalOceanError(
57
+ 'no credentials file found from '
58
+ f'the following paths {POSSIBLE_CREDENTIALS_PATHS}')
59
+
60
+ # attempt default context
61
+ credentials = common_utils.read_yaml(CREDENTIALS_PATH)
62
+ default_token = credentials.get('access-token', None)
63
+ if default_token is not None:
64
+ try:
65
+ test_client = do.pydo.Client(token=default_token)
66
+ test_client.droplets.list()
67
+ logger.debug('trying `default` context')
68
+ _client = test_client
69
+ return _client
70
+ except do.exceptions().HttpResponseError:
71
+ pass
72
+
73
+ auth_contexts = credentials.get('auth-contexts', None)
74
+ if auth_contexts is not None:
75
+ for context, api_token in auth_contexts.items():
76
+ try:
77
+ test_client = do.pydo.Client(token=api_token)
78
+ test_client.droplets.list()
79
+ logger.debug(f'using {context} context')
80
+ _client = test_client
81
+ break
82
+ except do.exceptions().HttpResponseError:
83
+ continue
84
+ else:
85
+ raise DigitalOceanError(
86
+ 'no valid api tokens found try '
87
+ 'setting a new API token with `doctl auth init`')
88
+ return _client
89
+
90
+
91
+ def client():
92
+ global _client
93
+ if _client is None:
94
+ _client = _init_client()
95
+ return _client
96
+
97
+
98
+ def ssh_key_id(public_key: str):
99
+ global _ssh_key_id
100
+ if _ssh_key_id is None:
101
+ page = 1
102
+ paginated = True
103
+ while paginated:
104
+ try:
105
+ resp = client().ssh_keys.list(per_page=50, page=page)
106
+ for ssh_key in resp['ssh_keys']:
107
+ if ssh_key['public_key'] == public_key:
108
+ _ssh_key_id = ssh_key
109
+ return _ssh_key_id
110
+ except do.exceptions().HttpResponseError as err:
111
+ raise DigitalOceanError(
112
+ f'Error: {err.status_code} {err.reason}: '
113
+ f'{err.error.message}') from err
114
+
115
+ pages = resp['links']
116
+ if 'pages' in pages and 'next' in pages['pages']:
117
+ pages = pages['pages']
118
+ parsed_url = urllib.parse.urlparse(pages['next'])
119
+ page = int(urllib.parse.parse_qs(parsed_url.query)['page'][0])
120
+ else:
121
+ paginated = False
122
+
123
+ request = {
124
+ 'public_key': public_key,
125
+ 'name': SSH_KEY_NAME_ON_DO,
126
+ }
127
+ _ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
128
+ return _ssh_key_id
129
+
130
+
131
+ def _create_volume(request: Dict[str, Any]) -> Dict[str, Any]:
132
+ try:
133
+ resp = client().volumes.create(body=request)
134
+ volume = resp['volume']
135
+ except do.exceptions().HttpResponseError as err:
136
+ raise DigitalOceanError(
137
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
138
+ ) from err
139
+ else:
140
+ return volume
141
+
142
+
143
+ def _create_droplet(request: Dict[str, Any]) -> Dict[str, Any]:
144
+ try:
145
+ resp = client().droplets.create(body=request)
146
+ droplet_id = resp['droplet']['id']
147
+
148
+ get_resp = client().droplets.get(droplet_id)
149
+ droplet = get_resp['droplet']
150
+ except do.exceptions().HttpResponseError as err:
151
+ raise DigitalOceanError(
152
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
153
+ ) from err
154
+ return droplet
155
+
156
+
157
+ def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str,
158
+ config: common.ProvisionConfig) -> Dict[str, Any]:
159
+ """Creates a instance and mounts the requested block storage
160
+
161
+ Args:
162
+ region (str): instance region
163
+ instance_name (str): name of instance
164
+ config (common.ProvisionConfig): provisioner configuration
165
+
166
+ Returns:
167
+ Dict[str, Any]: instance metadata
168
+ """
169
+ # sort tags by key to support deterministic unit test stubbing
170
+ tags = dict(sorted(copy.deepcopy(config.tags).items()))
171
+ tags = {
172
+ 'Name': cluster_name_on_cloud,
173
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
174
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud,
175
+ **tags
176
+ }
177
+ tags = [f'{key}:{value}' for key, value in tags.items()]
178
+ default_image = constants.GPU_IMAGES.get(
179
+ config.node_config['InstanceType'],
180
+ 'gpu-h100x1-base',
181
+ )
182
+ image_id = config.node_config['ImageId']
183
+ image_id = image_id if image_id is not None else default_image
184
+ instance_name = (f'{cluster_name_on_cloud}-'
185
+ f'{uuid.uuid4().hex[:4]}-{instance_type}')
186
+ instance_request = {
187
+ 'name': instance_name,
188
+ 'region': region,
189
+ 'size': config.node_config['InstanceType'],
190
+ 'image': image_id,
191
+ 'ssh_keys': [
192
+ ssh_key_id(
193
+ config.authentication_config['ssh_public_key'])['fingerprint']
194
+ ],
195
+ 'tags': tags,
196
+ }
197
+ instance = _create_droplet(instance_request)
198
+
199
+ volume_request = {
200
+ 'size_gigabytes': config.node_config['DiskSize'],
201
+ 'name': instance_name,
202
+ 'region': region,
203
+ 'filesystem_type': 'ext4',
204
+ 'tags': tags
205
+ }
206
+ volume = _create_volume(volume_request)
207
+
208
+ attach_request = {'type': 'attach', 'droplet_id': instance['id']}
209
+ try:
210
+ client().volume_actions.post_by_id(volume['id'], attach_request)
211
+ except do.exceptions().HttpResponseError as err:
212
+ raise DigitalOceanError(
213
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
214
+ ) from err
215
+ logger.debug(f'{instance_name} created')
216
+ return instance
217
+
218
+
219
+ def start_instance(instance: Dict[str, Any]):
220
+ try:
221
+ client().droplet_actions.post(droplet_id=instance['id'],
222
+ body={'type': 'power_on'})
223
+ except do.exceptions().HttpResponseError as err:
224
+ raise DigitalOceanError(
225
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
226
+ ) from err
227
+
228
+
229
+ def stop_instance(instance: Dict[str, Any]):
230
+ try:
231
+ client().droplet_actions.post(
232
+ droplet_id=instance['id'],
233
+ body={'type': 'shutdown'},
234
+ )
235
+ except do.exceptions().HttpResponseError as err:
236
+ raise DigitalOceanError(
237
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
238
+ ) from err
239
+
240
+
241
+ def down_instance(instance: Dict[str, Any]):
242
+ # We use dangerous destroy to atomically delete
243
+ # block storage and instance for autodown
244
+ try:
245
+ client().droplets.destroy_with_associated_resources_dangerous(
246
+ droplet_id=instance['id'], x_dangerous=True)
247
+ except do.exceptions().HttpResponseError as err:
248
+ if 'a destroy is already in progress' in err.error.message:
249
+ return
250
+ raise DigitalOceanError(
251
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
252
+ ) from err
253
+
254
+
255
+ def rename_instance(instance: Dict[str, Any], new_name: str):
256
+ try:
257
+ client().droplet_actions.rename(droplet=instance['id'],
258
+ body={
259
+ 'type': 'rename',
260
+ 'name': new_name
261
+ })
262
+ except do.exceptions().HttpResponseError as err:
263
+ raise DigitalOceanError(
264
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
265
+ ) from err
266
+
267
+
268
+ def filter_instances(
269
+ cluster_name_on_cloud: str,
270
+ status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
271
+ """Returns Dict mapping instance name
272
+ to instance metadata filtered by status
273
+ """
274
+
275
+ filtered_instances: Dict[str, Any] = {}
276
+ page = 1
277
+ paginated = True
278
+ while paginated:
279
+ try:
280
+ resp = client().droplets.list(
281
+ tag_name=f'{provision_constants.TAG_SKYPILOT_CLUSTER_NAME}:'
282
+ f'{cluster_name_on_cloud}',
283
+ per_page=50,
284
+ page=page)
285
+ for instance in resp['droplets']:
286
+ if status_filters is None or instance[
287
+ 'status'] in status_filters:
288
+ filtered_instances[instance['name']] = instance
289
+ except do.exceptions().HttpResponseError as err:
290
+ raise DigitalOceanError(
291
+ f'Error: {err.status_code} {err.reason}: {err.error.message}'
292
+ ) from err
293
+
294
+ pages = resp['links']
295
+ if 'pages' in pages and 'next' in pages['pages']:
296
+ pages = pages['pages']
297
+ parsed_url = urllib.parse.urlparse(pages['next'])
298
+ page = int(urllib.parse.parse_qs(parsed_url.query)['page'][0])
299
+ else:
300
+ paginated = False
301
+ return filtered_instances