skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,285 @@
1
+ """Nebius instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.nebius import utils
8
+ from sky.utils import common_utils
9
+ from sky.utils import status_lib
10
+ from sky.utils import ux_utils
11
+
12
+ PENDING_STATUS = ['STARTING', 'DELETING', 'STOPPING']
13
+
14
+ MAX_RETRIES_TO_LAUNCH = 120 # Maximum number of retries
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def _filter_instances(region: str,
20
+ cluster_name_on_cloud: str,
21
+ status_filters: Optional[List[str]],
22
+ head_only: bool = False) -> Dict[str, Any]:
23
+ project_id = utils.get_project_by_region(region)
24
+ instances = utils.list_instances(project_id)
25
+ filtered_instances = {}
26
+ for instance_id, instance in instances.items():
27
+ if (status_filters is not None and
28
+ instance['status'] not in status_filters):
29
+ continue
30
+
31
+ if instance['name'] and instance['name'].startswith(
32
+ f'{cluster_name_on_cloud}-'):
33
+ if head_only and instance['name'].endswith('-worker'):
34
+ continue
35
+ else:
36
+ filtered_instances[instance_id] = instance
37
+ return filtered_instances
38
+
39
+
40
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
+ head_instance_id = None
42
+ for inst_id, inst in instances.items():
43
+ if inst['name'].endswith('-head'):
44
+ head_instance_id = inst_id
45
+ break
46
+ return head_instance_id
47
+
48
+
49
+ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
50
+ retry_count = 0
51
+ while retry_count < MAX_RETRIES_TO_LAUNCH:
52
+ instances = _filter_instances(region, cluster_name_on_cloud,
53
+ PENDING_STATUS)
54
+ if not instances:
55
+ break
56
+ logger.info(f'Waiting for {len(instances)} instances to be ready '
57
+ f'(Attempt {retry_count + 1}/{MAX_RETRIES_TO_LAUNCH}).')
58
+ time.sleep(utils.POLL_INTERVAL)
59
+ retry_count += 1
60
+
61
+ if retry_count == MAX_RETRIES_TO_LAUNCH:
62
+ raise TimeoutError(f'Exceeded maximum retries '
63
+ f'({MAX_RETRIES_TO_LAUNCH * utils.POLL_INTERVAL}'
64
+ f' seconds) while waiting for instances'
65
+ f' to be ready.')
66
+
67
+
68
+ def run_instances(region: str, cluster_name_on_cloud: str,
69
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
70
+ """Runs instances for the given cluster."""
71
+ _wait_until_no_pending(region, cluster_name_on_cloud)
72
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
73
+ ['RUNNING'])
74
+ head_instance_id = _get_head_instance_id(running_instances)
75
+ to_start_count = config.count - len(running_instances)
76
+ if to_start_count < 0:
77
+ raise RuntimeError(
78
+ f'Cluster {cluster_name_on_cloud} already has '
79
+ f'{len(running_instances)} nodes, but {config.count} are required.')
80
+ if to_start_count == 0:
81
+ if head_instance_id is None:
82
+ raise RuntimeError(
83
+ f'Cluster {cluster_name_on_cloud} has no head node.')
84
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
85
+ f'{len(running_instances)} nodes, no need to start more.')
86
+ return common.ProvisionRecord(provider_name='nebius',
87
+ cluster_name=cluster_name_on_cloud,
88
+ region=region,
89
+ zone=None,
90
+ head_instance_id=head_instance_id,
91
+ resumed_instance_ids=[],
92
+ created_instance_ids=[])
93
+
94
+ created_instance_ids = []
95
+ resumed_instance_ids = []
96
+ stopped_instances = _filter_instances(region, cluster_name_on_cloud,
97
+ ['STOPPED'])
98
+ if config.resume_stopped_nodes and len(stopped_instances) > to_start_count:
99
+
100
+ raise RuntimeError(
101
+ 'The number of running/stopped/stopping instances combined '
102
+ f'({len(stopped_instances) + len(running_instances)}) in '
103
+ f'cluster "{cluster_name_on_cloud}" is greater than the '
104
+ f'number requested by the user ({config.count}). '
105
+ 'This is likely a resource leak. '
106
+ 'Use "sky down" to terminate the cluster.')
107
+
108
+ for stopped_instance_id, _ in stopped_instances.items():
109
+ if to_start_count > 0:
110
+ try:
111
+ utils.start(stopped_instance_id)
112
+ resumed_instance_ids.append(stopped_instance_id)
113
+ to_start_count -= 1
114
+ if stopped_instances[stopped_instance_id]['name'].endswith(
115
+ '-head'):
116
+ head_instance_id = stopped_instance_id
117
+ except Exception as e: # pylint: disable=broad-except
118
+ logger.warning(f'Start instance error: {e}')
119
+ raise
120
+ time.sleep(utils.POLL_INTERVAL) # to avoid fake STOPPED status
121
+ logger.info(f'Started instance {stopped_instance_id}.')
122
+
123
+ for _ in range(to_start_count):
124
+ node_type = 'head' if head_instance_id is None else 'worker'
125
+ try:
126
+ platform, preset = config.node_config['InstanceType'].split('_')
127
+ instance_id = utils.launch(
128
+ cluster_name_on_cloud=cluster_name_on_cloud,
129
+ node_type=node_type,
130
+ platform=platform,
131
+ preset=preset,
132
+ region=region,
133
+ image_family=config.node_config['ImageId'],
134
+ disk_size=config.node_config['DiskSize'],
135
+ user_data=config.node_config['UserData'])
136
+ except Exception as e: # pylint: disable=broad-except
137
+ logger.warning(f'run_instances error: {e}')
138
+ raise
139
+ logger.info(f'Launched instance {instance_id}.')
140
+ created_instance_ids.append(instance_id)
141
+ if head_instance_id is None:
142
+ head_instance_id = instance_id
143
+ assert head_instance_id is not None, 'head_instance_id should not be None'
144
+ return common.ProvisionRecord(provider_name='nebius',
145
+ cluster_name=cluster_name_on_cloud,
146
+ region=region,
147
+ zone=None,
148
+ head_instance_id=head_instance_id,
149
+ resumed_instance_ids=resumed_instance_ids,
150
+ created_instance_ids=created_instance_ids)
151
+
152
+
153
+ def wait_instances(region: str, cluster_name_on_cloud: str,
154
+ state: Optional[status_lib.ClusterStatus]) -> None:
155
+ _wait_until_no_pending(region, cluster_name_on_cloud)
156
+ if state is not None:
157
+ if state == status_lib.ClusterStatus.UP:
158
+ stopped_instances = _filter_instances(region, cluster_name_on_cloud,
159
+ ['STOPPED'])
160
+ if stopped_instances:
161
+ raise RuntimeError(
162
+ f'Cluster {cluster_name_on_cloud} is in UP state, but '
163
+ f'{len(stopped_instances)} instances are stopped.')
164
+ if state == status_lib.ClusterStatus.STOPPED:
165
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
166
+ ['RUNNIG'])
167
+
168
+ if running_instances:
169
+ raise RuntimeError(
170
+ f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
171
+ f'{len(running_instances)} instances are running.')
172
+
173
+
174
+ def stop_instances(
175
+ cluster_name_on_cloud: str,
176
+ provider_config: Optional[Dict[str, Any]] = None,
177
+ worker_only: bool = False,
178
+ ) -> None:
179
+ assert provider_config is not None
180
+ exist_instances = _filter_instances(provider_config['region'],
181
+ cluster_name_on_cloud, ['RUNNING'])
182
+ for instance in exist_instances:
183
+ if worker_only and instance.endswith('-head'):
184
+ continue
185
+ utils.stop(instance)
186
+
187
+
188
+ def terminate_instances(
189
+ cluster_name_on_cloud: str,
190
+ provider_config: Optional[Dict[str, Any]] = None,
191
+ worker_only: bool = False,
192
+ ) -> None:
193
+ """See sky/provision/__init__.py"""
194
+
195
+ assert provider_config is not None
196
+ instances = _filter_instances(provider_config['region'],
197
+ cluster_name_on_cloud,
198
+ status_filters=None)
199
+ for inst_id, inst in instances.items():
200
+ logger.debug(f'Terminating instance {inst_id}: {inst}')
201
+ if worker_only and inst['name'].endswith('-head'):
202
+ continue
203
+ try:
204
+ utils.remove(inst_id)
205
+ except Exception as e: # pylint: disable=broad-except
206
+ with ux_utils.print_exception_no_traceback():
207
+ raise RuntimeError(
208
+ f'Failed to terminate instance {inst_id}: '
209
+ f'{common_utils.format_exception(e, use_bracket=False)}'
210
+ ) from e
211
+ utils.delete_cluster(cluster_name_on_cloud, provider_config['region'])
212
+
213
+
214
+ def get_cluster_info(
215
+ region: str,
216
+ cluster_name_on_cloud: str,
217
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
218
+ _wait_until_no_pending(region, cluster_name_on_cloud)
219
+ running_instances = _filter_instances(region, cluster_name_on_cloud,
220
+ ['RUNNING'])
221
+ instances: Dict[str, List[common.InstanceInfo]] = {}
222
+ head_instance_id = None
223
+ for instance_id, instance_info in running_instances.items():
224
+ instances[instance_id] = [
225
+ common.InstanceInfo(
226
+ instance_id=instance_id,
227
+ internal_ip=instance_info['internal_ip'],
228
+ external_ip=instance_info['external_ip'],
229
+ tags={},
230
+ )
231
+ ]
232
+ if instance_info['name'].endswith('-head'):
233
+ head_instance_id = instance_id
234
+ assert head_instance_id is not None
235
+ return common.ClusterInfo(
236
+ instances=instances,
237
+ head_instance_id=head_instance_id,
238
+ provider_name='nebius',
239
+ provider_config=provider_config,
240
+ )
241
+
242
+
243
+ def query_instances(
244
+ cluster_name_on_cloud: str,
245
+ provider_config: Optional[Dict[str, Any]] = None,
246
+ non_terminated_only: bool = True,
247
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
248
+ """See sky/provision/__init__.py"""
249
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
250
+ instances = _filter_instances(provider_config['region'],
251
+ cluster_name_on_cloud, None)
252
+
253
+ status_map = {
254
+ 'STARTING': status_lib.ClusterStatus.INIT,
255
+ 'RUNNING': status_lib.ClusterStatus.UP,
256
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
257
+ 'STOPPING': status_lib.ClusterStatus.STOPPED,
258
+ 'DELETING': status_lib.ClusterStatus.STOPPED,
259
+ }
260
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
261
+ for inst_id, inst in instances.items():
262
+ status = status_map[inst['status']]
263
+ if non_terminated_only and status is None:
264
+ continue
265
+ statuses[inst_id] = status
266
+ return statuses
267
+
268
+
269
+ def open_ports(
270
+ cluster_name_on_cloud: str,
271
+ ports: List[str],
272
+ provider_config: Optional[Dict[str, Any]] = None,
273
+ ) -> None:
274
+ """See sky/provision/__init__.py"""
275
+ logger.debug(f'Skip opening ports {ports} for Nebius instances, as all '
276
+ 'ports are open by default.')
277
+ del cluster_name_on_cloud, provider_config, ports
278
+
279
+
280
+ def cleanup_ports(
281
+ cluster_name_on_cloud: str,
282
+ ports: List[str],
283
+ provider_config: Optional[Dict[str, Any]] = None,
284
+ ) -> None:
285
+ del cluster_name_on_cloud, ports, provider_config # Unused.
@@ -0,0 +1,318 @@
1
+ """Nebius library wrapper for SkyPilot."""
2
+ import time
3
+ from typing import Any, Dict
4
+ import uuid
5
+
6
+ from sky import sky_logging
7
+ from sky.adaptors import nebius
8
+ from sky.utils import common_utils
9
+
10
+ logger = sky_logging.init_logger(__name__)
11
+
12
+ POLL_INTERVAL = 5
13
+
14
+
15
+ def retry(func):
16
+ """Decorator to retry a function."""
17
+
18
+ def wrapper(*args, **kwargs):
19
+ """Wrapper for retrying a function."""
20
+ cnt = 0
21
+ while True:
22
+ try:
23
+ return func(*args, **kwargs)
24
+ except nebius.nebius.error.QueryError as e:
25
+ if cnt >= 3:
26
+ raise
27
+ logger.warning('Retrying for exception: '
28
+ f'{common_utils.format_exception(e)}.')
29
+ time.sleep(POLL_INTERVAL)
30
+
31
+ return wrapper
32
+
33
+
34
+ def get_project_by_region(region: str) -> str:
35
+ service = nebius.iam().ProjectServiceClient(nebius.sdk())
36
+ projects = service.list(nebius.iam().ListProjectsRequest(
37
+ parent_id=nebius.get_tenant_id())).wait()
38
+ # To find a project in a specific region, we rely on the project ID to
39
+ # deduce the region, since there is currently no method to retrieve region
40
+ # information directly from the project. Additionally, there is only one
41
+ # project per region, and projects cannot be created at this time.
42
+ # The region is determined from the project ID using a region-specific
43
+ # identifier embedded in it.
44
+ # Project id looks like project-e00xxxxxxxxxxxxxx where
45
+ # e00 - id of region 'eu-north1'
46
+ # e01 - id of region 'eu-west1'
47
+ region_ids = {'eu-north1': 'e00', 'eu-west1': 'e01'}
48
+ # TODO(SalikovAlex): fix when info about region will be in projects list
49
+ # Currently, Nebius cloud supports 2 regions. We manually enumerate
50
+ # them here. Reference: https://docs.nebius.com/overview/regions
51
+
52
+ # Check is there project if in config
53
+ preferable_project_id = nebius.get_project_id()
54
+ if preferable_project_id is not None:
55
+ if preferable_project_id[8:11] == region_ids[region]:
56
+ return preferable_project_id
57
+ logger.warning(
58
+ f'Can\'t use customized NEBIUS_PROJECT_ID ({preferable_project_id})'
59
+ f' for region {region}. Please check if the project ID is correct.')
60
+ for project in projects.items:
61
+ if project.metadata.id[8:11] == region_ids[region]:
62
+ return project.metadata.id
63
+ raise Exception(f'No project found for region "{region}".')
64
+
65
+
66
+ def get_or_create_gpu_cluster(name: str, region: str) -> str:
67
+ """Creates a GPU cluster.
68
+ When creating a GPU cluster, select an InfiniBand fabric for it:
69
+
70
+ fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
71
+ fabric-5 for projects in the eu-west1 region.
72
+
73
+ https://docs.nebius.com/compute/clusters/gpu
74
+ """
75
+ project_id = get_project_by_region(region)
76
+ service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
77
+ try:
78
+ cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
79
+ parent_id=project_id,
80
+ name=name,
81
+ )).wait()
82
+ cluster_id = cluster.metadata.id
83
+ except nebius.request_error() as no_cluster_found_error:
84
+ if region == 'eu-north1':
85
+ fabric = 'fabric-4'
86
+ elif region == 'eu-west1':
87
+ fabric = 'fabric-5'
88
+ else:
89
+ raise RuntimeError(
90
+ f'Unsupported region {region}.') from no_cluster_found_error
91
+ cluster = service.create(nebius.compute().CreateGpuClusterRequest(
92
+ metadata=nebius.nebius_common().ResourceMetadata(
93
+ parent_id=project_id,
94
+ name=name,
95
+ ),
96
+ spec=nebius.compute().GpuClusterSpec(
97
+ infiniband_fabric=fabric))).wait()
98
+ cluster_id = cluster.resource_id
99
+ return cluster_id
100
+
101
+
102
+ def delete_cluster(name: str, region: str) -> None:
103
+ """Delete a GPU cluster."""
104
+ project_id = get_project_by_region(region)
105
+ service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
106
+ try:
107
+ cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
108
+ parent_id=project_id,
109
+ name=name,
110
+ )).wait()
111
+ cluster_id = cluster.metadata.id
112
+ logger.debug(f'Found GPU Cluster : {cluster_id}.')
113
+ service.delete(
114
+ nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
115
+ logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
116
+ except nebius.request_error():
117
+ logger.debug('GPU Cluster does not exist.')
118
+
119
+
120
+ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
121
+ """Lists instances associated with API key."""
122
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
123
+ result = service.list(
124
+ nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
125
+
126
+ instances = result
127
+
128
+ instance_dict: Dict[str, Dict[str, Any]] = {}
129
+ for instance in instances.items:
130
+ info = {}
131
+ info['status'] = instance.status.state.name
132
+ info['name'] = instance.metadata.name
133
+ if instance.status.network_interfaces:
134
+ info['external_ip'] = instance.status.network_interfaces[
135
+ 0].public_ip_address.address.split('/')[0]
136
+ info['internal_ip'] = instance.status.network_interfaces[
137
+ 0].ip_address.address.split('/')[0]
138
+ instance_dict[instance.metadata.id] = info
139
+
140
+ return instance_dict
141
+
142
+
143
+ def stop(instance_id: str) -> None:
144
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
145
+ service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
146
+ retry_count = 0
147
+ while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
148
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
149
+ instance = service.get(nebius.compute().GetInstanceRequest(
150
+ id=instance_id,)).wait()
151
+ if instance.status.state.name == 'STOPPED':
152
+ break
153
+ time.sleep(POLL_INTERVAL)
154
+ logger.debug(f'Waiting for instance {instance_id} stopping.')
155
+ retry_count += 1
156
+
157
+ if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_STOP:
158
+ raise TimeoutError(
159
+ f'Exceeded maximum retries '
160
+ f'({nebius.MAX_RETRIES_TO_INSTANCE_STOP * POLL_INTERVAL}'
161
+ f' seconds) while waiting for instance {instance_id}'
162
+ f' to be stopped.')
163
+
164
+
165
+ def start(instance_id: str) -> None:
166
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
167
+ service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
168
+ retry_count = 0
169
+ while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
170
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
171
+ instance = service.get(nebius.compute().GetInstanceRequest(
172
+ id=instance_id,)).wait()
173
+ if instance.status.state.name == 'RUNNING':
174
+ break
175
+ time.sleep(POLL_INTERVAL)
176
+ logger.debug(f'Waiting for instance {instance_id} starting.')
177
+ retry_count += 1
178
+
179
+ if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_START:
180
+ raise TimeoutError(
181
+ f'Exceeded maximum retries '
182
+ f'({nebius.MAX_RETRIES_TO_INSTANCE_START * POLL_INTERVAL}'
183
+ f' seconds) while waiting for instance {instance_id}'
184
+ f' to be ready.')
185
+
186
+
187
+ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
188
+ preset: str, region: str, image_family: str, disk_size: int,
189
+ user_data: str) -> str:
190
+ # Each node must have a unique name to avoid conflicts between
191
+ # multiple worker VMs. To ensure uniqueness,a UUID is appended
192
+ # to the node name.
193
+ instance_name = (f'{cluster_name_on_cloud}-'
194
+ f'{uuid.uuid4().hex[:4]}-{node_type}')
195
+ logger.debug(f'Launching instance: {instance_name}')
196
+
197
+ disk_name = 'disk-' + instance_name
198
+ cluster_id = None
199
+ # 8 GPU virtual machines can be grouped into a GPU cluster.
200
+ # The GPU clusters are built with InfiniBand secure high-speed networking.
201
+ # https://docs.nebius.com/compute/clusters/gpu
202
+ if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
203
+ if preset == '8gpu-128vcpu-1600gb':
204
+ cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
205
+ region)
206
+
207
+ project_id = get_project_by_region(region)
208
+ service = nebius.compute().DiskServiceClient(nebius.sdk())
209
+ disk = service.create(nebius.compute().CreateDiskRequest(
210
+ metadata=nebius.nebius_common().ResourceMetadata(
211
+ parent_id=project_id,
212
+ name=disk_name,
213
+ ),
214
+ spec=nebius.compute().DiskSpec(
215
+ source_image_family=nebius.compute().SourceImageFamily(
216
+ image_family=image_family),
217
+ size_gibibytes=disk_size,
218
+ type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
219
+ ))).wait()
220
+ disk_id = disk.resource_id
221
+ retry_count = 0
222
+ while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
223
+ disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
224
+ parent_id=project_id,
225
+ name=disk_name,
226
+ )).wait()
227
+ if disk.status.state.name == 'READY':
228
+ break
229
+ logger.debug(f'Waiting for disk {disk_name} to be ready.')
230
+ time.sleep(POLL_INTERVAL)
231
+ retry_count += 1
232
+
233
+ if retry_count == nebius.MAX_RETRIES_TO_DISK_CREATE:
234
+ raise TimeoutError(
235
+ f'Exceeded maximum retries '
236
+ f'({nebius.MAX_RETRIES_TO_DISK_CREATE * POLL_INTERVAL}'
237
+ f' seconds) while waiting for disk {disk_name}'
238
+ f' to be ready.')
239
+
240
+ service = nebius.vpc().SubnetServiceClient(nebius.sdk())
241
+ sub_net = service.list(nebius.vpc().ListSubnetsRequest(
242
+ parent_id=project_id,)).wait()
243
+
244
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
245
+ service.create(nebius.compute().CreateInstanceRequest(
246
+ metadata=nebius.nebius_common().ResourceMetadata(
247
+ parent_id=project_id,
248
+ name=instance_name,
249
+ ),
250
+ spec=nebius.compute().InstanceSpec(
251
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
252
+ if cluster_id is not None else None,
253
+ boot_disk=nebius.compute().AttachedDiskSpec(
254
+ attach_mode=nebius.compute(
255
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
256
+ existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
257
+ cloud_init_user_data=user_data,
258
+ resources=nebius.compute().ResourcesSpec(platform=platform,
259
+ preset=preset),
260
+ network_interfaces=[
261
+ nebius.compute().NetworkInterfaceSpec(
262
+ subnet_id=sub_net.items[0].metadata.id,
263
+ ip_address=nebius.compute().IPAddress(),
264
+ name='network-interface-0',
265
+ public_ip_address=nebius.compute().PublicIPAddress())
266
+ ]))).wait()
267
+ instance_id = ''
268
+ retry_count = 0
269
+ while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
270
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
271
+ instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
272
+ parent_id=project_id,
273
+ name=instance_name,
274
+ )).wait()
275
+ if instance.status.state.name == 'STARTING':
276
+ instance_id = instance.metadata.id
277
+ break
278
+ time.sleep(POLL_INTERVAL)
279
+ logger.debug(f'Waiting for instance {instance_name} start running.')
280
+ retry_count += 1
281
+
282
+ if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
283
+ raise TimeoutError(
284
+ f'Exceeded maximum retries '
285
+ f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
286
+ f' seconds) while waiting for instance {instance_name}'
287
+ f' to be ready.')
288
+ return instance_id
289
+
290
+
291
+ def remove(instance_id: str) -> None:
292
+ """Terminates the given instance."""
293
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
294
+ result = service.get(
295
+ nebius.compute().GetInstanceRequest(id=instance_id)).wait()
296
+ disk_id = result.spec.boot_disk.existing_disk.id
297
+ service.delete(
298
+ nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
299
+ retry_count = 0
300
+ # The instance begins deleting and attempts to delete the disk.
301
+ # Must wait until the disk is unlocked and becomes deletable.
302
+ while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
303
+ try:
304
+ service = nebius.compute().DiskServiceClient(nebius.sdk())
305
+ service.delete(
306
+ nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
307
+ break
308
+ except nebius.request_error():
309
+ logger.debug('Waiting for disk deletion.')
310
+ time.sleep(POLL_INTERVAL)
311
+ retry_count += 1
312
+
313
+ if retry_count == nebius.MAX_RETRIES_TO_DISK_DELETE:
314
+ raise TimeoutError(
315
+ f'Exceeded maximum retries '
316
+ f'({nebius.MAX_RETRIES_TO_DISK_DELETE * POLL_INTERVAL}'
317
+ f' seconds) while waiting for disk {disk_id}'
318
+ f' to be deleted.')
@@ -0,0 +1,15 @@
1
+ """OCI provisioner for SkyPilot.
2
+
3
+ History:
4
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
5
+ """
6
+
7
+ from sky.provision.oci.config import bootstrap_instances
8
+ from sky.provision.oci.instance import cleanup_ports
9
+ from sky.provision.oci.instance import get_cluster_info
10
+ from sky.provision.oci.instance import open_ports
11
+ from sky.provision.oci.instance import query_instances
12
+ from sky.provision.oci.instance import run_instances
13
+ from sky.provision.oci.instance import stop_instances
14
+ from sky.provision.oci.instance import terminate_instances
15
+ from sky.provision.oci.instance import wait_instances
@@ -0,0 +1,51 @@
1
+ """OCI configuration bootstrapping.
2
+
3
+ Creates the resource group and deploys the configuration template to OCI for
4
+ a cluster to be launched.
5
+
6
+ History:
7
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
8
+ """
9
+
10
+ from sky import exceptions
11
+ from sky import sky_logging
12
+ from sky.adaptors import oci as oci_adaptor
13
+ from sky.clouds.utils import oci_utils
14
+ from sky.provision import common
15
+ from sky.provision.oci.query_utils import query_helper
16
+
17
+ logger = sky_logging.init_logger(__name__)
18
+
19
+
20
+ @common.log_function_start_end
21
+ def bootstrap_instances(
22
+ region: str, cluster_name_on_cloud: str,
23
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
24
+ """See sky/provision/__init__.py"""
25
+ # OCI module import and oci client
26
+ oci_adaptor.get_core_client(region, oci_utils.oci_config.get_profile())
27
+
28
+ # Find / create a compartment for creating instances.
29
+ compartment = query_helper.find_compartment(region)
30
+
31
+ # Find the configured VCN, or create a new one.
32
+ vcn = query_helper.find_create_vcn_subnet(region)
33
+ if vcn is None:
34
+ # pylint: disable=line-too-long
35
+ raise exceptions.ResourcesUnavailableError(
36
+ 'Failed to create a new VCN, possibly you hit the resource limitation.'
37
+ )
38
+
39
+ node_config = config.node_config
40
+
41
+ # Subscribe the image if it is from Marketplace listing.
42
+ query_helper.subscribe_image(
43
+ compartment_id=compartment,
44
+ listing_id=node_config['AppCatalogListingId'],
45
+ resource_version=node_config['ResourceVersion'],
46
+ region=region,
47
+ )
48
+
49
+ logger.info(f'Using cluster name: {cluster_name_on_cloud}')
50
+
51
+ return config