skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ """Lambda provisioner for SkyPilot."""
2
+
3
+ from sky.provision.lambda_cloud.config import bootstrap_instances
4
+ from sky.provision.lambda_cloud.instance import cleanup_ports
5
+ from sky.provision.lambda_cloud.instance import get_cluster_info
6
+ from sky.provision.lambda_cloud.instance import open_ports
7
+ from sky.provision.lambda_cloud.instance import query_instances
8
+ from sky.provision.lambda_cloud.instance import run_instances
9
+ from sky.provision.lambda_cloud.instance import stop_instances
10
+ from sky.provision.lambda_cloud.instance import terminate_instances
11
+ from sky.provision.lambda_cloud.instance import wait_instances
@@ -0,0 +1,10 @@
1
+ """Lambda Cloud configuration bootstrapping"""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ del region, cluster_name # unused
10
+ return config
@@ -0,0 +1,265 @@
1
+ """Lambda instance provisioning."""
2
+
3
+ import time
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from sky import sky_logging
7
+ from sky.provision import common
8
+ import sky.provision.lambda_cloud.lambda_utils as lambda_utils
9
+ from sky.utils import common_utils
10
+ from sky.utils import status_lib
11
+ from sky.utils import ux_utils
12
+
13
+ POLL_INTERVAL = 1
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+ _lambda_client = None
17
+
18
+
19
+ def _get_lambda_client():
20
+ global _lambda_client
21
+ if _lambda_client is None:
22
+ _lambda_client = lambda_utils.LambdaCloudClient()
23
+ return _lambda_client
24
+
25
+
26
+ def _filter_instances(
27
+ cluster_name_on_cloud: str,
28
+ status_filters: Optional[List[str]]) -> Dict[str, Dict[str, Any]]:
29
+ lambda_client = _get_lambda_client()
30
+ instances = lambda_client.list_instances()
31
+ possible_names = [
32
+ f'{cluster_name_on_cloud}-head',
33
+ f'{cluster_name_on_cloud}-worker',
34
+ ]
35
+
36
+ filtered_instances = {}
37
+ for instance in instances:
38
+ if (status_filters is not None and
39
+ instance['status'] not in status_filters):
40
+ continue
41
+ if instance.get('name') in possible_names:
42
+ filtered_instances[instance['id']] = instance
43
+ return filtered_instances
44
+
45
+
46
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
47
+ head_instance_id = None
48
+ for instance_id, instance in instances.items():
49
+ if instance['name'].endswith('-head'):
50
+ head_instance_id = instance_id
51
+ break
52
+ return head_instance_id
53
+
54
+
55
+ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
56
+ private_ip = instance_info.get('private_ip')
57
+ if private_ip is None:
58
+ if single_node:
59
+ # The Lambda cloud API may return an instance info without
60
+ # private IP. It does not align with their docs, but we still
61
+ # allow single-node cluster to proceed with provisioning, by using
62
+ # 127.0.0.1, as private IP is not critical for single-node case.
63
+ return '127.0.0.1'
64
+ msg = f'Failed to retrieve private IP for instance {instance_info}.'
65
+ logger.error(msg)
66
+ raise RuntimeError(msg)
67
+ return private_ip
68
+
69
+
70
+ def run_instances(region: str, cluster_name_on_cloud: str,
71
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
72
+ """Runs instances for the given cluster"""
73
+ lambda_client = _get_lambda_client()
74
+ pending_status = ['booting']
75
+ while True:
76
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
77
+ if not instances:
78
+ break
79
+ logger.info(f'Waiting for {len(instances)} instances to be ready.')
80
+ time.sleep(POLL_INTERVAL)
81
+ exist_instances = _filter_instances(cluster_name_on_cloud, ['active'])
82
+ head_instance_id = _get_head_instance_id(exist_instances)
83
+
84
+ to_start_count = config.count - len(exist_instances)
85
+ if to_start_count < 0:
86
+ raise RuntimeError(
87
+ f'Cluster {cluster_name_on_cloud} already has '
88
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
89
+ if to_start_count == 0:
90
+ if head_instance_id is None:
91
+ raise RuntimeError(
92
+ f'Cluster {cluster_name_on_cloud} has no head node.')
93
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
94
+ f'{len(exist_instances)} nodes, no need to start more.')
95
+ return common.ProvisionRecord(
96
+ provider_name='lambda',
97
+ cluster_name=cluster_name_on_cloud,
98
+ region=region,
99
+ zone=None,
100
+ head_instance_id=head_instance_id,
101
+ resumed_instance_ids=[],
102
+ created_instance_ids=[],
103
+ )
104
+
105
+ created_instance_ids = []
106
+ remote_ssh_key_name = config.authentication_config['remote_key_name']
107
+
108
+ def launch_nodes(node_type: str, quantity: int) -> List[str]:
109
+ try:
110
+ instance_ids = lambda_client.create_instances(
111
+ instance_type=config.node_config['InstanceType'],
112
+ region=region,
113
+ name=f'{cluster_name_on_cloud}-{node_type}',
114
+ quantity=quantity,
115
+ ssh_key_name=remote_ssh_key_name,
116
+ )
117
+ logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
118
+ f'instance_ids: {instance_ids}')
119
+ return instance_ids
120
+ except Exception as e:
121
+ logger.warning(f'run_instances error: {e}')
122
+ raise
123
+
124
+ if head_instance_id is None:
125
+ instance_ids = launch_nodes('head', 1)
126
+ assert len(instance_ids) == 1
127
+ created_instance_ids.append(instance_ids[0])
128
+ head_instance_id = instance_ids[0]
129
+
130
+ assert head_instance_id is not None, 'head_instance_id should not be None'
131
+
132
+ worker_node_count = to_start_count - 1
133
+ if worker_node_count > 0:
134
+ instance_ids = launch_nodes('worker', worker_node_count)
135
+ created_instance_ids.extend(instance_ids)
136
+
137
+ while True:
138
+ instances = _filter_instances(cluster_name_on_cloud, ['active'])
139
+ if len(instances) == config.count:
140
+ break
141
+
142
+ time.sleep(POLL_INTERVAL)
143
+
144
+ return common.ProvisionRecord(
145
+ provider_name='lambda',
146
+ cluster_name=cluster_name_on_cloud,
147
+ region=region,
148
+ zone=None,
149
+ head_instance_id=head_instance_id,
150
+ resumed_instance_ids=[],
151
+ created_instance_ids=created_instance_ids,
152
+ )
153
+
154
+
155
+ def wait_instances(region: str, cluster_name_on_cloud: str,
156
+ state: Optional[status_lib.ClusterStatus]) -> None:
157
+ del region, cluster_name_on_cloud, state # Unused.
158
+
159
+
160
+ def stop_instances(
161
+ cluster_name_on_cloud: str,
162
+ provider_config: Optional[Dict[str, Any]] = None,
163
+ worker_only: bool = False,
164
+ ) -> None:
165
+ raise NotImplementedError(
166
+ 'stop_instances is not supported for Lambda Cloud')
167
+
168
+
169
+ def terminate_instances(
170
+ cluster_name_on_cloud: str,
171
+ provider_config: Optional[Dict[str, Any]] = None,
172
+ worker_only: bool = False,
173
+ ) -> None:
174
+ """See sky/provision/__init__.py"""
175
+ del provider_config
176
+ lambda_client = _get_lambda_client()
177
+ instances = _filter_instances(cluster_name_on_cloud, None)
178
+
179
+ instance_ids_to_terminate = []
180
+ for instance_id, instance in instances.items():
181
+ if worker_only and not instance['name'].endswith('-worker'):
182
+ continue
183
+ instance_ids_to_terminate.append(instance_id)
184
+
185
+ try:
186
+ logger.debug(
187
+ f'Terminating instances {", ".join(instance_ids_to_terminate)}')
188
+ lambda_client.remove_instances(instance_ids_to_terminate)
189
+ except Exception as e: # pylint: disable=broad-except
190
+ with ux_utils.print_exception_no_traceback():
191
+ raise RuntimeError(
192
+ f'Failed to terminate instances {instance_ids_to_terminate}: '
193
+ f'{common_utils.format_exception(e, use_bracket=False)}') from e
194
+
195
+
196
+ def get_cluster_info(
197
+ region: str,
198
+ cluster_name_on_cloud: str,
199
+ provider_config: Optional[Dict[str, Any]] = None,
200
+ ) -> common.ClusterInfo:
201
+ del region # unused
202
+ running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
203
+ single_node = len(running_instances) == 1
204
+ instances: Dict[str, List[common.InstanceInfo]] = {}
205
+ head_instance_id = None
206
+ for instance_id, instance_info in running_instances.items():
207
+ instances[instance_id] = [
208
+ common.InstanceInfo(
209
+ instance_id=instance_id,
210
+ internal_ip=_get_private_ip(instance_info, single_node),
211
+ external_ip=instance_info['ip'],
212
+ ssh_port=22,
213
+ tags={},
214
+ )
215
+ ]
216
+ if instance_info['name'].endswith('-head'):
217
+ head_instance_id = instance_id
218
+
219
+ return common.ClusterInfo(
220
+ instances=instances,
221
+ head_instance_id=head_instance_id,
222
+ provider_name='lambda',
223
+ provider_config=provider_config,
224
+ )
225
+
226
+
227
+ def query_instances(
228
+ cluster_name_on_cloud: str,
229
+ provider_config: Optional[Dict[str, Any]] = None,
230
+ non_terminated_only: bool = True,
231
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
232
+ """See sky/provision/__init__.py"""
233
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
234
+ instances = _filter_instances(cluster_name_on_cloud, None)
235
+
236
+ status_map = {
237
+ 'booting': status_lib.ClusterStatus.INIT,
238
+ 'active': status_lib.ClusterStatus.UP,
239
+ 'unhealthy': status_lib.ClusterStatus.INIT,
240
+ 'terminating': None,
241
+ }
242
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
243
+ for instance_id, instance in instances.items():
244
+ status = status_map.get(instance['status'])
245
+ if non_terminated_only and status is None:
246
+ continue
247
+ statuses[instance_id] = status
248
+ return statuses
249
+
250
+
251
+ def open_ports(
252
+ cluster_name_on_cloud: str,
253
+ ports: List[str],
254
+ provider_config: Optional[Dict[str, Any]] = None,
255
+ ) -> None:
256
+ raise NotImplementedError('open_ports is not supported for Lambda Cloud')
257
+
258
+
259
+ def cleanup_ports(
260
+ cluster_name_on_cloud: str,
261
+ ports: List[str],
262
+ provider_config: Optional[Dict[str, Any]] = None,
263
+ ) -> None:
264
+ """See sky/provision/__init__.py"""
265
+ del cluster_name_on_cloud, ports, provider_config # Unused.
@@ -1,4 +1,5 @@
1
1
  """Lambda Cloud helper functions."""
2
+
2
3
  import json
3
4
  import os
4
5
  import time
@@ -49,7 +50,7 @@ class Metadata:
49
50
  if value is None:
50
51
  if instance_id in metadata:
51
52
  metadata.pop(instance_id) # del entry
52
- if len(metadata) == 0:
53
+ if not metadata:
53
54
  if os.path.exists(self.path):
54
55
  os.remove(self.path)
55
56
  return
@@ -68,7 +69,7 @@ class Metadata:
68
69
  for instance_id in list(metadata.keys()):
69
70
  if instance_id not in instance_ids:
70
71
  del metadata[instance_id]
71
- if len(metadata) == 0:
72
+ if not metadata:
72
73
  os.remove(self.path)
73
74
  return
74
75
  with open(self.path, 'w', encoding='utf-8') as f:
@@ -76,12 +77,12 @@ class Metadata:
76
77
 
77
78
 
78
79
  def raise_lambda_error(response: requests.Response) -> None:
79
- """Raise LambdaCloudError if appropriate. """
80
+ """Raise LambdaCloudError if appropriate."""
80
81
  status_code = response.status_code
81
82
  if status_code == 200:
82
83
  return
83
84
  if status_code == 429:
84
- # https://docs.lambdalabs.com/cloud/rate-limiting/
85
+ # https://docs.lambdalabs.com/public-cloud/cloud-api/
85
86
  raise LambdaCloudError('Your API requests are being rate limited.')
86
87
  try:
87
88
  resp_json = response.json()
@@ -131,23 +132,25 @@ class LambdaCloudClient:
131
132
  self.api_key = self._credentials['api_key']
132
133
  self.headers = {'Authorization': f'Bearer {self.api_key}'}
133
134
 
134
- def create_instances(self,
135
- instance_type: str = 'gpu_1x_a100_sxm4',
136
- region: str = 'us-east-1',
137
- quantity: int = 1,
138
- name: str = '',
139
- ssh_key_name: str = '') -> List[str]:
135
+ def create_instances(
136
+ self,
137
+ instance_type: str = 'gpu_1x_a100_sxm4',
138
+ region: str = 'us-east-1',
139
+ quantity: int = 1,
140
+ name: str = '',
141
+ ssh_key_name: str = '',
142
+ ) -> List[str]:
140
143
  """Launch new instances."""
141
144
  # Optimization:
142
145
  # Most API requests are rate limited at ~1 request every second but
143
146
  # launch requests are rate limited at ~1 request every 10 seconds.
144
147
  # So don't use launch requests to check availability.
145
- # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
146
- available_regions = self.list_catalog()[instance_type]\
147
- ['regions_with_capacity_available']
148
+ # See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
149
+ available_regions = (self.list_catalog()[instance_type]
150
+ ['regions_with_capacity_available'])
148
151
  available_regions = [reg['name'] for reg in available_regions]
149
152
  if region not in available_regions:
150
- if len(available_regions) > 0:
153
+ if available_regions:
151
154
  aval_reg = ' '.join(available_regions)
152
155
  else:
153
156
  aval_reg = 'None'
@@ -163,27 +166,25 @@ class LambdaCloudClient:
163
166
  'instance_type_name': instance_type,
164
167
  'ssh_key_names': [ssh_key_name],
165
168
  'quantity': quantity,
166
- 'name': name
169
+ 'name': name,
167
170
  })
168
171
  response = _try_request_with_backoff(
169
172
  'post',
170
173
  f'{API_ENDPOINT}/instance-operations/launch',
171
174
  data=data,
172
- headers=self.headers)
175
+ headers=self.headers,
176
+ )
173
177
  return response.json().get('data', []).get('instance_ids', [])
174
178
 
175
- def remove_instances(self, *instance_ids: str) -> Dict[str, Any]:
179
+ def remove_instances(self, instance_ids: List[str]) -> Dict[str, Any]:
176
180
  """Terminate instances."""
177
- data = json.dumps({
178
- 'instance_ids': [
179
- instance_ids[0] # TODO(ewzeng) don't hardcode
180
- ]
181
- })
181
+ data = json.dumps({'instance_ids': instance_ids})
182
182
  response = _try_request_with_backoff(
183
183
  'post',
184
184
  f'{API_ENDPOINT}/instance-operations/terminate',
185
185
  data=data,
186
- headers=self.headers)
186
+ headers=self.headers,
187
+ )
187
188
  return response.json().get('data', []).get('terminated_instances', [])
188
189
 
189
190
  def list_instances(self) -> List[Dict[str, Any]]:
sky/provision/logging.py CHANGED
@@ -41,7 +41,7 @@ def setup_provision_logging(log_dir: str):
41
41
  # Disable propagation to avoid streaming logs to the console, which is
42
42
  # set up for sky root logger.
43
43
  provision_logger.propagate = False
44
- stream_handler = sky_logging.RichSafeStreamHandler(sys.stdout)
44
+ stream_handler = logging.StreamHandler(sys.stdout)
45
45
  stream_handler.flush = sys.stdout.flush # type: ignore
46
46
  stream_handler.setFormatter(sky_logging.DIM_FORMATTER)
47
47
  stream_handler.setLevel(logging.WARNING)
@@ -0,0 +1,11 @@
1
+ """Nebius provisioner for SkyPilot."""
2
+
3
+ from sky.provision.nebius.config import bootstrap_instances
4
+ from sky.provision.nebius.instance import cleanup_ports
5
+ from sky.provision.nebius.instance import get_cluster_info
6
+ from sky.provision.nebius.instance import open_ports
7
+ from sky.provision.nebius.instance import query_instances
8
+ from sky.provision.nebius.instance import run_instances
9
+ from sky.provision.nebius.instance import stop_instances
10
+ from sky.provision.nebius.instance import terminate_instances
11
+ from sky.provision.nebius.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Nebius configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config