skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -397,7 +397,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
397
397
  operation = compute.networks().getEffectiveFirewalls(project=project_id,
398
398
  network=vpc_name)
399
399
  response = operation.execute()
400
- if len(response) == 0:
400
+ if not response:
401
401
  return False
402
402
  effective_rules = response['firewalls']
403
403
 
@@ -515,7 +515,7 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
515
515
  rule_list = _list_firewall_rules(project_id,
516
516
  compute,
517
517
  filter=f'(name={rule_name})')
518
- if len(rule_list) > 0:
518
+ if rule_list:
519
519
  _delete_firewall_rule(project_id, compute, rule_name)
520
520
 
521
521
  body = rule.copy()
@@ -624,7 +624,7 @@ def get_usable_vpc_and_subnet(
624
624
  vpc_list = _list_vpcnets(project_id,
625
625
  compute,
626
626
  filter=f'name={constants.SKYPILOT_VPC_NAME}')
627
- if len(vpc_list) == 0:
627
+ if not vpc_list:
628
628
  body = constants.VPC_TEMPLATE.copy()
629
629
  body['name'] = body['name'].format(VPC_NAME=constants.SKYPILOT_VPC_NAME)
630
630
  body['selfLink'] = body['selfLink'].format(
@@ -670,9 +670,14 @@ def _configure_subnet(region: str, cluster_name: str,
670
670
  'accessConfigs': [{
671
671
  'name': 'External NAT',
672
672
  'type': 'ONE_TO_ONE_NAT',
673
- }],
673
+ }]
674
674
  }]
675
- if config.provider_config.get('use_internal_ips', False):
675
+ # Add gVNIC if specified in config
676
+ enable_gvnic = config.provider_config.get('enable_gvnic', False)
677
+ if enable_gvnic:
678
+ default_interfaces[0]['nicType'] = 'gVNIC'
679
+ enable_external_ips = _enable_external_ips(config)
680
+ if not enable_external_ips:
676
681
  # Removing this key means the VM will not be assigned an external IP.
677
682
  default_interfaces[0].pop('accessConfigs')
678
683
 
@@ -686,14 +691,19 @@ def _configure_subnet(region: str, cluster_name: str,
686
691
  node_config['networkConfig'] = copy.deepcopy(default_interfaces)[0]
687
692
  # TPU doesn't have accessConfigs
688
693
  node_config['networkConfig'].pop('accessConfigs', None)
689
- if config.provider_config.get('use_internal_ips', False):
690
- node_config['networkConfig']['enableExternalIps'] = False
691
- else:
692
- node_config['networkConfig']['enableExternalIps'] = True
694
+ node_config['networkConfig']['enableExternalIps'] = enable_external_ips
693
695
 
694
696
  return config
695
697
 
696
698
 
699
+ def _enable_external_ips(config: common.ProvisionConfig) -> bool:
700
+ force_enable_external_ips = config.provider_config.get(
701
+ 'force_enable_external_ips', False)
702
+ use_internal_ips = config.provider_config.get('use_internal_ips', False)
703
+
704
+ return force_enable_external_ips or not use_internal_ips
705
+
706
+
697
707
  def _delete_firewall_rule(project_id: str, compute, name):
698
708
  operation = (compute.firewalls().delete(project=project_id,
699
709
  firewall=name).execute())
@@ -142,7 +142,7 @@ FIREWALL_RULES_TEMPLATE = [
142
142
  ]
143
143
 
144
144
  # A list of permissions required to run SkyPilot on GCP.
145
- # Keep this in sync with https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
145
+ # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
146
146
  VM_MINIMAL_PERMISSIONS = [
147
147
  'compute.disks.create',
148
148
  'compute.disks.list',
@@ -214,3 +214,9 @@ POLL_INTERVAL = 1
214
214
  MAX_POLLS = 60 // POLL_INTERVAL
215
215
  # Stopping instances can take several minutes, so we increase the timeout
216
216
  MAX_POLLS_STOP = MAX_POLLS * 8
217
+
218
+ # MIG constants
219
+ MANAGED_INSTANCE_GROUP_CONFIG = 'managed-instance-group'
220
+ DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT = 900 # 15 minutes
221
+ MIG_NAME_PREFIX = 'sky-mig-'
222
+ INSTANCE_TEMPLATE_NAME_PREFIX = 'sky-it-'
@@ -7,20 +7,16 @@ import time
7
7
  from typing import Any, Callable, Dict, Iterable, List, Optional, Type
8
8
 
9
9
  from sky import sky_logging
10
- from sky import status_lib
11
10
  from sky.adaptors import gcp
12
11
  from sky.provision import common
12
+ from sky.provision import constants as provision_constants
13
13
  from sky.provision.gcp import constants
14
14
  from sky.provision.gcp import instance_utils
15
15
  from sky.utils import common_utils
16
+ from sky.utils import status_lib
16
17
 
17
18
  logger = sky_logging.init_logger(__name__)
18
19
 
19
- TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
20
- # Tag uniquely identifying all nodes of a cluster
21
- TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
22
- TAG_RAY_NODE_KIND = 'ray-node-type'
23
-
24
20
  _INSTANCE_RESOURCE_NOT_FOUND_PATTERN = re.compile(
25
21
  r'The resource \'projects/.*/zones/.*/instances/.*\' was not found')
26
22
 
@@ -56,6 +52,8 @@ def _filter_instances(
56
52
  # non_terminated_only=True?
57
53
  # Will there be callers who would want this to be False?
58
54
  # stop() and terminate() for example already implicitly assume non-terminated.
55
+ # Currently, even with non_terminated_only=False, we may not have a dict entry
56
+ # for terminated instances, if they have already been fully deleted.
59
57
  @common_utils.retry
60
58
  def query_instances(
61
59
  cluster_name_on_cloud: str,
@@ -66,7 +64,9 @@ def query_instances(
66
64
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
67
65
  zone = provider_config['availability_zone']
68
66
  project_id = provider_config['project_id']
69
- label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
67
+ label_filters = {
68
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
69
+ }
70
70
 
71
71
  handler: Type[
72
72
  instance_utils.GCPInstance] = instance_utils.GCPComputeInstance
@@ -124,15 +124,15 @@ def _wait_for_operations(
124
124
  logger.debug(
125
125
  f'wait_for_compute_{op_type}_operation: '
126
126
  f'Waiting for operation {operation["name"]} to finish...')
127
- handler.wait_for_operation(operation, project_id, zone)
127
+ handler.wait_for_operation(operation, project_id, zone=zone)
128
128
 
129
129
 
130
130
  def _get_head_instance_id(instances: List) -> Optional[str]:
131
131
  head_instance_id = None
132
132
  for inst in instances:
133
133
  labels = inst.get('labels', {})
134
- if (labels.get(TAG_RAY_NODE_KIND) == 'head' or
135
- labels.get(TAG_SKYPILOT_HEAD_NODE) == '1'):
134
+ if (labels.get(provision_constants.TAG_RAY_NODE_KIND) == 'head' or
135
+ labels.get(provision_constants.TAG_SKYPILOT_HEAD_NODE) == '1'):
136
136
  head_instance_id = inst['name']
137
137
  break
138
138
  return head_instance_id
@@ -158,12 +158,16 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
158
158
  resource: Type[instance_utils.GCPInstance]
159
159
  if node_type == instance_utils.GCPNodeType.COMPUTE:
160
160
  resource = instance_utils.GCPComputeInstance
161
+ elif node_type == instance_utils.GCPNodeType.MIG:
162
+ resource = instance_utils.GCPManagedInstanceGroup
161
163
  elif node_type == instance_utils.GCPNodeType.TPU:
162
164
  resource = instance_utils.GCPTPUVMInstance
163
165
  else:
164
166
  raise ValueError(f'Unknown node type {node_type}')
165
167
 
166
- filter_labels = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
168
+ filter_labels = {
169
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
170
+ }
167
171
 
168
172
  # wait until all stopping instances are stopped/terminated
169
173
  while True:
@@ -264,12 +268,16 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
264
268
  if config.resume_stopped_nodes and to_start_count > 0 and stopped_instances:
265
269
  resumed_instance_ids = [n['name'] for n in stopped_instances]
266
270
  if resumed_instance_ids:
267
- for instance_id in resumed_instance_ids:
268
- resource.start_instance(instance_id, project_id,
269
- availability_zone)
270
- resource.set_labels(project_id, availability_zone, instance_id,
271
- labels)
272
- to_start_count -= len(resumed_instance_ids)
271
+ resumed_instance_ids = resource.start_instances(
272
+ cluster_name_on_cloud, project_id, availability_zone,
273
+ resumed_instance_ids, labels)
274
+ # In MIG case, the resumed_instance_ids will include the previously
275
+ # PENDING and RUNNING instances. To avoid double counting, we need to
276
+ # remove them from the resumed_instance_ids.
277
+ ready_instances = set(resumed_instance_ids)
278
+ ready_instances |= set([n['name'] for n in running_instances])
279
+ ready_instances |= set([n['name'] for n in pending_instances])
280
+ to_start_count = config.count - len(ready_instances)
273
281
 
274
282
  if head_instance_id is None:
275
283
  head_instance_id = resource.create_node_tag(
@@ -281,9 +289,14 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
281
289
 
282
290
  if to_start_count > 0:
283
291
  errors, created_instance_ids = resource.create_instances(
284
- cluster_name_on_cloud, project_id, availability_zone,
285
- config.node_config, labels, to_start_count,
286
- head_instance_id is None)
292
+ cluster_name_on_cloud,
293
+ project_id,
294
+ availability_zone,
295
+ config.node_config,
296
+ labels,
297
+ to_start_count,
298
+ total_count=config.count,
299
+ include_head_node=head_instance_id is None)
287
300
  if errors:
288
301
  error = common.ProvisionerError('Failed to launch instances.')
289
302
  error.errors = errors
@@ -387,7 +400,9 @@ def get_cluster_info(
387
400
  assert provider_config is not None, cluster_name_on_cloud
388
401
  zone = provider_config['availability_zone']
389
402
  project_id = provider_config['project_id']
390
- label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
403
+ label_filters = {
404
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
405
+ }
391
406
 
392
407
  handlers: List[Type[instance_utils.GCPInstance]] = [
393
408
  instance_utils.GCPComputeInstance
@@ -415,7 +430,7 @@ def get_cluster_info(
415
430
  project_id,
416
431
  zone,
417
432
  {
418
- **label_filters, TAG_RAY_NODE_KIND: 'head'
433
+ **label_filters, provision_constants.TAG_RAY_NODE_KIND: 'head'
419
434
  },
420
435
  lambda h: [h.RUNNING_STATE],
421
436
  )
@@ -441,14 +456,16 @@ def stop_instances(
441
456
  assert provider_config is not None, cluster_name_on_cloud
442
457
  zone = provider_config['availability_zone']
443
458
  project_id = provider_config['project_id']
444
- label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
459
+ label_filters = {
460
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
461
+ }
445
462
 
446
463
  tpu_node = provider_config.get('tpu_node')
447
464
  if tpu_node is not None:
448
465
  instance_utils.delete_tpu_node(project_id, zone, tpu_node)
449
466
 
450
467
  if worker_only:
451
- label_filters[TAG_RAY_NODE_KIND] = 'worker'
468
+ label_filters[provision_constants.TAG_RAY_NODE_KIND] = 'worker'
452
469
 
453
470
  handlers: List[Type[instance_utils.GCPInstance]] = [
454
471
  instance_utils.GCPComputeInstance
@@ -510,9 +527,18 @@ def terminate_instances(
510
527
  if tpu_node is not None:
511
528
  instance_utils.delete_tpu_node(project_id, zone, tpu_node)
512
529
 
513
- label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
530
+ use_mig = provider_config.get('use_managed_instance_group', False)
531
+ if use_mig:
532
+ # Deleting the MIG will also delete the instances.
533
+ instance_utils.GCPManagedInstanceGroup.delete_mig(
534
+ project_id, zone, cluster_name_on_cloud)
535
+ return
536
+
537
+ label_filters = {
538
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
539
+ }
514
540
  if worker_only:
515
- label_filters[TAG_RAY_NODE_KIND] = 'worker'
541
+ label_filters[provision_constants.TAG_RAY_NODE_KIND] = 'worker'
516
542
 
517
543
  handlers: List[Type[instance_utils.GCPInstance]] = [
518
544
  instance_utils.GCPComputeInstance
@@ -555,7 +581,9 @@ def open_ports(
555
581
  project_id = provider_config['project_id']
556
582
  firewall_rule_name = provider_config['firewall_rule']
557
583
 
558
- label_filters = {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
584
+ label_filters = {
585
+ provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
586
+ }
559
587
  handlers: List[Type[instance_utils.GCPInstance]] = [
560
588
  instance_utils.GCPComputeInstance,
561
589
  instance_utils.GCPTPUVMInstance,
@@ -606,13 +634,6 @@ def cleanup_ports(
606
634
  del ports # Unused.
607
635
  assert provider_config is not None, cluster_name_on_cloud
608
636
  project_id = provider_config['project_id']
609
- if 'ports' in provider_config:
610
- # Backward compatibility for old provider config.
611
- # TODO(tian): remove this after 2 minor releases, 0.6.0.
612
- for port in provider_config['ports']:
613
- firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
614
- instance_utils.GCPComputeInstance.delete_firewall_rule(
615
- project_id, firewall_rule_name)
616
637
  if 'firewall_rule' in provider_config:
617
638
  firewall_rule_name = provider_config['firewall_rule']
618
639
  instance_utils.GCPComputeInstance.delete_firewall_rule(