skypilot-nightly 1.0.0.dev20250317__py3-none-any.whl → 1.0.0.dev20250319__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '5d5ab949f1f83301a2677989761c1eea06f0af00'
8
+ _SKYPILOT_COMMIT_SHA = '246e69ba16705c31b69143bfe76efcee17b6407f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250317'
38
+ __version__ = '1.0.0.dev20250319'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/clouds/kubernetes.py CHANGED
@@ -229,32 +229,52 @@ class Kubernetes(clouds.Cloud):
229
229
  # Check if requested instance type will fit in the cluster.
230
230
  # TODO(zhwu,romilb): autoscaler type needs to be regional (per
231
231
  # kubernetes cluster/context).
232
- regions_to_return = []
232
+ if instance_type is None:
233
+ return regions
234
+
233
235
  autoscaler_type = kubernetes_utils.get_autoscaler_type()
234
- if autoscaler_type is None and instance_type is not None:
235
- # If autoscaler is not set, check if the instance type fits in the
236
- # cluster. Else, rely on the autoscaler to provision the right
237
- # instance type without running checks. Worst case, if autoscaling
238
- # fails, the pod will be stuck in pending state until
239
- # provision_timeout, after which failover will be triggered.
240
- for r in regions:
241
- context = r.name
242
- try:
243
- fits, reason = kubernetes_utils.check_instance_fits(
244
- context, instance_type)
245
- except exceptions.KubeAPIUnreachableError as e:
246
- cls._log_unreachable_context(context, str(e))
247
- continue
248
- if fits:
249
- regions_to_return.append(r)
250
- else:
251
- logger.debug(
252
- f'Instance type {instance_type} does '
253
- 'not fit in the Kubernetes cluster with context: '
254
- f'{context}. Reason: {reason}')
255
- else:
256
- regions_to_return = regions
236
+ if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
237
+ autoscaler_type).can_query_backend):
238
+ # Unsupported autoscaler type. Rely on the autoscaler to
239
+ # provision the right instance type without running checks.
240
+ # Worst case, if autoscaling fails, the pod will be stuck in
241
+ # pending state until provision_timeout, after which failover
242
+ # will be triggered.
243
+ #
244
+ # Removing this if statement produces the same behavior,
245
+ # because can_create_new_instance_of_type() always returns True
246
+ # for unsupported autoscaler types.
247
+ # This check is here as a performance optimization to avoid
248
+ # further code executions that is known to return this result.
249
+ return regions
257
250
 
251
+ regions_to_return = []
252
+ for r in regions:
253
+ context = r.name
254
+ try:
255
+ fits, reason = kubernetes_utils.check_instance_fits(
256
+ context, instance_type)
257
+ except exceptions.KubeAPIUnreachableError as e:
258
+ cls._log_unreachable_context(context, str(e))
259
+ continue
260
+ if fits:
261
+ regions_to_return.append(r)
262
+ continue
263
+ logger.debug(f'Instance type {instance_type} does '
264
+ 'not fit in the existing Kubernetes cluster '
265
+ 'with context: '
266
+ f'{context}. Reason: {reason}')
267
+ if autoscaler_type is None:
268
+ continue
269
+ autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
270
+ logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
271
+ if autoscaler.can_create_new_instance_of_type(
272
+ context, instance_type):
273
+ logger.debug(f'Kubernetes cluster {context} can be '
274
+ 'autoscaled to create instance type '
275
+ f'{instance_type}. Including {context} '
276
+ 'in the list of regions to return.')
277
+ regions_to_return.append(r)
258
278
  return regions_to_return
259
279
 
260
280
  def instance_type_to_hourly_cost(self,
@@ -618,7 +638,6 @@ class Kubernetes(clouds.Cloud):
618
638
  chosen_instance_type = (
619
639
  kubernetes_utils.KubernetesInstanceType.from_resources(
620
640
  gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
621
-
622
641
  # Check the availability of the specified instance type in all contexts.
623
642
  available_regions = self.regions_with_offering(
624
643
  chosen_instance_type,
sky/execution.py CHANGED
@@ -529,6 +529,11 @@ def launch(
529
529
  ]
530
530
  skip_unnecessary_provisioning = True
531
531
 
532
+ # Attach to setup if the cluster is a controller, so that user can
533
+ # see the setup logs when inspecting the launch process to know
534
+ # excatly what the job is waiting for.
535
+ detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
536
+
532
537
  return _execute(
533
538
  entrypoint=entrypoint,
534
539
  dryrun=dryrun,
@@ -540,7 +545,7 @@ def launch(
540
545
  optimize_target=optimize_target,
541
546
  stages=stages,
542
547
  cluster_name=cluster_name,
543
- detach_setup=True,
548
+ detach_setup=detach_setup,
544
549
  detach_run=True,
545
550
  idle_minutes_to_autostop=idle_minutes_to_autostop,
546
551
  no_setup=no_setup,
sky/optimizer.py CHANGED
@@ -1328,13 +1328,17 @@ def _fill_in_launchable_resources(
1328
1328
  f'{colorama.Style.RESET_ALL}')
1329
1329
  else:
1330
1330
  if resources.cpus is not None:
1331
- logger.info('Try specifying a different CPU count, '
1331
+ logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
1332
+ '- Try specifying a different CPU count, '
1332
1333
  'or add "+" to the end of the CPU count '
1333
- 'to allow for larger instances.')
1334
+ 'to allow for larger instances.'
1335
+ f'{colorama.Style.RESET_ALL}')
1334
1336
  if resources.memory is not None:
1335
- logger.info('Try specifying a different memory size, '
1337
+ logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
1338
+ '- Try specifying a different memory size, '
1336
1339
  'or add "+" to the end of the memory size '
1337
- 'to allow for larger instances.')
1340
+ 'to allow for larger instances.'
1341
+ f'{colorama.Style.RESET_ALL}')
1338
1342
  for cloud, hint in hints.items():
1339
1343
  logger.info(f'{repr(cloud)}: {hint}')
1340
1344
 
@@ -21,6 +21,7 @@ from sky import exceptions
21
21
  from sky import models
22
22
  from sky import sky_logging
23
23
  from sky import skypilot_config
24
+ from sky.adaptors import gcp
24
25
  from sky.adaptors import kubernetes
25
26
  from sky.provision import constants as provision_constants
26
27
  from sky.provision.kubernetes import constants as kubernetes_constants
@@ -96,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
96
97
  # Multi-host compatible v5e TPU configurations allowed.
97
98
  'tpu-v5-lite-podslice': 'v5e',
98
99
  'tpu-v5p-slice': 'v5p',
100
+ 'tpu-v6e-slice': 'v6e',
99
101
  }
100
102
 
101
103
  POD_STATUSES = {
@@ -358,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
358
360
  # label to use in an autoscaling environment. For list of topologies, see:
359
361
  # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
360
362
  # tpu v5p: https://cloud.google.com/tpu/docs/v5p
361
- # TODO(romilb): Add support for TPU v4 and v6.
363
+ # tpu v6e: https://cloud.google.com/tpu/docs/v6e
364
+ # TODO(romilb): Add support for TPU v4.
362
365
  GKE_TPU_TOPOLOGIES = {
363
366
  'tpu-v5-lite-podslice': {
364
367
  1: '1x1',
@@ -373,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
373
376
  'tpu-v5p-slice': {
374
377
  4: '2x2x1'
375
378
  },
379
+ 'tpu-v6e-slice': {
380
+ 1: '1x1',
381
+ 4: '2x2',
382
+ 8: '2x4'
383
+ }
376
384
  }
377
385
 
378
386
  @classmethod
@@ -517,13 +525,6 @@ LABEL_FORMATTER_REGISTRY = [
517
525
  GFDLabelFormatter, CoreWeaveLabelFormatter
518
526
  ]
519
527
 
520
- # Mapping of autoscaler type to label formatter
521
- AUTOSCALER_TO_LABEL_FORMATTER = {
522
- kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
523
- kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter, # pylint: disable=line-too-long
524
- kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
525
- }
526
-
527
528
 
528
529
  @annotations.lru_cache(scope='request')
529
530
  def detect_gpu_label_formatter(
@@ -557,6 +558,314 @@ def detect_gpu_label_formatter(
557
558
  return label_formatter, node_labels
558
559
 
559
560
 
561
+ class Autoscaler:
562
+ """Base class to define a autoscaler for a Kubernetes cluster.
563
+ An autoscaler is a class that defines how to detect if a Kubernetes
564
+ context can autoscale to meet the resource requirements of a task.
565
+ """
566
+
567
+ label_formatter: Any = None
568
+
569
+ # returns if the autoscaler backend can be queried for information.
570
+ # If True, SkyPilot will query the autoscaler backend to check if
571
+ # the Kubernetes context can autoscale to meet the resource requirements
572
+ # of a task.
573
+ can_query_backend: bool = False
574
+
575
+ @classmethod
576
+ # pylint: disable=unused-argument
577
+ def can_create_new_instance_of_type(cls, context: str,
578
+ instance_type: str) -> bool:
579
+ """Returns if the Kubernetes context has an autoscaler
580
+ that can create a new node that satisfies the instance type.
581
+ Args:
582
+ context: The Kubernetes context to check.
583
+ instance_type: The instance type to check.
584
+ Returns:
585
+ bool: True if the Kubernetes context has an autoscaler that can
586
+ create a new node satisfying the instance type,
587
+ or if such determination is not possible.
588
+ False if the Kubernetes context autoscaler cannot create a new
589
+ node satisfying the instance type.
590
+ """
591
+ # For autoscalers that SkyPilot does not know how to interface with,
592
+ # assume the autoscaler can create a new node that satisfies
593
+ # the instance type.
594
+ # If this is not the case, the autoscaler will fail to provision the
595
+ # node and the pod will be stuck in pending state until
596
+ # provision_timeout, after which failover will be triggered.
597
+ return True
598
+
599
+
600
+ class GKEAutoscaler(Autoscaler):
601
+ """GKE autoscaler
602
+ """
603
+
604
+ label_formatter: Any = GKELabelFormatter
605
+ can_query_backend: bool = True
606
+
607
+ # This variable is stored in memory in the server.
608
+ # The variable will reset if the server restarts.
609
+ _pip_install_gcp_hint_last_sent = 0.0
610
+
611
+ @classmethod
612
+ @annotations.lru_cache(scope='request', maxsize=10)
613
+ def can_create_new_instance_of_type(cls, context: str,
614
+ instance_type: str) -> bool:
615
+ """Looks at each node pool in the cluster and checks if
616
+ it can create a new node that satisfies the instance type.
617
+ If the context does not match standard GKE context naming convention,
618
+ or GKE credential is not set, this function returns True
619
+ for optimistic pod scheduling.
620
+ """
621
+ # assume context naming convention of
622
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME
623
+ valid, project_id, location, cluster_name = cls._validate_context_name(
624
+ context)
625
+ if not valid:
626
+ # Context name is not in the format of
627
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
628
+ # Cannot determine if the context can autoscale
629
+ # return True for optimistic pod scheduling.
630
+ logger.debug(f'context {context} is not in the format of '
631
+ f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
632
+ 'reporting context as potentially capable of '
633
+ 'provisioning resources without further check')
634
+ return True
635
+ try:
636
+ logger.debug(
637
+ f'attempting to get information about cluster {cluster_name}')
638
+ container_service = gcp.build('container',
639
+ 'v1',
640
+ credentials=None,
641
+ cache_discovery=False)
642
+ cluster = container_service.projects().locations().clusters().get(
643
+ name=f'projects/{project_id}'
644
+ f'/locations/{location}'
645
+ f'/clusters/{cluster_name}').execute()
646
+ except ImportError:
647
+ # If the gcp module is not installed, return True for
648
+ # optimistic pod scheduling.
649
+ # Remind the user once per day to install the gcp module for better
650
+ # pod scheduling with GKE autoscaler.
651
+ if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
652
+ logger.info(
653
+ 'Could not fetch autoscaler information from GKE. '
654
+ 'Run pip install "skypilot[gcp]" for more intelligent pod '
655
+ 'scheduling with GKE autoscaler.')
656
+ cls._pip_install_gcp_hint_last_sent = time.time()
657
+ return True
658
+ except gcp.http_error_exception() as e:
659
+ # Cluster information is not available.
660
+ # return True for optimistic pod scheduling.
661
+ logger.debug(f'{e.message}', exc_info=True)
662
+ return True
663
+
664
+ # Check if any node pool with autoscaling enabled can
665
+ # fit the instance type.
666
+ for node_pool in cluster['nodePools']:
667
+ logger.debug(f'checking if node pool {node_pool["name"]} '
668
+ 'has autoscaling enabled.')
669
+ if (node_pool['autoscaling'] is not None and
670
+ 'enabled' in node_pool['autoscaling'] and
671
+ node_pool['autoscaling']['enabled']):
672
+ logger.debug(
673
+ f'node pool {node_pool["name"]} has autoscaling enabled. '
674
+ 'Checking if it can create a node '
675
+ f'satisfying {instance_type}')
676
+ if cls._check_instance_fits_gke_autoscaler_node_pool(
677
+ instance_type, node_pool):
678
+ return True
679
+ return False
680
+
681
+ @classmethod
682
+ def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
683
+ """Validates the context name is in the format of
684
+ gke_PROJECT-ID_LOCATION_CLUSTER-NAME
685
+ Returns:
686
+ bool: True if the context name is in the format of
687
+ gke_PROJECT-ID_LOCATION_CLUSTER-NAME
688
+ str: project id
689
+ str: location
690
+ str: cluster name
691
+ """
692
+ context_components = context.split('_')
693
+ if len(context_components) != 4 or context_components[0] != 'gke':
694
+ logger.debug(
695
+ f'context {context} is not in valid GKE context format.')
696
+ return False, '', '', ''
697
+
698
+ logger.debug(f'context {context} is in valid GKE context format.')
699
+ return True, context_components[1], context_components[
700
+ 2], context_components[3]
701
+
702
+ @classmethod
703
+ def _check_instance_fits_gke_autoscaler_node_pool(
704
+ cls, instance_type: str, node_pool: dict
705
+ ) -> bool: # check if there are any spare capacity in the autoscaler.
706
+ node_pool_name = node_pool['name']
707
+ logger.debug(
708
+ f'checking if autoscale-enabled node pool {node_pool_name} '
709
+ f'can create a node satisfying {instance_type}')
710
+ k8s_instance_type = KubernetesInstanceType.\
711
+ from_instance_type(instance_type)
712
+ node_config = node_pool['config']
713
+ machine_type = node_config['machineType']
714
+
715
+ # Accelerator check
716
+ requested_acc_type = k8s_instance_type.accelerator_type
717
+ requested_acc_count = k8s_instance_type.accelerator_count
718
+ acc_is_tpu = (requested_acc_type is not None and
719
+ is_tpu_on_gke(requested_acc_type))
720
+ if requested_acc_type is not None:
721
+ assert requested_acc_count is not None, (requested_acc_type,
722
+ requested_acc_count)
723
+ accelerator_exists = False
724
+ if acc_is_tpu:
725
+ # Accelerator type is a TPU.
726
+ logger.debug(
727
+ f'checking {node_pool_name} for TPU {requested_acc_type}:'
728
+ f'{requested_acc_count}')
729
+ if 'resourceLabels' in node_config:
730
+ accelerator_exists = cls._node_pool_has_tpu_capacity(
731
+ node_config['resourceLabels'], machine_type,
732
+ requested_acc_type, requested_acc_count)
733
+ else:
734
+ # Accelerator type is a GPU.
735
+ logger.debug(
736
+ f'checking {node_pool_name} for GPU {requested_acc_type}:'
737
+ f'{requested_acc_count}')
738
+ if 'accelerators' in node_config:
739
+ accelerator_exists = cls._node_pool_has_gpu_capacity(
740
+ node_config['accelerators'], requested_acc_type,
741
+ requested_acc_count)
742
+
743
+ if not accelerator_exists:
744
+ logger.debug(f'{node_pool_name} does not have accelerators '
745
+ f'{requested_acc_type}:{requested_acc_count}')
746
+ return False
747
+
748
+ # vcpu and memory check is not supported for TPU instances.
749
+ # TODO(seungjin): Correctly account for vcpu/memory for TPUs.
750
+ if acc_is_tpu:
751
+ # vcpu and memory check
752
+ logger.debug(f'vcpu and memory check is not supported for TPUs. '
753
+ 'Skipping vcpu and memory check for node pool '
754
+ f'{node_pool_name}.')
755
+ return True
756
+
757
+ vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(machine_type)
758
+ if vcpus is not None and vcpus < k8s_instance_type.cpus:
759
+ logger.debug(f'vcpu check failed for {machine_type} '
760
+ f'on node pool {node_pool_name}')
761
+ return False
762
+ if mem is not None and mem < k8s_instance_type.memory:
763
+ logger.debug(f'memory check failed for {machine_type} '
764
+ f'on node pool {node_pool_name}')
765
+ return False
766
+
767
+ logger.debug(f'node pool {node_pool_name} can create a node '
768
+ f'satisfying {instance_type}')
769
+ return True
770
+
771
+ @classmethod
772
+ def _node_pool_has_gpu_capacity(cls, node_pool_accelerators: List[dict],
773
+ requested_gpu_type: str,
774
+ requested_gpu_count: int) -> bool:
775
+ """Check if the node pool has enough GPU capacity
776
+ to fit the instance type.
777
+ """
778
+ for accelerator in node_pool_accelerators:
779
+ node_accelerator_type = GKELabelFormatter. \
780
+ get_accelerator_from_label_value(
781
+ accelerator['acceleratorType'])
782
+ node_accelerator_count = accelerator['acceleratorCount']
783
+ if node_accelerator_type == requested_gpu_type and int(
784
+ node_accelerator_count) >= requested_gpu_count:
785
+ return True
786
+ return False
787
+
788
+ @classmethod
789
+ def _node_pool_has_tpu_capacity(cls, node_pool_resource_labels: dict,
790
+ machine_type: str, requested_tpu_type: str,
791
+ requested_tpu_count: int) -> bool:
792
+ """Check if the node pool has enough TPU capacity
793
+ to fit the instance type.
794
+ """
795
+
796
+ if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
797
+ # This node does not have TPUs.
798
+ return False
799
+ if cls._is_node_multi_host_tpu(node_pool_resource_labels):
800
+ # This node is a multi-host TPU.
801
+ # multi-host TPUs are not supported in SkyPilot yet.
802
+ return False
803
+ node_tpu_type = node_pool_resource_labels['goog-gke-accelerator-type']
804
+ # infer chip count from instance type
805
+ tpu_chip_count = cls._tpu_chip_count_from_instance_type(machine_type)
806
+
807
+ # For TPUs, the number of requested TPU count
808
+ # must exactly match the TPU count in the instance.
809
+ return (node_tpu_type == requested_tpu_type and
810
+ tpu_chip_count == requested_tpu_count)
811
+
812
+ @classmethod
813
+ def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
814
+ """Infer the number of TPU chips from the instance type."""
815
+ machine_type_parts = machine_type.split('-')
816
+ # according to
817
+ # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
818
+ # GKE TPU machine types have the format of
819
+ # ct<version>-<type>-<node-chip-count>t
820
+ logger.debug(
821
+ f'inferring TPU chip count from machine type: {machine_type}')
822
+ if (len(machine_type_parts) != 3 or
823
+ not machine_type_parts[0].startswith('ct') or
824
+ not machine_type_parts[2].endswith('t') or
825
+ not machine_type_parts[2].strip('t').isdigit()):
826
+ logger.debug(f'machine type {machine_type} is not a '
827
+ 'valid TPU machine type format.')
828
+ return 0
829
+ num_tpu_chips = int(machine_type_parts[2].strip('t'))
830
+ logger.debug(
831
+ f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
832
+ return num_tpu_chips
833
+
834
+ @classmethod
835
+ def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
836
+ """Check if the node pool is a multi-host TPU."""
837
+ return ('goog-gke-tpu-node-pool-type' in resource_labels and
838
+ resource_labels['goog-gke-tpu-node-pool-type'] == 'multi-host')
839
+
840
+
841
+ class KarpenterAutoscaler(Autoscaler):
842
+ """Karpenter autoscaler
843
+ """
844
+
845
+ label_formatter: Any = KarpenterLabelFormatter
846
+ can_query_backend: bool = False
847
+
848
+
849
+ class GenericAutoscaler(Autoscaler):
850
+ """Generic autoscaler
851
+ """
852
+
853
+ label_formatter: Any = SkyPilotLabelFormatter
854
+ can_query_backend: bool = False
855
+
856
+
857
+ # Mapping of autoscaler type to autoscaler
858
+ AUTOSCALER_TYPE_TO_AUTOSCALER = {
859
+ kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
860
+ kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
861
+ kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
862
+ }
863
+
864
+
865
+ def get_autoscaler(autoscaler_type: kubernetes_enums.KubernetesAutoscalerType):
866
+ return AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type, Autoscaler)
867
+
868
+
560
869
  @annotations.lru_cache(scope='request', maxsize=10)
561
870
  def detect_accelerator_resource(
562
871
  context: Optional[str]) -> Tuple[bool, Set[str]]:
@@ -710,7 +1019,8 @@ def check_instance_fits(context: Optional[str],
710
1019
  node for node in nodes if gpu_label_key in node.metadata.labels and
711
1020
  node.metadata.labels[gpu_label_key] == gpu_label_val
712
1021
  ]
713
- assert gpu_nodes, 'GPU nodes not found'
1022
+ if not gpu_nodes:
1023
+ return False, f'No GPU nodes found with {acc_type} on the cluster'
714
1024
  if is_tpu_on_gke(acc_type):
715
1025
  # If requested accelerator is a TPU type, check if the cluster
716
1026
  # has sufficient TPU resource to meet the requirement.
@@ -795,9 +1105,10 @@ def get_accelerator_label_key_value(
795
1105
  # early since we assume the cluster autoscaler will handle GPU
796
1106
  # node provisioning.
797
1107
  return None, None, None, None
798
- formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
799
- assert formatter is not None, ('Unsupported autoscaler type:'
800
- f' {autoscaler_type}')
1108
+ autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type)
1109
+ assert autoscaler is not None, ('Unsupported autoscaler type:'
1110
+ f' {autoscaler_type}')
1111
+ formatter = autoscaler.label_formatter
801
1112
  tpu_topology_label_key = None
802
1113
  tpu_topology_label_value = None
803
1114
  if is_tpu_on_gke(acc_type):
@@ -49,7 +49,6 @@ from sky.utils import annotations
49
49
  from sky.utils import common_utils
50
50
  from sky.utils import subprocess_utils
51
51
  from sky.utils import timeline
52
- from sky.utils import ux_utils
53
52
 
54
53
  if typing.TYPE_CHECKING:
55
54
  import types
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
221
220
  os.close(original_stderr)
222
221
 
223
222
 
223
+ def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
224
+ raise KeyboardInterrupt
225
+
226
+
224
227
  def _request_execution_wrapper(request_id: str,
225
228
  ignore_return_value: bool) -> None:
226
229
  """Wrapper for a request execution.
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
232
235
  3. Redirect the stdout and stderr of the execution to log file;
233
236
  4. Handle the SIGTERM signal to abort the request gracefully.
234
237
  """
235
-
236
- def sigterm_handler(signum: int,
237
- frame: Optional['types.FrameType']) -> None:
238
- raise KeyboardInterrupt
239
-
240
- signal.signal(signal.SIGTERM, sigterm_handler)
238
+ # Handle the SIGTERM signal to abort the request processing gracefully.
239
+ signal.signal(signal.SIGTERM, _sigterm_handler)
241
240
 
242
241
  pid = multiprocessing.current_process().pid
243
242
  logger.info(f'Running request {request_id} with pid {pid}')
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
355
354
  Args:
356
355
  max_parallel_size: Maximum number of parallel jobs this worker can run.
357
356
  """
357
+ # Handle the SIGTERM signal to abort the executor process gracefully.
358
+ signal.signal(signal.SIGTERM, _sigterm_handler)
358
359
  proc_group = f'{worker.schedule_type.value}-{worker.id}'
359
360
  setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
360
361
  queue = _get_queue(worker.schedule_type)
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
388
389
  logger.info(f'[{worker}] Finished request: {request_id}')
389
390
  else:
390
391
  logger.info(f'[{worker}] Submitted request: {request_id}')
391
- except KeyboardInterrupt:
392
- # Interrupt the worker process will stop request execution, but
393
- # the SIGTERM request should be respected anyway since it might
394
- # be explicitly sent by user.
395
- # TODO(aylei): crash the API server or recreate the worker process
396
- # to avoid broken state.
397
- logger.error(f'[{worker}] Worker process interrupted')
398
- with ux_utils.print_exception_no_traceback():
399
- raise
400
392
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
401
393
  # Catch any other exceptions to avoid crashing the worker process.
402
394
  logger.error(
403
- f'[{worker}] Error processing request {request_id}: '
395
+ f'[{worker}] Error processing request: '
396
+ f'{request_id if "request_id" in locals() else ""} '
404
397
  f'{common_utils.format_exception(e, use_bracket=True)}')
405
398
 
406
399
  # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
409
402
  # We use executor instead of individual multiprocessing.Process to avoid
410
403
  # the overhead of forking a new process for each request, which can be about
411
404
  # 1s delay.
412
- with concurrent.futures.ProcessPoolExecutor(
405
+ try:
406
+ executor = concurrent.futures.ProcessPoolExecutor(
413
407
  max_workers=max_parallel_size,
414
408
  initializer=executor_initializer,
415
- initargs=(proc_group,)) as executor:
409
+ initargs=(proc_group,))
416
410
  while True:
417
411
  process_request(executor)
412
+ # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
413
+ except KeyboardInterrupt:
414
+ pass
415
+ finally:
416
+ # In most cases, here we receive either ctrl-c in foreground execution
417
+ # or SIGTERM on server exiting. Gracefully exit the worker process and
418
+ # the executor.
419
+ # TODO(aylei): worker may also be killed by system daemons like OOM
420
+ # killer, crash the API server or recreate the worker process to avoid
421
+ # broken state in such cases.
422
+ logger.info(f'[{worker}] Worker process interrupted')
423
+ executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
424
+ # Shutdown the executor so that executor process can exit once the
425
+ # running task is finished or interrupted.
426
+ executor.shutdown(wait=False)
427
+ # Proactively interrupt the running task to avoid indefinite waiting.
428
+ subprocess_utils.run_in_parallel(
429
+ subprocess_utils.kill_process_with_grace_period,
430
+ executor_processes,
431
+ num_threads=len(executor_processes))
418
432
 
419
433
 
420
434
  def start(deploy: bool) -> List[multiprocessing.Process]:
sky/server/server.py CHANGED
@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
1140
1140
  # The process may not be started yet, close it anyway.
1141
1141
  proc.close()
1142
1142
 
1143
+ # Terminate processes in reverse order in case dependency, especially
1144
+ # queue server. Terminate queue server first does not affect the
1145
+ # correctness of cleanup but introduce redundant error messages.
1143
1146
  subprocess_utils.run_in_parallel(cleanup,
1144
- sub_procs,
1147
+ list(reversed(sub_procs)),
1145
1148
  num_threads=len(sub_procs))
@@ -1,4 +1,5 @@
1
1
  """Utility functions for subprocesses."""
2
+ import multiprocessing
2
3
  from multiprocessing import pool
3
4
  import os
4
5
  import random
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
181
182
  if isinstance(parent_pids, int):
182
183
  parent_pids = [parent_pids]
183
184
 
184
- def kill(proc: psutil.Process):
185
- if not proc.is_running():
186
- # Skip if the process is not running.
187
- return
188
- logger.debug(f'Killing process {proc.pid}')
189
- try:
190
- if force:
191
- proc.kill()
192
- else:
193
- proc.terminate()
194
- proc.wait(timeout=10)
195
- except psutil.NoSuchProcess:
196
- # The child process may have already been terminated.
197
- pass
198
- except psutil.TimeoutExpired:
199
- logger.debug(
200
- f'Process {proc.pid} did not terminate after 10 seconds')
201
- # Attempt to force kill if the normal termination fails
202
- if not force:
203
- logger.debug(f'Force killing process {proc.pid}')
204
- proc.kill()
205
- proc.wait(timeout=5) # Shorter timeout after force kill
206
-
207
185
  parent_processes = []
208
186
  if parent_pids is None:
209
187
  parent_processes = [psutil.Process()]
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
218
196
  for parent_process in parent_processes:
219
197
  child_processes = parent_process.children(recursive=True)
220
198
  if parent_pids is not None:
221
- kill(parent_process)
199
+ kill_process_with_grace_period(parent_process, force=force)
222
200
  logger.debug(f'Killing child processes: {child_processes}')
223
201
  for child in child_processes:
224
- kill(child)
202
+ kill_process_with_grace_period(child, force=force)
203
+
204
+
205
+ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
206
+ psutil.Process],
207
+ force: bool = False,
208
+ grace_period: int = 10) -> None:
209
+ """Kill a process with SIGTERM and wait for it to exit.
210
+
211
+ Args:
212
+ proc: The process to kill, either a multiprocessing.Process or a
213
+ psutil.Process.
214
+ force: Whether to force kill the process.
215
+ grace_period: The grace period seconds to wait for the process to exit.
216
+ """
217
+ if isinstance(proc, psutil.Process):
218
+ alive = proc.is_running
219
+ wait = proc.wait
220
+ else:
221
+ alive = proc.is_alive
222
+ wait = proc.join
223
+ if not alive():
224
+ # Skip if the process is not running.
225
+ return
226
+ logger.debug(f'Killing process {proc.pid}')
227
+ try:
228
+ if force:
229
+ proc.kill()
230
+ else:
231
+ proc.terminate()
232
+ wait(timeout=grace_period)
233
+ except (psutil.NoSuchProcess, ValueError):
234
+ # The child process may have already been terminated.
235
+ return
236
+ except psutil.TimeoutExpired:
237
+ # Pass to finally to force kill the process.
238
+ pass
239
+ finally:
240
+ logger.debug(f'Process {proc.pid} did not terminate after '
241
+ f'{grace_period} seconds')
242
+ # Attempt to force kill if the normal termination fails
243
+ if not force:
244
+ logger.debug(f'Force killing process {proc.pid}')
245
+ # Shorter timeout after force kill
246
+ kill_process_with_grace_period(proc, force=True, grace_period=5)
225
247
 
226
248
 
227
249
  def run_with_retries(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250317
3
+ Version: 1.0.0.dev20250319
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=rrcorJA9XwEtr4hzoKv0Vw-COYQaUddD9wrZOVjTeIw,6428
1
+ sky/__init__.py,sha256=3eIvmaqr9j7Q14zbXB6K1AYrtAYYBeSZaufG8cPHilk,6428
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
4
4
  sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
7
7
  sky/core.py,sha256=MU9hcTdh8baMGrr2ZXmbxx12vNlhajrkeyg5QtV717c,47609
8
8
  sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
9
9
  sky/exceptions.py,sha256=cEZ5nm7RhTW22Npw-oYS5Wp9rtxoHxdPQHfkNa92wOo,16641
10
- sky/execution.py,sha256=0M4RTEzWn-B9oz221XdZOIGH12XOACmNq0j-WGUT_No,28023
10
+ sky/execution.py,sha256=9L8NFOXNphtabnsL7mHGPJeGdw4n6gIIUEOzjW7CEHw,28294
11
11
  sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
12
12
  sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
13
- sky/optimizer.py,sha256=C82l9N3umdrJ2AaM-pSg0aK5rpOAX3lEAfFU7r6hqPo,60183
13
+ sky/optimizer.py,sha256=7FeTo0Bk4M7OnXugv-YdCj50PTL2R7NVGHMsr7DWBJ0,60457
14
14
  sky/resources.py,sha256=f2Qo_Wt0kFruKmYm6cgYbICH_wn0Zkb8uIv6LA82SRs,72153
15
15
  sky/sky_logging.py,sha256=pID2RINjH62n7SZpv70DuN8BSFYdCfTJ2ScGQpVmugg,5725
16
16
  sky/skypilot_config.py,sha256=bt1vSis2aKKdQfPz80-KcjM9vNIg_qYKLNXur782Poo,8693
@@ -55,7 +55,7 @@ sky/clouds/do.py,sha256=hmksx0XML0dVHUZBMV2Wr3a5VilOsYfxX2dSBV_XK5o,11487
55
55
  sky/clouds/fluidstack.py,sha256=Eb0nlfU_EwTtGtV0nPKS2ueBlB0nYiDAN9swA-jjQV0,12446
56
56
  sky/clouds/gcp.py,sha256=cvFSeX8RcyhX5HJb57YposUr9p1RaUPmpxvg_AI_D3c,55978
57
57
  sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
58
- sky/clouds/kubernetes.py,sha256=xsYX8HhdcRzsdx6Gd_3kumNqjMjpo_l4cinhs3ZMwZM,35067
58
+ sky/clouds/kubernetes.py,sha256=u8mRd75a0NS7-uHdGXk_cqqLc4Z2vU0CedwmLJpzmZ0,36081
59
59
  sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
60
60
  sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
61
61
  sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
165
165
  sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
166
166
  sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
167
167
  sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
168
- sky/provision/kubernetes/utils.py,sha256=A2nzKUCFqmq5KveyagE5u4_p0b6frg6256lwvAlwPEA,110155
168
+ sky/provision/kubernetes/utils.py,sha256=puwjlWM4EMExa1jO0cxluzg8ZSF-QX4rgZGksZdxKiQ,124015
169
169
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
170
170
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
171
171
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -230,13 +230,13 @@ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,32
230
230
  sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
231
231
  sky/server/common.py,sha256=PMPaKoPtoUGolbdSW78VetUW5H0X7YKBT-z6Hbu3BJM,18471
232
232
  sky/server/constants.py,sha256=_ZNrxYh8vmgbf3DmkGDduxjvO2y43ZSPTkH5rCNsVjU,770
233
- sky/server/server.py,sha256=kEjwRjA7PJDZzx6KqD_NAFxryVLkzwCnuPfbmY_p30A,44232
233
+ sky/server/server.py,sha256=62IysoY5jCbGi99xIsYrINFIuRgo-cKKIR8fXsKMuW0,44472
234
234
  sky/server/stream_utils.py,sha256=4JMHgtoXPpCT8JwtqyUcDQ9IdZFir9om0JaCRr8rvbQ,5849
235
235
  sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
236
236
  sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
237
237
  sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
238
238
  sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
239
- sky/server/requests/executor.py,sha256=SuSr-cVrRnMzf-1SEz6O8HpcLzGM3mrbNc8re7QduYk,20862
239
+ sky/server/requests/executor.py,sha256=BNJqkTQ3swYeRO5YVW-dTmobL2CYnDDf_m-kY7__n40,21684
240
240
  sky/server/requests/payloads.py,sha256=nVb7vr1SNAq6ay2dNe9301zLHp7NrM79M7nsWAECBms,16340
241
241
  sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
242
242
  sky/server/requests/requests.py,sha256=Sys2rg22rIXn7SrHfKzDVuTjBdRlm5oZk58u1UmS6JA,21231
@@ -328,7 +328,7 @@ sky/utils/resources_utils.py,sha256=URp6OS9B9nc9tIB5ibZCgGK4XSABmI4kRG0wOM6qgvs,
328
328
  sky/utils/rich_utils.py,sha256=3xdDzmn-TQXAE83EevAtOf9N4aak3Bl4ZeD33xIxjOo,11931
329
329
  sky/utils/schemas.py,sha256=KJCHrn1nMZ3XqzddWuu_nFQoRQw01cZh9qh19OrRtps,30145
330
330
  sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
331
- sky/utils/subprocess_utils.py,sha256=Q42CyjDNICXze2WCGuGxgpEjtjlka43_2ihRqKhSnQw,14916
331
+ sky/utils/subprocess_utils.py,sha256=Ee4WajTJ6YLAjC8CgN5l1K7m6hsnpGqDa26MXkDifvw,15776
332
332
  sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
333
333
  sky/utils/ux_utils.py,sha256=ngcOCg1K44p-SOk6XfwxJGXwjoP__PRvNuEzj7t05Yc,10185
334
334
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
@@ -347,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
347
347
  sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
348
348
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
349
349
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
350
- skypilot_nightly-1.0.0.dev20250317.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
351
- skypilot_nightly-1.0.0.dev20250317.dist-info/METADATA,sha256=QQAbZSEDZeyfbiMEHyn0Fvvb-dGb4B6lFHhJJTFe510,17919
352
- skypilot_nightly-1.0.0.dev20250317.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
353
- skypilot_nightly-1.0.0.dev20250317.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
354
- skypilot_nightly-1.0.0.dev20250317.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
355
- skypilot_nightly-1.0.0.dev20250317.dist-info/RECORD,,
350
+ skypilot_nightly-1.0.0.dev20250319.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
351
+ skypilot_nightly-1.0.0.dev20250319.dist-info/METADATA,sha256=Iys5Rb5saDPHcYoCslzL2WR1YyxDv2fSA-knwQQb6jc,17919
352
+ skypilot_nightly-1.0.0.dev20250319.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
353
+ skypilot_nightly-1.0.0.dev20250319.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
354
+ skypilot_nightly-1.0.0.dev20250319.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
355
+ skypilot_nightly-1.0.0.dev20250319.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5