skypilot-nightly 1.0.0.dev20250317__py3-none-any.whl → 1.0.0.dev20250318__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/clouds/kubernetes.py +44 -25
- sky/provision/kubernetes/utils.py +314 -11
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/RECORD +9 -9
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '587ea1ec71d64d4994bdead5166f1ee3cb31d10f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250318'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/clouds/kubernetes.py
CHANGED
@@ -229,32 +229,52 @@ class Kubernetes(clouds.Cloud):
|
|
229
229
|
# Check if requested instance type will fit in the cluster.
|
230
230
|
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
231
231
|
# kubernetes cluster/context).
|
232
|
-
|
232
|
+
if instance_type is None:
|
233
|
+
return regions
|
234
|
+
|
233
235
|
autoscaler_type = kubernetes_utils.get_autoscaler_type()
|
234
|
-
if autoscaler_type is None and
|
235
|
-
|
236
|
-
#
|
237
|
-
# instance type without running checks.
|
238
|
-
# fails, the pod will be stuck in
|
239
|
-
# provision_timeout, after which failover
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
if fits:
|
249
|
-
regions_to_return.append(r)
|
250
|
-
else:
|
251
|
-
logger.debug(
|
252
|
-
f'Instance type {instance_type} does '
|
253
|
-
'not fit in the Kubernetes cluster with context: '
|
254
|
-
f'{context}. Reason: {reason}')
|
255
|
-
else:
|
256
|
-
regions_to_return = regions
|
236
|
+
if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
|
237
|
+
autoscaler_type).can_query_backend):
|
238
|
+
# Unsupported autoscaler type. Rely on the autoscaler to
|
239
|
+
# provision the right instance type without running checks.
|
240
|
+
# Worst case, if autoscaling fails, the pod will be stuck in
|
241
|
+
# pending state until provision_timeout, after which failover
|
242
|
+
# will be triggered.
|
243
|
+
#
|
244
|
+
# Removing this if statement produces the same behavior,
|
245
|
+
# because can_create_new_instance_of_type() always returns True
|
246
|
+
# for unsupported autoscaler types.
|
247
|
+
# This check is here as a performance optimization to avoid
|
248
|
+
# further code executions that is known to return this result.
|
249
|
+
return regions
|
257
250
|
|
251
|
+
regions_to_return = []
|
252
|
+
for r in regions:
|
253
|
+
context = r.name
|
254
|
+
try:
|
255
|
+
fits, reason = kubernetes_utils.check_instance_fits(
|
256
|
+
context, instance_type)
|
257
|
+
except exceptions.KubeAPIUnreachableError as e:
|
258
|
+
cls._log_unreachable_context(context, str(e))
|
259
|
+
continue
|
260
|
+
if fits:
|
261
|
+
regions_to_return.append(r)
|
262
|
+
continue
|
263
|
+
logger.debug(f'Instance type {instance_type} does '
|
264
|
+
'not fit in the existing Kubernetes cluster '
|
265
|
+
'with context: '
|
266
|
+
f'{context}. Reason: {reason}')
|
267
|
+
if autoscaler_type is None:
|
268
|
+
continue
|
269
|
+
autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
|
270
|
+
logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
|
271
|
+
if autoscaler.can_create_new_instance_of_type(
|
272
|
+
context, instance_type):
|
273
|
+
logger.debug(f'Kubernetes cluster {context} can be '
|
274
|
+
'autoscaled to create instance type '
|
275
|
+
f'{instance_type}. Including {context} '
|
276
|
+
'in the list of regions to return.')
|
277
|
+
regions_to_return.append(r)
|
258
278
|
return regions_to_return
|
259
279
|
|
260
280
|
def instance_type_to_hourly_cost(self,
|
@@ -618,7 +638,6 @@ class Kubernetes(clouds.Cloud):
|
|
618
638
|
chosen_instance_type = (
|
619
639
|
kubernetes_utils.KubernetesInstanceType.from_resources(
|
620
640
|
gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
|
621
|
-
|
622
641
|
# Check the availability of the specified instance type in all contexts.
|
623
642
|
available_regions = self.regions_with_offering(
|
624
643
|
chosen_instance_type,
|
@@ -21,6 +21,7 @@ from sky import exceptions
|
|
21
21
|
from sky import models
|
22
22
|
from sky import sky_logging
|
23
23
|
from sky import skypilot_config
|
24
|
+
from sky.adaptors import gcp
|
24
25
|
from sky.adaptors import kubernetes
|
25
26
|
from sky.provision import constants as provision_constants
|
26
27
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
@@ -517,13 +518,6 @@ LABEL_FORMATTER_REGISTRY = [
|
|
517
518
|
GFDLabelFormatter, CoreWeaveLabelFormatter
|
518
519
|
]
|
519
520
|
|
520
|
-
# Mapping of autoscaler type to label formatter
|
521
|
-
AUTOSCALER_TO_LABEL_FORMATTER = {
|
522
|
-
kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
|
523
|
-
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter, # pylint: disable=line-too-long
|
524
|
-
kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
|
525
|
-
}
|
526
|
-
|
527
521
|
|
528
522
|
@annotations.lru_cache(scope='request')
|
529
523
|
def detect_gpu_label_formatter(
|
@@ -557,6 +551,313 @@ def detect_gpu_label_formatter(
|
|
557
551
|
return label_formatter, node_labels
|
558
552
|
|
559
553
|
|
554
|
+
class Autoscaler:
|
555
|
+
"""Base class to define a autoscaler for a Kubernetes cluster.
|
556
|
+
An autoscaler is a class that defines how to detect if a Kubernetes
|
557
|
+
context can autoscale to meet the resource requirements of a task.
|
558
|
+
"""
|
559
|
+
|
560
|
+
label_formatter: Any = None
|
561
|
+
|
562
|
+
# returns if the autoscaler backend can be queried for information.
|
563
|
+
# If True, SkyPilot will query the autoscaler backend to check if
|
564
|
+
# the Kubernetes context can autoscale to meet the resource requirements
|
565
|
+
# of a task.
|
566
|
+
can_query_backend: bool = False
|
567
|
+
|
568
|
+
@classmethod
|
569
|
+
# pylint: disable=unused-argument
|
570
|
+
def can_create_new_instance_of_type(cls, context: str,
|
571
|
+
instance_type: str) -> bool:
|
572
|
+
"""Returns if the Kubernetes context has an autoscaler
|
573
|
+
that can create a new node that satisfies the instance type.
|
574
|
+
Args:
|
575
|
+
context: The Kubernetes context to check.
|
576
|
+
instance_type: The instance type to check.
|
577
|
+
Returns:
|
578
|
+
bool: True if the Kubernetes context has an autoscaler that can
|
579
|
+
create a new node satisfying the instance type,
|
580
|
+
or if such determination is not possible.
|
581
|
+
False if the Kubernetes context autoscaler cannot create a new
|
582
|
+
node satisfying the instance type.
|
583
|
+
"""
|
584
|
+
# For autoscalers that SkyPilot does not know how to interface with,
|
585
|
+
# assume the autoscaler can create a new node that satisfies
|
586
|
+
# the instance type.
|
587
|
+
# If this is not the case, the autoscaler will fail to provision the
|
588
|
+
# node and the pod will be stuck in pending state until
|
589
|
+
# provision_timeout, after which failover will be triggered.
|
590
|
+
return True
|
591
|
+
|
592
|
+
|
593
|
+
class GKEAutoscaler(Autoscaler):
|
594
|
+
"""GKE autoscaler
|
595
|
+
"""
|
596
|
+
|
597
|
+
label_formatter: Any = GKELabelFormatter
|
598
|
+
can_query_backend: bool = True
|
599
|
+
|
600
|
+
# This variable is stored in memory in the server.
|
601
|
+
# The variable will reset if the server restarts.
|
602
|
+
_pip_install_gcp_hint_last_sent = 0.0
|
603
|
+
|
604
|
+
@classmethod
|
605
|
+
def can_create_new_instance_of_type(cls, context: str,
|
606
|
+
instance_type: str) -> bool:
|
607
|
+
"""Looks at each node pool in the cluster and checks if
|
608
|
+
it can create a new node that satisfies the instance type.
|
609
|
+
If the context does not match standard GKE context naming convention,
|
610
|
+
or GKE credential is not set, this function returns True
|
611
|
+
for optimistic pod scheduling.
|
612
|
+
"""
|
613
|
+
# assume context naming convention of
|
614
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
615
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
616
|
+
context)
|
617
|
+
if not valid:
|
618
|
+
# Context name is not in the format of
|
619
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
620
|
+
# Cannot determine if the context can autoscale
|
621
|
+
# return True for optimistic pod scheduling.
|
622
|
+
logger.debug(f'context {context} is not in the format of '
|
623
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
624
|
+
'reporting context as potentially capable of '
|
625
|
+
'provisioning resources without further check')
|
626
|
+
return True
|
627
|
+
try:
|
628
|
+
logger.debug(
|
629
|
+
f'attempting to get information about cluster {cluster_name}')
|
630
|
+
container_service = gcp.build('container',
|
631
|
+
'v1',
|
632
|
+
credentials=None,
|
633
|
+
cache_discovery=False)
|
634
|
+
cluster = container_service.projects().locations().clusters().get(
|
635
|
+
name=f'projects/{project_id}'
|
636
|
+
f'/locations/{location}'
|
637
|
+
f'/clusters/{cluster_name}').execute()
|
638
|
+
except ImportError:
|
639
|
+
# If the gcp module is not installed, return True for
|
640
|
+
# optimistic pod scheduling.
|
641
|
+
# Remind the user once per day to install the gcp module for better
|
642
|
+
# pod scheduling with GKE autoscaler.
|
643
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
644
|
+
logger.info(
|
645
|
+
'Could not fetch autoscaler information from GKE. '
|
646
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
647
|
+
'scheduling with GKE autoscaler.')
|
648
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
649
|
+
return True
|
650
|
+
except gcp.http_error_exception() as e:
|
651
|
+
# Cluster information is not available.
|
652
|
+
# return True for optimistic pod scheduling.
|
653
|
+
logger.debug(f'{e.message}', exc_info=True)
|
654
|
+
return True
|
655
|
+
|
656
|
+
# Check if any node pool with autoscaling enabled can
|
657
|
+
# fit the instance type.
|
658
|
+
for node_pool in cluster['nodePools']:
|
659
|
+
logger.debug(f'checking if node pool {node_pool["name"]} '
|
660
|
+
'has autoscaling enabled.')
|
661
|
+
if (node_pool['autoscaling'] is not None and
|
662
|
+
'enabled' in node_pool['autoscaling'] and
|
663
|
+
node_pool['autoscaling']['enabled']):
|
664
|
+
logger.debug(
|
665
|
+
f'node pool {node_pool["name"]} has autoscaling enabled. '
|
666
|
+
'Checking if it can create a node '
|
667
|
+
f'satisfying {instance_type}')
|
668
|
+
if cls._check_instance_fits_gke_autoscaler_node_pool(
|
669
|
+
instance_type, node_pool):
|
670
|
+
return True
|
671
|
+
return False
|
672
|
+
|
673
|
+
@classmethod
|
674
|
+
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
675
|
+
"""Validates the context name is in the format of
|
676
|
+
gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
677
|
+
Returns:
|
678
|
+
bool: True if the context name is in the format of
|
679
|
+
gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
680
|
+
str: project id
|
681
|
+
str: location
|
682
|
+
str: cluster name
|
683
|
+
"""
|
684
|
+
context_components = context.split('_')
|
685
|
+
if len(context_components) != 4 or context_components[0] != 'gke':
|
686
|
+
logger.debug(
|
687
|
+
f'context {context} is not in valid GKE context format.')
|
688
|
+
return False, '', '', ''
|
689
|
+
|
690
|
+
logger.debug(f'context {context} is in valid GKE context format.')
|
691
|
+
return True, context_components[1], context_components[
|
692
|
+
2], context_components[3]
|
693
|
+
|
694
|
+
@classmethod
|
695
|
+
def _check_instance_fits_gke_autoscaler_node_pool(
|
696
|
+
cls, instance_type: str, node_pool: dict
|
697
|
+
) -> bool: # check if there are any spare capacity in the autoscaler.
|
698
|
+
node_pool_name = node_pool['name']
|
699
|
+
logger.debug(
|
700
|
+
f'checking if autoscale-enabled node pool {node_pool_name} '
|
701
|
+
f'can create a node satisfying {instance_type}')
|
702
|
+
k8s_instance_type = KubernetesInstanceType.\
|
703
|
+
from_instance_type(instance_type)
|
704
|
+
node_config = node_pool['config']
|
705
|
+
machine_type = node_config['machineType']
|
706
|
+
|
707
|
+
# Accelerator check
|
708
|
+
requested_acc_type = k8s_instance_type.accelerator_type
|
709
|
+
requested_acc_count = k8s_instance_type.accelerator_count
|
710
|
+
acc_is_tpu = (requested_acc_type is not None and
|
711
|
+
is_tpu_on_gke(requested_acc_type))
|
712
|
+
if requested_acc_type is not None:
|
713
|
+
assert requested_acc_count is not None, (requested_acc_type,
|
714
|
+
requested_acc_count)
|
715
|
+
accelerator_exists = False
|
716
|
+
if acc_is_tpu:
|
717
|
+
# Accelerator type is a TPU.
|
718
|
+
logger.debug(
|
719
|
+
f'checking {node_pool_name} for TPU {requested_acc_type}:'
|
720
|
+
f'{requested_acc_count}')
|
721
|
+
if 'resourceLabels' in node_config:
|
722
|
+
accelerator_exists = cls._node_pool_has_tpu_capacity(
|
723
|
+
node_config['resourceLabels'], machine_type,
|
724
|
+
requested_acc_type, requested_acc_count)
|
725
|
+
else:
|
726
|
+
# Accelerator type is a GPU.
|
727
|
+
logger.debug(
|
728
|
+
f'checking {node_pool_name} for GPU {requested_acc_type}:'
|
729
|
+
f'{requested_acc_count}')
|
730
|
+
if 'accelerators' in node_config:
|
731
|
+
accelerator_exists = cls._node_pool_has_gpu_capacity(
|
732
|
+
node_config['accelerators'], requested_acc_type,
|
733
|
+
requested_acc_count)
|
734
|
+
|
735
|
+
if not accelerator_exists:
|
736
|
+
logger.debug(f'{node_pool_name} does not have accelerators '
|
737
|
+
f'{requested_acc_type}:{requested_acc_count}')
|
738
|
+
return False
|
739
|
+
|
740
|
+
# vcpu and memory check is not supported for TPU instances.
|
741
|
+
# TODO(seungjin): Correctly account for vcpu/memory for TPUs.
|
742
|
+
if acc_is_tpu:
|
743
|
+
# vcpu and memory check
|
744
|
+
logger.debug(f'vcpu and memory check is not supported for TPUs. '
|
745
|
+
'Skipping vcpu and memory check for node pool '
|
746
|
+
f'{node_pool_name}.')
|
747
|
+
return True
|
748
|
+
|
749
|
+
vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(machine_type)
|
750
|
+
if vcpus is not None and vcpus < k8s_instance_type.cpus:
|
751
|
+
logger.debug(f'vcpu check failed for {machine_type} '
|
752
|
+
f'on node pool {node_pool_name}')
|
753
|
+
return False
|
754
|
+
if mem is not None and mem < k8s_instance_type.memory:
|
755
|
+
logger.debug(f'memory check failed for {machine_type} '
|
756
|
+
f'on node pool {node_pool_name}')
|
757
|
+
return False
|
758
|
+
|
759
|
+
logger.debug(f'node pool {node_pool_name} can create a node '
|
760
|
+
f'satisfying {instance_type}')
|
761
|
+
return True
|
762
|
+
|
763
|
+
@classmethod
|
764
|
+
def _node_pool_has_gpu_capacity(cls, node_pool_accelerators: List[dict],
|
765
|
+
requested_gpu_type: str,
|
766
|
+
requested_gpu_count: int) -> bool:
|
767
|
+
"""Check if the node pool has enough GPU capacity
|
768
|
+
to fit the instance type.
|
769
|
+
"""
|
770
|
+
for accelerator in node_pool_accelerators:
|
771
|
+
node_accelerator_type = GKELabelFormatter. \
|
772
|
+
get_accelerator_from_label_value(
|
773
|
+
accelerator['acceleratorType'])
|
774
|
+
node_accelerator_count = accelerator['acceleratorCount']
|
775
|
+
if node_accelerator_type == requested_gpu_type and int(
|
776
|
+
node_accelerator_count) >= requested_gpu_count:
|
777
|
+
return True
|
778
|
+
return False
|
779
|
+
|
780
|
+
@classmethod
|
781
|
+
def _node_pool_has_tpu_capacity(cls, node_pool_resource_labels: dict,
|
782
|
+
machine_type: str, requested_tpu_type: str,
|
783
|
+
requested_tpu_count: int) -> bool:
|
784
|
+
"""Check if the node pool has enough TPU capacity
|
785
|
+
to fit the instance type.
|
786
|
+
"""
|
787
|
+
if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
|
788
|
+
# This node does not have TPUs.
|
789
|
+
return False
|
790
|
+
if cls._is_node_multi_host_tpu(node_pool_resource_labels):
|
791
|
+
# This node is a multi-host TPU.
|
792
|
+
# multi-host TPUs are not supported in SkyPilot yet.
|
793
|
+
return False
|
794
|
+
node_tpu_type = node_pool_resource_labels['goog-gke-accelerator-type']
|
795
|
+
# infer chip count from instance type
|
796
|
+
tpu_chip_count = cls._tpu_chip_count_from_instance_type(machine_type)
|
797
|
+
|
798
|
+
# For TPUs, the number of requested TPU count
|
799
|
+
# must exactly match the TPU count in the instance.
|
800
|
+
return (node_tpu_type == requested_tpu_type and
|
801
|
+
tpu_chip_count == requested_tpu_count)
|
802
|
+
|
803
|
+
@classmethod
|
804
|
+
def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
|
805
|
+
"""Infer the number of TPU chips from the instance type."""
|
806
|
+
machine_type_parts = machine_type.split('-')
|
807
|
+
# according to
|
808
|
+
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
|
809
|
+
# GKE TPU machine types have the format of
|
810
|
+
# ct<version>-hightpu-<node-chip-count>t
|
811
|
+
logger.debug(
|
812
|
+
f'inferring TPU chip count from machine type: {machine_type}')
|
813
|
+
if (len(machine_type_parts) != 3 or
|
814
|
+
not machine_type_parts[0].startswith('ct') or
|
815
|
+
machine_type_parts[1] != 'hightpu' or
|
816
|
+
not machine_type_parts[2].endswith('t') or
|
817
|
+
not machine_type_parts[2].strip('t').isdigit()):
|
818
|
+
logger.debug(f'machine type {machine_type} is not a '
|
819
|
+
'valid TPU machine type format.')
|
820
|
+
return 0
|
821
|
+
num_tpu_chips = int(machine_type_parts[2].strip('t'))
|
822
|
+
logger.debug(
|
823
|
+
f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
|
824
|
+
return num_tpu_chips
|
825
|
+
|
826
|
+
@classmethod
|
827
|
+
def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
|
828
|
+
"""Check if the node pool is a multi-host TPU."""
|
829
|
+
return ('goog-gke-tpu-node-pool-type' in resource_labels and
|
830
|
+
resource_labels['goog-gke-tpu-node-pool-type'] == 'multi-host')
|
831
|
+
|
832
|
+
|
833
|
+
class KarpenterAutoscaler(Autoscaler):
|
834
|
+
"""Karpenter autoscaler
|
835
|
+
"""
|
836
|
+
|
837
|
+
label_formatter: Any = KarpenterLabelFormatter
|
838
|
+
can_query_backend: bool = False
|
839
|
+
|
840
|
+
|
841
|
+
class GenericAutoscaler(Autoscaler):
|
842
|
+
"""Generic autoscaler
|
843
|
+
"""
|
844
|
+
|
845
|
+
label_formatter: Any = SkyPilotLabelFormatter
|
846
|
+
can_query_backend: bool = False
|
847
|
+
|
848
|
+
|
849
|
+
# Mapping of autoscaler type to autoscaler
|
850
|
+
AUTOSCALER_TYPE_TO_AUTOSCALER = {
|
851
|
+
kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
|
852
|
+
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
|
853
|
+
kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
|
854
|
+
}
|
855
|
+
|
856
|
+
|
857
|
+
def get_autoscaler(autoscaler_type: kubernetes_enums.KubernetesAutoscalerType):
|
858
|
+
return AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type, Autoscaler)
|
859
|
+
|
860
|
+
|
560
861
|
@annotations.lru_cache(scope='request', maxsize=10)
|
561
862
|
def detect_accelerator_resource(
|
562
863
|
context: Optional[str]) -> Tuple[bool, Set[str]]:
|
@@ -710,7 +1011,8 @@ def check_instance_fits(context: Optional[str],
|
|
710
1011
|
node for node in nodes if gpu_label_key in node.metadata.labels and
|
711
1012
|
node.metadata.labels[gpu_label_key] == gpu_label_val
|
712
1013
|
]
|
713
|
-
|
1014
|
+
if not gpu_nodes:
|
1015
|
+
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
714
1016
|
if is_tpu_on_gke(acc_type):
|
715
1017
|
# If requested accelerator is a TPU type, check if the cluster
|
716
1018
|
# has sufficient TPU resource to meet the requirement.
|
@@ -795,9 +1097,10 @@ def get_accelerator_label_key_value(
|
|
795
1097
|
# early since we assume the cluster autoscaler will handle GPU
|
796
1098
|
# node provisioning.
|
797
1099
|
return None, None, None, None
|
798
|
-
|
799
|
-
assert
|
800
|
-
|
1100
|
+
autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type)
|
1101
|
+
assert autoscaler is not None, ('Unsupported autoscaler type:'
|
1102
|
+
f' {autoscaler_type}')
|
1103
|
+
formatter = autoscaler.label_formatter
|
801
1104
|
tpu_topology_label_key = None
|
802
1105
|
tpu_topology_label_value = None
|
803
1106
|
if is_tpu_on_gke(acc_type):
|
{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250318.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=FI9zi2aUAWSFDtaG97GJau_uVb6mL5g5q7Co4_7rjmk,6428
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
|
4
4
|
sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
|
@@ -55,7 +55,7 @@ sky/clouds/do.py,sha256=hmksx0XML0dVHUZBMV2Wr3a5VilOsYfxX2dSBV_XK5o,11487
|
|
55
55
|
sky/clouds/fluidstack.py,sha256=Eb0nlfU_EwTtGtV0nPKS2ueBlB0nYiDAN9swA-jjQV0,12446
|
56
56
|
sky/clouds/gcp.py,sha256=cvFSeX8RcyhX5HJb57YposUr9p1RaUPmpxvg_AI_D3c,55978
|
57
57
|
sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
|
58
|
-
sky/clouds/kubernetes.py,sha256=
|
58
|
+
sky/clouds/kubernetes.py,sha256=u8mRd75a0NS7-uHdGXk_cqqLc4Z2vU0CedwmLJpzmZ0,36081
|
59
59
|
sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
|
60
60
|
sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
|
61
61
|
sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
|
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
|
|
165
165
|
sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
|
166
166
|
sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
|
167
167
|
sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
|
168
|
-
sky/provision/kubernetes/utils.py,sha256=
|
168
|
+
sky/provision/kubernetes/utils.py,sha256=EO_CNkTZUl8gkmWNq1v37OoARJ8xS-f9m9I4o0qanPY,123837
|
169
169
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
170
170
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
171
171
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -347,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
|
|
347
347
|
sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
|
348
348
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
349
349
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
350
|
-
skypilot_nightly-1.0.0.
|
351
|
-
skypilot_nightly-1.0.0.
|
352
|
-
skypilot_nightly-1.0.0.
|
353
|
-
skypilot_nightly-1.0.0.
|
354
|
-
skypilot_nightly-1.0.0.
|
355
|
-
skypilot_nightly-1.0.0.
|
350
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
351
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/METADATA,sha256=PBav6AniWi02Hrkry2Kyqu_8XuKXs_pKl1DdOiBn3h0,17919
|
352
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
353
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
354
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
355
|
+
skypilot_nightly-1.0.0.dev20250318.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|