skypilot-nightly 1.0.0.dev20250317__py3-none-any.whl → 1.0.0.dev20250319__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/clouds/kubernetes.py +44 -25
- sky/execution.py +6 -1
- sky/optimizer.py +8 -4
- sky/provision/kubernetes/utils.py +323 -12
- sky/server/requests/executor.py +33 -19
- sky/server/server.py +4 -1
- sky/utils/subprocess_utils.py +47 -25
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/RECORD +14 -14
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '246e69ba16705c31b69143bfe76efcee17b6407f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250319'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/clouds/kubernetes.py
CHANGED
@@ -229,32 +229,52 @@ class Kubernetes(clouds.Cloud):
|
|
229
229
|
# Check if requested instance type will fit in the cluster.
|
230
230
|
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
231
231
|
# kubernetes cluster/context).
|
232
|
-
|
232
|
+
if instance_type is None:
|
233
|
+
return regions
|
234
|
+
|
233
235
|
autoscaler_type = kubernetes_utils.get_autoscaler_type()
|
234
|
-
if autoscaler_type is None and
|
235
|
-
|
236
|
-
#
|
237
|
-
# instance type without running checks.
|
238
|
-
# fails, the pod will be stuck in
|
239
|
-
# provision_timeout, after which failover
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
if fits:
|
249
|
-
regions_to_return.append(r)
|
250
|
-
else:
|
251
|
-
logger.debug(
|
252
|
-
f'Instance type {instance_type} does '
|
253
|
-
'not fit in the Kubernetes cluster with context: '
|
254
|
-
f'{context}. Reason: {reason}')
|
255
|
-
else:
|
256
|
-
regions_to_return = regions
|
236
|
+
if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
|
237
|
+
autoscaler_type).can_query_backend):
|
238
|
+
# Unsupported autoscaler type. Rely on the autoscaler to
|
239
|
+
# provision the right instance type without running checks.
|
240
|
+
# Worst case, if autoscaling fails, the pod will be stuck in
|
241
|
+
# pending state until provision_timeout, after which failover
|
242
|
+
# will be triggered.
|
243
|
+
#
|
244
|
+
# Removing this if statement produces the same behavior,
|
245
|
+
# because can_create_new_instance_of_type() always returns True
|
246
|
+
# for unsupported autoscaler types.
|
247
|
+
# This check is here as a performance optimization to avoid
|
248
|
+
# further code executions that is known to return this result.
|
249
|
+
return regions
|
257
250
|
|
251
|
+
regions_to_return = []
|
252
|
+
for r in regions:
|
253
|
+
context = r.name
|
254
|
+
try:
|
255
|
+
fits, reason = kubernetes_utils.check_instance_fits(
|
256
|
+
context, instance_type)
|
257
|
+
except exceptions.KubeAPIUnreachableError as e:
|
258
|
+
cls._log_unreachable_context(context, str(e))
|
259
|
+
continue
|
260
|
+
if fits:
|
261
|
+
regions_to_return.append(r)
|
262
|
+
continue
|
263
|
+
logger.debug(f'Instance type {instance_type} does '
|
264
|
+
'not fit in the existing Kubernetes cluster '
|
265
|
+
'with context: '
|
266
|
+
f'{context}. Reason: {reason}')
|
267
|
+
if autoscaler_type is None:
|
268
|
+
continue
|
269
|
+
autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
|
270
|
+
logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
|
271
|
+
if autoscaler.can_create_new_instance_of_type(
|
272
|
+
context, instance_type):
|
273
|
+
logger.debug(f'Kubernetes cluster {context} can be '
|
274
|
+
'autoscaled to create instance type '
|
275
|
+
f'{instance_type}. Including {context} '
|
276
|
+
'in the list of regions to return.')
|
277
|
+
regions_to_return.append(r)
|
258
278
|
return regions_to_return
|
259
279
|
|
260
280
|
def instance_type_to_hourly_cost(self,
|
@@ -618,7 +638,6 @@ class Kubernetes(clouds.Cloud):
|
|
618
638
|
chosen_instance_type = (
|
619
639
|
kubernetes_utils.KubernetesInstanceType.from_resources(
|
620
640
|
gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
|
621
|
-
|
622
641
|
# Check the availability of the specified instance type in all contexts.
|
623
642
|
available_regions = self.regions_with_offering(
|
624
643
|
chosen_instance_type,
|
sky/execution.py
CHANGED
@@ -529,6 +529,11 @@ def launch(
|
|
529
529
|
]
|
530
530
|
skip_unnecessary_provisioning = True
|
531
531
|
|
532
|
+
# Attach to setup if the cluster is a controller, so that user can
|
533
|
+
# see the setup logs when inspecting the launch process to know
|
534
|
+
# excatly what the job is waiting for.
|
535
|
+
detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
|
536
|
+
|
532
537
|
return _execute(
|
533
538
|
entrypoint=entrypoint,
|
534
539
|
dryrun=dryrun,
|
@@ -540,7 +545,7 @@ def launch(
|
|
540
545
|
optimize_target=optimize_target,
|
541
546
|
stages=stages,
|
542
547
|
cluster_name=cluster_name,
|
543
|
-
detach_setup=
|
548
|
+
detach_setup=detach_setup,
|
544
549
|
detach_run=True,
|
545
550
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
546
551
|
no_setup=no_setup,
|
sky/optimizer.py
CHANGED
@@ -1328,13 +1328,17 @@ def _fill_in_launchable_resources(
|
|
1328
1328
|
f'{colorama.Style.RESET_ALL}')
|
1329
1329
|
else:
|
1330
1330
|
if resources.cpus is not None:
|
1331
|
-
logger.info('
|
1331
|
+
logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
|
1332
|
+
'- Try specifying a different CPU count, '
|
1332
1333
|
'or add "+" to the end of the CPU count '
|
1333
|
-
'to allow for larger instances.'
|
1334
|
+
'to allow for larger instances.'
|
1335
|
+
f'{colorama.Style.RESET_ALL}')
|
1334
1336
|
if resources.memory is not None:
|
1335
|
-
logger.info('
|
1337
|
+
logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
|
1338
|
+
'- Try specifying a different memory size, '
|
1336
1339
|
'or add "+" to the end of the memory size '
|
1337
|
-
'to allow for larger instances.'
|
1340
|
+
'to allow for larger instances.'
|
1341
|
+
f'{colorama.Style.RESET_ALL}')
|
1338
1342
|
for cloud, hint in hints.items():
|
1339
1343
|
logger.info(f'{repr(cloud)}: {hint}')
|
1340
1344
|
|
@@ -21,6 +21,7 @@ from sky import exceptions
|
|
21
21
|
from sky import models
|
22
22
|
from sky import sky_logging
|
23
23
|
from sky import skypilot_config
|
24
|
+
from sky.adaptors import gcp
|
24
25
|
from sky.adaptors import kubernetes
|
25
26
|
from sky.provision import constants as provision_constants
|
26
27
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
@@ -96,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
|
|
96
97
|
# Multi-host compatible v5e TPU configurations allowed.
|
97
98
|
'tpu-v5-lite-podslice': 'v5e',
|
98
99
|
'tpu-v5p-slice': 'v5p',
|
100
|
+
'tpu-v6e-slice': 'v6e',
|
99
101
|
}
|
100
102
|
|
101
103
|
POD_STATUSES = {
|
@@ -358,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
358
360
|
# label to use in an autoscaling environment. For list of topologies, see:
|
359
361
|
# tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
|
360
362
|
# tpu v5p: https://cloud.google.com/tpu/docs/v5p
|
361
|
-
#
|
363
|
+
# tpu v6e: https://cloud.google.com/tpu/docs/v6e
|
364
|
+
# TODO(romilb): Add support for TPU v4.
|
362
365
|
GKE_TPU_TOPOLOGIES = {
|
363
366
|
'tpu-v5-lite-podslice': {
|
364
367
|
1: '1x1',
|
@@ -373,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
373
376
|
'tpu-v5p-slice': {
|
374
377
|
4: '2x2x1'
|
375
378
|
},
|
379
|
+
'tpu-v6e-slice': {
|
380
|
+
1: '1x1',
|
381
|
+
4: '2x2',
|
382
|
+
8: '2x4'
|
383
|
+
}
|
376
384
|
}
|
377
385
|
|
378
386
|
@classmethod
|
@@ -517,13 +525,6 @@ LABEL_FORMATTER_REGISTRY = [
|
|
517
525
|
GFDLabelFormatter, CoreWeaveLabelFormatter
|
518
526
|
]
|
519
527
|
|
520
|
-
# Mapping of autoscaler type to label formatter
|
521
|
-
AUTOSCALER_TO_LABEL_FORMATTER = {
|
522
|
-
kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
|
523
|
-
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter, # pylint: disable=line-too-long
|
524
|
-
kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
|
525
|
-
}
|
526
|
-
|
527
528
|
|
528
529
|
@annotations.lru_cache(scope='request')
|
529
530
|
def detect_gpu_label_formatter(
|
@@ -557,6 +558,314 @@ def detect_gpu_label_formatter(
|
|
557
558
|
return label_formatter, node_labels
|
558
559
|
|
559
560
|
|
561
|
+
class Autoscaler:
|
562
|
+
"""Base class to define a autoscaler for a Kubernetes cluster.
|
563
|
+
An autoscaler is a class that defines how to detect if a Kubernetes
|
564
|
+
context can autoscale to meet the resource requirements of a task.
|
565
|
+
"""
|
566
|
+
|
567
|
+
label_formatter: Any = None
|
568
|
+
|
569
|
+
# returns if the autoscaler backend can be queried for information.
|
570
|
+
# If True, SkyPilot will query the autoscaler backend to check if
|
571
|
+
# the Kubernetes context can autoscale to meet the resource requirements
|
572
|
+
# of a task.
|
573
|
+
can_query_backend: bool = False
|
574
|
+
|
575
|
+
@classmethod
|
576
|
+
# pylint: disable=unused-argument
|
577
|
+
def can_create_new_instance_of_type(cls, context: str,
|
578
|
+
instance_type: str) -> bool:
|
579
|
+
"""Returns if the Kubernetes context has an autoscaler
|
580
|
+
that can create a new node that satisfies the instance type.
|
581
|
+
Args:
|
582
|
+
context: The Kubernetes context to check.
|
583
|
+
instance_type: The instance type to check.
|
584
|
+
Returns:
|
585
|
+
bool: True if the Kubernetes context has an autoscaler that can
|
586
|
+
create a new node satisfying the instance type,
|
587
|
+
or if such determination is not possible.
|
588
|
+
False if the Kubernetes context autoscaler cannot create a new
|
589
|
+
node satisfying the instance type.
|
590
|
+
"""
|
591
|
+
# For autoscalers that SkyPilot does not know how to interface with,
|
592
|
+
# assume the autoscaler can create a new node that satisfies
|
593
|
+
# the instance type.
|
594
|
+
# If this is not the case, the autoscaler will fail to provision the
|
595
|
+
# node and the pod will be stuck in pending state until
|
596
|
+
# provision_timeout, after which failover will be triggered.
|
597
|
+
return True
|
598
|
+
|
599
|
+
|
600
|
+
class GKEAutoscaler(Autoscaler):
|
601
|
+
"""GKE autoscaler
|
602
|
+
"""
|
603
|
+
|
604
|
+
label_formatter: Any = GKELabelFormatter
|
605
|
+
can_query_backend: bool = True
|
606
|
+
|
607
|
+
# This variable is stored in memory in the server.
|
608
|
+
# The variable will reset if the server restarts.
|
609
|
+
_pip_install_gcp_hint_last_sent = 0.0
|
610
|
+
|
611
|
+
@classmethod
|
612
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
613
|
+
def can_create_new_instance_of_type(cls, context: str,
|
614
|
+
instance_type: str) -> bool:
|
615
|
+
"""Looks at each node pool in the cluster and checks if
|
616
|
+
it can create a new node that satisfies the instance type.
|
617
|
+
If the context does not match standard GKE context naming convention,
|
618
|
+
or GKE credential is not set, this function returns True
|
619
|
+
for optimistic pod scheduling.
|
620
|
+
"""
|
621
|
+
# assume context naming convention of
|
622
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
623
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
624
|
+
context)
|
625
|
+
if not valid:
|
626
|
+
# Context name is not in the format of
|
627
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
628
|
+
# Cannot determine if the context can autoscale
|
629
|
+
# return True for optimistic pod scheduling.
|
630
|
+
logger.debug(f'context {context} is not in the format of '
|
631
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
632
|
+
'reporting context as potentially capable of '
|
633
|
+
'provisioning resources without further check')
|
634
|
+
return True
|
635
|
+
try:
|
636
|
+
logger.debug(
|
637
|
+
f'attempting to get information about cluster {cluster_name}')
|
638
|
+
container_service = gcp.build('container',
|
639
|
+
'v1',
|
640
|
+
credentials=None,
|
641
|
+
cache_discovery=False)
|
642
|
+
cluster = container_service.projects().locations().clusters().get(
|
643
|
+
name=f'projects/{project_id}'
|
644
|
+
f'/locations/{location}'
|
645
|
+
f'/clusters/{cluster_name}').execute()
|
646
|
+
except ImportError:
|
647
|
+
# If the gcp module is not installed, return True for
|
648
|
+
# optimistic pod scheduling.
|
649
|
+
# Remind the user once per day to install the gcp module for better
|
650
|
+
# pod scheduling with GKE autoscaler.
|
651
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
652
|
+
logger.info(
|
653
|
+
'Could not fetch autoscaler information from GKE. '
|
654
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
655
|
+
'scheduling with GKE autoscaler.')
|
656
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
657
|
+
return True
|
658
|
+
except gcp.http_error_exception() as e:
|
659
|
+
# Cluster information is not available.
|
660
|
+
# return True for optimistic pod scheduling.
|
661
|
+
logger.debug(f'{e.message}', exc_info=True)
|
662
|
+
return True
|
663
|
+
|
664
|
+
# Check if any node pool with autoscaling enabled can
|
665
|
+
# fit the instance type.
|
666
|
+
for node_pool in cluster['nodePools']:
|
667
|
+
logger.debug(f'checking if node pool {node_pool["name"]} '
|
668
|
+
'has autoscaling enabled.')
|
669
|
+
if (node_pool['autoscaling'] is not None and
|
670
|
+
'enabled' in node_pool['autoscaling'] and
|
671
|
+
node_pool['autoscaling']['enabled']):
|
672
|
+
logger.debug(
|
673
|
+
f'node pool {node_pool["name"]} has autoscaling enabled. '
|
674
|
+
'Checking if it can create a node '
|
675
|
+
f'satisfying {instance_type}')
|
676
|
+
if cls._check_instance_fits_gke_autoscaler_node_pool(
|
677
|
+
instance_type, node_pool):
|
678
|
+
return True
|
679
|
+
return False
|
680
|
+
|
681
|
+
@classmethod
|
682
|
+
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
683
|
+
"""Validates the context name is in the format of
|
684
|
+
gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
685
|
+
Returns:
|
686
|
+
bool: True if the context name is in the format of
|
687
|
+
gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
688
|
+
str: project id
|
689
|
+
str: location
|
690
|
+
str: cluster name
|
691
|
+
"""
|
692
|
+
context_components = context.split('_')
|
693
|
+
if len(context_components) != 4 or context_components[0] != 'gke':
|
694
|
+
logger.debug(
|
695
|
+
f'context {context} is not in valid GKE context format.')
|
696
|
+
return False, '', '', ''
|
697
|
+
|
698
|
+
logger.debug(f'context {context} is in valid GKE context format.')
|
699
|
+
return True, context_components[1], context_components[
|
700
|
+
2], context_components[3]
|
701
|
+
|
702
|
+
@classmethod
|
703
|
+
def _check_instance_fits_gke_autoscaler_node_pool(
|
704
|
+
cls, instance_type: str, node_pool: dict
|
705
|
+
) -> bool: # check if there are any spare capacity in the autoscaler.
|
706
|
+
node_pool_name = node_pool['name']
|
707
|
+
logger.debug(
|
708
|
+
f'checking if autoscale-enabled node pool {node_pool_name} '
|
709
|
+
f'can create a node satisfying {instance_type}')
|
710
|
+
k8s_instance_type = KubernetesInstanceType.\
|
711
|
+
from_instance_type(instance_type)
|
712
|
+
node_config = node_pool['config']
|
713
|
+
machine_type = node_config['machineType']
|
714
|
+
|
715
|
+
# Accelerator check
|
716
|
+
requested_acc_type = k8s_instance_type.accelerator_type
|
717
|
+
requested_acc_count = k8s_instance_type.accelerator_count
|
718
|
+
acc_is_tpu = (requested_acc_type is not None and
|
719
|
+
is_tpu_on_gke(requested_acc_type))
|
720
|
+
if requested_acc_type is not None:
|
721
|
+
assert requested_acc_count is not None, (requested_acc_type,
|
722
|
+
requested_acc_count)
|
723
|
+
accelerator_exists = False
|
724
|
+
if acc_is_tpu:
|
725
|
+
# Accelerator type is a TPU.
|
726
|
+
logger.debug(
|
727
|
+
f'checking {node_pool_name} for TPU {requested_acc_type}:'
|
728
|
+
f'{requested_acc_count}')
|
729
|
+
if 'resourceLabels' in node_config:
|
730
|
+
accelerator_exists = cls._node_pool_has_tpu_capacity(
|
731
|
+
node_config['resourceLabels'], machine_type,
|
732
|
+
requested_acc_type, requested_acc_count)
|
733
|
+
else:
|
734
|
+
# Accelerator type is a GPU.
|
735
|
+
logger.debug(
|
736
|
+
f'checking {node_pool_name} for GPU {requested_acc_type}:'
|
737
|
+
f'{requested_acc_count}')
|
738
|
+
if 'accelerators' in node_config:
|
739
|
+
accelerator_exists = cls._node_pool_has_gpu_capacity(
|
740
|
+
node_config['accelerators'], requested_acc_type,
|
741
|
+
requested_acc_count)
|
742
|
+
|
743
|
+
if not accelerator_exists:
|
744
|
+
logger.debug(f'{node_pool_name} does not have accelerators '
|
745
|
+
f'{requested_acc_type}:{requested_acc_count}')
|
746
|
+
return False
|
747
|
+
|
748
|
+
# vcpu and memory check is not supported for TPU instances.
|
749
|
+
# TODO(seungjin): Correctly account for vcpu/memory for TPUs.
|
750
|
+
if acc_is_tpu:
|
751
|
+
# vcpu and memory check
|
752
|
+
logger.debug(f'vcpu and memory check is not supported for TPUs. '
|
753
|
+
'Skipping vcpu and memory check for node pool '
|
754
|
+
f'{node_pool_name}.')
|
755
|
+
return True
|
756
|
+
|
757
|
+
vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(machine_type)
|
758
|
+
if vcpus is not None and vcpus < k8s_instance_type.cpus:
|
759
|
+
logger.debug(f'vcpu check failed for {machine_type} '
|
760
|
+
f'on node pool {node_pool_name}')
|
761
|
+
return False
|
762
|
+
if mem is not None and mem < k8s_instance_type.memory:
|
763
|
+
logger.debug(f'memory check failed for {machine_type} '
|
764
|
+
f'on node pool {node_pool_name}')
|
765
|
+
return False
|
766
|
+
|
767
|
+
logger.debug(f'node pool {node_pool_name} can create a node '
|
768
|
+
f'satisfying {instance_type}')
|
769
|
+
return True
|
770
|
+
|
771
|
+
@classmethod
|
772
|
+
def _node_pool_has_gpu_capacity(cls, node_pool_accelerators: List[dict],
|
773
|
+
requested_gpu_type: str,
|
774
|
+
requested_gpu_count: int) -> bool:
|
775
|
+
"""Check if the node pool has enough GPU capacity
|
776
|
+
to fit the instance type.
|
777
|
+
"""
|
778
|
+
for accelerator in node_pool_accelerators:
|
779
|
+
node_accelerator_type = GKELabelFormatter. \
|
780
|
+
get_accelerator_from_label_value(
|
781
|
+
accelerator['acceleratorType'])
|
782
|
+
node_accelerator_count = accelerator['acceleratorCount']
|
783
|
+
if node_accelerator_type == requested_gpu_type and int(
|
784
|
+
node_accelerator_count) >= requested_gpu_count:
|
785
|
+
return True
|
786
|
+
return False
|
787
|
+
|
788
|
+
@classmethod
|
789
|
+
def _node_pool_has_tpu_capacity(cls, node_pool_resource_labels: dict,
|
790
|
+
machine_type: str, requested_tpu_type: str,
|
791
|
+
requested_tpu_count: int) -> bool:
|
792
|
+
"""Check if the node pool has enough TPU capacity
|
793
|
+
to fit the instance type.
|
794
|
+
"""
|
795
|
+
|
796
|
+
if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
|
797
|
+
# This node does not have TPUs.
|
798
|
+
return False
|
799
|
+
if cls._is_node_multi_host_tpu(node_pool_resource_labels):
|
800
|
+
# This node is a multi-host TPU.
|
801
|
+
# multi-host TPUs are not supported in SkyPilot yet.
|
802
|
+
return False
|
803
|
+
node_tpu_type = node_pool_resource_labels['goog-gke-accelerator-type']
|
804
|
+
# infer chip count from instance type
|
805
|
+
tpu_chip_count = cls._tpu_chip_count_from_instance_type(machine_type)
|
806
|
+
|
807
|
+
# For TPUs, the number of requested TPU count
|
808
|
+
# must exactly match the TPU count in the instance.
|
809
|
+
return (node_tpu_type == requested_tpu_type and
|
810
|
+
tpu_chip_count == requested_tpu_count)
|
811
|
+
|
812
|
+
@classmethod
|
813
|
+
def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
|
814
|
+
"""Infer the number of TPU chips from the instance type."""
|
815
|
+
machine_type_parts = machine_type.split('-')
|
816
|
+
# according to
|
817
|
+
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
|
818
|
+
# GKE TPU machine types have the format of
|
819
|
+
# ct<version>-<type>-<node-chip-count>t
|
820
|
+
logger.debug(
|
821
|
+
f'inferring TPU chip count from machine type: {machine_type}')
|
822
|
+
if (len(machine_type_parts) != 3 or
|
823
|
+
not machine_type_parts[0].startswith('ct') or
|
824
|
+
not machine_type_parts[2].endswith('t') or
|
825
|
+
not machine_type_parts[2].strip('t').isdigit()):
|
826
|
+
logger.debug(f'machine type {machine_type} is not a '
|
827
|
+
'valid TPU machine type format.')
|
828
|
+
return 0
|
829
|
+
num_tpu_chips = int(machine_type_parts[2].strip('t'))
|
830
|
+
logger.debug(
|
831
|
+
f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
|
832
|
+
return num_tpu_chips
|
833
|
+
|
834
|
+
@classmethod
|
835
|
+
def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
|
836
|
+
"""Check if the node pool is a multi-host TPU."""
|
837
|
+
return ('goog-gke-tpu-node-pool-type' in resource_labels and
|
838
|
+
resource_labels['goog-gke-tpu-node-pool-type'] == 'multi-host')
|
839
|
+
|
840
|
+
|
841
|
+
class KarpenterAutoscaler(Autoscaler):
|
842
|
+
"""Karpenter autoscaler
|
843
|
+
"""
|
844
|
+
|
845
|
+
label_formatter: Any = KarpenterLabelFormatter
|
846
|
+
can_query_backend: bool = False
|
847
|
+
|
848
|
+
|
849
|
+
class GenericAutoscaler(Autoscaler):
|
850
|
+
"""Generic autoscaler
|
851
|
+
"""
|
852
|
+
|
853
|
+
label_formatter: Any = SkyPilotLabelFormatter
|
854
|
+
can_query_backend: bool = False
|
855
|
+
|
856
|
+
|
857
|
+
# Mapping of autoscaler type to autoscaler
|
858
|
+
AUTOSCALER_TYPE_TO_AUTOSCALER = {
|
859
|
+
kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
|
860
|
+
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
|
861
|
+
kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
|
862
|
+
}
|
863
|
+
|
864
|
+
|
865
|
+
def get_autoscaler(autoscaler_type: kubernetes_enums.KubernetesAutoscalerType):
|
866
|
+
return AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type, Autoscaler)
|
867
|
+
|
868
|
+
|
560
869
|
@annotations.lru_cache(scope='request', maxsize=10)
|
561
870
|
def detect_accelerator_resource(
|
562
871
|
context: Optional[str]) -> Tuple[bool, Set[str]]:
|
@@ -710,7 +1019,8 @@ def check_instance_fits(context: Optional[str],
|
|
710
1019
|
node for node in nodes if gpu_label_key in node.metadata.labels and
|
711
1020
|
node.metadata.labels[gpu_label_key] == gpu_label_val
|
712
1021
|
]
|
713
|
-
|
1022
|
+
if not gpu_nodes:
|
1023
|
+
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
714
1024
|
if is_tpu_on_gke(acc_type):
|
715
1025
|
# If requested accelerator is a TPU type, check if the cluster
|
716
1026
|
# has sufficient TPU resource to meet the requirement.
|
@@ -795,9 +1105,10 @@ def get_accelerator_label_key_value(
|
|
795
1105
|
# early since we assume the cluster autoscaler will handle GPU
|
796
1106
|
# node provisioning.
|
797
1107
|
return None, None, None, None
|
798
|
-
|
799
|
-
assert
|
800
|
-
|
1108
|
+
autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type)
|
1109
|
+
assert autoscaler is not None, ('Unsupported autoscaler type:'
|
1110
|
+
f' {autoscaler_type}')
|
1111
|
+
formatter = autoscaler.label_formatter
|
801
1112
|
tpu_topology_label_key = None
|
802
1113
|
tpu_topology_label_value = None
|
803
1114
|
if is_tpu_on_gke(acc_type):
|
sky/server/requests/executor.py
CHANGED
@@ -49,7 +49,6 @@ from sky.utils import annotations
|
|
49
49
|
from sky.utils import common_utils
|
50
50
|
from sky.utils import subprocess_utils
|
51
51
|
from sky.utils import timeline
|
52
|
-
from sky.utils import ux_utils
|
53
52
|
|
54
53
|
if typing.TYPE_CHECKING:
|
55
54
|
import types
|
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
|
221
220
|
os.close(original_stderr)
|
222
221
|
|
223
222
|
|
223
|
+
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
224
|
+
raise KeyboardInterrupt
|
225
|
+
|
226
|
+
|
224
227
|
def _request_execution_wrapper(request_id: str,
|
225
228
|
ignore_return_value: bool) -> None:
|
226
229
|
"""Wrapper for a request execution.
|
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
|
|
232
235
|
3. Redirect the stdout and stderr of the execution to log file;
|
233
236
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
234
237
|
"""
|
235
|
-
|
236
|
-
|
237
|
-
frame: Optional['types.FrameType']) -> None:
|
238
|
-
raise KeyboardInterrupt
|
239
|
-
|
240
|
-
signal.signal(signal.SIGTERM, sigterm_handler)
|
238
|
+
# Handle the SIGTERM signal to abort the request processing gracefully.
|
239
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
241
240
|
|
242
241
|
pid = multiprocessing.current_process().pid
|
243
242
|
logger.info(f'Running request {request_id} with pid {pid}')
|
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
355
354
|
Args:
|
356
355
|
max_parallel_size: Maximum number of parallel jobs this worker can run.
|
357
356
|
"""
|
357
|
+
# Handle the SIGTERM signal to abort the executor process gracefully.
|
358
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
358
359
|
proc_group = f'{worker.schedule_type.value}-{worker.id}'
|
359
360
|
setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
|
360
361
|
queue = _get_queue(worker.schedule_type)
|
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
388
389
|
logger.info(f'[{worker}] Finished request: {request_id}')
|
389
390
|
else:
|
390
391
|
logger.info(f'[{worker}] Submitted request: {request_id}')
|
391
|
-
except KeyboardInterrupt:
|
392
|
-
# Interrupt the worker process will stop request execution, but
|
393
|
-
# the SIGTERM request should be respected anyway since it might
|
394
|
-
# be explicitly sent by user.
|
395
|
-
# TODO(aylei): crash the API server or recreate the worker process
|
396
|
-
# to avoid broken state.
|
397
|
-
logger.error(f'[{worker}] Worker process interrupted')
|
398
|
-
with ux_utils.print_exception_no_traceback():
|
399
|
-
raise
|
400
392
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
401
393
|
# Catch any other exceptions to avoid crashing the worker process.
|
402
394
|
logger.error(
|
403
|
-
f'[{worker}] Error processing request
|
395
|
+
f'[{worker}] Error processing request: '
|
396
|
+
f'{request_id if "request_id" in locals() else ""} '
|
404
397
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
405
398
|
|
406
399
|
# Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
|
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
409
402
|
# We use executor instead of individual multiprocessing.Process to avoid
|
410
403
|
# the overhead of forking a new process for each request, which can be about
|
411
404
|
# 1s delay.
|
412
|
-
|
405
|
+
try:
|
406
|
+
executor = concurrent.futures.ProcessPoolExecutor(
|
413
407
|
max_workers=max_parallel_size,
|
414
408
|
initializer=executor_initializer,
|
415
|
-
initargs=(proc_group,))
|
409
|
+
initargs=(proc_group,))
|
416
410
|
while True:
|
417
411
|
process_request(executor)
|
412
|
+
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
413
|
+
except KeyboardInterrupt:
|
414
|
+
pass
|
415
|
+
finally:
|
416
|
+
# In most cases, here we receive either ctrl-c in foreground execution
|
417
|
+
# or SIGTERM on server exiting. Gracefully exit the worker process and
|
418
|
+
# the executor.
|
419
|
+
# TODO(aylei): worker may also be killed by system daemons like OOM
|
420
|
+
# killer, crash the API server or recreate the worker process to avoid
|
421
|
+
# broken state in such cases.
|
422
|
+
logger.info(f'[{worker}] Worker process interrupted')
|
423
|
+
executor_processes = list(executor._processes.values()) # pylint: disable=protected-access,line-too-long
|
424
|
+
# Shutdown the executor so that executor process can exit once the
|
425
|
+
# running task is finished or interrupted.
|
426
|
+
executor.shutdown(wait=False)
|
427
|
+
# Proactively interrupt the running task to avoid indefinite waiting.
|
428
|
+
subprocess_utils.run_in_parallel(
|
429
|
+
subprocess_utils.kill_process_with_grace_period,
|
430
|
+
executor_processes,
|
431
|
+
num_threads=len(executor_processes))
|
418
432
|
|
419
433
|
|
420
434
|
def start(deploy: bool) -> List[multiprocessing.Process]:
|
sky/server/server.py
CHANGED
@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
|
|
1140
1140
|
# The process may not be started yet, close it anyway.
|
1141
1141
|
proc.close()
|
1142
1142
|
|
1143
|
+
# Terminate processes in reverse order in case dependency, especially
|
1144
|
+
# queue server. Terminate queue server first does not affect the
|
1145
|
+
# correctness of cleanup but introduce redundant error messages.
|
1143
1146
|
subprocess_utils.run_in_parallel(cleanup,
|
1144
|
-
sub_procs,
|
1147
|
+
list(reversed(sub_procs)),
|
1145
1148
|
num_threads=len(sub_procs))
|
sky/utils/subprocess_utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Utility functions for subprocesses."""
|
2
|
+
import multiprocessing
|
2
3
|
from multiprocessing import pool
|
3
4
|
import os
|
4
5
|
import random
|
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
|
|
181
182
|
if isinstance(parent_pids, int):
|
182
183
|
parent_pids = [parent_pids]
|
183
184
|
|
184
|
-
def kill(proc: psutil.Process):
|
185
|
-
if not proc.is_running():
|
186
|
-
# Skip if the process is not running.
|
187
|
-
return
|
188
|
-
logger.debug(f'Killing process {proc.pid}')
|
189
|
-
try:
|
190
|
-
if force:
|
191
|
-
proc.kill()
|
192
|
-
else:
|
193
|
-
proc.terminate()
|
194
|
-
proc.wait(timeout=10)
|
195
|
-
except psutil.NoSuchProcess:
|
196
|
-
# The child process may have already been terminated.
|
197
|
-
pass
|
198
|
-
except psutil.TimeoutExpired:
|
199
|
-
logger.debug(
|
200
|
-
f'Process {proc.pid} did not terminate after 10 seconds')
|
201
|
-
# Attempt to force kill if the normal termination fails
|
202
|
-
if not force:
|
203
|
-
logger.debug(f'Force killing process {proc.pid}')
|
204
|
-
proc.kill()
|
205
|
-
proc.wait(timeout=5) # Shorter timeout after force kill
|
206
|
-
|
207
185
|
parent_processes = []
|
208
186
|
if parent_pids is None:
|
209
187
|
parent_processes = [psutil.Process()]
|
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
|
|
218
196
|
for parent_process in parent_processes:
|
219
197
|
child_processes = parent_process.children(recursive=True)
|
220
198
|
if parent_pids is not None:
|
221
|
-
|
199
|
+
kill_process_with_grace_period(parent_process, force=force)
|
222
200
|
logger.debug(f'Killing child processes: {child_processes}')
|
223
201
|
for child in child_processes:
|
224
|
-
|
202
|
+
kill_process_with_grace_period(child, force=force)
|
203
|
+
|
204
|
+
|
205
|
+
def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
|
206
|
+
psutil.Process],
|
207
|
+
force: bool = False,
|
208
|
+
grace_period: int = 10) -> None:
|
209
|
+
"""Kill a process with SIGTERM and wait for it to exit.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
proc: The process to kill, either a multiprocessing.Process or a
|
213
|
+
psutil.Process.
|
214
|
+
force: Whether to force kill the process.
|
215
|
+
grace_period: The grace period seconds to wait for the process to exit.
|
216
|
+
"""
|
217
|
+
if isinstance(proc, psutil.Process):
|
218
|
+
alive = proc.is_running
|
219
|
+
wait = proc.wait
|
220
|
+
else:
|
221
|
+
alive = proc.is_alive
|
222
|
+
wait = proc.join
|
223
|
+
if not alive():
|
224
|
+
# Skip if the process is not running.
|
225
|
+
return
|
226
|
+
logger.debug(f'Killing process {proc.pid}')
|
227
|
+
try:
|
228
|
+
if force:
|
229
|
+
proc.kill()
|
230
|
+
else:
|
231
|
+
proc.terminate()
|
232
|
+
wait(timeout=grace_period)
|
233
|
+
except (psutil.NoSuchProcess, ValueError):
|
234
|
+
# The child process may have already been terminated.
|
235
|
+
return
|
236
|
+
except psutil.TimeoutExpired:
|
237
|
+
# Pass to finally to force kill the process.
|
238
|
+
pass
|
239
|
+
finally:
|
240
|
+
logger.debug(f'Process {proc.pid} did not terminate after '
|
241
|
+
f'{grace_period} seconds')
|
242
|
+
# Attempt to force kill if the normal termination fails
|
243
|
+
if not force:
|
244
|
+
logger.debug(f'Force killing process {proc.pid}')
|
245
|
+
# Shorter timeout after force kill
|
246
|
+
kill_process_with_grace_period(proc, force=True, grace_period=5)
|
225
247
|
|
226
248
|
|
227
249
|
def run_with_retries(
|
{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=3eIvmaqr9j7Q14zbXB6K1AYrtAYYBeSZaufG8cPHilk,6428
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
|
4
4
|
sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
|
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
|
|
7
7
|
sky/core.py,sha256=MU9hcTdh8baMGrr2ZXmbxx12vNlhajrkeyg5QtV717c,47609
|
8
8
|
sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
|
9
9
|
sky/exceptions.py,sha256=cEZ5nm7RhTW22Npw-oYS5Wp9rtxoHxdPQHfkNa92wOo,16641
|
10
|
-
sky/execution.py,sha256=
|
10
|
+
sky/execution.py,sha256=9L8NFOXNphtabnsL7mHGPJeGdw4n6gIIUEOzjW7CEHw,28294
|
11
11
|
sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
|
12
12
|
sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
|
13
|
-
sky/optimizer.py,sha256=
|
13
|
+
sky/optimizer.py,sha256=7FeTo0Bk4M7OnXugv-YdCj50PTL2R7NVGHMsr7DWBJ0,60457
|
14
14
|
sky/resources.py,sha256=f2Qo_Wt0kFruKmYm6cgYbICH_wn0Zkb8uIv6LA82SRs,72153
|
15
15
|
sky/sky_logging.py,sha256=pID2RINjH62n7SZpv70DuN8BSFYdCfTJ2ScGQpVmugg,5725
|
16
16
|
sky/skypilot_config.py,sha256=bt1vSis2aKKdQfPz80-KcjM9vNIg_qYKLNXur782Poo,8693
|
@@ -55,7 +55,7 @@ sky/clouds/do.py,sha256=hmksx0XML0dVHUZBMV2Wr3a5VilOsYfxX2dSBV_XK5o,11487
|
|
55
55
|
sky/clouds/fluidstack.py,sha256=Eb0nlfU_EwTtGtV0nPKS2ueBlB0nYiDAN9swA-jjQV0,12446
|
56
56
|
sky/clouds/gcp.py,sha256=cvFSeX8RcyhX5HJb57YposUr9p1RaUPmpxvg_AI_D3c,55978
|
57
57
|
sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
|
58
|
-
sky/clouds/kubernetes.py,sha256=
|
58
|
+
sky/clouds/kubernetes.py,sha256=u8mRd75a0NS7-uHdGXk_cqqLc4Z2vU0CedwmLJpzmZ0,36081
|
59
59
|
sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
|
60
60
|
sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
|
61
61
|
sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
|
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
|
|
165
165
|
sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
|
166
166
|
sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
|
167
167
|
sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
|
168
|
-
sky/provision/kubernetes/utils.py,sha256=
|
168
|
+
sky/provision/kubernetes/utils.py,sha256=puwjlWM4EMExa1jO0cxluzg8ZSF-QX4rgZGksZdxKiQ,124015
|
169
169
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
170
170
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
171
171
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -230,13 +230,13 @@ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,32
|
|
230
230
|
sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
|
231
231
|
sky/server/common.py,sha256=PMPaKoPtoUGolbdSW78VetUW5H0X7YKBT-z6Hbu3BJM,18471
|
232
232
|
sky/server/constants.py,sha256=_ZNrxYh8vmgbf3DmkGDduxjvO2y43ZSPTkH5rCNsVjU,770
|
233
|
-
sky/server/server.py,sha256=
|
233
|
+
sky/server/server.py,sha256=62IysoY5jCbGi99xIsYrINFIuRgo-cKKIR8fXsKMuW0,44472
|
234
234
|
sky/server/stream_utils.py,sha256=4JMHgtoXPpCT8JwtqyUcDQ9IdZFir9om0JaCRr8rvbQ,5849
|
235
235
|
sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
|
236
236
|
sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
|
237
237
|
sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
238
238
|
sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
|
239
|
-
sky/server/requests/executor.py,sha256=
|
239
|
+
sky/server/requests/executor.py,sha256=BNJqkTQ3swYeRO5YVW-dTmobL2CYnDDf_m-kY7__n40,21684
|
240
240
|
sky/server/requests/payloads.py,sha256=nVb7vr1SNAq6ay2dNe9301zLHp7NrM79M7nsWAECBms,16340
|
241
241
|
sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
|
242
242
|
sky/server/requests/requests.py,sha256=Sys2rg22rIXn7SrHfKzDVuTjBdRlm5oZk58u1UmS6JA,21231
|
@@ -328,7 +328,7 @@ sky/utils/resources_utils.py,sha256=URp6OS9B9nc9tIB5ibZCgGK4XSABmI4kRG0wOM6qgvs,
|
|
328
328
|
sky/utils/rich_utils.py,sha256=3xdDzmn-TQXAE83EevAtOf9N4aak3Bl4ZeD33xIxjOo,11931
|
329
329
|
sky/utils/schemas.py,sha256=KJCHrn1nMZ3XqzddWuu_nFQoRQw01cZh9qh19OrRtps,30145
|
330
330
|
sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
|
331
|
-
sky/utils/subprocess_utils.py,sha256=
|
331
|
+
sky/utils/subprocess_utils.py,sha256=Ee4WajTJ6YLAjC8CgN5l1K7m6hsnpGqDa26MXkDifvw,15776
|
332
332
|
sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
|
333
333
|
sky/utils/ux_utils.py,sha256=ngcOCg1K44p-SOk6XfwxJGXwjoP__PRvNuEzj7t05Yc,10185
|
334
334
|
sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
|
@@ -347,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
|
|
347
347
|
sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
|
348
348
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
349
349
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
350
|
-
skypilot_nightly-1.0.0.
|
351
|
-
skypilot_nightly-1.0.0.
|
352
|
-
skypilot_nightly-1.0.0.
|
353
|
-
skypilot_nightly-1.0.0.
|
354
|
-
skypilot_nightly-1.0.0.
|
355
|
-
skypilot_nightly-1.0.0.
|
350
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
351
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/METADATA,sha256=Iys5Rb5saDPHcYoCslzL2WR1YyxDv2fSA-knwQQb6jc,17919
|
352
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
353
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
354
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
355
|
+
skypilot_nightly-1.0.0.dev20250319.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|