skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -67
- sky/check.py +31 -1
- sky/cli.py +11 -34
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/core.py +8 -5
- sky/data/storage.py +66 -14
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +263 -21
- sky/jobs/utils.py +338 -96
- sky/provision/aws/config.py +48 -26
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +76 -18
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +13 -0
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/provision/aws/config.py
CHANGED
@@ -553,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
|
|
553
553
|
|
554
554
|
def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
555
555
|
expected_sg_name: str) -> Any:
|
556
|
-
|
557
|
-
vpc_to_existing_sg = {
|
558
|
-
sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
|
559
|
-
ec2,
|
560
|
-
[vpc_id],
|
561
|
-
[expected_sg_name],
|
562
|
-
)
|
563
|
-
}
|
556
|
+
"""Find or create a security group in the specified VPC.
|
564
557
|
|
565
|
-
|
566
|
-
|
558
|
+
Args:
|
559
|
+
ec2: The initialized EC2 client object.
|
560
|
+
vpc_id: The ID of the VPC where the security group should be queried
|
561
|
+
or created.
|
562
|
+
expected_sg_name: The expected name of the security group.
|
563
|
+
|
564
|
+
Returns:
|
565
|
+
The security group object containing the details of the security group.
|
566
|
+
|
567
|
+
Raises:
|
568
|
+
exceptions.NoClusterLaunchedError: If the security group creation fails
|
569
|
+
and is not due to an existing duplicate.
|
570
|
+
botocore.exceptions.ClientError: If the security group creation fails
|
571
|
+
due to AWS service issues.
|
572
|
+
"""
|
573
|
+
# Figure out which security groups with this name exist for each VPC...
|
574
|
+
security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
|
575
|
+
expected_sg_name)
|
576
|
+
if security_group is not None:
|
577
|
+
return security_group
|
567
578
|
|
568
579
|
try:
|
569
580
|
# create a new security group
|
@@ -573,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
|
573
584
|
VpcId=vpc_id,
|
574
585
|
)
|
575
586
|
except ec2.meta.client.exceptions.ClientError as e:
|
587
|
+
if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
|
588
|
+
# The security group already exists, but we didn't see it
|
589
|
+
# because of eventual consistency.
|
590
|
+
logger.warning(f'{expected_sg_name} already exists when creating.')
|
591
|
+
security_group = _get_security_group_from_vpc_id(
|
592
|
+
ec2, vpc_id, expected_sg_name)
|
593
|
+
assert (security_group is not None and
|
594
|
+
security_group.group_name == expected_sg_name), (
|
595
|
+
f'Expected {expected_sg_name} but got {security_group}')
|
596
|
+
logger.info(
|
597
|
+
f'Found existing security group {colorama.Style.BRIGHT}'
|
598
|
+
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
599
|
+
f'[id={security_group.id}]')
|
600
|
+
return security_group
|
576
601
|
message = ('Failed to create security group. Error: '
|
577
602
|
f'{common_utils.format_exception(e)}')
|
578
603
|
logger.warning(message)
|
579
604
|
raise exceptions.NoClusterLaunchedError(message) from e
|
580
605
|
|
581
|
-
security_group =
|
582
|
-
|
583
|
-
|
584
|
-
assert security_group, 'Failed to create security group'
|
585
|
-
security_group = security_group[0]
|
586
|
-
|
606
|
+
security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
|
607
|
+
expected_sg_name)
|
608
|
+
assert security_group is not None, 'Failed to create security group'
|
587
609
|
logger.info(f'Created new security group {colorama.Style.BRIGHT}'
|
588
610
|
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
589
611
|
f'[id={security_group.id}]')
|
590
612
|
return security_group
|
591
613
|
|
592
614
|
|
593
|
-
def
|
594
|
-
|
595
|
-
|
596
|
-
unique_group_names = set(group_names)
|
597
|
-
|
615
|
+
def _get_security_group_from_vpc_id(ec2, vpc_id: str,
|
616
|
+
group_name: str) -> Optional[Any]:
|
617
|
+
"""Get security group by VPC ID and group name."""
|
598
618
|
existing_groups = list(
|
599
619
|
ec2.security_groups.filter(Filters=[{
|
600
620
|
'Name': 'vpc-id',
|
601
|
-
'Values':
|
621
|
+
'Values': [vpc_id]
|
602
622
|
}]))
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
623
|
+
|
624
|
+
for sg in existing_groups:
|
625
|
+
if sg.group_name == group_name:
|
626
|
+
return sg
|
627
|
+
|
628
|
+
return None
|
@@ -38,7 +38,7 @@ _FIREWALL_RESOURCE_NOT_FOUND_PATTERN = re.compile(
|
|
38
38
|
r'The resource \'projects/.*/global/firewalls/.*\' was not found')
|
39
39
|
|
40
40
|
|
41
|
-
def
|
41
|
+
def _retry_on_gcp_http_exception(
|
42
42
|
regex: Optional[str] = None,
|
43
43
|
max_retries: int = GCP_MAX_RETRIES,
|
44
44
|
retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
|
@@ -49,17 +49,18 @@ def _retry_on_http_exception(
|
|
49
49
|
|
50
50
|
@functools.wraps(func)
|
51
51
|
def wrapper(*args, **kwargs):
|
52
|
-
exception_type = gcp.http_error_exception()
|
53
52
|
|
54
53
|
def try_catch_exc():
|
55
54
|
try:
|
56
55
|
value = func(*args, **kwargs)
|
57
56
|
return value
|
58
57
|
except Exception as e: # pylint: disable=broad-except
|
59
|
-
if
|
60
|
-
|
61
|
-
|
62
|
-
|
58
|
+
if (isinstance(e, gcp.http_error_exception()) and
|
59
|
+
(regex is None or re.search(regex, str(e)))):
|
60
|
+
logger.error(
|
61
|
+
f'Retrying for gcp.http_error_exception: {e}')
|
62
|
+
return e
|
63
|
+
raise
|
63
64
|
|
64
65
|
for _ in range(max_retries):
|
65
66
|
ret = try_catch_exc()
|
@@ -431,7 +432,7 @@ class GCPComputeInstance(GCPInstance):
|
|
431
432
|
logger.debug(
|
432
433
|
f'Waiting GCP operation {operation["name"]} to be ready ...')
|
433
434
|
|
434
|
-
@
|
435
|
+
@_retry_on_gcp_http_exception(
|
435
436
|
f'Failed to wait for operation {operation["name"]}')
|
436
437
|
def call_operation(fn, timeout: int):
|
437
438
|
request = fn(
|
@@ -613,6 +614,11 @@ class GCPComputeInstance(GCPInstance):
|
|
613
614
|
return operation
|
614
615
|
|
615
616
|
@classmethod
|
617
|
+
# When there is a cloud function running in parallel to set labels for
|
618
|
+
# newly created instances, it may fail with the following error:
|
619
|
+
# "Labels fingerprint either invalid or resource labels have changed"
|
620
|
+
# We should retry until the labels are set successfully.
|
621
|
+
@_retry_on_gcp_http_exception('Labels fingerprint either invalid')
|
616
622
|
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
|
617
623
|
labels: dict) -> None:
|
618
624
|
node = cls.load_resource().instances().get(
|
@@ -1211,7 +1217,7 @@ class GCPTPUVMInstance(GCPInstance):
|
|
1211
1217
|
"""Poll for TPU operation until finished."""
|
1212
1218
|
del project_id, region, zone # unused
|
1213
1219
|
|
1214
|
-
@
|
1220
|
+
@_retry_on_gcp_http_exception(
|
1215
1221
|
f'Failed to wait for operation {operation["name"]}')
|
1216
1222
|
def call_operation(fn, timeout: int):
|
1217
1223
|
request = fn(name=operation['name'])
|
@@ -1379,7 +1385,7 @@ class GCPTPUVMInstance(GCPInstance):
|
|
1379
1385
|
f'Failed to get VPC name for instance {instance}') from e
|
1380
1386
|
|
1381
1387
|
@classmethod
|
1382
|
-
@
|
1388
|
+
@_retry_on_gcp_http_exception('unable to queue the operation')
|
1383
1389
|
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
|
1384
1390
|
labels: dict) -> None:
|
1385
1391
|
while True:
|
@@ -976,7 +976,7 @@ def terminate_instances(
|
|
976
976
|
_terminate_node(namespace, context, pod_name)
|
977
977
|
|
978
978
|
# Run pod termination in parallel
|
979
|
-
subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
|
979
|
+
subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
|
980
980
|
_NUM_THREADS)
|
981
981
|
|
982
982
|
|
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import re
|
8
8
|
import shutil
|
9
9
|
import subprocess
|
10
|
+
import time
|
10
11
|
import typing
|
11
12
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
12
13
|
from urllib.parse import urlparse
|
@@ -105,6 +106,75 @@ ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
|
|
105
106
|
|
106
107
|
logger = sky_logging.init_logger(__name__)
|
107
108
|
|
109
|
+
# Default retry settings for Kubernetes API calls
|
110
|
+
DEFAULT_MAX_RETRIES = 3
|
111
|
+
DEFAULT_RETRY_INTERVAL_SECONDS = 1
|
112
|
+
|
113
|
+
|
114
|
+
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
115
|
+
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
116
|
+
resource_type: Optional[str] = None):
|
117
|
+
"""Decorator to retry Kubernetes API calls on transient failures.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
max_retries: Maximum number of retry attempts
|
121
|
+
retry_interval: Initial seconds to wait between retries
|
122
|
+
resource_type: Type of resource being accessed (e.g. 'node', 'pod').
|
123
|
+
Used to provide more specific error messages.
|
124
|
+
"""
|
125
|
+
|
126
|
+
def decorator(func):
|
127
|
+
|
128
|
+
@functools.wraps(func)
|
129
|
+
def wrapper(*args, **kwargs):
|
130
|
+
last_exception = None
|
131
|
+
backoff = common_utils.Backoff(initial_backoff=retry_interval,
|
132
|
+
max_backoff_factor=3)
|
133
|
+
|
134
|
+
for attempt in range(max_retries):
|
135
|
+
try:
|
136
|
+
return func(*args, **kwargs)
|
137
|
+
except (kubernetes.max_retry_error(),
|
138
|
+
kubernetes.api_exception(),
|
139
|
+
kubernetes.config_exception()) as e:
|
140
|
+
last_exception = e
|
141
|
+
# Don't retry on permanent errors like 401 (Unauthorized)
|
142
|
+
# or 403 (Forbidden)
|
143
|
+
if (isinstance(e, kubernetes.api_exception()) and
|
144
|
+
e.status in (401, 403)):
|
145
|
+
raise
|
146
|
+
if attempt < max_retries - 1:
|
147
|
+
sleep_time = backoff.current_backoff()
|
148
|
+
logger.debug(f'Kubernetes API call {func.__name__} '
|
149
|
+
f'failed with {str(e)}. Retrying in '
|
150
|
+
f'{sleep_time:.1f}s...')
|
151
|
+
time.sleep(sleep_time)
|
152
|
+
continue
|
153
|
+
|
154
|
+
# Format error message based on the type of exception
|
155
|
+
resource_msg = f' when trying to get {resource_type} info' \
|
156
|
+
if resource_type else ''
|
157
|
+
debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
|
158
|
+
if resource_type else ''
|
159
|
+
|
160
|
+
if isinstance(last_exception, kubernetes.max_retry_error()):
|
161
|
+
error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
|
162
|
+
elif isinstance(last_exception, kubernetes.api_exception()):
|
163
|
+
error_msg = (f'Kubernetes API error{resource_msg}: '
|
164
|
+
f'{str(last_exception)}')
|
165
|
+
else:
|
166
|
+
error_msg = (f'Kubernetes configuration error{resource_msg}: '
|
167
|
+
f'{str(last_exception)}')
|
168
|
+
|
169
|
+
raise exceptions.ResourcesUnavailableError(
|
170
|
+
f'{error_msg}'
|
171
|
+
f' Please check if the cluster is healthy and retry.'
|
172
|
+
f'{debug_cmd}') from last_exception
|
173
|
+
|
174
|
+
return wrapper
|
175
|
+
|
176
|
+
return decorator
|
177
|
+
|
108
178
|
|
109
179
|
class GPULabelFormatter:
|
110
180
|
"""Base class to define a GPU label formatter for a Kubernetes cluster
|
@@ -446,6 +516,7 @@ def detect_accelerator_resource(
|
|
446
516
|
|
447
517
|
|
448
518
|
@functools.lru_cache(maxsize=10)
|
519
|
+
@_retry_on_error(resource_type='node')
|
449
520
|
def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
|
450
521
|
"""Gets the kubernetes nodes in the context.
|
451
522
|
|
@@ -454,17 +525,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
|
|
454
525
|
if context is None:
|
455
526
|
context = get_current_kube_config_context_name()
|
456
527
|
|
457
|
-
|
458
|
-
|
459
|
-
_request_timeout=kubernetes.API_TIMEOUT).items
|
460
|
-
except kubernetes.max_retry_error():
|
461
|
-
raise exceptions.ResourcesUnavailableError(
|
462
|
-
'Timed out when trying to get node info from Kubernetes cluster. '
|
463
|
-
'Please check if the cluster is healthy and retry. To debug, run: '
|
464
|
-
'kubectl get nodes') from None
|
528
|
+
nodes = kubernetes.core_api(context).list_node(
|
529
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
465
530
|
return nodes
|
466
531
|
|
467
532
|
|
533
|
+
@_retry_on_error(resource_type='pod')
|
468
534
|
def get_all_pods_in_kubernetes_cluster(
|
469
535
|
context: Optional[str] = None) -> List[Any]:
|
470
536
|
"""Gets pods in all namespaces in kubernetes cluster indicated by context.
|
@@ -474,14 +540,8 @@ def get_all_pods_in_kubernetes_cluster(
|
|
474
540
|
if context is None:
|
475
541
|
context = get_current_kube_config_context_name()
|
476
542
|
|
477
|
-
|
478
|
-
|
479
|
-
_request_timeout=kubernetes.API_TIMEOUT).items
|
480
|
-
except kubernetes.max_retry_error():
|
481
|
-
raise exceptions.ResourcesUnavailableError(
|
482
|
-
'Timed out when trying to get pod info from Kubernetes cluster. '
|
483
|
-
'Please check if the cluster is healthy and retry. To debug, run: '
|
484
|
-
'kubectl get pods') from None
|
543
|
+
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
544
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
485
545
|
return pods
|
486
546
|
|
487
547
|
|
@@ -1758,8 +1818,6 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
|
|
1758
1818
|
else:
|
1759
1819
|
destination[key].extend(value)
|
1760
1820
|
else:
|
1761
|
-
if destination is None:
|
1762
|
-
destination = {}
|
1763
1821
|
destination[key] = value
|
1764
1822
|
|
1765
1823
|
|
sky/resources.py
CHANGED
@@ -540,7 +540,7 @@ class Resources:
|
|
540
540
|
if memory_gb <= 0:
|
541
541
|
with ux_utils.print_exception_no_traceback():
|
542
542
|
raise ValueError(
|
543
|
-
f'The "
|
543
|
+
f'The "memory" field should be positive. Found: {memory!r}')
|
544
544
|
|
545
545
|
def _set_accelerators(
|
546
546
|
self,
|