skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +50 -67
  3. sky/check.py +31 -1
  4. sky/cli.py +11 -34
  5. sky/clouds/kubernetes.py +3 -3
  6. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  7. sky/core.py +8 -5
  8. sky/data/storage.py +66 -14
  9. sky/global_user_state.py +1 -1
  10. sky/jobs/constants.py +8 -7
  11. sky/jobs/controller.py +19 -22
  12. sky/jobs/core.py +0 -2
  13. sky/jobs/recovery_strategy.py +114 -143
  14. sky/jobs/scheduler.py +283 -0
  15. sky/jobs/state.py +263 -21
  16. sky/jobs/utils.py +338 -96
  17. sky/provision/aws/config.py +48 -26
  18. sky/provision/gcp/instance_utils.py +15 -9
  19. sky/provision/kubernetes/instance.py +1 -1
  20. sky/provision/kubernetes/utils.py +76 -18
  21. sky/resources.py +1 -1
  22. sky/serve/autoscalers.py +359 -301
  23. sky/serve/controller.py +10 -8
  24. sky/serve/core.py +84 -7
  25. sky/serve/load_balancer.py +27 -10
  26. sky/serve/replica_managers.py +1 -3
  27. sky/serve/serve_state.py +10 -5
  28. sky/serve/serve_utils.py +28 -1
  29. sky/serve/service.py +4 -3
  30. sky/serve/service_spec.py +31 -0
  31. sky/skylet/constants.py +1 -1
  32. sky/skylet/events.py +7 -3
  33. sky/skylet/job_lib.py +10 -30
  34. sky/skylet/log_lib.py +8 -8
  35. sky/skylet/log_lib.pyi +3 -0
  36. sky/skylet/skylet.py +1 -1
  37. sky/templates/jobs-controller.yaml.j2 +7 -3
  38. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  39. sky/utils/db_utils.py +18 -4
  40. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  41. sky/utils/resources_utils.py +25 -21
  42. sky/utils/schemas.py +13 -0
  43. sky/utils/subprocess_utils.py +48 -9
  44. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
  45. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
  46. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
  48. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  49. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
@@ -553,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
553
553
 
554
554
  def _get_or_create_vpc_security_group(ec2, vpc_id: str,
555
555
  expected_sg_name: str) -> Any:
556
- # Figure out which security groups with this name exist for each VPC...
557
- vpc_to_existing_sg = {
558
- sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
559
- ec2,
560
- [vpc_id],
561
- [expected_sg_name],
562
- )
563
- }
556
+ """Find or create a security group in the specified VPC.
564
557
 
565
- if vpc_id in vpc_to_existing_sg:
566
- return vpc_to_existing_sg[vpc_id]
558
+ Args:
559
+ ec2: The initialized EC2 client object.
560
+ vpc_id: The ID of the VPC where the security group should be queried
561
+ or created.
562
+ expected_sg_name: The expected name of the security group.
563
+
564
+ Returns:
565
+ The security group object containing the details of the security group.
566
+
567
+ Raises:
568
+ exceptions.NoClusterLaunchedError: If the security group creation fails
569
+ and is not due to an existing duplicate.
570
+ botocore.exceptions.ClientError: If the security group creation fails
571
+ due to AWS service issues.
572
+ """
573
+ # Figure out which security groups with this name exist for each VPC...
574
+ security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
575
+ expected_sg_name)
576
+ if security_group is not None:
577
+ return security_group
567
578
 
568
579
  try:
569
580
  # create a new security group
@@ -573,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
573
584
  VpcId=vpc_id,
574
585
  )
575
586
  except ec2.meta.client.exceptions.ClientError as e:
587
+ if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
588
+ # The security group already exists, but we didn't see it
589
+ # because of eventual consistency.
590
+ logger.warning(f'{expected_sg_name} already exists when creating.')
591
+ security_group = _get_security_group_from_vpc_id(
592
+ ec2, vpc_id, expected_sg_name)
593
+ assert (security_group is not None and
594
+ security_group.group_name == expected_sg_name), (
595
+ f'Expected {expected_sg_name} but got {security_group}')
596
+ logger.info(
597
+ f'Found existing security group {colorama.Style.BRIGHT}'
598
+ f'{security_group.group_name}{colorama.Style.RESET_ALL} '
599
+ f'[id={security_group.id}]')
600
+ return security_group
576
601
  message = ('Failed to create security group. Error: '
577
602
  f'{common_utils.format_exception(e)}')
578
603
  logger.warning(message)
579
604
  raise exceptions.NoClusterLaunchedError(message) from e
580
605
 
581
- security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
582
- [expected_sg_name])
583
-
584
- assert security_group, 'Failed to create security group'
585
- security_group = security_group[0]
586
-
606
+ security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
607
+ expected_sg_name)
608
+ assert security_group is not None, 'Failed to create security group'
587
609
  logger.info(f'Created new security group {colorama.Style.BRIGHT}'
588
610
  f'{security_group.group_name}{colorama.Style.RESET_ALL} '
589
611
  f'[id={security_group.id}]')
590
612
  return security_group
591
613
 
592
614
 
593
- def _get_security_groups_from_vpc_ids(ec2, vpc_ids: List[str],
594
- group_names: List[str]) -> List[Any]:
595
- unique_vpc_ids = list(set(vpc_ids))
596
- unique_group_names = set(group_names)
597
-
615
+ def _get_security_group_from_vpc_id(ec2, vpc_id: str,
616
+ group_name: str) -> Optional[Any]:
617
+ """Get security group by VPC ID and group name."""
598
618
  existing_groups = list(
599
619
  ec2.security_groups.filter(Filters=[{
600
620
  'Name': 'vpc-id',
601
- 'Values': unique_vpc_ids
621
+ 'Values': [vpc_id]
602
622
  }]))
603
- filtered_groups = [
604
- sg for sg in existing_groups if sg.group_name in unique_group_names
605
- ]
606
- return filtered_groups
623
+
624
+ for sg in existing_groups:
625
+ if sg.group_name == group_name:
626
+ return sg
627
+
628
+ return None
@@ -38,7 +38,7 @@ _FIREWALL_RESOURCE_NOT_FOUND_PATTERN = re.compile(
38
38
  r'The resource \'projects/.*/global/firewalls/.*\' was not found')
39
39
 
40
40
 
41
- def _retry_on_http_exception(
41
+ def _retry_on_gcp_http_exception(
42
42
  regex: Optional[str] = None,
43
43
  max_retries: int = GCP_MAX_RETRIES,
44
44
  retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
@@ -49,17 +49,18 @@ def _retry_on_http_exception(
49
49
 
50
50
  @functools.wraps(func)
51
51
  def wrapper(*args, **kwargs):
52
- exception_type = gcp.http_error_exception()
53
52
 
54
53
  def try_catch_exc():
55
54
  try:
56
55
  value = func(*args, **kwargs)
57
56
  return value
58
57
  except Exception as e: # pylint: disable=broad-except
59
- if not isinstance(e, exception_type) or (
60
- regex and not re.search(regex, str(e))):
61
- raise
62
- return e
58
+ if (isinstance(e, gcp.http_error_exception()) and
59
+ (regex is None or re.search(regex, str(e)))):
60
+ logger.error(
61
+ f'Retrying for gcp.http_error_exception: {e}')
62
+ return e
63
+ raise
63
64
 
64
65
  for _ in range(max_retries):
65
66
  ret = try_catch_exc()
@@ -431,7 +432,7 @@ class GCPComputeInstance(GCPInstance):
431
432
  logger.debug(
432
433
  f'Waiting GCP operation {operation["name"]} to be ready ...')
433
434
 
434
- @_retry_on_http_exception(
435
+ @_retry_on_gcp_http_exception(
435
436
  f'Failed to wait for operation {operation["name"]}')
436
437
  def call_operation(fn, timeout: int):
437
438
  request = fn(
@@ -613,6 +614,11 @@ class GCPComputeInstance(GCPInstance):
613
614
  return operation
614
615
 
615
616
  @classmethod
617
+ # When there is a cloud function running in parallel to set labels for
618
+ # newly created instances, it may fail with the following error:
619
+ # "Labels fingerprint either invalid or resource labels have changed"
620
+ # We should retry until the labels are set successfully.
621
+ @_retry_on_gcp_http_exception('Labels fingerprint either invalid')
616
622
  def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
617
623
  labels: dict) -> None:
618
624
  node = cls.load_resource().instances().get(
@@ -1211,7 +1217,7 @@ class GCPTPUVMInstance(GCPInstance):
1211
1217
  """Poll for TPU operation until finished."""
1212
1218
  del project_id, region, zone # unused
1213
1219
 
1214
- @_retry_on_http_exception(
1220
+ @_retry_on_gcp_http_exception(
1215
1221
  f'Failed to wait for operation {operation["name"]}')
1216
1222
  def call_operation(fn, timeout: int):
1217
1223
  request = fn(name=operation['name'])
@@ -1379,7 +1385,7 @@ class GCPTPUVMInstance(GCPInstance):
1379
1385
  f'Failed to get VPC name for instance {instance}') from e
1380
1386
 
1381
1387
  @classmethod
1382
- @_retry_on_http_exception('unable to queue the operation')
1388
+ @_retry_on_gcp_http_exception('unable to queue the operation')
1383
1389
  def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
1384
1390
  labels: dict) -> None:
1385
1391
  while True:
@@ -976,7 +976,7 @@ def terminate_instances(
976
976
  _terminate_node(namespace, context, pod_name)
977
977
 
978
978
  # Run pod termination in parallel
979
- subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
979
+ subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
980
980
  _NUM_THREADS)
981
981
 
982
982
 
@@ -7,6 +7,7 @@ import os
7
7
  import re
8
8
  import shutil
9
9
  import subprocess
10
+ import time
10
11
  import typing
11
12
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
12
13
  from urllib.parse import urlparse
@@ -105,6 +106,75 @@ ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
105
106
 
106
107
  logger = sky_logging.init_logger(__name__)
107
108
 
109
+ # Default retry settings for Kubernetes API calls
110
+ DEFAULT_MAX_RETRIES = 3
111
+ DEFAULT_RETRY_INTERVAL_SECONDS = 1
112
+
113
+
114
+ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
115
+ retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
116
+ resource_type: Optional[str] = None):
117
+ """Decorator to retry Kubernetes API calls on transient failures.
118
+
119
+ Args:
120
+ max_retries: Maximum number of retry attempts
121
+ retry_interval: Initial seconds to wait between retries
122
+ resource_type: Type of resource being accessed (e.g. 'node', 'pod').
123
+ Used to provide more specific error messages.
124
+ """
125
+
126
+ def decorator(func):
127
+
128
+ @functools.wraps(func)
129
+ def wrapper(*args, **kwargs):
130
+ last_exception = None
131
+ backoff = common_utils.Backoff(initial_backoff=retry_interval,
132
+ max_backoff_factor=3)
133
+
134
+ for attempt in range(max_retries):
135
+ try:
136
+ return func(*args, **kwargs)
137
+ except (kubernetes.max_retry_error(),
138
+ kubernetes.api_exception(),
139
+ kubernetes.config_exception()) as e:
140
+ last_exception = e
141
+ # Don't retry on permanent errors like 401 (Unauthorized)
142
+ # or 403 (Forbidden)
143
+ if (isinstance(e, kubernetes.api_exception()) and
144
+ e.status in (401, 403)):
145
+ raise
146
+ if attempt < max_retries - 1:
147
+ sleep_time = backoff.current_backoff()
148
+ logger.debug(f'Kubernetes API call {func.__name__} '
149
+ f'failed with {str(e)}. Retrying in '
150
+ f'{sleep_time:.1f}s...')
151
+ time.sleep(sleep_time)
152
+ continue
153
+
154
+ # Format error message based on the type of exception
155
+ resource_msg = f' when trying to get {resource_type} info' \
156
+ if resource_type else ''
157
+ debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
158
+ if resource_type else ''
159
+
160
+ if isinstance(last_exception, kubernetes.max_retry_error()):
161
+ error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
162
+ elif isinstance(last_exception, kubernetes.api_exception()):
163
+ error_msg = (f'Kubernetes API error{resource_msg}: '
164
+ f'{str(last_exception)}')
165
+ else:
166
+ error_msg = (f'Kubernetes configuration error{resource_msg}: '
167
+ f'{str(last_exception)}')
168
+
169
+ raise exceptions.ResourcesUnavailableError(
170
+ f'{error_msg}'
171
+ f' Please check if the cluster is healthy and retry.'
172
+ f'{debug_cmd}') from last_exception
173
+
174
+ return wrapper
175
+
176
+ return decorator
177
+
108
178
 
109
179
  class GPULabelFormatter:
110
180
  """Base class to define a GPU label formatter for a Kubernetes cluster
@@ -446,6 +516,7 @@ def detect_accelerator_resource(
446
516
 
447
517
 
448
518
  @functools.lru_cache(maxsize=10)
519
+ @_retry_on_error(resource_type='node')
449
520
  def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
450
521
  """Gets the kubernetes nodes in the context.
451
522
 
@@ -454,17 +525,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
454
525
  if context is None:
455
526
  context = get_current_kube_config_context_name()
456
527
 
457
- try:
458
- nodes = kubernetes.core_api(context).list_node(
459
- _request_timeout=kubernetes.API_TIMEOUT).items
460
- except kubernetes.max_retry_error():
461
- raise exceptions.ResourcesUnavailableError(
462
- 'Timed out when trying to get node info from Kubernetes cluster. '
463
- 'Please check if the cluster is healthy and retry. To debug, run: '
464
- 'kubectl get nodes') from None
528
+ nodes = kubernetes.core_api(context).list_node(
529
+ _request_timeout=kubernetes.API_TIMEOUT).items
465
530
  return nodes
466
531
 
467
532
 
533
+ @_retry_on_error(resource_type='pod')
468
534
  def get_all_pods_in_kubernetes_cluster(
469
535
  context: Optional[str] = None) -> List[Any]:
470
536
  """Gets pods in all namespaces in kubernetes cluster indicated by context.
@@ -474,14 +540,8 @@ def get_all_pods_in_kubernetes_cluster(
474
540
  if context is None:
475
541
  context = get_current_kube_config_context_name()
476
542
 
477
- try:
478
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
479
- _request_timeout=kubernetes.API_TIMEOUT).items
480
- except kubernetes.max_retry_error():
481
- raise exceptions.ResourcesUnavailableError(
482
- 'Timed out when trying to get pod info from Kubernetes cluster. '
483
- 'Please check if the cluster is healthy and retry. To debug, run: '
484
- 'kubectl get pods') from None
543
+ pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
544
+ _request_timeout=kubernetes.API_TIMEOUT).items
485
545
  return pods
486
546
 
487
547
 
@@ -1758,8 +1818,6 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
1758
1818
  else:
1759
1819
  destination[key].extend(value)
1760
1820
  else:
1761
- if destination is None:
1762
- destination = {}
1763
1821
  destination[key] = value
1764
1822
 
1765
1823
 
sky/resources.py CHANGED
@@ -540,7 +540,7 @@ class Resources:
540
540
  if memory_gb <= 0:
541
541
  with ux_utils.print_exception_no_traceback():
542
542
  raise ValueError(
543
- f'The "cpus" field should be positive. Found: {memory!r}')
543
+ f'The "memory" field should be positive. Found: {memory!r}')
544
544
 
545
545
  def _set_accelerators(
546
546
  self,