skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +31 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/backends/wheel_utils.py +9 -0
  7. sky/check.py +1 -1
  8. sky/cli.py +253 -74
  9. sky/client/cli.py +253 -74
  10. sky/client/common.py +10 -3
  11. sky/client/sdk.py +11 -8
  12. sky/clouds/aws.py +2 -2
  13. sky/clouds/kubernetes.py +0 -8
  14. sky/clouds/oci.py +1 -1
  15. sky/core.py +17 -11
  16. sky/dashboard/out/404.html +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  21. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  25. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  37. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  38. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  39. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  40. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  41. sky/dashboard/out/clusters/[cluster].html +1 -0
  42. sky/dashboard/out/clusters.html +1 -0
  43. sky/dashboard/out/favicon.ico +0 -0
  44. sky/dashboard/out/index.html +1 -0
  45. sky/dashboard/out/jobs/[job].html +1 -0
  46. sky/dashboard/out/jobs.html +1 -0
  47. sky/dashboard/out/skypilot.svg +15 -0
  48. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  49. sky/data/data_transfer.py +2 -1
  50. sky/data/storage.py +24 -14
  51. sky/exceptions.py +5 -0
  52. sky/jobs/constants.py +8 -1
  53. sky/jobs/server/core.py +12 -8
  54. sky/models.py +28 -0
  55. sky/optimizer.py +7 -9
  56. sky/provision/kubernetes/config.py +1 -1
  57. sky/provision/kubernetes/instance.py +16 -14
  58. sky/provision/kubernetes/network_utils.py +1 -1
  59. sky/provision/kubernetes/utils.py +50 -22
  60. sky/provision/provisioner.py +2 -1
  61. sky/resources.py +56 -2
  62. sky/serve/__init__.py +2 -0
  63. sky/serve/autoscalers.py +6 -2
  64. sky/serve/client/sdk.py +61 -0
  65. sky/serve/constants.py +6 -0
  66. sky/serve/load_balancing_policies.py +0 -4
  67. sky/serve/replica_managers.py +6 -8
  68. sky/serve/serve_state.py +0 -6
  69. sky/serve/serve_utils.py +33 -1
  70. sky/serve/server/core.py +192 -7
  71. sky/serve/server/server.py +28 -0
  72. sky/server/common.py +152 -47
  73. sky/server/constants.py +7 -1
  74. sky/server/requests/executor.py +4 -0
  75. sky/server/requests/payloads.py +12 -15
  76. sky/server/requests/serializers/decoders.py +2 -5
  77. sky/server/requests/serializers/encoders.py +2 -5
  78. sky/server/server.py +44 -1
  79. sky/setup_files/MANIFEST.in +1 -0
  80. sky/setup_files/dependencies.py +1 -0
  81. sky/sky_logging.py +12 -2
  82. sky/skylet/constants.py +5 -7
  83. sky/skylet/job_lib.py +3 -3
  84. sky/skypilot_config.py +225 -84
  85. sky/templates/kubernetes-ray.yml.j2 +7 -3
  86. sky/utils/cli_utils/status_utils.py +12 -5
  87. sky/utils/config_utils.py +39 -15
  88. sky/utils/controller_utils.py +44 -7
  89. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  90. sky/utils/kubernetes/gpu_labeler.py +99 -16
  91. sky/utils/schemas.py +24 -0
  92. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
  93. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
  94. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  95. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/jobs/constants.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """Constants used for Managed Jobs."""
2
- from typing import Dict, Union
2
+ from typing import Any, Dict, Union
3
3
 
4
4
  from sky.skylet import constants as skylet_constants
5
5
 
@@ -23,6 +23,13 @@ CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
23
23
  'disk_size': 50
24
24
  }
25
25
 
26
+ # Autostop config for the jobs controller. These are the default values for
27
+ # jobs.controller.autostop in ~/.sky/config.yaml.
28
+ CONTROLLER_AUTOSTOP: Dict[str, Any] = {
29
+ 'idle_minutes': 10,
30
+ 'down': False,
31
+ }
32
+
26
33
  # TODO(zhwu): This is no longer accurate, after #4592, which increases the
27
34
  # length of user hash appended to the cluster name from 4 to 8 chars. This makes
28
35
  # the cluster name on GCP being wrapped twice. However, we cannot directly
sky/jobs/server/core.py CHANGED
@@ -144,6 +144,9 @@ def launch(
144
144
  controller_resources = controller_utils.get_controller_resources(
145
145
  controller=controller_utils.Controllers.JOBS_CONTROLLER,
146
146
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
147
+ controller_idle_minutes_to_autostop, controller_down = (
148
+ controller_utils.get_controller_autostop_config(
149
+ controller=controller_utils.Controllers.JOBS_CONTROLLER))
147
150
 
148
151
  vars_to_fill = {
149
152
  'remote_user_yaml_path': remote_user_yaml_path,
@@ -185,14 +188,15 @@ def launch(
185
188
  # Launch with the api server's user hash, so that sky status does not
186
189
  # show the owner of the controller as whatever user launched it first.
187
190
  with common.with_server_user_hash():
188
- return execution.launch(task=controller_task,
189
- cluster_name=controller_name,
190
- stream_logs=stream_logs,
191
- idle_minutes_to_autostop=skylet_constants.
192
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
193
- retry_until_up=True,
194
- fast=True,
195
- _disable_controller_check=True)
191
+ return execution.launch(
192
+ task=controller_task,
193
+ cluster_name=controller_name,
194
+ stream_logs=stream_logs,
195
+ idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
196
+ down=controller_down,
197
+ retry_until_up=True,
198
+ fast=True,
199
+ _disable_controller_check=True)
196
200
 
197
201
 
198
202
  def queue_from_kubernetes_pod(
sky/models.py CHANGED
@@ -28,3 +28,31 @@ class KubernetesNodeInfo:
28
28
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
29
29
  total: Dict[str, int]
30
30
  free: Dict[str, int]
31
+
32
+
33
+ @dataclasses.dataclass
34
+ class KubernetesNodesInfo:
35
+ """Dataclass to store Kubernetes node info map."""
36
+ # The nodes in the cluster, keyed by node name.
37
+ node_info_dict: Dict[str, KubernetesNodeInfo]
38
+ # Additional hint for the node info.
39
+ hint: str
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ return {
43
+ 'node_info_dict': {
44
+ node_name: dataclasses.asdict(node_info)
45
+ for node_name, node_info in self.node_info_dict.items()
46
+ },
47
+ 'hint': self.hint,
48
+ }
49
+
50
+ @classmethod
51
+ def from_dict(cls, data: Dict[str, Any]) -> 'KubernetesNodesInfo':
52
+ return cls(
53
+ node_info_dict={
54
+ node_name: KubernetesNodeInfo(**node_info)
55
+ for node_name, node_info in data['node_info_dict'].items()
56
+ },
57
+ hint=data['hint'],
58
+ )
sky/optimizer.py CHANGED
@@ -335,9 +335,6 @@ class Optimizer:
335
335
  orig_resources)
336
336
 
337
337
  for resources in launchable_list:
338
- if do_print:
339
- logger.debug(f'resources: {resources}')
340
-
341
338
  if minimize_cost:
342
339
  cost_per_node = resources.get_cost(estimated_runtime)
343
340
  num_available_reserved_nodes = (
@@ -355,13 +352,14 @@ class Optimizer:
355
352
  # Minimize run time.
356
353
  estimated_cost_or_time = estimated_runtime
357
354
  if do_print:
358
- logger.debug(
359
- ' estimated_runtime: {:.0f} s ({:.1f} hr)'.format(
360
- estimated_runtime, estimated_runtime / 3600))
355
+ debug_msg = (
356
+ f'resources: {resources}, '
357
+ f'estimated_runtime: {estimated_runtime} s '
358
+ f'({estimated_runtime / 3600:.1f} hr)')
361
359
  if minimize_cost:
362
- logger.debug(
363
- ' estimated_cost (not incl. egress): ${:.1f}'.
364
- format(estimated_cost_or_time))
360
+ debug_msg += (', estimated_cost: '
361
+ f'${estimated_cost_or_time:.1f}')
362
+ logger.debug(debug_msg)
365
363
  node_to_cost_map[node][resources] = estimated_cost_or_time
366
364
  if not node_to_cost_map[node]:
367
365
  source_hint = 'catalog'
@@ -43,7 +43,7 @@ def bootstrap_instances(
43
43
  if (requested_service_account ==
44
44
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
45
45
  # If the user has requested a different service account (via pod_config
46
- # in ~/.sky/skyconfig.yaml), we assume they have already set up the
46
+ # in ~/.sky/config.yaml), we assume they have already set up the
47
47
  # necessary roles and role bindings.
48
48
  # If not, set up the roles and bindings for skypilot-service-account
49
49
  # here.
@@ -720,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
720
720
  f'{common_utils.format_exception(e)}'
721
721
  'Continuing without using nvidia RuntimeClass.\n'
722
722
  'If you are on a K3s cluster, manually '
723
- 'override runtimeClassName in ~/.sky/skyconfig.yaml. '
723
+ 'override runtimeClassName in ~/.sky/config.yaml. '
724
724
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
725
725
 
726
726
  needs_gpus = False
@@ -879,8 +879,8 @@ def stop_instances(
879
879
  raise NotImplementedError()
880
880
 
881
881
 
882
- def _terminate_node(namespace: str, context: Optional[str],
883
- pod_name: str) -> None:
882
+ def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
883
+ is_head: bool) -> None:
884
884
  """Terminate a pod."""
885
885
  logger.debug('terminate_instances: calling delete_namespaced_pod')
886
886
 
@@ -918,16 +918,18 @@ def _terminate_node(namespace: str, context: Optional[str],
918
918
  else:
919
919
  raise
920
920
 
921
- # Delete services for the pod
922
- for service_name in [pod_name, f'{pod_name}-ssh']:
923
- _delete_k8s_resource_with_retry(
924
- delete_func=lambda name=service_name: kubernetes.core_api(
925
- context).delete_namespaced_service(name=name,
926
- namespace=namespace,
927
- _request_timeout=config_lib.
928
- DELETION_TIMEOUT),
929
- resource_type='service',
930
- resource_name=service_name)
921
+ if is_head:
922
+ # Delete services for the head pod
923
+ # services are specified in sky/templates/kubernetes-ray.yml.j2
924
+ for service_name in [pod_name, f'{pod_name}-ssh']:
925
+ _delete_k8s_resource_with_retry(
926
+ delete_func=lambda name=service_name: kubernetes.core_api(
927
+ context).delete_namespaced_service(
928
+ name=name,
929
+ namespace=namespace,
930
+ _request_timeout=config_lib.DELETION_TIMEOUT),
931
+ resource_type='service',
932
+ resource_name=service_name)
931
933
 
932
934
  # Note - delete pod after all other resources are deleted.
933
935
  # This is to ensure there are no leftover resources if this down is run
@@ -974,7 +976,7 @@ def terminate_instances(
974
976
  if _is_head(pod) and worker_only:
975
977
  return
976
978
  logger.debug(f'Terminating instance {pod_name}: {pod}')
977
- _terminate_node(namespace, context, pod_name)
979
+ _terminate_node(namespace, context, pod_name, _is_head(pod))
978
980
 
979
981
  # Run pod termination in parallel
980
982
  subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
@@ -66,7 +66,7 @@ def get_networking_mode(
66
66
  except ValueError as e:
67
67
  with ux_utils.print_exception_no_traceback():
68
68
  raise ValueError(str(e) +
69
- ' Please check: ~/.sky/skyconfig.yaml.') from None
69
+ ' Please check: ~/.sky/config.yaml.') from None
70
70
  return networking_mode
71
71
 
72
72
 
@@ -1336,13 +1336,19 @@ def check_credentials(context: Optional[str],
1336
1336
  return False, ('An error occurred: '
1337
1337
  f'{common_utils.format_exception(e, use_bracket=True)}')
1338
1338
 
1339
+ # Check if $KUBECONFIG envvar consists of multiple paths. We run this before
1340
+ # optional checks.
1341
+ try:
1342
+ _ = _get_kubeconfig_path()
1343
+ except ValueError as e:
1344
+ return False, f'{common_utils.format_exception(e, use_bracket=True)}'
1345
+
1339
1346
  # If we reach here, the credentials are valid and Kubernetes cluster is up.
1340
1347
  if not run_optional_checks:
1341
1348
  return True, None
1342
1349
 
1343
1350
  # We now do softer checks to check if exec based auth is used and to
1344
1351
  # see if the cluster is GPU-enabled.
1345
-
1346
1352
  _, exec_msg = is_kubeconfig_exec_auth(context)
1347
1353
 
1348
1354
  # We now check if GPUs are available and labels are set correctly on the
@@ -1454,14 +1460,14 @@ def is_kubeconfig_exec_auth(
1454
1460
 
1455
1461
 
1456
1462
  Using exec-based authentication is problematic when used in conjunction
1457
- with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/skyconfig.yaml.
1463
+ with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml.
1458
1464
  This is because the exec-based authentication may not have the relevant
1459
1465
  dependencies installed on the remote cluster or may have hardcoded paths
1460
1466
  that are not available on the remote cluster.
1461
1467
 
1462
1468
  Returns:
1463
1469
  bool: True if exec-based authentication is used and LOCAL_CREDENTIAL
1464
- mode is used for remote_identity in ~/.sky/skyconfig.yaml.
1470
+ mode is used for remote_identity in ~/.sky/config.yaml.
1465
1471
  str: Error message if exec-based authentication is used, None otherwise
1466
1472
  """
1467
1473
  k8s = kubernetes.kubernetes
@@ -1489,9 +1495,8 @@ def is_kubeconfig_exec_auth(
1489
1495
  # K8s api does not provide a mechanism to get the user details from the
1490
1496
  # context. We need to load the kubeconfig file and parse it to get the
1491
1497
  # user details.
1492
- kubeconfig_path = os.path.expanduser(
1493
- os.getenv('KUBECONFIG',
1494
- k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION))
1498
+ kubeconfig_path = _get_kubeconfig_path()
1499
+
1495
1500
  # Load the kubeconfig file as a dictionary
1496
1501
  with open(kubeconfig_path, 'r', encoding='utf-8') as f:
1497
1502
  kubeconfig = yaml.safe_load(f)
@@ -1514,7 +1519,7 @@ def is_kubeconfig_exec_auth(
1514
1519
  'Managed Jobs or SkyServe controller on Kubernetes. '
1515
1520
  'To fix, configure SkyPilot to create a service account '
1516
1521
  'for running pods by setting the following in '
1517
- '~/.sky/skyconfig.yaml:\n'
1522
+ '~/.sky/config.yaml:\n'
1518
1523
  ' kubernetes:\n'
1519
1524
  ' remote_identity: SERVICE_ACCOUNT\n'
1520
1525
  ' More: https://docs.skypilot.co/en/latest/'
@@ -2252,7 +2257,7 @@ def combine_pod_config_fields(
2252
2257
  cluster_config_overrides: Dict[str, Any],
2253
2258
  ) -> None:
2254
2259
  """Adds or updates fields in the YAML with fields from the
2255
- ~/.sky/skyconfig.yaml's kubernetes.pod_spec dict.
2260
+ ~/.sky/config.yaml's kubernetes.pod_spec dict.
2256
2261
  This can be used to add fields to the YAML that are not supported by
2257
2262
  SkyPilot yet, or require simple configuration (e.g., adding an
2258
2263
  imagePullSecrets field).
@@ -2312,7 +2317,7 @@ def combine_pod_config_fields(
2312
2317
 
2313
2318
  def combine_metadata_fields(cluster_yaml_path: str) -> None:
2314
2319
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2315
- fields from the ~/.sky/skyconfig.yaml's kubernetes.custom_metadata dict.
2320
+ fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2316
2321
 
2317
2322
  Obeys the same add or update semantics as combine_pod_config_fields().
2318
2323
  """
@@ -2538,9 +2543,15 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
2538
2543
 
2539
2544
 
2540
2545
  def get_kubernetes_node_info(
2541
- context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
2546
+ context: Optional[str] = None) -> models.KubernetesNodesInfo:
2542
2547
  """Gets the resource information for all the nodes in the cluster.
2543
2548
 
2549
+ This function returns a model with node info map as a nested field. This
2550
+ allows future extensions while keeping the client-server compatibility,
2551
+ e.g. when adding a new field to the model, the legacy clients will not be
2552
+ affected and new clients can opt-in new behavior if the new field is
2553
+ presented.
2554
+
2544
2555
  Currently only GPU resources are supported. The function returns the total
2545
2556
  number of GPUs available on the node and the number of free GPUs on the
2546
2557
  node.
@@ -2549,8 +2560,8 @@ def get_kubernetes_node_info(
2549
2560
  namespaces, the function will return free GPUs as -1.
2550
2561
 
2551
2562
  Returns:
2552
- Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
2553
- key and the KubernetesNodeInfo object as value
2563
+ KubernetesNodesInfo: A model that contains the node info map and other
2564
+ information.
2554
2565
  """
2555
2566
  nodes = get_kubernetes_nodes(context=context)
2556
2567
  # Get the pods to get the real-time resource usage
@@ -2569,6 +2580,7 @@ def get_kubernetes_node_info(
2569
2580
  label_keys = lf.get_label_keys()
2570
2581
 
2571
2582
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
2583
+ has_multi_host_tpu = False
2572
2584
 
2573
2585
  for node in nodes:
2574
2586
  accelerator_name = None
@@ -2605,6 +2617,7 @@ def get_kubernetes_node_info(
2605
2617
  # TODO(Doyoung): Remove the logic when adding support for
2606
2618
  # multi-host TPUs.
2607
2619
  if is_multi_host_tpu(node.metadata.labels):
2620
+ has_multi_host_tpu = True
2608
2621
  continue
2609
2622
 
2610
2623
  node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
@@ -2612,8 +2625,15 @@ def get_kubernetes_node_info(
2612
2625
  accelerator_type=accelerator_name,
2613
2626
  total={'accelerator_count': int(accelerator_count)},
2614
2627
  free={'accelerators_available': int(accelerators_available)})
2628
+ hint = ''
2629
+ if has_multi_host_tpu:
2630
+ hint = ('(Note: Multi-host TPUs are detected and excluded from the '
2631
+ 'display as multi-host TPUs are not supported.)')
2615
2632
 
2616
- return node_info_dict
2633
+ return models.KubernetesNodesInfo(
2634
+ node_info_dict=node_info_dict,
2635
+ hint=hint,
2636
+ )
2617
2637
 
2618
2638
 
2619
2639
  def to_label_selector(tags):
@@ -2860,15 +2880,6 @@ def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
2860
2880
  return False
2861
2881
 
2862
2882
 
2863
- def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
2864
- """Checks if there exists a multi-host TPU within the cluster."""
2865
- nodes = get_kubernetes_nodes(context=context)
2866
- for node in nodes:
2867
- if is_multi_host_tpu(node.metadata.labels):
2868
- return True
2869
- return False
2870
-
2871
-
2872
2883
  @dataclasses.dataclass
2873
2884
  class KubernetesSkyPilotClusterInfo:
2874
2885
  cluster_name_on_cloud: str
@@ -3017,3 +3028,20 @@ def get_gpu_resource_key():
3017
3028
  # Else use default.
3018
3029
  # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
3019
3030
  return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
3031
+
3032
+
3033
+ def _get_kubeconfig_path() -> str:
3034
+ """Get the path to the kubeconfig file.
3035
+ Parses `KUBECONFIG` env var if present, else uses the default path.
3036
+ Currently, specifying multiple KUBECONFIG paths in the envvar is not
3037
+ allowed, hence will raise a ValueError.
3038
+ """
3039
+ kubeconfig_path = os.path.expanduser(
3040
+ os.getenv(
3041
+ 'KUBECONFIG', kubernetes.kubernetes.config.kube_config.
3042
+ KUBE_CONFIG_DEFAULT_LOCATION))
3043
+ if len(kubeconfig_path.split(os.pathsep)) > 1:
3044
+ raise ValueError('SkyPilot currently only supports one '
3045
+ 'config file path with $KUBECONFIG. Current '
3046
+ f'path(s) are {kubeconfig_path}.')
3047
+ return kubeconfig_path
@@ -670,6 +670,7 @@ def post_provision_runtime_setup(
670
670
  ux_utils.error_message(
671
671
  'Failed to set up SkyPilot runtime on cluster.',
672
672
  provision_logging.config.log_path))
673
- logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
673
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
674
+ logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
674
675
  with ux_utils.print_exception_no_traceback():
675
676
  raise
sky/resources.py CHANGED
@@ -18,6 +18,7 @@ from sky.skylet import constants
18
18
  from sky.utils import accelerator_registry
19
19
  from sky.utils import annotations
20
20
  from sky.utils import common_utils
21
+ from sky.utils import config_utils
21
22
  from sky.utils import log_utils
22
23
  from sky.utils import registry
23
24
  from sky.utils import resources_utils
@@ -28,6 +29,10 @@ logger = sky_logging.init_logger(__name__)
28
29
 
29
30
  _DEFAULT_DISK_SIZE_GB = 256
30
31
 
32
+ RESOURCE_CONFIG_ALIASES = {
33
+ 'gpus': 'accelerators',
34
+ }
35
+
31
36
 
32
37
  class Resources:
33
38
  """Resources: compute requirements of Tasks.
@@ -1290,6 +1295,22 @@ class Resources:
1290
1295
  def copy(self, **override) -> 'Resources':
1291
1296
  """Returns a copy of the given Resources."""
1292
1297
  use_spot = self.use_spot if self._use_spot_specified else None
1298
+
1299
+ current_override_configs = self._cluster_config_overrides
1300
+ if self._cluster_config_overrides is None:
1301
+ current_override_configs = {}
1302
+ new_override_configs = override.pop('_cluster_config_overrides', {})
1303
+ overlaid_configs = skypilot_config.overlay_skypilot_config(
1304
+ original_config=config_utils.Config(current_override_configs),
1305
+ override_configs=new_override_configs,
1306
+ )
1307
+ override_configs = config_utils.Config()
1308
+ for key in constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK:
1309
+ elem = overlaid_configs.get_nested(key, None)
1310
+ if elem is not None:
1311
+ override_configs.set_nested(key, elem)
1312
+
1313
+ override_configs = dict(override_configs) if override_configs else None
1293
1314
  resources = Resources(
1294
1315
  cloud=override.pop('cloud', self.cloud),
1295
1316
  instance_type=override.pop('instance_type', self.instance_type),
@@ -1315,8 +1336,7 @@ class Resources:
1315
1336
  _is_image_managed=override.pop('_is_image_managed',
1316
1337
  self._is_image_managed),
1317
1338
  _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
1318
- _cluster_config_overrides=override.pop(
1319
- '_cluster_config_overrides', self._cluster_config_overrides),
1339
+ _cluster_config_overrides=override_configs,
1320
1340
  )
1321
1341
  assert not override
1322
1342
  return resources
@@ -1349,12 +1369,46 @@ class Resources:
1349
1369
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1350
1370
  return features
1351
1371
 
1372
+ @staticmethod
1373
+ def _apply_resource_config_aliases(
1374
+ config: Optional[Dict[str, Any]]) -> None:
1375
+ """Mutatively applies overriding aliases to the passed in config.
1376
+
1377
+ Note: Nested aliases are not supported.
1378
+ The preferred way to support nested aliases would be to cast
1379
+ the parsed resource config dictionary to a config_utils.Config object
1380
+ and use the get_, set_, and pop_ nested methods accordingly.
1381
+ However, this approach comes at a significant memory cost as get_
1382
+ and pop_nested create deep copies of the config.
1383
+ """
1384
+ if not config:
1385
+ return
1386
+
1387
+ for alias, canonical in RESOURCE_CONFIG_ALIASES.items():
1388
+ if alias in config:
1389
+ if canonical in config:
1390
+ raise exceptions.InvalidSkyPilotConfigError(
1391
+ f'Cannot specify both {alias} '
1392
+ f'and {canonical} in config.')
1393
+ config[canonical] = config[alias]
1394
+ del config[alias]
1395
+
1352
1396
  @classmethod
1353
1397
  def from_yaml_config(
1354
1398
  cls, config: Optional[Dict[str, Any]]
1355
1399
  ) -> Union[Set['Resources'], List['Resources']]:
1356
1400
  if config is None:
1357
1401
  return {Resources()}
1402
+
1403
+ Resources._apply_resource_config_aliases(config)
1404
+ anyof = config.get('any_of')
1405
+ if anyof is not None and isinstance(anyof, list):
1406
+ for anyof_config in anyof:
1407
+ Resources._apply_resource_config_aliases(anyof_config)
1408
+ ordered = config.get('ordered')
1409
+ if ordered is not None and isinstance(ordered, list):
1410
+ for ordered_config in ordered:
1411
+ Resources._apply_resource_config_aliases(ordered_config)
1358
1412
  common_utils.validate_schema(config, schemas.get_resources_schema(),
1359
1413
  'Invalid resources YAML: ')
1360
1414
 
sky/serve/__init__.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
 
4
4
  from sky.serve.client.sdk import down
5
5
  from sky.serve.client.sdk import status
6
+ from sky.serve.client.sdk import sync_down_logs
6
7
  from sky.serve.client.sdk import tail_logs
7
8
  from sky.serve.client.sdk import terminate_replica
8
9
  from sky.serve.client.sdk import up
@@ -37,6 +38,7 @@ __all__ = [
37
38
  'LB_POLICIES',
38
39
  'ReplicaStatus',
39
40
  'ServiceComponent',
41
+ 'sync_down_logs',
40
42
  'ServiceStatus',
41
43
  'ServeCodeGen',
42
44
  'SkyServiceSpec',
sky/serve/autoscalers.py CHANGED
@@ -676,8 +676,12 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
676
676
  # because the provisioning spot can fail to UP due to the capacity
677
677
  # issue, and on-demand should fill the gap between the required
678
678
  # number of spot and ready spot.
679
- num_ondemand_to_provision += (num_spot_to_provision -
680
- num_ready_spot)
679
+ # When scaling down spot instances, it is possible that the number
680
+ # of ready spot is more than the number of spot to provision, thus
681
+ # generate a negative number. In this case, we don't need to
682
+ # provision on-demand instances.
683
+ num_ondemand_to_provision += max(
684
+ 0, num_spot_to_provision - num_ready_spot)
681
685
 
682
686
  # Make sure we don't launch on-demand fallback for
683
687
  # overprovisioned replicas.
sky/serve/client/sdk.py CHANGED
@@ -374,3 +374,64 @@ def tail_logs(service_name: str,
374
374
  )
375
375
  request_id = server_common.get_request_id(response)
376
376
  sdk.stream_response(request_id, response, output_stream)
377
+
378
+
379
+ @usage_lib.entrypoint
380
+ @server_common.check_server_healthy_or_start
381
+ def sync_down_logs(service_name: str,
382
+ local_dir: str,
383
+ *,
384
+ targets: Optional[Union[
385
+ str, 'serve_utils.ServiceComponent',
386
+ List[Union[str,
387
+ 'serve_utils.ServiceComponent']]]] = None,
388
+ replica_ids: Optional[List[int]] = None) -> None:
389
+ """Sync down logs from the service components to a local directory.
390
+
391
+ This function syncs logs from the specified service components (controller,
392
+ load balancer, replicas) via the API server to a specified local directory.
393
+
394
+ Args:
395
+ service_name: The name of the service to download logs from.
396
+ targets: Which component(s) to download logs for. If None or empty,
397
+ means download all logs (controller, load-balancer, all replicas).
398
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
399
+ or a list of them for multiple components. Currently accepted
400
+ values:
401
+ - "controller"/ServiceComponent.CONTROLLER
402
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
403
+ - "replica"/ServiceComponent.REPLICA
404
+ replica_ids: The list of replica IDs to download logs from, specified
405
+ when target includes `ServiceComponent.REPLICA`. If target includes
406
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
407
+ replicas will be downloaded.
408
+ local_dir: Local directory to sync down logs to. Defaults to
409
+ `~/sky_logs`.
410
+
411
+ Raises:
412
+ RuntimeError: If fails to gather logs or fails to rsync from the
413
+ controller.
414
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
415
+ ValueError: Arguments not valid.
416
+ """
417
+ # Avoid circular import.
418
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
419
+
420
+ body = payloads.ServeDownloadLogsBody(
421
+ service_name=service_name,
422
+ # No need to set here, since the server will override it
423
+ # to a directory on the API server.
424
+ local_dir=local_dir,
425
+ targets=targets,
426
+ replica_ids=replica_ids,
427
+ )
428
+ response = requests.post(
429
+ f'{server_common.get_server_url()}/serve/sync-down-logs',
430
+ json=json.loads(body.model_dump_json()),
431
+ timeout=(5, None),
432
+ )
433
+ remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
434
+
435
+ # Download from API server paths to the client's local_dir
436
+ client_common.download_logs_from_api_server([remote_dir], remote_dir,
437
+ local_dir)
sky/serve/constants.py CHANGED
@@ -66,6 +66,12 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
66
66
  # disk space. Maybe we could use a larger disk size, migrate to cloud storage or
67
67
  # do some log rotation.
68
68
  CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
69
+ # Autostop config for the jobs controller. These are the default values for
70
+ # serve.controller.autostop in ~/.sky/config.yaml.
71
+ CONTROLLER_AUTOSTOP = {
72
+ 'idle_minutes': 10,
73
+ 'down': False,
74
+ }
69
75
 
70
76
  # Due to the CPU/memory usage of the controller process launched with a job on
71
77
  # controller VM (use ray job under the hood), we need to reserve some CPU/memory
@@ -15,10 +15,6 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Define a registry for load balancing policies
16
16
  LB_POLICIES = {}
17
17
  DEFAULT_LB_POLICY = None
18
- # Prior to #4439, the default policy was round_robin. We store the legacy
19
- # default policy here to maintain backwards compatibility. Remove this after
20
- # 2 minor release, i.e., 0.9.0.
21
- LEGACY_DEFAULT_POLICY = 'round_robin'
22
18
 
23
19
 
24
20
  def _request_repr(request: 'fastapi.Request') -> str:
@@ -257,14 +257,6 @@ class ReplicaStatusProperty:
257
257
  # is set to True and it can fail immediately due to spot availability.
258
258
  failed_spot_availability: bool = False
259
259
 
260
- def remove_terminated_replica(self) -> bool:
261
- """Whether to remove the replica record from the replica table.
262
-
263
- If not, the replica will stay in the replica table permanently to
264
- notify the user that something is wrong with the user code / setup.
265
- """
266
- return self.is_scale_down
267
-
268
260
  def unrecoverable_failure(self) -> bool:
269
261
  """Whether the replica fails and cannot be recovered.
270
262
 
@@ -730,6 +722,12 @@ class SkyPilotReplicaManager(ReplicaManager):
730
722
  replica_drain_delay_seconds: int,
731
723
  is_scale_down: bool = False,
732
724
  purge: bool = False) -> None:
725
+ left_in_record = not (is_scale_down or purge)
726
+ if left_in_record:
727
+ assert sync_down_logs, (
728
+ 'For the replica left in the record, '
729
+ 'the logs should always be synced down. '
730
+ 'So that the user can see the logs to debug.')
733
731
 
734
732
  if replica_id in self._launch_process_pool:
735
733
  info = serve_state.get_replica_info_from_id(self._service_name,
sky/serve/serve_state.py CHANGED
@@ -11,7 +11,6 @@ from typing import Any, Dict, List, Optional, Tuple
11
11
  import colorama
12
12
 
13
13
  from sky.serve import constants
14
- from sky.serve import load_balancing_policies as lb_policies
15
14
  from sky.utils import db_utils
16
15
 
17
16
  if typing.TYPE_CHECKING:
@@ -335,11 +334,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
335
334
  (current_version, name, controller_job_id, controller_port,
336
335
  load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
337
336
  _, active_versions, load_balancing_policy, tls_encrypted) = row[:15]
338
- if load_balancing_policy is None:
339
- # This entry in database was added in #4439, and it will always be set
340
- # to a str value. If it is None, it means it is an legacy entry and is
341
- # using the legacy default policy.
342
- load_balancing_policy = lb_policies.LEGACY_DEFAULT_POLICY
343
337
  return {
344
338
  'name': name,
345
339
  'controller_job_id': controller_job_id,