skypilot-nightly 1.0.0.dev20241110__py3-none-any.whl → 1.0.0.dev20241112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'dddd65187953a5d6b32f762bea78eed1f109ec3c'
8
+ _SKYPILOT_COMMIT_SHA = '140125eaad5fb64da37934c8f6650d68aa135f77'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241110'
38
+ __version__ = '1.0.0.dev20241112'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -3102,6 +3102,7 @@ def show_gpus(
3102
3102
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3103
3103
  kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3104
3104
  sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3105
+ no_permissions_str = '<no permissions>'
3105
3106
 
3106
3107
  def _list_to_str(lst):
3107
3108
  return ', '.join([str(e) for e in lst])
@@ -3146,9 +3147,11 @@ def show_gpus(
3146
3147
  debug_msg)
3147
3148
  raise ValueError(full_err_msg)
3148
3149
  for gpu, _ in sorted(counts.items()):
3150
+ available_qty = available[gpu] if available[gpu] != -1 else (
3151
+ no_permissions_str)
3149
3152
  realtime_gpu_table.add_row([
3150
3153
  gpu,
3151
- _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
3154
+ _list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
3152
3155
  ])
3153
3156
  return realtime_gpu_table
3154
3157
 
@@ -3158,10 +3161,11 @@ def show_gpus(
3158
3161
 
3159
3162
  node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3160
3163
  for node_name, node_info in node_info_dict.items():
3164
+ available = node_info.free['nvidia.com/gpu'] if node_info.free[
3165
+ 'nvidia.com/gpu'] != -1 else no_permissions_str
3161
3166
  node_table.add_row([
3162
3167
  node_name, node_info.gpu_type,
3163
- node_info.total['nvidia.com/gpu'],
3164
- node_info.free['nvidia.com/gpu']
3168
+ node_info.total['nvidia.com/gpu'], available
3165
3169
  ])
3166
3170
  return node_table
3167
3171
 
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
10
10
  from sky import check as sky_check
11
11
  from sky import sky_logging
12
12
  from sky.adaptors import common as adaptors_common
13
+ from sky.adaptors import kubernetes
13
14
  from sky.clouds import Kubernetes
14
15
  from sky.clouds.service_catalog import CloudFilter
15
16
  from sky.clouds.service_catalog import common
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
22
23
  else:
23
24
  pd = adaptors_common.LazyImport('pandas')
24
25
 
26
+ logger = sky_logging.init_logger(__name__)
27
+
25
28
  _PULL_FREQUENCY_HOURS = 7
26
29
 
27
30
  # We keep pull_frequency_hours so we can remotely update the default image paths
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
77
80
  require_price: bool = True
78
81
  ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
79
82
  int]]:
83
+ """List accelerators in the Kubernetes cluster.
84
+
85
+ If the user does not have sufficient permissions to list pods in all
86
+ namespaces, the function will return free GPUs as -1.
87
+ """
80
88
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
81
89
  # function from kubernetes_utils.
82
90
  del all_regions, require_price # Unused.
@@ -108,7 +116,17 @@ def list_accelerators_realtime(
108
116
  key = label_formatter.get_label_key()
109
117
  nodes = kubernetes_utils.get_kubernetes_nodes(context)
110
118
  # Get the pods to get the real-time GPU usage
111
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
119
+ try:
120
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
121
+ except kubernetes.api_exception() as e:
122
+ if e.status == 403:
123
+ logger.warning('Failed to get pods in the Kubernetes cluster '
124
+ '(forbidden). Please check if your account has '
125
+ 'necessary permissions to list pods. Realtime GPU '
126
+ 'availability information may be incorrect.')
127
+ pods = None
128
+ else:
129
+ raise
112
130
  # Total number of GPUs in the cluster
113
131
  total_accelerators_capacity: Dict[str, int] = {}
114
132
  # Total number of GPUs currently available in the cluster
@@ -141,6 +159,21 @@ def list_accelerators_realtime(
141
159
  if accelerator_count not in accelerators_qtys:
142
160
  accelerators_qtys.add((accelerator_name, accelerator_count))
143
161
 
162
+ if accelerator_count >= min_quantity_filter:
163
+ quantized_count = (min_quantity_filter *
164
+ (accelerator_count // min_quantity_filter))
165
+ if accelerator_name not in total_accelerators_capacity:
166
+ total_accelerators_capacity[
167
+ accelerator_name] = quantized_count
168
+ else:
169
+ total_accelerators_capacity[
170
+ accelerator_name] += quantized_count
171
+
172
+ if pods is None:
173
+ # If we can't get the pods, we can't get the GPU usage
174
+ total_accelerators_available[accelerator_name] = -1
175
+ continue
176
+
144
177
  for pod in pods:
145
178
  # Get all the pods running on the node
146
179
  if (pod.spec.node_name == node.metadata.name and
@@ -155,16 +188,6 @@ def list_accelerators_realtime(
155
188
 
156
189
  accelerators_available = accelerator_count - allocated_qty
157
190
 
158
- if accelerator_count >= min_quantity_filter:
159
- quantized_count = (min_quantity_filter *
160
- (accelerator_count // min_quantity_filter))
161
- if accelerator_name not in total_accelerators_capacity:
162
- total_accelerators_capacity[
163
- accelerator_name] = quantized_count
164
- else:
165
- total_accelerators_capacity[
166
- accelerator_name] += quantized_count
167
-
168
191
  if accelerator_name not in total_accelerators_available:
169
192
  total_accelerators_available[accelerator_name] = 0
170
193
  if accelerators_available >= min_quantity_filter:
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
20
20
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
21
  'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
22
22
  '~/container_env_var.sh && '
23
- '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
23
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
24
24
  )
25
25
 
26
26
  # Docker daemon may not be ready when the machine is firstly started. The error
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
333
333
  raise
334
334
 
335
335
 
336
- def _set_env_vars_in_pods(namespace: str, context: Optional[str],
337
- new_pods: List):
338
- """Setting environment variables in pods.
339
-
340
- Once all containers are ready, we can exec into them and set env vars.
341
- Kubernetes automatically populates containers with critical
342
- environment variables, such as those for discovering services running
343
- in the cluster and CUDA/nvidia environment variables. We need to
344
- make sure these env vars are available in every task and ssh session.
345
- This is needed for GPU support and service discovery.
346
- See https://github.com/skypilot-org/skypilot/issues/2287 for
347
- more details.
348
-
349
- To do so, we capture env vars from the pod's runtime and write them to
350
- /etc/profile.d/, making them available for all users in future
351
- shell sessions.
352
- """
353
- set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
336
+ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
337
+ """Pre-initialization step for SkyPilot pods.
354
338
 
355
- def _set_env_vars_thread(new_pod):
356
- pod_name = new_pod.metadata.name
357
- logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
358
- f'{"-"*20}')
359
- runner = command_runner.KubernetesCommandRunner(
360
- ((namespace, context), pod_name))
339
+ This step is run in the pod right after it is created and before the
340
+ SkyPilot runtime is setup.
361
341
 
362
- def _run_env_vars_cmd():
363
- rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
364
- require_outputs=True,
365
- stream_logs=False)
366
- _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
367
- pod_name, rc, stdout)
342
+ This step includes three key steps:
368
343
 
369
- _run_function_with_retries(_run_env_vars_cmd,
370
- f'set env vars in pod {pod_name}')
371
- logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
372
- f'{"-"*20}')
344
+ 1. Privilege check: Checks if the default user has sufficient privilege
345
+ to set up the kubernetes instance pod.
346
+ 2. SSH setup: Sets up SSH for the pod instance.
347
+ 3. Environment variable setup to populate k8s env vars in the pod.
373
348
 
374
- subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
375
- NUM_THREADS)
349
+ Make sure commands used in these methods are generic and work
350
+ on most base images. E.g., do not use Python, since that may not
351
+ be installed by default.
376
352
 
353
+ If you run any apt commands, be sure to check if the lock is available.
354
+ It is possible the `apt update` run in the pod container args may still
355
+ be running.
356
+
357
+ Args:
358
+ namespace (str): Kubernetes namespace.
359
+ context (Optional[str]): Kubernetes context.
360
+ new_nodes (List): List of new pod instances.
361
+
362
+ Raises:
363
+ config_lib.KubernetesError: If user privileges are insufficient or
364
+ setup fails.
365
+ """
377
366
 
378
- def _check_user_privilege(namespace: str, context: Optional[str],
379
- new_nodes: List) -> None:
380
- # Checks if the default user has sufficient privilege to set up
381
- # the kubernetes instance pod.
382
367
  check_k8s_user_sudo_cmd = (
383
368
  'if [ $(id -u) -eq 0 ]; then'
384
369
  # If user is root, create an alias for sudo used in skypilot setup
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
386
371
  'else '
387
372
  ' if command -v sudo >/dev/null 2>&1; then '
388
373
  ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
389
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
374
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
375
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
390
376
  ' else '
391
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
377
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
378
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
392
379
  ' fi; '
393
- 'fi')
380
+ 'fi;')
381
+
382
+ # Kubernetes automatically populates containers with critical
383
+ # environment variables, such as those for discovering services running
384
+ # in the cluster and CUDA/nvidia environment variables. We need to
385
+ # make sure these env vars are available in every task and ssh session.
386
+ # This is needed for GPU support and service discovery.
387
+ # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
388
+ # To do so, we capture env vars from the pod's runtime and write them to
389
+ # /etc/profile.d/, making them available for all users in future
390
+ # shell sessions.
391
+ set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
394
392
 
395
- # This check needs to run on a per-image basis, so running the check on
396
- # any one pod is sufficient.
397
- new_node = new_nodes[0]
398
- pod_name = new_node.metadata.name
393
+ check_apt_update_complete_cmd = (
394
+ 'echo "Checking if apt update from container init is complete..."; '
395
+ 'timeout_secs=600; '
396
+ 'start_time=$(date +%s); '
397
+ 'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
398
+ ' echo "apt update still running. Logs:"; '
399
+ ' cat /tmp/apt-update.log; '
400
+ ' current_time=$(date +%s); '
401
+ ' elapsed=$((current_time - start_time)); '
402
+ ' if [ $elapsed -ge $timeout_secs ]; then '
403
+ ' echo "Timed out waiting for apt update"; '
404
+ ' exit 1; '
405
+ ' fi; '
406
+ ' sleep 5; '
407
+ 'done; '
408
+ 'echo "apt update complete."; ')
399
409
 
400
- runner = command_runner.KubernetesCommandRunner(
401
- ((namespace, context), pod_name))
402
- logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
403
- f'{"-"*20}')
404
-
405
- def _run_privilege_check():
406
- rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
407
- require_outputs=True,
408
- separate_stderr=True,
409
- stream_logs=False)
410
- _raise_command_running_error('check user privilege',
411
- check_k8s_user_sudo_cmd, pod_name, rc,
412
- stdout + stderr)
413
- return stdout
414
-
415
- stdout = _run_function_with_retries(
416
- _run_privilege_check, f'check user privilege in pod {pod_name!r}')
417
-
418
- if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
419
- raise config_lib.KubernetesError(
420
- 'Insufficient system privileges detected. '
421
- 'Ensure the default user has root access or '
422
- '"sudo" is installed and the user is added to the sudoers '
423
- 'from the image.')
424
- logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
425
- f'{"-"*20}')
426
-
427
-
428
- def _setup_ssh_in_pods(namespace: str, context: Optional[str],
429
- new_nodes: List) -> None:
430
- # Setting up ssh for the pod instance. This is already setup for
431
- # the jump pod so it does not need to be run for it.
432
- set_k8s_ssh_cmd = (
433
- 'set -ex; '
410
+ install_ssh_k8s_cmd = (
434
411
  'prefix_cmd() '
435
412
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
436
413
  'export DEBIAN_FRONTEND=noninteractive;'
437
- '$(prefix_cmd) apt-get update;'
438
- '$(prefix_cmd) apt install openssh-server rsync -y; '
414
+ 'echo "Installing missing packages..."; '
415
+ 'for i in {1..5}; do '
416
+ ' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
417
+ ' rc=$?; '
418
+ ' if [ $rc -eq 0 ]; then '
419
+ ' break; '
420
+ ' fi; '
421
+ ' echo "$output" | grep -qi "could not get lock" || '
422
+ ' grep -qi "Unable to acquire the dpkg frontend lock"; '
423
+ ' if [ $? -eq 0 ]; then '
424
+ ' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
425
+ ' sleep 5; '
426
+ ' else '
427
+ ' echo "apt install failed for a non-lock reason: $output"; '
428
+ ' exit $rc; '
429
+ ' fi; '
430
+ 'done; '
431
+ 'if [ $rc -ne 0 ]; then '
432
+ ' echo "apt install failed after 5 attempts due to lock errors."; '
433
+ ' exit $rc; '
434
+ 'fi; '
439
435
  '$(prefix_cmd) mkdir -p /var/run/sshd; '
440
436
  '$(prefix_cmd) '
441
437
  'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
456
452
  # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
457
453
  '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
458
454
 
459
- def _setup_ssh_thread(new_node):
455
+ pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
456
+ set_k8s_env_var_cmd + check_apt_update_complete_cmd +
457
+ install_ssh_k8s_cmd)
458
+
459
+ def _pre_init_thread(new_node):
460
460
  pod_name = new_node.metadata.name
461
+ logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
461
462
  runner = command_runner.KubernetesCommandRunner(
462
463
  ((namespace, context), pod_name))
463
- logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
464
464
 
465
- def _run_ssh_setup():
466
- rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
467
- require_outputs=True,
468
- stream_logs=False)
469
- _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
470
- rc, stdout)
465
+ # Run the combined pre-init command
466
+ rc, stdout, _ = runner.run(pre_init_cmd,
467
+ require_outputs=True,
468
+ stream_logs=False)
469
+ if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
470
+ raise config_lib.KubernetesError(
471
+ 'Insufficient system privileges detected. '
472
+ 'Ensure the default user has root access or '
473
+ '"sudo" is installed and the user is added to the sudoers '
474
+ 'from the image.')
475
+
476
+ op_name = 'pre-init'
477
+ _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
478
+ stdout)
471
479
 
472
- _run_function_with_retries(_run_ssh_setup,
473
- f'setup ssh in pod {pod_name!r}')
474
- logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
480
+ logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
475
481
 
476
- subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
482
+ # Run pre_init in parallel across all new_nodes
483
+ subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
477
484
 
478
485
 
479
486
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
724
731
  f'pods: {list(uninitialized_pods.keys())}')
725
732
  uninitialized_pods_list = list(uninitialized_pods.values())
726
733
 
727
- # Setup SSH and environment variables in pods.
728
- # Make sure commands used in these methods are generic and work
729
- # on most base images. E.g., do not use Python, since that may not
730
- # be installed by default.
731
- _check_user_privilege(namespace, context, uninitialized_pods_list)
732
- _setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
733
- _set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
734
+ # Run pre-init steps in the pod.
735
+ pre_init(namespace, context, uninitialized_pods_list)
734
736
 
735
737
  for pod in uninitialized_pods.values():
736
738
  _label_pod(namespace,
@@ -1801,13 +1801,22 @@ def get_kubernetes_node_info(
1801
1801
  number of GPUs available on the node and the number of free GPUs on the
1802
1802
  node.
1803
1803
 
1804
+ If the user does not have sufficient permissions to list pods in all
1805
+ namespaces, the function will return free GPUs as -1.
1806
+
1804
1807
  Returns:
1805
1808
  Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
1806
1809
  key and the KubernetesNodeInfo object as value
1807
1810
  """
1808
1811
  nodes = get_kubernetes_nodes(context)
1809
1812
  # Get the pods to get the real-time resource usage
1810
- pods = get_all_pods_in_kubernetes_cluster(context)
1813
+ try:
1814
+ pods = get_all_pods_in_kubernetes_cluster(context)
1815
+ except kubernetes.api_exception() as e:
1816
+ if e.status == 403:
1817
+ pods = None
1818
+ else:
1819
+ raise
1811
1820
 
1812
1821
  label_formatter, _ = detect_gpu_label_formatter(context)
1813
1822
  if not label_formatter:
@@ -1828,19 +1837,22 @@ def get_kubernetes_node_info(
1828
1837
  accelerator_count = int(node.status.allocatable.get(
1829
1838
  'nvidia.com/gpu', 0))
1830
1839
 
1831
- for pod in pods:
1832
- # Get all the pods running on the node
1833
- if (pod.spec.node_name == node.metadata.name and
1834
- pod.status.phase in ['Running', 'Pending']):
1835
- # Iterate over all the containers in the pod and sum the
1836
- # GPU requests
1837
- for container in pod.spec.containers:
1838
- if container.resources.requests:
1839
- allocated_qty += int(
1840
- container.resources.requests.get(
1841
- 'nvidia.com/gpu', 0))
1842
-
1843
- accelerators_available = accelerator_count - allocated_qty
1840
+ if pods is None:
1841
+ accelerators_available = -1
1842
+
1843
+ else:
1844
+ for pod in pods:
1845
+ # Get all the pods running on the node
1846
+ if (pod.spec.node_name == node.metadata.name and
1847
+ pod.status.phase in ['Running', 'Pending']):
1848
+ # Iterate over all the containers in the pod and sum the
1849
+ # GPU requests
1850
+ for container in pod.spec.containers:
1851
+ if container.resources.requests:
1852
+ allocated_qty += int(
1853
+ container.resources.requests.get(
1854
+ 'nvidia.com/gpu', 0))
1855
+ accelerators_available = accelerator_count - allocated_qty
1844
1856
 
1845
1857
  node_info_dict[node.metadata.name] = KubernetesNodeInfo(
1846
1858
  name=node.metadata.name,
sky/serve/__init__.py CHANGED
@@ -11,6 +11,7 @@ from sky.serve.core import tail_logs
11
11
  from sky.serve.core import terminate_replica
12
12
  from sky.serve.core import up
13
13
  from sky.serve.core import update
14
+ from sky.serve.load_balancing_policies import LB_POLICIES
14
15
  from sky.serve.serve_state import ReplicaStatus
15
16
  from sky.serve.serve_state import ServiceStatus
16
17
  from sky.serve.serve_utils import DEFAULT_UPDATE_MODE
@@ -35,6 +36,7 @@ __all__ = [
35
36
  'get_endpoint',
36
37
  'INITIAL_VERSION',
37
38
  'LB_CONTROLLER_SYNC_INTERVAL_SECONDS',
39
+ 'LB_POLICIES',
38
40
  'ReplicaStatus',
39
41
  'ServiceComponent',
40
42
  'ServiceStatus',
@@ -2,7 +2,7 @@
2
2
  import asyncio
3
3
  import logging
4
4
  import threading
5
- from typing import Dict, Union
5
+ from typing import Dict, Optional, Union
6
6
 
7
7
  import aiohttp
8
8
  import fastapi
@@ -27,18 +27,24 @@ class SkyServeLoadBalancer:
27
27
  policy.
28
28
  """
29
29
 
30
- def __init__(self, controller_url: str, load_balancer_port: int) -> None:
30
+ def __init__(self,
31
+ controller_url: str,
32
+ load_balancer_port: int,
33
+ load_balancing_policy_name: Optional[str] = None) -> None:
31
34
  """Initialize the load balancer.
32
35
 
33
36
  Args:
34
37
  controller_url: The URL of the controller.
35
38
  load_balancer_port: The port where the load balancer listens to.
39
+ load_balancing_policy_name: The name of the load balancing policy
40
+ to use. Defaults to None.
36
41
  """
37
42
  self._app = fastapi.FastAPI()
38
43
  self._controller_url: str = controller_url
39
44
  self._load_balancer_port: int = load_balancer_port
40
- self._load_balancing_policy: lb_policies.LoadBalancingPolicy = (
41
- lb_policies.RoundRobinPolicy())
45
+ # Use the registry to create the load balancing policy
46
+ self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
47
+ load_balancing_policy_name)
42
48
  self._request_aggregator: serve_utils.RequestsAggregator = (
43
49
  serve_utils.RequestTimestamp())
44
50
  # TODO(tian): httpx.Client has a resource limit of 100 max connections
@@ -223,9 +229,21 @@ class SkyServeLoadBalancer:
223
229
  uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port)
224
230
 
225
231
 
226
- def run_load_balancer(controller_addr: str, load_balancer_port: int):
227
- load_balancer = SkyServeLoadBalancer(controller_url=controller_addr,
228
- load_balancer_port=load_balancer_port)
232
+ def run_load_balancer(controller_addr: str,
233
+ load_balancer_port: int,
234
+ load_balancing_policy_name: Optional[str] = None) -> None:
235
+ """ Run the load balancer.
236
+
237
+ Args:
238
+ controller_addr: The address of the controller.
239
+ load_balancer_port: The port where the load balancer listens to.
240
+ policy_name: The name of the load balancing policy to use. Defaults to
241
+ None.
242
+ """
243
+ load_balancer = SkyServeLoadBalancer(
244
+ controller_url=controller_addr,
245
+ load_balancer_port=load_balancer_port,
246
+ load_balancing_policy_name=load_balancing_policy_name)
229
247
  load_balancer.run()
230
248
 
231
249
 
@@ -241,5 +259,13 @@ if __name__ == '__main__':
241
259
  required=True,
242
260
  default=8890,
243
261
  help='The port where the load balancer listens to.')
262
+ available_policies = list(lb_policies.LB_POLICIES.keys())
263
+ parser.add_argument(
264
+ '--load-balancing-policy',
265
+ choices=available_policies,
266
+ default='round_robin',
267
+ help=f'The load balancing policy to use. Available policies: '
268
+ f'{", ".join(available_policies)}.')
244
269
  args = parser.parse_args()
245
- run_load_balancer(args.controller_addr, args.load_balancer_port)
270
+ run_load_balancer(args.controller_addr, args.load_balancer_port,
271
+ args.load_balancing_policy)
@@ -10,6 +10,10 @@ if typing.TYPE_CHECKING:
10
10
 
11
11
  logger = sky_logging.init_logger(__name__)
12
12
 
13
+ # Define a registry for load balancing policies
14
+ LB_POLICIES = {}
15
+ DEFAULT_LB_POLICY = None
16
+
13
17
 
14
18
  def _request_repr(request: 'fastapi.Request') -> str:
15
19
  return ('<Request '
@@ -25,6 +29,24 @@ class LoadBalancingPolicy:
25
29
  def __init__(self) -> None:
26
30
  self.ready_replicas: List[str] = []
27
31
 
32
+ def __init_subclass__(cls, name: str, default: bool = False):
33
+ LB_POLICIES[name] = cls
34
+ if default:
35
+ global DEFAULT_LB_POLICY
36
+ assert DEFAULT_LB_POLICY is None, (
37
+ 'Only one policy can be default.')
38
+ DEFAULT_LB_POLICY = name
39
+
40
+ @classmethod
41
+ def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy':
42
+ """Create a load balancing policy from a name."""
43
+ if policy_name is None:
44
+ policy_name = DEFAULT_LB_POLICY
45
+
46
+ if policy_name not in LB_POLICIES:
47
+ raise ValueError(f'Unknown load balancing policy: {policy_name}')
48
+ return LB_POLICIES[policy_name]()
49
+
28
50
  def set_ready_replicas(self, ready_replicas: List[str]) -> None:
29
51
  raise NotImplementedError
30
52
 
@@ -44,7 +66,7 @@ class LoadBalancingPolicy:
44
66
  raise NotImplementedError
45
67
 
46
68
 
47
- class RoundRobinPolicy(LoadBalancingPolicy):
69
+ class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin', default=True):
48
70
  """Round-robin load balancing policy."""
49
71
 
50
72
  def __init__(self) -> None:
sky/serve/service.py CHANGED
@@ -219,6 +219,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
219
219
  load_balancer_port = common_utils.find_free_port(
220
220
  constants.LOAD_BALANCER_PORT_START)
221
221
 
222
+ # Extract the load balancing policy from the service spec
223
+ policy_name = service_spec.load_balancing_policy
224
+
222
225
  # Start the load balancer.
223
226
  # TODO(tian): Probably we could enable multiple ports specified in
224
227
  # service spec and we could start multiple load balancers.
@@ -227,7 +230,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
227
230
  target=ux_utils.RedirectOutputForProcess(
228
231
  load_balancer.run_load_balancer,
229
232
  load_balancer_log_file).run,
230
- args=(controller_addr, load_balancer_port))
233
+ args=(controller_addr, load_balancer_port, policy_name))
231
234
  load_balancer_process.start()
232
235
  serve_state.set_service_load_balancer_port(service_name,
233
236
  load_balancer_port)
sky/serve/service_spec.py CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, Optional
6
6
 
7
7
  import yaml
8
8
 
9
+ from sky import serve
9
10
  from sky.serve import constants
10
11
  from sky.utils import common_utils
11
12
  from sky.utils import schemas
@@ -29,6 +30,7 @@ class SkyServiceSpec:
29
30
  base_ondemand_fallback_replicas: Optional[int] = None,
30
31
  upscale_delay_seconds: Optional[int] = None,
31
32
  downscale_delay_seconds: Optional[int] = None,
33
+ load_balancing_policy: Optional[str] = None,
32
34
  ) -> None:
33
35
  if max_replicas is not None and max_replicas < min_replicas:
34
36
  with ux_utils.print_exception_no_traceback():
@@ -55,6 +57,13 @@ class SkyServiceSpec:
55
57
  raise ValueError('readiness_path must start with a slash (/). '
56
58
  f'Got: {readiness_path}')
57
59
 
60
+ # Add the check for unknown load balancing policies
61
+ if (load_balancing_policy is not None and
62
+ load_balancing_policy not in serve.LB_POLICIES):
63
+ with ux_utils.print_exception_no_traceback():
64
+ raise ValueError(
65
+ f'Unknown load balancing policy: {load_balancing_policy}. '
66
+ f'Available policies: {list(serve.LB_POLICIES.keys())}')
58
67
  self._readiness_path: str = readiness_path
59
68
  self._initial_delay_seconds: int = initial_delay_seconds
60
69
  self._readiness_timeout_seconds: int = readiness_timeout_seconds
@@ -69,6 +78,7 @@ class SkyServiceSpec:
69
78
  int] = base_ondemand_fallback_replicas
70
79
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
71
80
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
81
+ self._load_balancing_policy: Optional[str] = load_balancing_policy
72
82
 
73
83
  self._use_ondemand_fallback: bool = (
74
84
  self.dynamic_ondemand_fallback is not None and
@@ -150,6 +160,8 @@ class SkyServiceSpec:
150
160
  service_config['dynamic_ondemand_fallback'] = policy_section.get(
151
161
  'dynamic_ondemand_fallback', None)
152
162
 
163
+ service_config['load_balancing_policy'] = config.get(
164
+ 'load_balancing_policy', None)
153
165
  return SkyServiceSpec(**service_config)
154
166
 
155
167
  @staticmethod
@@ -205,6 +217,8 @@ class SkyServiceSpec:
205
217
  self.upscale_delay_seconds)
206
218
  add_if_not_none('replica_policy', 'downscale_delay_seconds',
207
219
  self.downscale_delay_seconds)
220
+ add_if_not_none('load_balancing_policy', None,
221
+ self._load_balancing_policy)
208
222
  return config
209
223
 
210
224
  def probe_str(self):
@@ -256,6 +270,7 @@ class SkyServiceSpec:
256
270
  Readiness probe timeout seconds: {self.readiness_timeout_seconds}
257
271
  Replica autoscaling policy: {self.autoscaling_policy_str()}
258
272
  Spot Policy: {self.spot_policy_str()}
273
+ Load Balancing Policy: {self.load_balancing_policy}
259
274
  """)
260
275
 
261
276
  @property
@@ -310,3 +325,7 @@ class SkyServiceSpec:
310
325
  @property
311
326
  def use_ondemand_fallback(self) -> bool:
312
327
  return self._use_ondemand_fallback
328
+
329
+ @property
330
+ def load_balancing_policy(self) -> Optional[str]:
331
+ return self._load_balancing_policy
@@ -324,6 +324,15 @@ available_node_types:
324
324
  command: ["/bin/bash", "-c", "--"]
325
325
  args:
326
326
  - |
327
+ # Helper function to conditionally use sudo
328
+ prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
329
+
330
+ # Run apt update in background and log to a file
331
+ (
332
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
333
+ echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
334
+ ) &
335
+
327
336
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
328
337
 
329
338
  # Tails file and checks every 5 sec for
@@ -419,7 +428,18 @@ setup_commands:
419
428
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
420
429
  # Line 'mkdir -p ..': disable host key check
421
430
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
422
- - sudo DEBIAN_FRONTEND=noninteractive apt install lsof gcc patch pciutils rsync fuse curl -y;
431
+ - |
432
+ PACKAGES="gcc patch pciutils rsync fuse curl";
433
+ MISSING_PACKAGES="";
434
+ for pkg in $PACKAGES; do
435
+ if ! dpkg -l | grep -q "^ii $pkg "; then
436
+ MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
437
+ fi
438
+ done;
439
+ if [ ! -z "$MISSING_PACKAGES" ]; then
440
+ echo "Installing missing packages: $MISSING_PACKAGES";
441
+ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y $MISSING_PACKAGES;
442
+ fi;
423
443
  mkdir -p ~/.ssh; touch ~/.ssh/config;
424
444
  {%- for initial_setup_command in initial_setup_commands %}
425
445
  {{ initial_setup_command }}
@@ -112,6 +112,9 @@ rules:
112
112
  - apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
113
113
  resources: ["ingressclasses"]
114
114
  verbs: ["get", "list", "watch"]
115
+ - apiGroups: [""] # Required for sky show-gpus command
116
+ resources: ["pods"]
117
+ verbs: ["get", "list"]
115
118
  ---
116
119
  # ClusterRoleBinding for the service account
117
120
  apiVersion: rbac.authorization.k8s.io/v1
sky/utils/schemas.py CHANGED
@@ -308,6 +308,9 @@ def get_storage_schema():
308
308
 
309
309
  def get_service_schema():
310
310
  """Schema for top-level `service:` field (for SkyServe)."""
311
+ # To avoid circular imports, only import when needed.
312
+ # pylint: disable=import-outside-toplevel
313
+ from sky.serve import load_balancing_policies
311
314
  return {
312
315
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
313
316
  'type': 'object',
@@ -382,6 +385,11 @@ def get_service_schema():
382
385
  'replicas': {
383
386
  'type': 'integer',
384
387
  },
388
+ 'load_balancing_policy': {
389
+ 'type': 'string',
390
+ 'case_insensitive_enum': list(
391
+ load_balancing_policies.LB_POLICIES.keys())
392
+ },
385
393
  }
386
394
  }
387
395
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241110
3
+ Version: 1.0.0.dev20241112
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -309,7 +309,7 @@ Runnable examples:
309
309
  - [LocalGPT](./llm/localgpt)
310
310
  - [Falcon](./llm/falcon)
311
311
  - Add yours here & see more in [`llm/`](./llm)!
312
- - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
312
+ - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
313
313
 
314
314
  Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
315
315
 
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=j3vy9X4XOYIefQk15d_c6Q_mpDjII9Nltso4xgrFI1o,5882
1
+ sky/__init__.py,sha256=8VuuTyDTVZB1BaeWD7OwBFZwpwweQkb0DNyPpsitRQs,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=jEjXs5Z0u263eJIsTHoKyG9oOY6giqw19s2di9kEv1s,212088
5
+ sky/cli.py,sha256=oGBQrCYVWqRTcWR-yCKZY7dmUOUnP5Xuvz_zcFXzqlw,212342
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
8
8
  sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
@@ -65,7 +65,7 @@ sky/clouds/service_catalog/cudo_catalog.py,sha256=V_takvL6dWTGQaTLCEvjKIotCDPnMu
65
65
  sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
66
66
  sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
68
- sky/clouds/service_catalog/kubernetes_catalog.py,sha256=5ilQ-JK1ZS2EZp8GpCKok0H3S1fdI_aAznzIDWCY1NY,9110
68
+ sky/clouds/service_catalog/kubernetes_catalog.py,sha256=c6Oot8RC1ujcFmfJbkeJKUWsw3aX0iNvKL1fJg-FoOc,10020
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
70
70
  sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
71
71
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
@@ -106,7 +106,7 @@ sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x
106
106
  sky/provision/__init__.py,sha256=llAtnAAzx0TKT17B0JL_2ZiKea9RRQRxSzkWHQYqWTo,6292
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
- sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
109
+ sky/provision/docker_utils.py,sha256=l4AMzwXGZd8RyNq8AwOaKV9bFSofLYfSyj2NBhkXYsY,19200
110
110
  sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
@@ -137,10 +137,10 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
137
137
  sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
138
138
  sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
139
139
  sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
140
- sky/provision/kubernetes/instance.py,sha256=rY43hZOInP20kYofW0MGs7wDbJ4NxMw1FtKAJAPGIOU,43960
140
+ sky/provision/kubernetes/instance.py,sha256=MFtTh-dNIuTZcHD20PQG_QuULFRFaPxwlUczR6sRnsk,43601
141
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
142
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
143
- sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
143
+ sky/provision/kubernetes/utils.py,sha256=PEDyZnf-dSmQ4dXyS_0x9OYHt9SbY7A6urd436f-WyQ,89923
144
144
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
145
145
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -175,18 +175,18 @@ sky/provision/vsphere/common/service_manager_factory.py,sha256=YkvfHiRXFK_Nb406z
175
175
  sky/provision/vsphere/common/ssl_helper.py,sha256=TYzN9K0i_Mk_17PKGyGPgvOGfoizysuuIeYapcy_tWE,795
176
176
  sky/provision/vsphere/common/vapiconnect.py,sha256=R2I1ZWBA19d11fZ_FrIzQT8E1aLl1HU4Rdcj8Z5r3NE,2932
177
177
  sky/provision/vsphere/common/vim_utils.py,sha256=EMWLS8ILpdx6XwUZ9I53y0B_1yFrRrlr4jjIMT84hAc,17877
178
- sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
178
+ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
179
179
  sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
180
180
  sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
181
181
  sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
182
182
  sky/serve/core.py,sha256=hszs95BwtC4wIJujGNokvFC46VjojgRz1BbYOIIPh6k,31601
183
- sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
184
- sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
183
+ sky/serve/load_balancer.py,sha256=I4W66eh1t1kA_C_VaMPI76WeDTCl3Z6rFxF6rQIWd6E,12636
184
+ sky/serve/load_balancing_policies.py,sha256=_k4tkwIvhulR02Ln9ixYB_b97KOypr2xfSjMx8_zky0,3143
185
185
  sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
186
186
  sky/serve/serve_state.py,sha256=Q7De4GoBEPxlN_t1Lpn-Y1fd94SeHZ3E-94f1OTuhpc,19086
187
187
  sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,39431
188
- sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
189
- sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
188
+ sky/serve/service.py,sha256=gVem2vX8XuR_1wTqwrzbszQAbjzjDP2ddd787aynT9g,12017
189
+ sky/serve/service_spec.py,sha256=34dMQ37INHltBzWaxHl3y_o9X3wLOCWA5jUhmhH1II4,14740
190
190
  sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
191
191
  sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
192
192
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
@@ -228,7 +228,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjA
228
228
  sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
229
229
  sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
230
230
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
231
- sky/templates/kubernetes-ray.yml.j2,sha256=dsWlkX-0b1igeZI4c0u0Jzia5I_9gezCiewR6pX1LlY,18374
231
+ sky/templates/kubernetes-ray.yml.j2,sha256=Ek6nePe_IP1b0mqMLnbyjp7wpo1-kwranD_AFRXJ9tU,19152
232
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
233
233
  sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
234
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
@@ -257,7 +257,7 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
257
257
  sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
258
258
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
259
259
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
260
- sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
260
+ sky/utils/schemas.py,sha256=67LK87wBywblIyF-QgG5hgL1BvBuHsxeQLQBO0M5OH4,29447
261
261
  sky/utils/subprocess_utils.py,sha256=mMFCTfxbyav5LJ1epJJXkgfFYmd828naTOMVfYjuEWY,6905
262
262
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
263
263
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -269,15 +269,15 @@ sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnv
269
269
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
270
270
  sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=vGj0mD0tejHDRy8ulwKOvOF2mfLyT5J8fp7GVqEe_EY,8478
271
271
  sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
272
- sky/utils/kubernetes/generate_kubeconfig.sh,sha256=AcYhuuG5jXWGHUmyRuH-oKy5qcn92gXhu6bXOt6eD6g,9274
272
+ sky/utils/kubernetes/generate_kubeconfig.sh,sha256=livvxDKV-_xx8-dYWNyo4wlg3sOldeHefI37JXKLXu0,9398
273
273
  sky/utils/kubernetes/gpu_labeler.py,sha256=MEUv0U4ACDcNwtFVltlv017XJMjxx1Bndf6fL0i6eqg,6960
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7ZWF5gdVIZPupCCo9A,1224
275
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
276
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
277
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
278
- skypilot_nightly-1.0.0.dev20241110.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
- skypilot_nightly-1.0.0.dev20241110.dist-info/METADATA,sha256=4ar4pUczmGqsEHMG-85ANcAB_ifYgIDJRr0BJfypruA,19708
280
- skypilot_nightly-1.0.0.dev20241110.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
- skypilot_nightly-1.0.0.dev20241110.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
- skypilot_nightly-1.0.0.dev20241110.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
- skypilot_nightly-1.0.0.dev20241110.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241112.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241112.dist-info/METADATA,sha256=Ui6L9CmuvZsIg2D0paU-NiqfVLtywzq5GLpCrJes-eY,19699
280
+ skypilot_nightly-1.0.0.dev20241112.dist-info/WHEEL,sha256=a7TGlA-5DaHMRrarXjVbQagU3Man_dCnGIWMJr5kRWo,91
281
+ skypilot_nightly-1.0.0.dev20241112.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241112.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241112.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.4.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5