skypilot-nightly 1.0.0.dev20241110__py3-none-any.whl → 1.0.0.dev20241111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'dddd65187953a5d6b32f762bea78eed1f109ec3c'
8
+ _SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241110'
38
+ __version__ = '1.0.0.dev20241111'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
20
20
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
21
  'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
22
22
  '~/container_env_var.sh && '
23
- '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
23
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
24
24
  )
25
25
 
26
26
  # Docker daemon may not be ready when the machine is firstly started. The error
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
333
333
  raise
334
334
 
335
335
 
336
- def _set_env_vars_in_pods(namespace: str, context: Optional[str],
337
- new_pods: List):
338
- """Setting environment variables in pods.
339
-
340
- Once all containers are ready, we can exec into them and set env vars.
341
- Kubernetes automatically populates containers with critical
342
- environment variables, such as those for discovering services running
343
- in the cluster and CUDA/nvidia environment variables. We need to
344
- make sure these env vars are available in every task and ssh session.
345
- This is needed for GPU support and service discovery.
346
- See https://github.com/skypilot-org/skypilot/issues/2287 for
347
- more details.
348
-
349
- To do so, we capture env vars from the pod's runtime and write them to
350
- /etc/profile.d/, making them available for all users in future
351
- shell sessions.
352
- """
353
- set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
336
+ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
337
+ """Pre-initialization step for SkyPilot pods.
354
338
 
355
- def _set_env_vars_thread(new_pod):
356
- pod_name = new_pod.metadata.name
357
- logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
358
- f'{"-"*20}')
359
- runner = command_runner.KubernetesCommandRunner(
360
- ((namespace, context), pod_name))
339
+ This step is run in the pod right after it is created and before the
340
+ SkyPilot runtime is setup.
361
341
 
362
- def _run_env_vars_cmd():
363
- rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
364
- require_outputs=True,
365
- stream_logs=False)
366
- _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
367
- pod_name, rc, stdout)
342
+ This step includes three key steps:
368
343
 
369
- _run_function_with_retries(_run_env_vars_cmd,
370
- f'set env vars in pod {pod_name}')
371
- logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
372
- f'{"-"*20}')
344
+ 1. Privilege check: Checks if the default user has sufficient privilege
345
+ to set up the kubernetes instance pod.
346
+ 2. SSH setup: Sets up SSH for the pod instance.
347
+ 3. Environment variable setup to populate k8s env vars in the pod.
373
348
 
374
- subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
375
- NUM_THREADS)
349
+ Make sure commands used in these methods are generic and work
350
+ on most base images. E.g., do not use Python, since that may not
351
+ be installed by default.
376
352
 
353
+ If you run any apt commands, be sure to check if the lock is available.
354
+ It is possible the `apt update` run in the pod container args may still
355
+ be running.
356
+
357
+ Args:
358
+ namespace (str): Kubernetes namespace.
359
+ context (Optional[str]): Kubernetes context.
360
+ new_nodes (List): List of new pod instances.
361
+
362
+ Raises:
363
+ config_lib.KubernetesError: If user privileges are insufficient or
364
+ setup fails.
365
+ """
377
366
 
378
- def _check_user_privilege(namespace: str, context: Optional[str],
379
- new_nodes: List) -> None:
380
- # Checks if the default user has sufficient privilege to set up
381
- # the kubernetes instance pod.
382
367
  check_k8s_user_sudo_cmd = (
383
368
  'if [ $(id -u) -eq 0 ]; then'
384
369
  # If user is root, create an alias for sudo used in skypilot setup
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
386
371
  'else '
387
372
  ' if command -v sudo >/dev/null 2>&1; then '
388
373
  ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
389
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
374
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
375
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
390
376
  ' else '
391
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
377
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
378
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
392
379
  ' fi; '
393
- 'fi')
380
+ 'fi;')
381
+
382
+ # Kubernetes automatically populates containers with critical
383
+ # environment variables, such as those for discovering services running
384
+ # in the cluster and CUDA/nvidia environment variables. We need to
385
+ # make sure these env vars are available in every task and ssh session.
386
+ # This is needed for GPU support and service discovery.
387
+ # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
388
+ # To do so, we capture env vars from the pod's runtime and write them to
389
+ # /etc/profile.d/, making them available for all users in future
390
+ # shell sessions.
391
+ set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
394
392
 
395
- # This check needs to run on a per-image basis, so running the check on
396
- # any one pod is sufficient.
397
- new_node = new_nodes[0]
398
- pod_name = new_node.metadata.name
393
+ check_apt_update_complete_cmd = (
394
+ 'echo "Checking if apt update from container init is complete..."; '
395
+ 'timeout_secs=600; '
396
+ 'start_time=$(date +%s); '
397
+ 'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
398
+ ' echo "apt update still running. Logs:"; '
399
+ ' cat /tmp/apt-update.log; '
400
+ ' current_time=$(date +%s); '
401
+ ' elapsed=$((current_time - start_time)); '
402
+ ' if [ $elapsed -ge $timeout_secs ]; then '
403
+ ' echo "Timed out waiting for apt update"; '
404
+ ' exit 1; '
405
+ ' fi; '
406
+ ' sleep 5; '
407
+ 'done; '
408
+ 'echo "apt update complete."; ')
399
409
 
400
- runner = command_runner.KubernetesCommandRunner(
401
- ((namespace, context), pod_name))
402
- logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
403
- f'{"-"*20}')
404
-
405
- def _run_privilege_check():
406
- rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
407
- require_outputs=True,
408
- separate_stderr=True,
409
- stream_logs=False)
410
- _raise_command_running_error('check user privilege',
411
- check_k8s_user_sudo_cmd, pod_name, rc,
412
- stdout + stderr)
413
- return stdout
414
-
415
- stdout = _run_function_with_retries(
416
- _run_privilege_check, f'check user privilege in pod {pod_name!r}')
417
-
418
- if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
419
- raise config_lib.KubernetesError(
420
- 'Insufficient system privileges detected. '
421
- 'Ensure the default user has root access or '
422
- '"sudo" is installed and the user is added to the sudoers '
423
- 'from the image.')
424
- logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
425
- f'{"-"*20}')
426
-
427
-
428
- def _setup_ssh_in_pods(namespace: str, context: Optional[str],
429
- new_nodes: List) -> None:
430
- # Setting up ssh for the pod instance. This is already setup for
431
- # the jump pod so it does not need to be run for it.
432
- set_k8s_ssh_cmd = (
433
- 'set -ex; '
410
+ install_ssh_k8s_cmd = (
434
411
  'prefix_cmd() '
435
412
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
436
413
  'export DEBIAN_FRONTEND=noninteractive;'
437
- '$(prefix_cmd) apt-get update;'
438
- '$(prefix_cmd) apt install openssh-server rsync -y; '
414
+ 'echo "Installing missing packages..."; '
415
+ 'for i in {1..5}; do '
416
+ ' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
417
+ ' rc=$?; '
418
+ ' if [ $rc -eq 0 ]; then '
419
+ ' break; '
420
+ ' fi; '
421
+ ' echo "$output" | grep -qi "could not get lock" || '
422
+ ' grep -qi "Unable to acquire the dpkg frontend lock"; '
423
+ ' if [ $? -eq 0 ]; then '
424
+ ' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
425
+ ' sleep 5; '
426
+ ' else '
427
+ ' echo "apt install failed for a non-lock reason: $output"; '
428
+ ' exit $rc; '
429
+ ' fi; '
430
+ 'done; '
431
+ 'if [ $rc -ne 0 ]; then '
432
+ ' echo "apt install failed after 5 attempts due to lock errors."; '
433
+ ' exit $rc; '
434
+ 'fi; '
439
435
  '$(prefix_cmd) mkdir -p /var/run/sshd; '
440
436
  '$(prefix_cmd) '
441
437
  'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
456
452
  # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
457
453
  '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
458
454
 
459
- def _setup_ssh_thread(new_node):
455
+ pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
456
+ set_k8s_env_var_cmd + check_apt_update_complete_cmd +
457
+ install_ssh_k8s_cmd)
458
+
459
+ def _pre_init_thread(new_node):
460
460
  pod_name = new_node.metadata.name
461
+ logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
461
462
  runner = command_runner.KubernetesCommandRunner(
462
463
  ((namespace, context), pod_name))
463
- logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
464
464
 
465
- def _run_ssh_setup():
466
- rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
467
- require_outputs=True,
468
- stream_logs=False)
469
- _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
470
- rc, stdout)
465
+ # Run the combined pre-init command
466
+ rc, stdout, _ = runner.run(pre_init_cmd,
467
+ require_outputs=True,
468
+ stream_logs=False)
469
+ if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
470
+ raise config_lib.KubernetesError(
471
+ 'Insufficient system privileges detected. '
472
+ 'Ensure the default user has root access or '
473
+ '"sudo" is installed and the user is added to the sudoers '
474
+ 'from the image.')
475
+
476
+ op_name = 'pre-init'
477
+ _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
478
+ stdout)
471
479
 
472
- _run_function_with_retries(_run_ssh_setup,
473
- f'setup ssh in pod {pod_name!r}')
474
- logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
480
+ logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
475
481
 
476
- subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
482
+ # Run pre_init in parallel across all new_nodes
483
+ subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
477
484
 
478
485
 
479
486
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
724
731
  f'pods: {list(uninitialized_pods.keys())}')
725
732
  uninitialized_pods_list = list(uninitialized_pods.values())
726
733
 
727
- # Setup SSH and environment variables in pods.
728
- # Make sure commands used in these methods are generic and work
729
- # on most base images. E.g., do not use Python, since that may not
730
- # be installed by default.
731
- _check_user_privilege(namespace, context, uninitialized_pods_list)
732
- _setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
733
- _set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
734
+ # Run pre-init steps in the pod.
735
+ pre_init(namespace, context, uninitialized_pods_list)
734
736
 
735
737
  for pod in uninitialized_pods.values():
736
738
  _label_pod(namespace,
sky/serve/__init__.py CHANGED
@@ -11,6 +11,7 @@ from sky.serve.core import tail_logs
11
11
  from sky.serve.core import terminate_replica
12
12
  from sky.serve.core import up
13
13
  from sky.serve.core import update
14
+ from sky.serve.load_balancing_policies import LB_POLICIES
14
15
  from sky.serve.serve_state import ReplicaStatus
15
16
  from sky.serve.serve_state import ServiceStatus
16
17
  from sky.serve.serve_utils import DEFAULT_UPDATE_MODE
@@ -35,6 +36,7 @@ __all__ = [
35
36
  'get_endpoint',
36
37
  'INITIAL_VERSION',
37
38
  'LB_CONTROLLER_SYNC_INTERVAL_SECONDS',
39
+ 'LB_POLICIES',
38
40
  'ReplicaStatus',
39
41
  'ServiceComponent',
40
42
  'ServiceStatus',
@@ -2,7 +2,7 @@
2
2
  import asyncio
3
3
  import logging
4
4
  import threading
5
- from typing import Dict, Union
5
+ from typing import Dict, Optional, Union
6
6
 
7
7
  import aiohttp
8
8
  import fastapi
@@ -27,18 +27,24 @@ class SkyServeLoadBalancer:
27
27
  policy.
28
28
  """
29
29
 
30
- def __init__(self, controller_url: str, load_balancer_port: int) -> None:
30
+ def __init__(self,
31
+ controller_url: str,
32
+ load_balancer_port: int,
33
+ load_balancing_policy_name: Optional[str] = None) -> None:
31
34
  """Initialize the load balancer.
32
35
 
33
36
  Args:
34
37
  controller_url: The URL of the controller.
35
38
  load_balancer_port: The port where the load balancer listens to.
39
+ load_balancing_policy_name: The name of the load balancing policy
40
+ to use. Defaults to None.
36
41
  """
37
42
  self._app = fastapi.FastAPI()
38
43
  self._controller_url: str = controller_url
39
44
  self._load_balancer_port: int = load_balancer_port
40
- self._load_balancing_policy: lb_policies.LoadBalancingPolicy = (
41
- lb_policies.RoundRobinPolicy())
45
+ # Use the registry to create the load balancing policy
46
+ self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
47
+ load_balancing_policy_name)
42
48
  self._request_aggregator: serve_utils.RequestsAggregator = (
43
49
  serve_utils.RequestTimestamp())
44
50
  # TODO(tian): httpx.Client has a resource limit of 100 max connections
@@ -223,9 +229,21 @@ class SkyServeLoadBalancer:
223
229
  uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port)
224
230
 
225
231
 
226
- def run_load_balancer(controller_addr: str, load_balancer_port: int):
227
- load_balancer = SkyServeLoadBalancer(controller_url=controller_addr,
228
- load_balancer_port=load_balancer_port)
232
+ def run_load_balancer(controller_addr: str,
233
+ load_balancer_port: int,
234
+ load_balancing_policy_name: Optional[str] = None) -> None:
235
+ """ Run the load balancer.
236
+
237
+ Args:
238
+ controller_addr: The address of the controller.
239
+ load_balancer_port: The port where the load balancer listens to.
240
+ policy_name: The name of the load balancing policy to use. Defaults to
241
+ None.
242
+ """
243
+ load_balancer = SkyServeLoadBalancer(
244
+ controller_url=controller_addr,
245
+ load_balancer_port=load_balancer_port,
246
+ load_balancing_policy_name=load_balancing_policy_name)
229
247
  load_balancer.run()
230
248
 
231
249
 
@@ -241,5 +259,13 @@ if __name__ == '__main__':
241
259
  required=True,
242
260
  default=8890,
243
261
  help='The port where the load balancer listens to.')
262
+ available_policies = list(lb_policies.LB_POLICIES.keys())
263
+ parser.add_argument(
264
+ '--load-balancing-policy',
265
+ choices=available_policies,
266
+ default='round_robin',
267
+ help=f'The load balancing policy to use. Available policies: '
268
+ f'{", ".join(available_policies)}.')
244
269
  args = parser.parse_args()
245
- run_load_balancer(args.controller_addr, args.load_balancer_port)
270
+ run_load_balancer(args.controller_addr, args.load_balancer_port,
271
+ args.load_balancing_policy)
@@ -10,6 +10,10 @@ if typing.TYPE_CHECKING:
10
10
 
11
11
  logger = sky_logging.init_logger(__name__)
12
12
 
13
+ # Define a registry for load balancing policies
14
+ LB_POLICIES = {}
15
+ DEFAULT_LB_POLICY = None
16
+
13
17
 
14
18
  def _request_repr(request: 'fastapi.Request') -> str:
15
19
  return ('<Request '
@@ -25,6 +29,24 @@ class LoadBalancingPolicy:
25
29
  def __init__(self) -> None:
26
30
  self.ready_replicas: List[str] = []
27
31
 
32
+ def __init_subclass__(cls, name: str, default: bool = False):
33
+ LB_POLICIES[name] = cls
34
+ if default:
35
+ global DEFAULT_LB_POLICY
36
+ assert DEFAULT_LB_POLICY is None, (
37
+ 'Only one policy can be default.')
38
+ DEFAULT_LB_POLICY = name
39
+
40
+ @classmethod
41
+ def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy':
42
+ """Create a load balancing policy from a name."""
43
+ if policy_name is None:
44
+ policy_name = DEFAULT_LB_POLICY
45
+
46
+ if policy_name not in LB_POLICIES:
47
+ raise ValueError(f'Unknown load balancing policy: {policy_name}')
48
+ return LB_POLICIES[policy_name]()
49
+
28
50
  def set_ready_replicas(self, ready_replicas: List[str]) -> None:
29
51
  raise NotImplementedError
30
52
 
@@ -44,7 +66,7 @@ class LoadBalancingPolicy:
44
66
  raise NotImplementedError
45
67
 
46
68
 
47
- class RoundRobinPolicy(LoadBalancingPolicy):
69
+ class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin', default=True):
48
70
  """Round-robin load balancing policy."""
49
71
 
50
72
  def __init__(self) -> None:
sky/serve/service.py CHANGED
@@ -219,6 +219,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
219
219
  load_balancer_port = common_utils.find_free_port(
220
220
  constants.LOAD_BALANCER_PORT_START)
221
221
 
222
+ # Extract the load balancing policy from the service spec
223
+ policy_name = service_spec.load_balancing_policy
224
+
222
225
  # Start the load balancer.
223
226
  # TODO(tian): Probably we could enable multiple ports specified in
224
227
  # service spec and we could start multiple load balancers.
@@ -227,7 +230,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
227
230
  target=ux_utils.RedirectOutputForProcess(
228
231
  load_balancer.run_load_balancer,
229
232
  load_balancer_log_file).run,
230
- args=(controller_addr, load_balancer_port))
233
+ args=(controller_addr, load_balancer_port, policy_name))
231
234
  load_balancer_process.start()
232
235
  serve_state.set_service_load_balancer_port(service_name,
233
236
  load_balancer_port)
sky/serve/service_spec.py CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, Optional
6
6
 
7
7
  import yaml
8
8
 
9
+ from sky import serve
9
10
  from sky.serve import constants
10
11
  from sky.utils import common_utils
11
12
  from sky.utils import schemas
@@ -29,6 +30,7 @@ class SkyServiceSpec:
29
30
  base_ondemand_fallback_replicas: Optional[int] = None,
30
31
  upscale_delay_seconds: Optional[int] = None,
31
32
  downscale_delay_seconds: Optional[int] = None,
33
+ load_balancing_policy: Optional[str] = None,
32
34
  ) -> None:
33
35
  if max_replicas is not None and max_replicas < min_replicas:
34
36
  with ux_utils.print_exception_no_traceback():
@@ -55,6 +57,13 @@ class SkyServiceSpec:
55
57
  raise ValueError('readiness_path must start with a slash (/). '
56
58
  f'Got: {readiness_path}')
57
59
 
60
+ # Add the check for unknown load balancing policies
61
+ if (load_balancing_policy is not None and
62
+ load_balancing_policy not in serve.LB_POLICIES):
63
+ with ux_utils.print_exception_no_traceback():
64
+ raise ValueError(
65
+ f'Unknown load balancing policy: {load_balancing_policy}. '
66
+ f'Available policies: {list(serve.LB_POLICIES.keys())}')
58
67
  self._readiness_path: str = readiness_path
59
68
  self._initial_delay_seconds: int = initial_delay_seconds
60
69
  self._readiness_timeout_seconds: int = readiness_timeout_seconds
@@ -69,6 +78,7 @@ class SkyServiceSpec:
69
78
  int] = base_ondemand_fallback_replicas
70
79
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
71
80
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
81
+ self._load_balancing_policy: Optional[str] = load_balancing_policy
72
82
 
73
83
  self._use_ondemand_fallback: bool = (
74
84
  self.dynamic_ondemand_fallback is not None and
@@ -150,6 +160,8 @@ class SkyServiceSpec:
150
160
  service_config['dynamic_ondemand_fallback'] = policy_section.get(
151
161
  'dynamic_ondemand_fallback', None)
152
162
 
163
+ service_config['load_balancing_policy'] = config.get(
164
+ 'load_balancing_policy', None)
153
165
  return SkyServiceSpec(**service_config)
154
166
 
155
167
  @staticmethod
@@ -205,6 +217,8 @@ class SkyServiceSpec:
205
217
  self.upscale_delay_seconds)
206
218
  add_if_not_none('replica_policy', 'downscale_delay_seconds',
207
219
  self.downscale_delay_seconds)
220
+ add_if_not_none('load_balancing_policy', None,
221
+ self._load_balancing_policy)
208
222
  return config
209
223
 
210
224
  def probe_str(self):
@@ -256,6 +270,7 @@ class SkyServiceSpec:
256
270
  Readiness probe timeout seconds: {self.readiness_timeout_seconds}
257
271
  Replica autoscaling policy: {self.autoscaling_policy_str()}
258
272
  Spot Policy: {self.spot_policy_str()}
273
+ Load Balancing Policy: {self.load_balancing_policy}
259
274
  """)
260
275
 
261
276
  @property
@@ -310,3 +325,7 @@ class SkyServiceSpec:
310
325
  @property
311
326
  def use_ondemand_fallback(self) -> bool:
312
327
  return self._use_ondemand_fallback
328
+
329
+ @property
330
+ def load_balancing_policy(self) -> Optional[str]:
331
+ return self._load_balancing_policy
@@ -324,6 +324,15 @@ available_node_types:
324
324
  command: ["/bin/bash", "-c", "--"]
325
325
  args:
326
326
  - |
327
+ # Helper function to conditionally use sudo
328
+ prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
329
+
330
+ # Run apt update in background and log to a file
331
+ (
332
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
333
+ echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
334
+ ) &
335
+
327
336
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
328
337
 
329
338
  # Tails file and checks every 5 sec for
@@ -419,7 +428,18 @@ setup_commands:
419
428
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
420
429
  # Line 'mkdir -p ..': disable host key check
421
430
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
422
- - sudo DEBIAN_FRONTEND=noninteractive apt install lsof gcc patch pciutils rsync fuse curl -y;
431
+ - |
432
+ PACKAGES="gcc patch pciutils rsync fuse curl";
433
+ MISSING_PACKAGES="";
434
+ for pkg in $PACKAGES; do
435
+ if ! dpkg -l | grep -q "^ii $pkg "; then
436
+ MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
437
+ fi
438
+ done;
439
+ if [ ! -z "$MISSING_PACKAGES" ]; then
440
+ echo "Installing missing packages: $MISSING_PACKAGES";
441
+ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y $MISSING_PACKAGES;
442
+ fi;
423
443
  mkdir -p ~/.ssh; touch ~/.ssh/config;
424
444
  {%- for initial_setup_command in initial_setup_commands %}
425
445
  {{ initial_setup_command }}
sky/utils/schemas.py CHANGED
@@ -308,6 +308,9 @@ def get_storage_schema():
308
308
 
309
309
  def get_service_schema():
310
310
  """Schema for top-level `service:` field (for SkyServe)."""
311
+ # To avoid circular imports, only import when needed.
312
+ # pylint: disable=import-outside-toplevel
313
+ from sky.serve import load_balancing_policies
311
314
  return {
312
315
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
313
316
  'type': 'object',
@@ -382,6 +385,11 @@ def get_service_schema():
382
385
  'replicas': {
383
386
  'type': 'integer',
384
387
  },
388
+ 'load_balancing_policy': {
389
+ 'type': 'string',
390
+ 'case_insensitive_enum': list(
391
+ load_balancing_policies.LB_POLICIES.keys())
392
+ },
385
393
  }
386
394
  }
387
395
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241110
3
+ Version: 1.0.0.dev20241111
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=j3vy9X4XOYIefQk15d_c6Q_mpDjII9Nltso4xgrFI1o,5882
1
+ sky/__init__.py,sha256=JxZi3opPkeceUxnwl2tlNNr19fC_0QQ_mQ9N6cSQb-Q,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -106,7 +106,7 @@ sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x
106
106
  sky/provision/__init__.py,sha256=llAtnAAzx0TKT17B0JL_2ZiKea9RRQRxSzkWHQYqWTo,6292
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
- sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
109
+ sky/provision/docker_utils.py,sha256=l4AMzwXGZd8RyNq8AwOaKV9bFSofLYfSyj2NBhkXYsY,19200
110
110
  sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
@@ -137,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
137
137
  sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
138
138
  sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
139
139
  sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
140
- sky/provision/kubernetes/instance.py,sha256=rY43hZOInP20kYofW0MGs7wDbJ4NxMw1FtKAJAPGIOU,43960
140
+ sky/provision/kubernetes/instance.py,sha256=MFtTh-dNIuTZcHD20PQG_QuULFRFaPxwlUczR6sRnsk,43601
141
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
142
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
143
143
  sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
@@ -175,18 +175,18 @@ sky/provision/vsphere/common/service_manager_factory.py,sha256=YkvfHiRXFK_Nb406z
175
175
  sky/provision/vsphere/common/ssl_helper.py,sha256=TYzN9K0i_Mk_17PKGyGPgvOGfoizysuuIeYapcy_tWE,795
176
176
  sky/provision/vsphere/common/vapiconnect.py,sha256=R2I1ZWBA19d11fZ_FrIzQT8E1aLl1HU4Rdcj8Z5r3NE,2932
177
177
  sky/provision/vsphere/common/vim_utils.py,sha256=EMWLS8ILpdx6XwUZ9I53y0B_1yFrRrlr4jjIMT84hAc,17877
178
- sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
178
+ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
179
179
  sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
180
180
  sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
181
181
  sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
182
182
  sky/serve/core.py,sha256=hszs95BwtC4wIJujGNokvFC46VjojgRz1BbYOIIPh6k,31601
183
- sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
184
- sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
183
+ sky/serve/load_balancer.py,sha256=I4W66eh1t1kA_C_VaMPI76WeDTCl3Z6rFxF6rQIWd6E,12636
184
+ sky/serve/load_balancing_policies.py,sha256=_k4tkwIvhulR02Ln9ixYB_b97KOypr2xfSjMx8_zky0,3143
185
185
  sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
186
186
  sky/serve/serve_state.py,sha256=Q7De4GoBEPxlN_t1Lpn-Y1fd94SeHZ3E-94f1OTuhpc,19086
187
187
  sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,39431
188
- sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
189
- sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
188
+ sky/serve/service.py,sha256=gVem2vX8XuR_1wTqwrzbszQAbjzjDP2ddd787aynT9g,12017
189
+ sky/serve/service_spec.py,sha256=34dMQ37INHltBzWaxHl3y_o9X3wLOCWA5jUhmhH1II4,14740
190
190
  sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
191
191
  sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
192
192
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
@@ -228,7 +228,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjA
228
228
  sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
229
229
  sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
230
230
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
231
- sky/templates/kubernetes-ray.yml.j2,sha256=dsWlkX-0b1igeZI4c0u0Jzia5I_9gezCiewR6pX1LlY,18374
231
+ sky/templates/kubernetes-ray.yml.j2,sha256=Ek6nePe_IP1b0mqMLnbyjp7wpo1-kwranD_AFRXJ9tU,19152
232
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
233
233
  sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
234
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
@@ -257,7 +257,7 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
257
257
  sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
258
258
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
259
259
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
260
- sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
260
+ sky/utils/schemas.py,sha256=67LK87wBywblIyF-QgG5hgL1BvBuHsxeQLQBO0M5OH4,29447
261
261
  sky/utils/subprocess_utils.py,sha256=mMFCTfxbyav5LJ1epJJXkgfFYmd828naTOMVfYjuEWY,6905
262
262
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
263
263
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -275,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
275
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
276
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
277
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
278
- skypilot_nightly-1.0.0.dev20241110.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
- skypilot_nightly-1.0.0.dev20241110.dist-info/METADATA,sha256=4ar4pUczmGqsEHMG-85ANcAB_ifYgIDJRr0BJfypruA,19708
280
- skypilot_nightly-1.0.0.dev20241110.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
- skypilot_nightly-1.0.0.dev20241110.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
- skypilot_nightly-1.0.0.dev20241110.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
- skypilot_nightly-1.0.0.dev20241110.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241111.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241111.dist-info/METADATA,sha256=ILiS9hM4X6WG3syvXek7BxYF7SvXnZ9o8h5bmcFL2sI,19708
280
+ skypilot_nightly-1.0.0.dev20241111.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
+ skypilot_nightly-1.0.0.dev20241111.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241111.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241111.dist-info/RECORD,,