skypilot-nightly 1.0.0.dev20241110__py3-none-any.whl → 1.0.0.dev20241111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/instance.py +104 -102
- sky/serve/__init__.py +2 -0
- sky/serve/load_balancer.py +34 -8
- sky/serve/load_balancing_policies.py +23 -1
- sky/serve/service.py +4 -1
- sky/serve/service_spec.py +19 -0
- sky/templates/kubernetes-ray.yml.j2 +21 -1
- sky/utils/schemas.py +8 -0
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/RECORD +16 -16
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241111'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/provision/docker_utils.py
CHANGED
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
|
|
20
20
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
21
21
|
'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
|
22
22
|
'~/container_env_var.sh && '
|
23
|
-
'$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
|
23
|
+
'$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
|
24
24
|
)
|
25
25
|
|
26
26
|
# Docker daemon may not be ready when the machine is firstly started. The error
|
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
|
|
333
333
|
raise
|
334
334
|
|
335
335
|
|
336
|
-
def
|
337
|
-
|
338
|
-
"""Setting environment variables in pods.
|
339
|
-
|
340
|
-
Once all containers are ready, we can exec into them and set env vars.
|
341
|
-
Kubernetes automatically populates containers with critical
|
342
|
-
environment variables, such as those for discovering services running
|
343
|
-
in the cluster and CUDA/nvidia environment variables. We need to
|
344
|
-
make sure these env vars are available in every task and ssh session.
|
345
|
-
This is needed for GPU support and service discovery.
|
346
|
-
See https://github.com/skypilot-org/skypilot/issues/2287 for
|
347
|
-
more details.
|
348
|
-
|
349
|
-
To do so, we capture env vars from the pod's runtime and write them to
|
350
|
-
/etc/profile.d/, making them available for all users in future
|
351
|
-
shell sessions.
|
352
|
-
"""
|
353
|
-
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
336
|
+
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
337
|
+
"""Pre-initialization step for SkyPilot pods.
|
354
338
|
|
355
|
-
|
356
|
-
|
357
|
-
logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
|
358
|
-
f'{"-"*20}')
|
359
|
-
runner = command_runner.KubernetesCommandRunner(
|
360
|
-
((namespace, context), pod_name))
|
339
|
+
This step is run in the pod right after it is created and before the
|
340
|
+
SkyPilot runtime is setup.
|
361
341
|
|
362
|
-
|
363
|
-
rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
|
364
|
-
require_outputs=True,
|
365
|
-
stream_logs=False)
|
366
|
-
_raise_command_running_error('set env vars', set_k8s_env_var_cmd,
|
367
|
-
pod_name, rc, stdout)
|
342
|
+
This step includes three key steps:
|
368
343
|
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
344
|
+
1. Privilege check: Checks if the default user has sufficient privilege
|
345
|
+
to set up the kubernetes instance pod.
|
346
|
+
2. SSH setup: Sets up SSH for the pod instance.
|
347
|
+
3. Environment variable setup to populate k8s env vars in the pod.
|
373
348
|
|
374
|
-
|
375
|
-
|
349
|
+
Make sure commands used in these methods are generic and work
|
350
|
+
on most base images. E.g., do not use Python, since that may not
|
351
|
+
be installed by default.
|
376
352
|
|
353
|
+
If you run any apt commands, be sure to check if the lock is available.
|
354
|
+
It is possible the `apt update` run in the pod container args may still
|
355
|
+
be running.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
namespace (str): Kubernetes namespace.
|
359
|
+
context (Optional[str]): Kubernetes context.
|
360
|
+
new_nodes (List): List of new pod instances.
|
361
|
+
|
362
|
+
Raises:
|
363
|
+
config_lib.KubernetesError: If user privileges are insufficient or
|
364
|
+
setup fails.
|
365
|
+
"""
|
377
366
|
|
378
|
-
def _check_user_privilege(namespace: str, context: Optional[str],
|
379
|
-
new_nodes: List) -> None:
|
380
|
-
# Checks if the default user has sufficient privilege to set up
|
381
|
-
# the kubernetes instance pod.
|
382
367
|
check_k8s_user_sudo_cmd = (
|
383
368
|
'if [ $(id -u) -eq 0 ]; then'
|
384
369
|
# If user is root, create an alias for sudo used in skypilot setup
|
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
|
|
386
371
|
'else '
|
387
372
|
' if command -v sudo >/dev/null 2>&1; then '
|
388
373
|
' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
|
389
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r};
|
374
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
375
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
390
376
|
' else '
|
391
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r};
|
377
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
378
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
392
379
|
' fi; '
|
393
|
-
'fi')
|
380
|
+
'fi;')
|
381
|
+
|
382
|
+
# Kubernetes automatically populates containers with critical
|
383
|
+
# environment variables, such as those for discovering services running
|
384
|
+
# in the cluster and CUDA/nvidia environment variables. We need to
|
385
|
+
# make sure these env vars are available in every task and ssh session.
|
386
|
+
# This is needed for GPU support and service discovery.
|
387
|
+
# See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
|
388
|
+
# To do so, we capture env vars from the pod's runtime and write them to
|
389
|
+
# /etc/profile.d/, making them available for all users in future
|
390
|
+
# shell sessions.
|
391
|
+
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
394
392
|
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
393
|
+
check_apt_update_complete_cmd = (
|
394
|
+
'echo "Checking if apt update from container init is complete..."; '
|
395
|
+
'timeout_secs=600; '
|
396
|
+
'start_time=$(date +%s); '
|
397
|
+
'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
|
398
|
+
' echo "apt update still running. Logs:"; '
|
399
|
+
' cat /tmp/apt-update.log; '
|
400
|
+
' current_time=$(date +%s); '
|
401
|
+
' elapsed=$((current_time - start_time)); '
|
402
|
+
' if [ $elapsed -ge $timeout_secs ]; then '
|
403
|
+
' echo "Timed out waiting for apt update"; '
|
404
|
+
' exit 1; '
|
405
|
+
' fi; '
|
406
|
+
' sleep 5; '
|
407
|
+
'done; '
|
408
|
+
'echo "apt update complete."; ')
|
399
409
|
|
400
|
-
|
401
|
-
((namespace, context), pod_name))
|
402
|
-
logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
|
403
|
-
f'{"-"*20}')
|
404
|
-
|
405
|
-
def _run_privilege_check():
|
406
|
-
rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
|
407
|
-
require_outputs=True,
|
408
|
-
separate_stderr=True,
|
409
|
-
stream_logs=False)
|
410
|
-
_raise_command_running_error('check user privilege',
|
411
|
-
check_k8s_user_sudo_cmd, pod_name, rc,
|
412
|
-
stdout + stderr)
|
413
|
-
return stdout
|
414
|
-
|
415
|
-
stdout = _run_function_with_retries(
|
416
|
-
_run_privilege_check, f'check user privilege in pod {pod_name!r}')
|
417
|
-
|
418
|
-
if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
|
419
|
-
raise config_lib.KubernetesError(
|
420
|
-
'Insufficient system privileges detected. '
|
421
|
-
'Ensure the default user has root access or '
|
422
|
-
'"sudo" is installed and the user is added to the sudoers '
|
423
|
-
'from the image.')
|
424
|
-
logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
|
425
|
-
f'{"-"*20}')
|
426
|
-
|
427
|
-
|
428
|
-
def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
429
|
-
new_nodes: List) -> None:
|
430
|
-
# Setting up ssh for the pod instance. This is already setup for
|
431
|
-
# the jump pod so it does not need to be run for it.
|
432
|
-
set_k8s_ssh_cmd = (
|
433
|
-
'set -ex; '
|
410
|
+
install_ssh_k8s_cmd = (
|
434
411
|
'prefix_cmd() '
|
435
412
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
|
436
413
|
'export DEBIAN_FRONTEND=noninteractive;'
|
437
|
-
'
|
438
|
-
'
|
414
|
+
'echo "Installing missing packages..."; '
|
415
|
+
'for i in {1..5}; do '
|
416
|
+
' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
|
417
|
+
' rc=$?; '
|
418
|
+
' if [ $rc -eq 0 ]; then '
|
419
|
+
' break; '
|
420
|
+
' fi; '
|
421
|
+
' echo "$output" | grep -qi "could not get lock" || '
|
422
|
+
' grep -qi "Unable to acquire the dpkg frontend lock"; '
|
423
|
+
' if [ $? -eq 0 ]; then '
|
424
|
+
' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
|
425
|
+
' sleep 5; '
|
426
|
+
' else '
|
427
|
+
' echo "apt install failed for a non-lock reason: $output"; '
|
428
|
+
' exit $rc; '
|
429
|
+
' fi; '
|
430
|
+
'done; '
|
431
|
+
'if [ $rc -ne 0 ]; then '
|
432
|
+
' echo "apt install failed after 5 attempts due to lock errors."; '
|
433
|
+
' exit $rc; '
|
434
|
+
'fi; '
|
439
435
|
'$(prefix_cmd) mkdir -p /var/run/sshd; '
|
440
436
|
'$(prefix_cmd) '
|
441
437
|
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
|
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
|
|
456
452
|
# See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
457
453
|
'$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
|
458
454
|
|
459
|
-
|
455
|
+
pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
|
456
|
+
set_k8s_env_var_cmd + check_apt_update_complete_cmd +
|
457
|
+
install_ssh_k8s_cmd)
|
458
|
+
|
459
|
+
def _pre_init_thread(new_node):
|
460
460
|
pod_name = new_node.metadata.name
|
461
|
+
logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
|
461
462
|
runner = command_runner.KubernetesCommandRunner(
|
462
463
|
((namespace, context), pod_name))
|
463
|
-
logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
|
464
464
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
465
|
+
# Run the combined pre-init command
|
466
|
+
rc, stdout, _ = runner.run(pre_init_cmd,
|
467
|
+
require_outputs=True,
|
468
|
+
stream_logs=False)
|
469
|
+
if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
|
470
|
+
raise config_lib.KubernetesError(
|
471
|
+
'Insufficient system privileges detected. '
|
472
|
+
'Ensure the default user has root access or '
|
473
|
+
'"sudo" is installed and the user is added to the sudoers '
|
474
|
+
'from the image.')
|
475
|
+
|
476
|
+
op_name = 'pre-init'
|
477
|
+
_raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
|
478
|
+
stdout)
|
471
479
|
|
472
|
-
|
473
|
-
f'setup ssh in pod {pod_name!r}')
|
474
|
-
logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
|
480
|
+
logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
|
475
481
|
|
476
|
-
|
482
|
+
# Run pre_init in parallel across all new_nodes
|
483
|
+
subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
|
477
484
|
|
478
485
|
|
479
486
|
def _label_pod(namespace: str, context: Optional[str], pod_name: str,
|
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
724
731
|
f'pods: {list(uninitialized_pods.keys())}')
|
725
732
|
uninitialized_pods_list = list(uninitialized_pods.values())
|
726
733
|
|
727
|
-
#
|
728
|
-
|
729
|
-
# on most base images. E.g., do not use Python, since that may not
|
730
|
-
# be installed by default.
|
731
|
-
_check_user_privilege(namespace, context, uninitialized_pods_list)
|
732
|
-
_setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
|
733
|
-
_set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
|
734
|
+
# Run pre-init steps in the pod.
|
735
|
+
pre_init(namespace, context, uninitialized_pods_list)
|
734
736
|
|
735
737
|
for pod in uninitialized_pods.values():
|
736
738
|
_label_pod(namespace,
|
sky/serve/__init__.py
CHANGED
@@ -11,6 +11,7 @@ from sky.serve.core import tail_logs
|
|
11
11
|
from sky.serve.core import terminate_replica
|
12
12
|
from sky.serve.core import up
|
13
13
|
from sky.serve.core import update
|
14
|
+
from sky.serve.load_balancing_policies import LB_POLICIES
|
14
15
|
from sky.serve.serve_state import ReplicaStatus
|
15
16
|
from sky.serve.serve_state import ServiceStatus
|
16
17
|
from sky.serve.serve_utils import DEFAULT_UPDATE_MODE
|
@@ -35,6 +36,7 @@ __all__ = [
|
|
35
36
|
'get_endpoint',
|
36
37
|
'INITIAL_VERSION',
|
37
38
|
'LB_CONTROLLER_SYNC_INTERVAL_SECONDS',
|
39
|
+
'LB_POLICIES',
|
38
40
|
'ReplicaStatus',
|
39
41
|
'ServiceComponent',
|
40
42
|
'ServiceStatus',
|
sky/serve/load_balancer.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
import asyncio
|
3
3
|
import logging
|
4
4
|
import threading
|
5
|
-
from typing import Dict, Union
|
5
|
+
from typing import Dict, Optional, Union
|
6
6
|
|
7
7
|
import aiohttp
|
8
8
|
import fastapi
|
@@ -27,18 +27,24 @@ class SkyServeLoadBalancer:
|
|
27
27
|
policy.
|
28
28
|
"""
|
29
29
|
|
30
|
-
def __init__(self,
|
30
|
+
def __init__(self,
|
31
|
+
controller_url: str,
|
32
|
+
load_balancer_port: int,
|
33
|
+
load_balancing_policy_name: Optional[str] = None) -> None:
|
31
34
|
"""Initialize the load balancer.
|
32
35
|
|
33
36
|
Args:
|
34
37
|
controller_url: The URL of the controller.
|
35
38
|
load_balancer_port: The port where the load balancer listens to.
|
39
|
+
load_balancing_policy_name: The name of the load balancing policy
|
40
|
+
to use. Defaults to None.
|
36
41
|
"""
|
37
42
|
self._app = fastapi.FastAPI()
|
38
43
|
self._controller_url: str = controller_url
|
39
44
|
self._load_balancer_port: int = load_balancer_port
|
40
|
-
|
41
|
-
|
45
|
+
# Use the registry to create the load balancing policy
|
46
|
+
self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
|
47
|
+
load_balancing_policy_name)
|
42
48
|
self._request_aggregator: serve_utils.RequestsAggregator = (
|
43
49
|
serve_utils.RequestTimestamp())
|
44
50
|
# TODO(tian): httpx.Client has a resource limit of 100 max connections
|
@@ -223,9 +229,21 @@ class SkyServeLoadBalancer:
|
|
223
229
|
uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port)
|
224
230
|
|
225
231
|
|
226
|
-
def run_load_balancer(controller_addr: str,
|
227
|
-
|
228
|
-
|
232
|
+
def run_load_balancer(controller_addr: str,
|
233
|
+
load_balancer_port: int,
|
234
|
+
load_balancing_policy_name: Optional[str] = None) -> None:
|
235
|
+
""" Run the load balancer.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
controller_addr: The address of the controller.
|
239
|
+
load_balancer_port: The port where the load balancer listens to.
|
240
|
+
policy_name: The name of the load balancing policy to use. Defaults to
|
241
|
+
None.
|
242
|
+
"""
|
243
|
+
load_balancer = SkyServeLoadBalancer(
|
244
|
+
controller_url=controller_addr,
|
245
|
+
load_balancer_port=load_balancer_port,
|
246
|
+
load_balancing_policy_name=load_balancing_policy_name)
|
229
247
|
load_balancer.run()
|
230
248
|
|
231
249
|
|
@@ -241,5 +259,13 @@ if __name__ == '__main__':
|
|
241
259
|
required=True,
|
242
260
|
default=8890,
|
243
261
|
help='The port where the load balancer listens to.')
|
262
|
+
available_policies = list(lb_policies.LB_POLICIES.keys())
|
263
|
+
parser.add_argument(
|
264
|
+
'--load-balancing-policy',
|
265
|
+
choices=available_policies,
|
266
|
+
default='round_robin',
|
267
|
+
help=f'The load balancing policy to use. Available policies: '
|
268
|
+
f'{", ".join(available_policies)}.')
|
244
269
|
args = parser.parse_args()
|
245
|
-
run_load_balancer(args.controller_addr, args.load_balancer_port
|
270
|
+
run_load_balancer(args.controller_addr, args.load_balancer_port,
|
271
|
+
args.load_balancing_policy)
|
@@ -10,6 +10,10 @@ if typing.TYPE_CHECKING:
|
|
10
10
|
|
11
11
|
logger = sky_logging.init_logger(__name__)
|
12
12
|
|
13
|
+
# Define a registry for load balancing policies
|
14
|
+
LB_POLICIES = {}
|
15
|
+
DEFAULT_LB_POLICY = None
|
16
|
+
|
13
17
|
|
14
18
|
def _request_repr(request: 'fastapi.Request') -> str:
|
15
19
|
return ('<Request '
|
@@ -25,6 +29,24 @@ class LoadBalancingPolicy:
|
|
25
29
|
def __init__(self) -> None:
|
26
30
|
self.ready_replicas: List[str] = []
|
27
31
|
|
32
|
+
def __init_subclass__(cls, name: str, default: bool = False):
|
33
|
+
LB_POLICIES[name] = cls
|
34
|
+
if default:
|
35
|
+
global DEFAULT_LB_POLICY
|
36
|
+
assert DEFAULT_LB_POLICY is None, (
|
37
|
+
'Only one policy can be default.')
|
38
|
+
DEFAULT_LB_POLICY = name
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy':
|
42
|
+
"""Create a load balancing policy from a name."""
|
43
|
+
if policy_name is None:
|
44
|
+
policy_name = DEFAULT_LB_POLICY
|
45
|
+
|
46
|
+
if policy_name not in LB_POLICIES:
|
47
|
+
raise ValueError(f'Unknown load balancing policy: {policy_name}')
|
48
|
+
return LB_POLICIES[policy_name]()
|
49
|
+
|
28
50
|
def set_ready_replicas(self, ready_replicas: List[str]) -> None:
|
29
51
|
raise NotImplementedError
|
30
52
|
|
@@ -44,7 +66,7 @@ class LoadBalancingPolicy:
|
|
44
66
|
raise NotImplementedError
|
45
67
|
|
46
68
|
|
47
|
-
class RoundRobinPolicy(LoadBalancingPolicy):
|
69
|
+
class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin', default=True):
|
48
70
|
"""Round-robin load balancing policy."""
|
49
71
|
|
50
72
|
def __init__(self) -> None:
|
sky/serve/service.py
CHANGED
@@ -219,6 +219,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
219
219
|
load_balancer_port = common_utils.find_free_port(
|
220
220
|
constants.LOAD_BALANCER_PORT_START)
|
221
221
|
|
222
|
+
# Extract the load balancing policy from the service spec
|
223
|
+
policy_name = service_spec.load_balancing_policy
|
224
|
+
|
222
225
|
# Start the load balancer.
|
223
226
|
# TODO(tian): Probably we could enable multiple ports specified in
|
224
227
|
# service spec and we could start multiple load balancers.
|
@@ -227,7 +230,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
227
230
|
target=ux_utils.RedirectOutputForProcess(
|
228
231
|
load_balancer.run_load_balancer,
|
229
232
|
load_balancer_log_file).run,
|
230
|
-
args=(controller_addr, load_balancer_port))
|
233
|
+
args=(controller_addr, load_balancer_port, policy_name))
|
231
234
|
load_balancer_process.start()
|
232
235
|
serve_state.set_service_load_balancer_port(service_name,
|
233
236
|
load_balancer_port)
|
sky/serve/service_spec.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, Optional
|
|
6
6
|
|
7
7
|
import yaml
|
8
8
|
|
9
|
+
from sky import serve
|
9
10
|
from sky.serve import constants
|
10
11
|
from sky.utils import common_utils
|
11
12
|
from sky.utils import schemas
|
@@ -29,6 +30,7 @@ class SkyServiceSpec:
|
|
29
30
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
30
31
|
upscale_delay_seconds: Optional[int] = None,
|
31
32
|
downscale_delay_seconds: Optional[int] = None,
|
33
|
+
load_balancing_policy: Optional[str] = None,
|
32
34
|
) -> None:
|
33
35
|
if max_replicas is not None and max_replicas < min_replicas:
|
34
36
|
with ux_utils.print_exception_no_traceback():
|
@@ -55,6 +57,13 @@ class SkyServiceSpec:
|
|
55
57
|
raise ValueError('readiness_path must start with a slash (/). '
|
56
58
|
f'Got: {readiness_path}')
|
57
59
|
|
60
|
+
# Add the check for unknown load balancing policies
|
61
|
+
if (load_balancing_policy is not None and
|
62
|
+
load_balancing_policy not in serve.LB_POLICIES):
|
63
|
+
with ux_utils.print_exception_no_traceback():
|
64
|
+
raise ValueError(
|
65
|
+
f'Unknown load balancing policy: {load_balancing_policy}. '
|
66
|
+
f'Available policies: {list(serve.LB_POLICIES.keys())}')
|
58
67
|
self._readiness_path: str = readiness_path
|
59
68
|
self._initial_delay_seconds: int = initial_delay_seconds
|
60
69
|
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
@@ -69,6 +78,7 @@ class SkyServiceSpec:
|
|
69
78
|
int] = base_ondemand_fallback_replicas
|
70
79
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
71
80
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
81
|
+
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
72
82
|
|
73
83
|
self._use_ondemand_fallback: bool = (
|
74
84
|
self.dynamic_ondemand_fallback is not None and
|
@@ -150,6 +160,8 @@ class SkyServiceSpec:
|
|
150
160
|
service_config['dynamic_ondemand_fallback'] = policy_section.get(
|
151
161
|
'dynamic_ondemand_fallback', None)
|
152
162
|
|
163
|
+
service_config['load_balancing_policy'] = config.get(
|
164
|
+
'load_balancing_policy', None)
|
153
165
|
return SkyServiceSpec(**service_config)
|
154
166
|
|
155
167
|
@staticmethod
|
@@ -205,6 +217,8 @@ class SkyServiceSpec:
|
|
205
217
|
self.upscale_delay_seconds)
|
206
218
|
add_if_not_none('replica_policy', 'downscale_delay_seconds',
|
207
219
|
self.downscale_delay_seconds)
|
220
|
+
add_if_not_none('load_balancing_policy', None,
|
221
|
+
self._load_balancing_policy)
|
208
222
|
return config
|
209
223
|
|
210
224
|
def probe_str(self):
|
@@ -256,6 +270,7 @@ class SkyServiceSpec:
|
|
256
270
|
Readiness probe timeout seconds: {self.readiness_timeout_seconds}
|
257
271
|
Replica autoscaling policy: {self.autoscaling_policy_str()}
|
258
272
|
Spot Policy: {self.spot_policy_str()}
|
273
|
+
Load Balancing Policy: {self.load_balancing_policy}
|
259
274
|
""")
|
260
275
|
|
261
276
|
@property
|
@@ -310,3 +325,7 @@ class SkyServiceSpec:
|
|
310
325
|
@property
|
311
326
|
def use_ondemand_fallback(self) -> bool:
|
312
327
|
return self._use_ondemand_fallback
|
328
|
+
|
329
|
+
@property
|
330
|
+
def load_balancing_policy(self) -> Optional[str]:
|
331
|
+
return self._load_balancing_policy
|
@@ -324,6 +324,15 @@ available_node_types:
|
|
324
324
|
command: ["/bin/bash", "-c", "--"]
|
325
325
|
args:
|
326
326
|
- |
|
327
|
+
# Helper function to conditionally use sudo
|
328
|
+
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
329
|
+
|
330
|
+
# Run apt update in background and log to a file
|
331
|
+
(
|
332
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
333
|
+
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
334
|
+
) &
|
335
|
+
|
327
336
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
328
337
|
|
329
338
|
# Tails file and checks every 5 sec for
|
@@ -419,7 +428,18 @@ setup_commands:
|
|
419
428
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
420
429
|
# Line 'mkdir -p ..': disable host key check
|
421
430
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
422
|
-
-
|
431
|
+
- |
|
432
|
+
PACKAGES="gcc patch pciutils rsync fuse curl";
|
433
|
+
MISSING_PACKAGES="";
|
434
|
+
for pkg in $PACKAGES; do
|
435
|
+
if ! dpkg -l | grep -q "^ii $pkg "; then
|
436
|
+
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
437
|
+
fi
|
438
|
+
done;
|
439
|
+
if [ ! -z "$MISSING_PACKAGES" ]; then
|
440
|
+
echo "Installing missing packages: $MISSING_PACKAGES";
|
441
|
+
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y $MISSING_PACKAGES;
|
442
|
+
fi;
|
423
443
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
424
444
|
{%- for initial_setup_command in initial_setup_commands %}
|
425
445
|
{{ initial_setup_command }}
|
sky/utils/schemas.py
CHANGED
@@ -308,6 +308,9 @@ def get_storage_schema():
|
|
308
308
|
|
309
309
|
def get_service_schema():
|
310
310
|
"""Schema for top-level `service:` field (for SkyServe)."""
|
311
|
+
# To avoid circular imports, only import when needed.
|
312
|
+
# pylint: disable=import-outside-toplevel
|
313
|
+
from sky.serve import load_balancing_policies
|
311
314
|
return {
|
312
315
|
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
313
316
|
'type': 'object',
|
@@ -382,6 +385,11 @@ def get_service_schema():
|
|
382
385
|
'replicas': {
|
383
386
|
'type': 'integer',
|
384
387
|
},
|
388
|
+
'load_balancing_policy': {
|
389
|
+
'type': 'string',
|
390
|
+
'case_insensitive_enum': list(
|
391
|
+
load_balancing_policies.LB_POLICIES.keys())
|
392
|
+
},
|
385
393
|
}
|
386
394
|
}
|
387
395
|
|
{skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=JxZi3opPkeceUxnwl2tlNNr19fC_0QQ_mQ9N6cSQb-Q,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
@@ -106,7 +106,7 @@ sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x
|
|
106
106
|
sky/provision/__init__.py,sha256=llAtnAAzx0TKT17B0JL_2ZiKea9RRQRxSzkWHQYqWTo,6292
|
107
107
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
108
108
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
109
|
-
sky/provision/docker_utils.py,sha256=
|
109
|
+
sky/provision/docker_utils.py,sha256=l4AMzwXGZd8RyNq8AwOaKV9bFSofLYfSyj2NBhkXYsY,19200
|
110
110
|
sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
|
111
111
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
112
112
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
@@ -137,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
|
|
137
137
|
sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
|
138
138
|
sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
|
139
139
|
sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
|
140
|
-
sky/provision/kubernetes/instance.py,sha256=
|
140
|
+
sky/provision/kubernetes/instance.py,sha256=MFtTh-dNIuTZcHD20PQG_QuULFRFaPxwlUczR6sRnsk,43601
|
141
141
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
142
142
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
143
143
|
sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
|
@@ -175,18 +175,18 @@ sky/provision/vsphere/common/service_manager_factory.py,sha256=YkvfHiRXFK_Nb406z
|
|
175
175
|
sky/provision/vsphere/common/ssl_helper.py,sha256=TYzN9K0i_Mk_17PKGyGPgvOGfoizysuuIeYapcy_tWE,795
|
176
176
|
sky/provision/vsphere/common/vapiconnect.py,sha256=R2I1ZWBA19d11fZ_FrIzQT8E1aLl1HU4Rdcj8Z5r3NE,2932
|
177
177
|
sky/provision/vsphere/common/vim_utils.py,sha256=EMWLS8ILpdx6XwUZ9I53y0B_1yFrRrlr4jjIMT84hAc,17877
|
178
|
-
sky/serve/__init__.py,sha256=
|
178
|
+
sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
|
179
179
|
sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
|
180
180
|
sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
|
181
181
|
sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
|
182
182
|
sky/serve/core.py,sha256=hszs95BwtC4wIJujGNokvFC46VjojgRz1BbYOIIPh6k,31601
|
183
|
-
sky/serve/load_balancer.py,sha256=
|
184
|
-
sky/serve/load_balancing_policies.py,sha256=
|
183
|
+
sky/serve/load_balancer.py,sha256=I4W66eh1t1kA_C_VaMPI76WeDTCl3Z6rFxF6rQIWd6E,12636
|
184
|
+
sky/serve/load_balancing_policies.py,sha256=_k4tkwIvhulR02Ln9ixYB_b97KOypr2xfSjMx8_zky0,3143
|
185
185
|
sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
|
186
186
|
sky/serve/serve_state.py,sha256=Q7De4GoBEPxlN_t1Lpn-Y1fd94SeHZ3E-94f1OTuhpc,19086
|
187
187
|
sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,39431
|
188
|
-
sky/serve/service.py,sha256=
|
189
|
-
sky/serve/service_spec.py,sha256=
|
188
|
+
sky/serve/service.py,sha256=gVem2vX8XuR_1wTqwrzbszQAbjzjDP2ddd787aynT9g,12017
|
189
|
+
sky/serve/service_spec.py,sha256=34dMQ37INHltBzWaxHl3y_o9X3wLOCWA5jUhmhH1II4,14740
|
190
190
|
sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
|
191
191
|
sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
|
192
192
|
sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
|
@@ -228,7 +228,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjA
|
|
228
228
|
sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
|
229
229
|
sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
|
230
230
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
|
231
|
-
sky/templates/kubernetes-ray.yml.j2,sha256=
|
231
|
+
sky/templates/kubernetes-ray.yml.j2,sha256=Ek6nePe_IP1b0mqMLnbyjp7wpo1-kwranD_AFRXJ9tU,19152
|
232
232
|
sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
|
233
233
|
sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
|
234
234
|
sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
|
@@ -257,7 +257,7 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
|
|
257
257
|
sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
|
258
258
|
sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
|
259
259
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
260
|
-
sky/utils/schemas.py,sha256=
|
260
|
+
sky/utils/schemas.py,sha256=67LK87wBywblIyF-QgG5hgL1BvBuHsxeQLQBO0M5OH4,29447
|
261
261
|
sky/utils/subprocess_utils.py,sha256=mMFCTfxbyav5LJ1epJJXkgfFYmd828naTOMVfYjuEWY,6905
|
262
262
|
sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
263
263
|
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
@@ -275,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
275
275
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
276
276
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
277
277
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
283
|
-
skypilot_nightly-1.0.0.
|
278
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
279
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/METADATA,sha256=ILiS9hM4X6WG3syvXek7BxYF7SvXnZ9o8h5bmcFL2sI,19708
|
280
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
281
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
282
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
283
|
+
skypilot_nightly-1.0.0.dev20241111.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241110.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|