skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250430__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +2 -0
- sky/cli.py +90 -37
- sky/client/cli.py +90 -37
- sky/client/sdk.py +3 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/kubernetes.py +4 -4
- sky/clouds/nebius.py +16 -10
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/optimizer.py +35 -11
- sky/provision/docker_utils.py +22 -16
- sky/provision/kubernetes/utils.py +26 -24
- sky/resources.py +1 -1
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/setup_files/setup.py +1 -1
- sky/skylet/constants.py +18 -0
- sky/skypilot_config.py +32 -11
- sky/templates/aws-ray.yml.j2 +2 -1
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +1 -1
- sky/templates/ibm-ray.yml.j2 +3 -3
- sky/templates/kubernetes-ray.yml.j2 +26 -14
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +64 -0
- sky/templates/oci-ray.yml.j2 +1 -1
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/vast-ray.yml.j2 +1 -1
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/utils/aws/__init__.py +0 -0
- sky/utils/aws/get_default_security_group.py +11 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/METADATA +3 -3
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/RECORD +58 -55
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/top_level.txt +0 -0
sky/provision/docker_utils.py
CHANGED
@@ -28,6 +28,9 @@ SETUP_ENV_VARS_CMD = (
|
|
28
28
|
# the command.
|
29
29
|
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
|
30
30
|
'the Docker daemon socket')
|
31
|
+
|
32
|
+
DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
33
|
+
|
31
34
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
32
35
|
|
33
36
|
|
@@ -173,22 +176,25 @@ class DockerInitializer:
|
|
173
176
|
stream_logs=False,
|
174
177
|
separate_stderr=separate_stderr,
|
175
178
|
log_path=self.log_path)
|
176
|
-
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr
|
177
|
-
|
178
|
-
if
|
179
|
-
if
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
179
|
+
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
|
180
|
+
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
|
181
|
+
if wait_for_docker_daemon:
|
182
|
+
if time.time(
|
183
|
+
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
|
184
|
+
if rc == 0:
|
185
|
+
# Set returncode to 1 if failed to connect to docker
|
186
|
+
# daemon after timeout.
|
187
|
+
rc = 1
|
188
|
+
break
|
189
|
+
# Close the cached connection to make the permission update
|
190
|
+
# of ssh user take effect, e.g. usermod -aG docker $USER,
|
191
|
+
# called by cloud-init of Azure.
|
192
|
+
self.runner.close_cached_connection()
|
193
|
+
logger.info(
|
194
|
+
'Failed to connect to docker daemon. It might be '
|
195
|
+
'initializing, retrying in 5 seconds...')
|
196
|
+
time.sleep(5)
|
197
|
+
continue
|
192
198
|
break
|
193
199
|
subprocess_utils.handle_returncode(
|
194
200
|
rc,
|
@@ -243,7 +243,7 @@ class GPULabelFormatter:
|
|
243
243
|
raise NotImplementedError
|
244
244
|
|
245
245
|
@classmethod
|
246
|
-
def
|
246
|
+
def get_label_values(cls, accelerator: str) -> List[str]:
|
247
247
|
"""Given a GPU type, returns the label value to be used"""
|
248
248
|
raise NotImplementedError
|
249
249
|
|
@@ -311,10 +311,10 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
|
|
311
311
|
return [cls.LABEL_KEY]
|
312
312
|
|
313
313
|
@classmethod
|
314
|
-
def
|
314
|
+
def get_label_values(cls, accelerator: str) -> List[str]:
|
315
315
|
# For SkyPilot formatter, we use the accelerator str directly.
|
316
316
|
# See sky.utils.kubernetes.gpu_labeler.
|
317
|
-
return accelerator.lower()
|
317
|
+
return [accelerator.lower()]
|
318
318
|
|
319
319
|
@classmethod
|
320
320
|
def match_label_key(cls, label_key: str) -> bool:
|
@@ -351,8 +351,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
351
351
|
return [cls.LABEL_KEY]
|
352
352
|
|
353
353
|
@classmethod
|
354
|
-
def
|
355
|
-
return accelerator.upper()
|
354
|
+
def get_label_values(cls, accelerator: str) -> List[str]:
|
355
|
+
return [accelerator.upper()]
|
356
356
|
|
357
357
|
@classmethod
|
358
358
|
def match_label_key(cls, label_key: str) -> bool:
|
@@ -438,8 +438,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
438
438
|
return count_to_topology
|
439
439
|
|
440
440
|
@classmethod
|
441
|
-
def
|
442
|
-
return get_gke_accelerator_name(accelerator)
|
441
|
+
def get_label_values(cls, accelerator: str) -> List[str]:
|
442
|
+
return [get_gke_accelerator_name(accelerator)]
|
443
443
|
|
444
444
|
@classmethod
|
445
445
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
@@ -472,7 +472,7 @@ class GFDLabelFormatter(GPULabelFormatter):
|
|
472
472
|
https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
|
473
473
|
|
474
474
|
This LabelFormatter can't be used in autoscaling clusters since accelerators
|
475
|
-
may map to multiple label, so we're not implementing `
|
475
|
+
may map to multiple label, so we're not implementing `get_label_values`
|
476
476
|
"""
|
477
477
|
|
478
478
|
LABEL_KEY = 'nvidia.com/gpu.product'
|
@@ -486,10 +486,10 @@ class GFDLabelFormatter(GPULabelFormatter):
|
|
486
486
|
return [cls.LABEL_KEY]
|
487
487
|
|
488
488
|
@classmethod
|
489
|
-
def
|
490
|
-
|
491
|
-
(e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
|
492
|
-
|
489
|
+
def get_label_values(cls, accelerator: str) -> List[str]:
|
490
|
+
# An accelerator can map to many Nvidia GFD labels
|
491
|
+
# (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
|
492
|
+
# TODO implement get_label_values for GFDLabelFormatter
|
493
493
|
raise NotImplementedError
|
494
494
|
|
495
495
|
@classmethod
|
@@ -1032,15 +1032,17 @@ def check_instance_fits(context: Optional[str],
|
|
1032
1032
|
# met.
|
1033
1033
|
assert acc_count is not None, (acc_type, acc_count)
|
1034
1034
|
try:
|
1035
|
-
gpu_label_key,
|
1036
|
-
|
1035
|
+
gpu_label_key, gpu_label_values, _, _ = (
|
1036
|
+
get_accelerator_label_key_values(context, acc_type, acc_count))
|
1037
|
+
if gpu_label_values is None:
|
1038
|
+
gpu_label_values = []
|
1037
1039
|
except exceptions.ResourcesUnavailableError as e:
|
1038
1040
|
# If GPU not found, return empty list and error message.
|
1039
1041
|
return False, str(e)
|
1040
1042
|
# Get the set of nodes that have the GPU type
|
1041
1043
|
gpu_nodes = [
|
1042
1044
|
node for node in nodes if gpu_label_key in node.metadata.labels and
|
1043
|
-
node.metadata.labels[gpu_label_key]
|
1045
|
+
node.metadata.labels[gpu_label_key] in gpu_label_values
|
1044
1046
|
]
|
1045
1047
|
if not gpu_nodes:
|
1046
1048
|
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
@@ -1082,12 +1084,12 @@ def check_instance_fits(context: Optional[str],
|
|
1082
1084
|
return fits, reason
|
1083
1085
|
|
1084
1086
|
|
1085
|
-
def
|
1087
|
+
def get_accelerator_label_key_values(
|
1086
1088
|
context: Optional[str],
|
1087
1089
|
acc_type: str,
|
1088
1090
|
acc_count: int,
|
1089
1091
|
check_mode=False
|
1090
|
-
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
1092
|
+
) -> Tuple[Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
|
1091
1093
|
"""Returns the label key and value for the given GPU/TPU type.
|
1092
1094
|
|
1093
1095
|
Args:
|
@@ -1141,7 +1143,7 @@ def get_accelerator_label_key_value(
|
|
1141
1143
|
tpu_topology_label_key = formatter.get_tpu_topology_label_key()
|
1142
1144
|
tpu_topology_label_value = formatter.get_tpu_topology_label_value(
|
1143
1145
|
acc_type, acc_count)
|
1144
|
-
return formatter.get_label_key(acc_type), formatter.
|
1146
|
+
return formatter.get_label_key(acc_type), formatter.get_label_values(
|
1145
1147
|
acc_type), tpu_topology_label_key, tpu_topology_label_value
|
1146
1148
|
|
1147
1149
|
has_gpus, cluster_resources = detect_accelerator_resource(context)
|
@@ -1220,12 +1222,12 @@ def get_accelerator_label_key_value(
|
|
1220
1222
|
# different topologies that maps to identical
|
1221
1223
|
# number of TPU chips.
|
1222
1224
|
if tpu_topology_chip_count == acc_count:
|
1223
|
-
return (label, value, topology_label_key,
|
1225
|
+
return (label, [value], topology_label_key,
|
1224
1226
|
topology_value)
|
1225
1227
|
else:
|
1226
1228
|
continue
|
1227
1229
|
else:
|
1228
|
-
return label, value, None, None
|
1230
|
+
return label, [value], None, None
|
1229
1231
|
|
1230
1232
|
# If no node is found with the requested acc_type, raise error
|
1231
1233
|
with ux_utils.print_exception_no_traceback():
|
@@ -1387,10 +1389,10 @@ def check_credentials(context: Optional[str],
|
|
1387
1389
|
# `get_unlabeled_accelerator_nodes`.
|
1388
1390
|
# Therefore, if `get_unlabeled_accelerator_nodes` detects unlabelled
|
1389
1391
|
# nodes, we skip this check.
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1392
|
+
get_accelerator_label_key_values(context,
|
1393
|
+
acc_type='',
|
1394
|
+
acc_count=0,
|
1395
|
+
check_mode=True)
|
1394
1396
|
except exceptions.ResourcesUnavailableError as e:
|
1395
1397
|
# If GPUs are not available, we return cluster as enabled
|
1396
1398
|
# (since it can be a CPU-only cluster) but we also return the
|
sky/resources.py
CHANGED
@@ -1707,7 +1707,7 @@ class Resources:
|
|
1707
1707
|
# multiple contexts, we now set the region to the context name.
|
1708
1708
|
# Since we do not have information on which context the cluster
|
1709
1709
|
# was run in, we default it to the current active context.
|
1710
|
-
legacy_region =
|
1710
|
+
legacy_region = 'kubernetes'
|
1711
1711
|
original_cloud = state.get('_cloud', None)
|
1712
1712
|
original_region = state.get('_region', None)
|
1713
1713
|
if (isinstance(original_cloud, clouds.Kubernetes) and
|
sky/server/common.py
CHANGED
@@ -333,7 +333,7 @@ def _start_api_server(deploy: bool = False,
|
|
333
333
|
break
|
334
334
|
|
335
335
|
server_url = get_server_url(host)
|
336
|
-
dashboard_msg =
|
336
|
+
dashboard_msg = ''
|
337
337
|
api_server_info = get_api_server_status(server_url)
|
338
338
|
if api_server_info.version == _DEV_VERSION:
|
339
339
|
dashboard_msg += (
|
@@ -343,12 +343,15 @@ def _start_api_server(deploy: bool = False,
|
|
343
343
|
dashboard_msg += (
|
344
344
|
'Dashboard is not built, '
|
345
345
|
'to build: npm --prefix sky/dashboard install '
|
346
|
-
'&& npm --prefix sky/dashboard run build')
|
346
|
+
'&& npm --prefix sky/dashboard run build\n')
|
347
347
|
else:
|
348
348
|
dashboard_msg += (
|
349
349
|
'Dashboard may be stale when installed from source, '
|
350
350
|
'to rebuild: npm --prefix sky/dashboard install '
|
351
|
-
'&& npm --prefix sky/dashboard run build')
|
351
|
+
'&& npm --prefix sky/dashboard run build\n')
|
352
|
+
dashboard_msg += (
|
353
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
354
|
+
f'Dashboard: {get_dashboard_url(server_url)}')
|
352
355
|
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
353
356
|
logger.info(
|
354
357
|
ux_utils.finishing_message(
|
sky/server/config.py
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
"""SkyPilot API Server configuration."""
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
import enum
|
5
|
+
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.server import constants as server_constants
|
8
|
+
from sky.utils import common_utils
|
9
|
+
|
10
|
+
# Constants based on profiling the peak memory usage while serving various
|
11
|
+
# sky commands. These estimation are highly related to usage patterns
|
12
|
+
# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
|
13
|
+
# the profiling covers major clouds and common usage patterns. For user has
|
14
|
+
# deviated usage pattern, they can override the default estimation by
|
15
|
+
# environment variables.
|
16
|
+
# NOTE(dev): update these constants for each release according to the load
|
17
|
+
# test results.
|
18
|
+
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
19
|
+
# automatically tune parallelism at runtime according to system usage stats
|
20
|
+
# in the future.
|
21
|
+
_LONG_WORKER_MEM_GB = 0.4
|
22
|
+
_SHORT_WORKER_MEM_GB = 0.25
|
23
|
+
# To control the number of long workers.
|
24
|
+
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
25
|
+
# Limit the number of long workers of local API server, since local server is
|
26
|
+
# typically:
|
27
|
+
# 1. launched automatically in an environment with high resource contention
|
28
|
+
# (e.g. Laptop)
|
29
|
+
# 2. used by a single user
|
30
|
+
_MAX_LONG_WORKERS_LOCAL = 4
|
31
|
+
# Percentage of memory for long requests
|
32
|
+
# from the memory reserved for SkyPilot.
|
33
|
+
# This is to reserve some memory for short requests.
|
34
|
+
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
35
|
+
# Minimal number of long workers to ensure responsiveness.
|
36
|
+
_MIN_LONG_WORKERS = 1
|
37
|
+
# Minimal number of short workers, there is a daemon task running on short
|
38
|
+
# workers so at least 2 workers are needed to ensure responsiveness.
|
39
|
+
_MIN_SHORT_WORKERS = 2
|
40
|
+
|
41
|
+
# Default number of burstable workers for local API server. A heuristic number
|
42
|
+
# that is large enough for most local cases.
|
43
|
+
# TODO(aylei): the number of burstable workers should be auto-tuned based on the
|
44
|
+
# system usage stats.
|
45
|
+
_BURSTABLE_WORKERS_FOR_LOCAL = 1024
|
46
|
+
|
47
|
+
logger = sky_logging.init_logger(__name__)
|
48
|
+
|
49
|
+
|
50
|
+
class QueueBackend(enum.Enum):
|
51
|
+
# Local queue backend serves queues in each process locally, which has
|
52
|
+
# lower resource usage but the consumer must be in the same process, i.e.
|
53
|
+
# this only works in single-process mode.
|
54
|
+
LOCAL = 'local'
|
55
|
+
# Multi-process queue backend starts a dedicated process for serving queues.
|
56
|
+
MULTIPROCESSING = 'multiprocessing'
|
57
|
+
# TODO(zhwu): we can add redis backend in the future.
|
58
|
+
|
59
|
+
|
60
|
+
@dataclasses.dataclass
|
61
|
+
class WorkerConfig:
|
62
|
+
garanteed_parallelism: int
|
63
|
+
burstable_parallelism: int
|
64
|
+
|
65
|
+
|
66
|
+
@dataclasses.dataclass
|
67
|
+
class ServerConfig:
|
68
|
+
num_server_workers: int
|
69
|
+
long_worker_config: WorkerConfig
|
70
|
+
short_worker_config: WorkerConfig
|
71
|
+
queue_backend: QueueBackend
|
72
|
+
|
73
|
+
|
74
|
+
def compute_server_config(deploy: bool) -> ServerConfig:
|
75
|
+
"""Compute the server config based on environment.
|
76
|
+
|
77
|
+
We have different assumptions for the resources in different deployment
|
78
|
+
modes, which leads to different worker setups:
|
79
|
+
|
80
|
+
- Deployment mode (deploy=True), we assume the resources are dedicated to
|
81
|
+
the API server and the resources will be tuned for serious use cases, so:
|
82
|
+
- Use multiprocessing queue backend and dedicated workers processes to
|
83
|
+
avoid GIL contention.
|
84
|
+
- Parallelism (number of executor processes) is fixed and executor
|
85
|
+
processes have same lifecycle with the server, which ensures
|
86
|
+
best-effort cache reusing and stable resources consumption.
|
87
|
+
- Reject to start in low resource environments, to avoid flaky
|
88
|
+
deployments.
|
89
|
+
- Local mode (deploy=False), we assume the server is running in a shared
|
90
|
+
environment (e.g. laptop) and users typically do not pay attention to
|
91
|
+
the resource setup of the server. Moreover, existing users may expect
|
92
|
+
some consistent behaviors with old versions, i.e. before API server was
|
93
|
+
introduced, so:
|
94
|
+
- The max number of long-running executor processes are limited, to avoid
|
95
|
+
high memory consumption when the server is idle.
|
96
|
+
- Allow burstable workers to handle requests when all long-running
|
97
|
+
workers are busy, which mimics the behavior of local sky CLI before
|
98
|
+
API server was introduced.
|
99
|
+
- Works in low resources environments, and further reduce the memory
|
100
|
+
consumption in low resource environments.
|
101
|
+
|
102
|
+
Note that there is still significant overhead for SDK users when migrate to
|
103
|
+
local API server. Since the users are free to run sky operations in Threads
|
104
|
+
when using SDK but all client operations will occupy at least one worker
|
105
|
+
process after API server was introduced.
|
106
|
+
"""
|
107
|
+
cpu_count = common_utils.get_cpu_count()
|
108
|
+
mem_size_gb = common_utils.get_mem_size_gb()
|
109
|
+
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
110
|
+
mem_size_gb,
|
111
|
+
local=not deploy)
|
112
|
+
max_parallel_for_short = _max_short_worker_parallism(
|
113
|
+
mem_size_gb, max_parallel_for_long)
|
114
|
+
queue_backend = QueueBackend.MULTIPROCESSING
|
115
|
+
burstable_parallel_for_long = 0
|
116
|
+
burstable_parallel_for_short = 0
|
117
|
+
num_server_workers = cpu_count
|
118
|
+
if not deploy:
|
119
|
+
# For local mode, use local queue backend since we only run 1 uvicorn
|
120
|
+
# worker in local mode and no multiprocessing is needed.
|
121
|
+
num_server_workers = 1
|
122
|
+
queue_backend = QueueBackend.LOCAL
|
123
|
+
# Enable burstable workers for local API server.
|
124
|
+
burstable_parallel_for_long = _BURSTABLE_WORKERS_FOR_LOCAL
|
125
|
+
burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
|
126
|
+
# Runs in low resource mode if the available memory is less than
|
127
|
+
# server_constants.MIN_AVAIL_MEM_GB.
|
128
|
+
if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
|
129
|
+
# Permanent worker process may have significant memory consumption
|
130
|
+
# (~350MB per worker) after running commands like `sky check`, so we
|
131
|
+
# don't start any permanent workers in low resource local mode. This
|
132
|
+
# mimics the behavior of local sky CLI before API server was
|
133
|
+
# introduced, where the CLI will start new process everytime and
|
134
|
+
# never reject to start due to resource constraints.
|
135
|
+
# Note that the refresh daemon will still occupy one worker
|
136
|
+
# permanently because it never exits.
|
137
|
+
max_parallel_for_long = 0
|
138
|
+
max_parallel_for_short = 0
|
139
|
+
logger.warning(
|
140
|
+
'SkyPilot API server will run in low resource mode because '
|
141
|
+
'the available memory is less than '
|
142
|
+
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
143
|
+
logger.info(
|
144
|
+
f'SkyPilot API server will start {num_server_workers} server processes '
|
145
|
+
f'with {max_parallel_for_long} background workers for long requests '
|
146
|
+
f'and will allow at max {max_parallel_for_short} short requests in '
|
147
|
+
f'parallel.')
|
148
|
+
return ServerConfig(
|
149
|
+
num_server_workers=num_server_workers,
|
150
|
+
queue_backend=queue_backend,
|
151
|
+
long_worker_config=WorkerConfig(
|
152
|
+
garanteed_parallelism=max_parallel_for_long,
|
153
|
+
burstable_parallelism=burstable_parallel_for_long),
|
154
|
+
short_worker_config=WorkerConfig(
|
155
|
+
garanteed_parallelism=max_parallel_for_short,
|
156
|
+
burstable_parallelism=burstable_parallel_for_short),
|
157
|
+
)
|
158
|
+
|
159
|
+
|
160
|
+
def _max_long_worker_parallism(cpu_count: int,
|
161
|
+
mem_size_gb: float,
|
162
|
+
local=False) -> int:
|
163
|
+
"""Max parallelism for long workers."""
|
164
|
+
# Reserve min available memory to avoid OOM.
|
165
|
+
available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
166
|
+
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
167
|
+
mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
168
|
+
_LONG_WORKER_MEM_GB)
|
169
|
+
n = max(_MIN_LONG_WORKERS,
|
170
|
+
min(cpu_based_max_parallel, mem_based_max_parallel))
|
171
|
+
if local:
|
172
|
+
return min(n, _MAX_LONG_WORKERS_LOCAL)
|
173
|
+
return n
|
174
|
+
|
175
|
+
|
176
|
+
def _max_short_worker_parallism(mem_size_gb: float,
|
177
|
+
long_worker_parallism: int) -> int:
|
178
|
+
"""Max parallelism for short workers."""
|
179
|
+
# Reserve memory for long workers and min available memory.
|
180
|
+
reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
|
181
|
+
_LONG_WORKER_MEM_GB)
|
182
|
+
available_mem = max(0, mem_size_gb - reserved_mem)
|
183
|
+
n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
|
184
|
+
return n
|
sky/server/requests/executor.py
CHANGED
@@ -19,7 +19,6 @@ The number of the workers is determined by the system resources.
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
20
|
"""
|
21
21
|
import contextlib
|
22
|
-
import enum
|
23
22
|
import multiprocessing
|
24
23
|
import os
|
25
24
|
import queue as queue_lib
|
@@ -37,6 +36,7 @@ from sky import models
|
|
37
36
|
from sky import sky_logging
|
38
37
|
from sky import skypilot_config
|
39
38
|
from sky.server import common as server_common
|
39
|
+
from sky.server import config as server_config
|
40
40
|
from sky.server import constants as server_constants
|
41
41
|
from sky.server.requests import payloads
|
42
42
|
from sky.server.requests import preconditions
|
@@ -70,53 +70,6 @@ logger = sky_logging.init_logger(__name__)
|
|
70
70
|
# platforms, including macOS.
|
71
71
|
multiprocessing.set_start_method('spawn', force=True)
|
72
72
|
|
73
|
-
# Constants based on profiling the peak memory usage while serving various
|
74
|
-
# sky commands. These estimation are highly related to usage patterns
|
75
|
-
# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
|
76
|
-
# the profiling covers major clouds and common usage patterns. For user has
|
77
|
-
# deviated usage pattern, they can override the default estimation by
|
78
|
-
# environment variables.
|
79
|
-
# NOTE(dev): update these constants for each release according to the load
|
80
|
-
# test results.
|
81
|
-
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
82
|
-
# automatically tune parallelism at runtime according to system usage stats
|
83
|
-
# in the future.
|
84
|
-
_LONG_WORKER_MEM_GB = 0.4
|
85
|
-
_SHORT_WORKER_MEM_GB = 0.25
|
86
|
-
# To control the number of long workers.
|
87
|
-
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
88
|
-
# Limit the number of long workers of local API server, since local server is
|
89
|
-
# typically:
|
90
|
-
# 1. launched automatically in an environment with high resource contention
|
91
|
-
# (e.g. Laptop)
|
92
|
-
# 2. used by a single user
|
93
|
-
_MAX_LONG_WORKERS_LOCAL = 4
|
94
|
-
# Percentage of memory for long requests
|
95
|
-
# from the memory reserved for SkyPilot.
|
96
|
-
# This is to reserve some memory for short requests.
|
97
|
-
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
98
|
-
# Minimal number of long workers to ensure responsiveness.
|
99
|
-
_MIN_LONG_WORKERS = 1
|
100
|
-
# Minimal number of short workers, there is a daemon task running on short
|
101
|
-
# workers so at least 2 workers are needed to ensure responsiveness.
|
102
|
-
_MIN_SHORT_WORKERS = 2
|
103
|
-
|
104
|
-
# Default number of burstable workers for local API server. A heuristic number
|
105
|
-
# that is large enough for most local cases.
|
106
|
-
# TODO(aylei): the number of burstable workers should be auto-tuned based on the
|
107
|
-
# system usage stats.
|
108
|
-
_BURSTABLE_WORKERS_FOR_LOCAL = 1024
|
109
|
-
|
110
|
-
|
111
|
-
class QueueBackend(enum.Enum):
|
112
|
-
# Local queue backend serves queues in each process locally, which has
|
113
|
-
# lower resource usage but the consumer must be in the same process, i.e.
|
114
|
-
# this only works in single-process mode.
|
115
|
-
LOCAL = 'local'
|
116
|
-
# Multi-process queue backend starts a dedicated process for serving queues.
|
117
|
-
MULTIPROCESSING = 'multiprocessing'
|
118
|
-
# TODO(zhwu): we can add redis backend in the future.
|
119
|
-
|
120
73
|
|
121
74
|
class RequestQueue:
|
122
75
|
"""The queue for the requests, either redis or multiprocessing.
|
@@ -126,12 +79,12 @@ class RequestQueue:
|
|
126
79
|
|
127
80
|
def __init__(self,
|
128
81
|
schedule_type: api_requests.ScheduleType,
|
129
|
-
backend: Optional[QueueBackend] = None) -> None:
|
82
|
+
backend: Optional[server_config.QueueBackend] = None) -> None:
|
130
83
|
self.name = schedule_type.value
|
131
84
|
self.backend = backend
|
132
|
-
if backend == QueueBackend.MULTIPROCESSING:
|
85
|
+
if backend == server_config.QueueBackend.MULTIPROCESSING:
|
133
86
|
self.queue = mp_queue.get_queue(self.name)
|
134
|
-
elif backend == QueueBackend.LOCAL:
|
87
|
+
elif backend == server_config.QueueBackend.LOCAL:
|
135
88
|
self.queue = local_queue.get_queue(self.name)
|
136
89
|
else:
|
137
90
|
raise RuntimeError(f'Invalid queue backend: {backend}')
|
@@ -162,7 +115,7 @@ class RequestQueue:
|
|
162
115
|
return self.queue.qsize()
|
163
116
|
|
164
117
|
|
165
|
-
queue_backend = QueueBackend.MULTIPROCESSING
|
118
|
+
queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
166
119
|
|
167
120
|
|
168
121
|
def executor_initializer(proc_group: str):
|
@@ -186,13 +139,11 @@ class RequestWorker:
|
|
186
139
|
# if there are available CPU/memory resources.
|
187
140
|
burstable_parallelism: int = 0
|
188
141
|
|
189
|
-
def __init__(self,
|
190
|
-
|
191
|
-
garanteed_parallelism: int,
|
192
|
-
burstable_parallelism: int = 0) -> None:
|
142
|
+
def __init__(self, schedule_type: api_requests.ScheduleType,
|
143
|
+
config: server_config.WorkerConfig) -> None:
|
193
144
|
self.schedule_type = schedule_type
|
194
|
-
self.garanteed_parallelism = garanteed_parallelism
|
195
|
-
self.burstable_parallelism = burstable_parallelism
|
145
|
+
self.garanteed_parallelism = config.garanteed_parallelism
|
146
|
+
self.burstable_parallelism = config.burstable_parallelism
|
196
147
|
|
197
148
|
def __str__(self) -> str:
|
198
149
|
return f'Worker(schedule_type={self.schedule_type.value})'
|
@@ -455,80 +406,17 @@ def schedule_request(
|
|
455
406
|
enqueue()
|
456
407
|
|
457
408
|
|
458
|
-
def start(
|
409
|
+
def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
459
410
|
"""Start the request workers.
|
460
411
|
|
461
412
|
Request workers run in background, schedule the requests and delegate the
|
462
|
-
request execution to executor processes.
|
463
|
-
the resources in different deployment modes, which leads to different
|
464
|
-
worker setups:
|
465
|
-
|
466
|
-
- Deployment mode (deploy=True), we assume the resources are dedicated to
|
467
|
-
the API server and the resources will be tuned for serious use cases, so:
|
468
|
-
- Use multiprocessing queue backend and dedicated workers processes to
|
469
|
-
avoid GIL contention.
|
470
|
-
- Parallelism (number of executor processes) is fixed and executor
|
471
|
-
processes have same lifecycle with the server, which ensures
|
472
|
-
best-effort cache reusing and stable resources consumption.
|
473
|
-
- Reject to start in low resource environments, to avoid flaky
|
474
|
-
deployments.
|
475
|
-
- Local mode (deploy=False), we assume the server is running in a shared
|
476
|
-
environment (e.g. laptop) and users typically do not pay attention to
|
477
|
-
the resource setup of the server. Moreover, existing users may expect
|
478
|
-
some consistent behaviors with old versions, i.e. before API server was
|
479
|
-
introduced, so:
|
480
|
-
- The max number of long-running executor processes are limited, to avoid
|
481
|
-
high memory consumption when the server is idle.
|
482
|
-
- Allow burstable workers to handle requests when all long-running
|
483
|
-
workers are busy, which mimics the behavior of local sky CLI before
|
484
|
-
API server was introduced.
|
485
|
-
- Works in low resources environments, and further reduce the memory
|
486
|
-
consumption in low resource environments.
|
487
|
-
|
488
|
-
Note that there is still significant overhead for SDK users when migrate to
|
489
|
-
local API server. Since the users are free to run sky operations in Threads
|
490
|
-
when using SDK but all client operations will occupy at least one worker
|
491
|
-
process after API server was introduced.
|
413
|
+
request execution to executor processes.
|
492
414
|
"""
|
493
|
-
|
494
|
-
|
495
|
-
mem_size_gb = common_utils.get_mem_size_gb()
|
496
|
-
mem_for_workers = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
497
|
-
# Runs in low resource mode if the available memory is less than
|
498
|
-
# server_constants.MIN_AVAIL_MEM_GB.
|
499
|
-
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
500
|
-
mem_for_workers,
|
501
|
-
local=not deploy)
|
502
|
-
max_parallel_for_short = _max_short_worker_parallism(
|
503
|
-
mem_for_workers, max_parallel_for_long)
|
504
|
-
if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
|
505
|
-
# Permanent worker process may have significant memory consumption
|
506
|
-
# (~350MB per worker) after running commands like `sky check`, so we
|
507
|
-
# don't start any permanent workers in low resource local mode. This
|
508
|
-
# mimics the behavior of local sky CLI before API server was
|
509
|
-
# introduced, where the CLI will start new process everytime and
|
510
|
-
# never reject to start due to resource constraints.
|
511
|
-
# Note that the refresh daemon will still occupy one worker
|
512
|
-
# permanently because it never exits.
|
513
|
-
max_parallel_for_long = 0
|
514
|
-
max_parallel_for_short = 0
|
515
|
-
logger.warning(
|
516
|
-
'SkyPilot API server will run in low resource mode because '
|
517
|
-
'the available memory is less than '
|
518
|
-
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
519
|
-
else:
|
520
|
-
logger.info(
|
521
|
-
f'SkyPilot API server will start {max_parallel_for_long} workers '
|
522
|
-
f'for long requests and will allow at max '
|
523
|
-
f'{max_parallel_for_short} short requests in parallel.')
|
524
|
-
if not deploy:
|
525
|
-
# For local mode, use local queue backend since we only run 1 uvicorn
|
526
|
-
# worker in local mode.
|
527
|
-
global queue_backend
|
528
|
-
queue_backend = QueueBackend.LOCAL
|
415
|
+
global queue_backend
|
416
|
+
queue_backend = config.queue_backend
|
529
417
|
sub_procs = []
|
530
418
|
# Setup the queues.
|
531
|
-
if queue_backend == QueueBackend.MULTIPROCESSING:
|
419
|
+
if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
|
532
420
|
logger.info('Creating shared request queues')
|
533
421
|
queue_names = [
|
534
422
|
schedule_type.value for schedule_type in api_requests.ScheduleType
|
@@ -547,7 +435,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
547
435
|
mp_queue.wait_for_queues_to_be_ready(queue_names,
|
548
436
|
queue_server,
|
549
437
|
port=port)
|
550
|
-
elif queue_backend == QueueBackend.LOCAL:
|
438
|
+
elif queue_backend == server_config.QueueBackend.LOCAL:
|
551
439
|
# No setup is needed for local queue backend.
|
552
440
|
pass
|
553
441
|
else:
|
@@ -563,40 +451,13 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
563
451
|
thread = threading.Thread(target=worker.run, daemon=True)
|
564
452
|
thread.start()
|
565
453
|
|
566
|
-
burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
|
567
454
|
# Start a worker for long requests.
|
568
455
|
long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
|
569
|
-
|
570
|
-
burstable_parallelism=burstable_parallelism)
|
456
|
+
config=config.long_worker_config)
|
571
457
|
run_worker_in_background(long_worker)
|
572
458
|
|
573
459
|
# Start a worker for short requests.
|
574
460
|
short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
|
575
|
-
|
576
|
-
burstable_parallelism=burstable_parallelism)
|
461
|
+
config=config.short_worker_config)
|
577
462
|
run_worker_in_background(short_worker)
|
578
463
|
return sub_procs
|
579
|
-
|
580
|
-
|
581
|
-
@annotations.lru_cache(scope='global', maxsize=1)
|
582
|
-
def _max_long_worker_parallism(cpu_count: int,
|
583
|
-
mem_size_gb: float,
|
584
|
-
local=False) -> int:
|
585
|
-
"""Max parallelism for long workers."""
|
586
|
-
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
587
|
-
mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
588
|
-
_LONG_WORKER_MEM_GB)
|
589
|
-
n = max(_MIN_LONG_WORKERS,
|
590
|
-
min(cpu_based_max_parallel, mem_based_max_parallel))
|
591
|
-
if local:
|
592
|
-
return min(n, _MAX_LONG_WORKERS_LOCAL)
|
593
|
-
return n
|
594
|
-
|
595
|
-
|
596
|
-
@annotations.lru_cache(scope='global', maxsize=1)
|
597
|
-
def _max_short_worker_parallism(mem_size_gb: float,
|
598
|
-
long_worker_parallism: int) -> int:
|
599
|
-
"""Max parallelism for short workers."""
|
600
|
-
available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
|
601
|
-
n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
|
602
|
-
return n
|