skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/cli.py +90 -37
  4. sky/client/cli.py +90 -37
  5. sky/client/sdk.py +3 -2
  6. sky/clouds/cloud.py +5 -2
  7. sky/clouds/kubernetes.py +4 -4
  8. sky/clouds/nebius.py +16 -10
  9. sky/clouds/service_catalog/constants.py +1 -1
  10. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  11. sky/core.py +58 -29
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/favicon.ico +0 -0
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/optimizer.py +35 -11
  21. sky/provision/docker_utils.py +22 -16
  22. sky/provision/kubernetes/utils.py +26 -24
  23. sky/server/common.py +6 -3
  24. sky/server/config.py +184 -0
  25. sky/server/requests/executor.py +17 -156
  26. sky/server/server.py +4 -4
  27. sky/setup_files/dependencies.py +0 -1
  28. sky/skypilot_config.py +27 -6
  29. sky/templates/kubernetes-ray.yml.j2 +23 -13
  30. sky/templates/nebius-ray.yml.j2 +63 -0
  31. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
  32. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +38 -37
  33. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
  34. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
  35. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +0 -0
  36. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
  37. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
  38. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,9 @@ SETUP_ENV_VARS_CMD = (
28
28
  # the command.
29
29
  DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
30
30
  'the Docker daemon socket')
31
+
32
+ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
33
+
31
34
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
32
35
 
33
36
 
@@ -173,22 +176,25 @@ class DockerInitializer:
173
176
  stream_logs=False,
174
177
  separate_stderr=separate_stderr,
175
178
  log_path=self.log_path)
176
- if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr and
177
- wait_for_docker_daemon):
178
- if time.time() - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
179
- if rc == 0:
180
- # Set returncode to 1 if failed to connect to docker
181
- # daemon after timeout.
182
- rc = 1
183
- break
184
- # Close the cached connection to make the permission update of
185
- # ssh user take effect, e.g. usermod -aG docker $USER, called
186
- # by cloud-init of Azure.
187
- self.runner.close_cached_connection()
188
- logger.info('Failed to connect to docker daemon. It might be '
189
- 'initializing, retrying in 5 seconds...')
190
- time.sleep(5)
191
- continue
179
+ if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
180
+ DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
181
+ if wait_for_docker_daemon:
182
+ if time.time(
183
+ ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
184
+ if rc == 0:
185
+ # Set returncode to 1 if failed to connect to docker
186
+ # daemon after timeout.
187
+ rc = 1
188
+ break
189
+ # Close the cached connection to make the permission update
190
+ # of ssh user take effect, e.g. usermod -aG docker $USER,
191
+ # called by cloud-init of Azure.
192
+ self.runner.close_cached_connection()
193
+ logger.info(
194
+ 'Failed to connect to docker daemon. It might be '
195
+ 'initializing, retrying in 5 seconds...')
196
+ time.sleep(5)
197
+ continue
192
198
  break
193
199
  subprocess_utils.handle_returncode(
194
200
  rc,
@@ -243,7 +243,7 @@ class GPULabelFormatter:
243
243
  raise NotImplementedError
244
244
 
245
245
  @classmethod
246
- def get_label_value(cls, accelerator: str) -> str:
246
+ def get_label_values(cls, accelerator: str) -> List[str]:
247
247
  """Given a GPU type, returns the label value to be used"""
248
248
  raise NotImplementedError
249
249
 
@@ -311,10 +311,10 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
311
311
  return [cls.LABEL_KEY]
312
312
 
313
313
  @classmethod
314
- def get_label_value(cls, accelerator: str) -> str:
314
+ def get_label_values(cls, accelerator: str) -> List[str]:
315
315
  # For SkyPilot formatter, we use the accelerator str directly.
316
316
  # See sky.utils.kubernetes.gpu_labeler.
317
- return accelerator.lower()
317
+ return [accelerator.lower()]
318
318
 
319
319
  @classmethod
320
320
  def match_label_key(cls, label_key: str) -> bool:
@@ -351,8 +351,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
351
351
  return [cls.LABEL_KEY]
352
352
 
353
353
  @classmethod
354
- def get_label_value(cls, accelerator: str) -> str:
355
- return accelerator.upper()
354
+ def get_label_values(cls, accelerator: str) -> List[str]:
355
+ return [accelerator.upper()]
356
356
 
357
357
  @classmethod
358
358
  def match_label_key(cls, label_key: str) -> bool:
@@ -438,8 +438,8 @@ class GKELabelFormatter(GPULabelFormatter):
438
438
  return count_to_topology
439
439
 
440
440
  @classmethod
441
- def get_label_value(cls, accelerator: str) -> str:
442
- return get_gke_accelerator_name(accelerator)
441
+ def get_label_values(cls, accelerator: str) -> List[str]:
442
+ return [get_gke_accelerator_name(accelerator)]
443
443
 
444
444
  @classmethod
445
445
  def get_accelerator_from_label_value(cls, value: str) -> str:
@@ -472,7 +472,7 @@ class GFDLabelFormatter(GPULabelFormatter):
472
472
  https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
473
473
 
474
474
  This LabelFormatter can't be used in autoscaling clusters since accelerators
475
- may map to multiple label, so we're not implementing `get_label_value`
475
+ may map to multiple label, so we're not implementing `get_label_values`
476
476
  """
477
477
 
478
478
  LABEL_KEY = 'nvidia.com/gpu.product'
@@ -486,10 +486,10 @@ class GFDLabelFormatter(GPULabelFormatter):
486
486
  return [cls.LABEL_KEY]
487
487
 
488
488
  @classmethod
489
- def get_label_value(cls, accelerator: str) -> str:
490
- """An accelerator can map to many Nvidia GFD labels
491
- (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
492
- As a result, we do not support get_label_value for GFDLabelFormatter."""
489
+ def get_label_values(cls, accelerator: str) -> List[str]:
490
+ # An accelerator can map to many Nvidia GFD labels
491
+ # (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
492
+ # TODO implement get_label_values for GFDLabelFormatter
493
493
  raise NotImplementedError
494
494
 
495
495
  @classmethod
@@ -1032,15 +1032,17 @@ def check_instance_fits(context: Optional[str],
1032
1032
  # met.
1033
1033
  assert acc_count is not None, (acc_type, acc_count)
1034
1034
  try:
1035
- gpu_label_key, gpu_label_val, _, _ = (
1036
- get_accelerator_label_key_value(context, acc_type, acc_count))
1035
+ gpu_label_key, gpu_label_values, _, _ = (
1036
+ get_accelerator_label_key_values(context, acc_type, acc_count))
1037
+ if gpu_label_values is None:
1038
+ gpu_label_values = []
1037
1039
  except exceptions.ResourcesUnavailableError as e:
1038
1040
  # If GPU not found, return empty list and error message.
1039
1041
  return False, str(e)
1040
1042
  # Get the set of nodes that have the GPU type
1041
1043
  gpu_nodes = [
1042
1044
  node for node in nodes if gpu_label_key in node.metadata.labels and
1043
- node.metadata.labels[gpu_label_key] == gpu_label_val
1045
+ node.metadata.labels[gpu_label_key] in gpu_label_values
1044
1046
  ]
1045
1047
  if not gpu_nodes:
1046
1048
  return False, f'No GPU nodes found with {acc_type} on the cluster'
@@ -1082,12 +1084,12 @@ def check_instance_fits(context: Optional[str],
1082
1084
  return fits, reason
1083
1085
 
1084
1086
 
1085
- def get_accelerator_label_key_value(
1087
+ def get_accelerator_label_key_values(
1086
1088
  context: Optional[str],
1087
1089
  acc_type: str,
1088
1090
  acc_count: int,
1089
1091
  check_mode=False
1090
- ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
1092
+ ) -> Tuple[Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
1091
1093
  """Returns the label key and value for the given GPU/TPU type.
1092
1094
 
1093
1095
  Args:
@@ -1141,7 +1143,7 @@ def get_accelerator_label_key_value(
1141
1143
  tpu_topology_label_key = formatter.get_tpu_topology_label_key()
1142
1144
  tpu_topology_label_value = formatter.get_tpu_topology_label_value(
1143
1145
  acc_type, acc_count)
1144
- return formatter.get_label_key(acc_type), formatter.get_label_value(
1146
+ return formatter.get_label_key(acc_type), formatter.get_label_values(
1145
1147
  acc_type), tpu_topology_label_key, tpu_topology_label_value
1146
1148
 
1147
1149
  has_gpus, cluster_resources = detect_accelerator_resource(context)
@@ -1220,12 +1222,12 @@ def get_accelerator_label_key_value(
1220
1222
  # different topologies that maps to identical
1221
1223
  # number of TPU chips.
1222
1224
  if tpu_topology_chip_count == acc_count:
1223
- return (label, value, topology_label_key,
1225
+ return (label, [value], topology_label_key,
1224
1226
  topology_value)
1225
1227
  else:
1226
1228
  continue
1227
1229
  else:
1228
- return label, value, None, None
1230
+ return label, [value], None, None
1229
1231
 
1230
1232
  # If no node is found with the requested acc_type, raise error
1231
1233
  with ux_utils.print_exception_no_traceback():
@@ -1387,10 +1389,10 @@ def check_credentials(context: Optional[str],
1387
1389
  # `get_unlabeled_accelerator_nodes`.
1388
1390
  # Therefore, if `get_unlabeled_accelerator_nodes` detects unlabelled
1389
1391
  # nodes, we skip this check.
1390
- get_accelerator_label_key_value(context,
1391
- acc_type='',
1392
- acc_count=0,
1393
- check_mode=True)
1392
+ get_accelerator_label_key_values(context,
1393
+ acc_type='',
1394
+ acc_count=0,
1395
+ check_mode=True)
1394
1396
  except exceptions.ResourcesUnavailableError as e:
1395
1397
  # If GPUs are not available, we return cluster as enabled
1396
1398
  # (since it can be a CPU-only cluster) but we also return the
sky/server/common.py CHANGED
@@ -333,7 +333,7 @@ def _start_api_server(deploy: bool = False,
333
333
  break
334
334
 
335
335
  server_url = get_server_url(host)
336
- dashboard_msg = (f'Dashboard: {get_dashboard_url(server_url)}')
336
+ dashboard_msg = ''
337
337
  api_server_info = get_api_server_status(server_url)
338
338
  if api_server_info.version == _DEV_VERSION:
339
339
  dashboard_msg += (
@@ -343,12 +343,15 @@ def _start_api_server(deploy: bool = False,
343
343
  dashboard_msg += (
344
344
  'Dashboard is not built, '
345
345
  'to build: npm --prefix sky/dashboard install '
346
- '&& npm --prefix sky/dashboard run build')
346
+ '&& npm --prefix sky/dashboard run build\n')
347
347
  else:
348
348
  dashboard_msg += (
349
349
  'Dashboard may be stale when installed from source, '
350
350
  'to rebuild: npm --prefix sky/dashboard install '
351
- '&& npm --prefix sky/dashboard run build')
351
+ '&& npm --prefix sky/dashboard run build\n')
352
+ dashboard_msg += (
353
+ f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
354
+ f'Dashboard: {get_dashboard_url(server_url)}')
352
355
  dashboard_msg += f'{colorama.Style.RESET_ALL}'
353
356
  logger.info(
354
357
  ux_utils.finishing_message(
sky/server/config.py ADDED
@@ -0,0 +1,184 @@
1
+ """SkyPilot API Server configuration."""
2
+
3
+ import dataclasses
4
+ import enum
5
+
6
+ from sky import sky_logging
7
+ from sky.server import constants as server_constants
8
+ from sky.utils import common_utils
9
+
10
+ # Constants based on profiling the peak memory usage while serving various
11
+ # sky commands. These estimation are highly related to usage patterns
12
+ # (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
13
+ # the profiling covers major clouds and common usage patterns. For user has
14
+ # deviated usage pattern, they can override the default estimation by
15
+ # environment variables.
16
+ # NOTE(dev): update these constants for each release according to the load
17
+ # test results.
18
+ # TODO(aylei): maintaining these constants is error-prone, we may need to
19
+ # automatically tune parallelism at runtime according to system usage stats
20
+ # in the future.
21
+ _LONG_WORKER_MEM_GB = 0.4
22
+ _SHORT_WORKER_MEM_GB = 0.25
23
+ # To control the number of long workers.
24
+ _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
25
+ # Limit the number of long workers of local API server, since local server is
26
+ # typically:
27
+ # 1. launched automatically in an environment with high resource contention
28
+ # (e.g. Laptop)
29
+ # 2. used by a single user
30
+ _MAX_LONG_WORKERS_LOCAL = 4
31
+ # Percentage of memory for long requests
32
+ # from the memory reserved for SkyPilot.
33
+ # This is to reserve some memory for short requests.
34
+ _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
35
+ # Minimal number of long workers to ensure responsiveness.
36
+ _MIN_LONG_WORKERS = 1
37
+ # Minimal number of short workers, there is a daemon task running on short
38
+ # workers so at least 2 workers are needed to ensure responsiveness.
39
+ _MIN_SHORT_WORKERS = 2
40
+
41
+ # Default number of burstable workers for local API server. A heuristic number
42
+ # that is large enough for most local cases.
43
+ # TODO(aylei): the number of burstable workers should be auto-tuned based on the
44
+ # system usage stats.
45
+ _BURSTABLE_WORKERS_FOR_LOCAL = 1024
46
+
47
+ logger = sky_logging.init_logger(__name__)
48
+
49
+
50
+ class QueueBackend(enum.Enum):
51
+ # Local queue backend serves queues in each process locally, which has
52
+ # lower resource usage but the consumer must be in the same process, i.e.
53
+ # this only works in single-process mode.
54
+ LOCAL = 'local'
55
+ # Multi-process queue backend starts a dedicated process for serving queues.
56
+ MULTIPROCESSING = 'multiprocessing'
57
+ # TODO(zhwu): we can add redis backend in the future.
58
+
59
+
60
+ @dataclasses.dataclass
61
+ class WorkerConfig:
62
+ garanteed_parallelism: int
63
+ burstable_parallelism: int
64
+
65
+
66
+ @dataclasses.dataclass
67
+ class ServerConfig:
68
+ num_server_workers: int
69
+ long_worker_config: WorkerConfig
70
+ short_worker_config: WorkerConfig
71
+ queue_backend: QueueBackend
72
+
73
+
74
+ def compute_server_config(deploy: bool) -> ServerConfig:
75
+ """Compute the server config based on environment.
76
+
77
+ We have different assumptions for the resources in different deployment
78
+ modes, which leads to different worker setups:
79
+
80
+ - Deployment mode (deploy=True), we assume the resources are dedicated to
81
+ the API server and the resources will be tuned for serious use cases, so:
82
+ - Use multiprocessing queue backend and dedicated workers processes to
83
+ avoid GIL contention.
84
+ - Parallelism (number of executor processes) is fixed and executor
85
+ processes have same lifecycle with the server, which ensures
86
+ best-effort cache reusing and stable resources consumption.
87
+ - Reject to start in low resource environments, to avoid flaky
88
+ deployments.
89
+ - Local mode (deploy=False), we assume the server is running in a shared
90
+ environment (e.g. laptop) and users typically do not pay attention to
91
+ the resource setup of the server. Moreover, existing users may expect
92
+ some consistent behaviors with old versions, i.e. before API server was
93
+ introduced, so:
94
+ - The max number of long-running executor processes are limited, to avoid
95
+ high memory consumption when the server is idle.
96
+ - Allow burstable workers to handle requests when all long-running
97
+ workers are busy, which mimics the behavior of local sky CLI before
98
+ API server was introduced.
99
+ - Works in low resources environments, and further reduce the memory
100
+ consumption in low resource environments.
101
+
102
+ Note that there is still significant overhead for SDK users when migrate to
103
+ local API server. Since the users are free to run sky operations in Threads
104
+ when using SDK but all client operations will occupy at least one worker
105
+ process after API server was introduced.
106
+ """
107
+ cpu_count = common_utils.get_cpu_count()
108
+ mem_size_gb = common_utils.get_mem_size_gb()
109
+ max_parallel_for_long = _max_long_worker_parallism(cpu_count,
110
+ mem_size_gb,
111
+ local=not deploy)
112
+ max_parallel_for_short = _max_short_worker_parallism(
113
+ mem_size_gb, max_parallel_for_long)
114
+ queue_backend = QueueBackend.MULTIPROCESSING
115
+ burstable_parallel_for_long = 0
116
+ burstable_parallel_for_short = 0
117
+ num_server_workers = cpu_count
118
+ if not deploy:
119
+ # For local mode, use local queue backend since we only run 1 uvicorn
120
+ # worker in local mode and no multiprocessing is needed.
121
+ num_server_workers = 1
122
+ queue_backend = QueueBackend.LOCAL
123
+ # Enable burstable workers for local API server.
124
+ burstable_parallel_for_long = _BURSTABLE_WORKERS_FOR_LOCAL
125
+ burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
126
+ # Runs in low resource mode if the available memory is less than
127
+ # server_constants.MIN_AVAIL_MEM_GB.
128
+ if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
129
+ # Permanent worker process may have significant memory consumption
130
+ # (~350MB per worker) after running commands like `sky check`, so we
131
+ # don't start any permanent workers in low resource local mode. This
132
+ # mimics the behavior of local sky CLI before API server was
133
+ # introduced, where the CLI will start new process everytime and
134
+ # never reject to start due to resource constraints.
135
+ # Note that the refresh daemon will still occupy one worker
136
+ # permanently because it never exits.
137
+ max_parallel_for_long = 0
138
+ max_parallel_for_short = 0
139
+ logger.warning(
140
+ 'SkyPilot API server will run in low resource mode because '
141
+ 'the available memory is less than '
142
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
143
+ logger.info(
144
+ f'SkyPilot API server will start {num_server_workers} server processes '
145
+ f'with {max_parallel_for_long} background workers for long requests '
146
+ f'and will allow at max {max_parallel_for_short} short requests in '
147
+ f'parallel.')
148
+ return ServerConfig(
149
+ num_server_workers=num_server_workers,
150
+ queue_backend=queue_backend,
151
+ long_worker_config=WorkerConfig(
152
+ garanteed_parallelism=max_parallel_for_long,
153
+ burstable_parallelism=burstable_parallel_for_long),
154
+ short_worker_config=WorkerConfig(
155
+ garanteed_parallelism=max_parallel_for_short,
156
+ burstable_parallelism=burstable_parallel_for_short),
157
+ )
158
+
159
+
160
+ def _max_long_worker_parallism(cpu_count: int,
161
+ mem_size_gb: float,
162
+ local=False) -> int:
163
+ """Max parallelism for long workers."""
164
+ # Reserve min available memory to avoid OOM.
165
+ available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
166
+ cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
167
+ mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
168
+ _LONG_WORKER_MEM_GB)
169
+ n = max(_MIN_LONG_WORKERS,
170
+ min(cpu_based_max_parallel, mem_based_max_parallel))
171
+ if local:
172
+ return min(n, _MAX_LONG_WORKERS_LOCAL)
173
+ return n
174
+
175
+
176
+ def _max_short_worker_parallism(mem_size_gb: float,
177
+ long_worker_parallism: int) -> int:
178
+ """Max parallelism for short workers."""
179
+ # Reserve memory for long workers and min available memory.
180
+ reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
181
+ _LONG_WORKER_MEM_GB)
182
+ available_mem = max(0, mem_size_gb - reserved_mem)
183
+ n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
184
+ return n
@@ -19,7 +19,6 @@ The number of the workers is determined by the system resources.
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
21
  import contextlib
22
- import enum
23
22
  import multiprocessing
24
23
  import os
25
24
  import queue as queue_lib
@@ -37,6 +36,7 @@ from sky import models
37
36
  from sky import sky_logging
38
37
  from sky import skypilot_config
39
38
  from sky.server import common as server_common
39
+ from sky.server import config as server_config
40
40
  from sky.server import constants as server_constants
41
41
  from sky.server.requests import payloads
42
42
  from sky.server.requests import preconditions
@@ -70,53 +70,6 @@ logger = sky_logging.init_logger(__name__)
70
70
  # platforms, including macOS.
71
71
  multiprocessing.set_start_method('spawn', force=True)
72
72
 
73
- # Constants based on profiling the peak memory usage while serving various
74
- # sky commands. These estimation are highly related to usage patterns
75
- # (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
76
- # the profiling covers major clouds and common usage patterns. For user has
77
- # deviated usage pattern, they can override the default estimation by
78
- # environment variables.
79
- # NOTE(dev): update these constants for each release according to the load
80
- # test results.
81
- # TODO(aylei): maintaining these constants is error-prone, we may need to
82
- # automatically tune parallelism at runtime according to system usage stats
83
- # in the future.
84
- _LONG_WORKER_MEM_GB = 0.4
85
- _SHORT_WORKER_MEM_GB = 0.25
86
- # To control the number of long workers.
87
- _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
88
- # Limit the number of long workers of local API server, since local server is
89
- # typically:
90
- # 1. launched automatically in an environment with high resource contention
91
- # (e.g. Laptop)
92
- # 2. used by a single user
93
- _MAX_LONG_WORKERS_LOCAL = 4
94
- # Percentage of memory for long requests
95
- # from the memory reserved for SkyPilot.
96
- # This is to reserve some memory for short requests.
97
- _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
98
- # Minimal number of long workers to ensure responsiveness.
99
- _MIN_LONG_WORKERS = 1
100
- # Minimal number of short workers, there is a daemon task running on short
101
- # workers so at least 2 workers are needed to ensure responsiveness.
102
- _MIN_SHORT_WORKERS = 2
103
-
104
- # Default number of burstable workers for local API server. A heuristic number
105
- # that is large enough for most local cases.
106
- # TODO(aylei): the number of burstable workers should be auto-tuned based on the
107
- # system usage stats.
108
- _BURSTABLE_WORKERS_FOR_LOCAL = 1024
109
-
110
-
111
- class QueueBackend(enum.Enum):
112
- # Local queue backend serves queues in each process locally, which has
113
- # lower resource usage but the consumer must be in the same process, i.e.
114
- # this only works in single-process mode.
115
- LOCAL = 'local'
116
- # Multi-process queue backend starts a dedicated process for serving queues.
117
- MULTIPROCESSING = 'multiprocessing'
118
- # TODO(zhwu): we can add redis backend in the future.
119
-
120
73
 
121
74
  class RequestQueue:
122
75
  """The queue for the requests, either redis or multiprocessing.
@@ -126,12 +79,12 @@ class RequestQueue:
126
79
 
127
80
  def __init__(self,
128
81
  schedule_type: api_requests.ScheduleType,
129
- backend: Optional[QueueBackend] = None) -> None:
82
+ backend: Optional[server_config.QueueBackend] = None) -> None:
130
83
  self.name = schedule_type.value
131
84
  self.backend = backend
132
- if backend == QueueBackend.MULTIPROCESSING:
85
+ if backend == server_config.QueueBackend.MULTIPROCESSING:
133
86
  self.queue = mp_queue.get_queue(self.name)
134
- elif backend == QueueBackend.LOCAL:
87
+ elif backend == server_config.QueueBackend.LOCAL:
135
88
  self.queue = local_queue.get_queue(self.name)
136
89
  else:
137
90
  raise RuntimeError(f'Invalid queue backend: {backend}')
@@ -162,7 +115,7 @@ class RequestQueue:
162
115
  return self.queue.qsize()
163
116
 
164
117
 
165
- queue_backend = QueueBackend.MULTIPROCESSING
118
+ queue_backend = server_config.QueueBackend.MULTIPROCESSING
166
119
 
167
120
 
168
121
  def executor_initializer(proc_group: str):
@@ -186,13 +139,11 @@ class RequestWorker:
186
139
  # if there are available CPU/memory resources.
187
140
  burstable_parallelism: int = 0
188
141
 
189
- def __init__(self,
190
- schedule_type: api_requests.ScheduleType,
191
- garanteed_parallelism: int,
192
- burstable_parallelism: int = 0) -> None:
142
+ def __init__(self, schedule_type: api_requests.ScheduleType,
143
+ config: server_config.WorkerConfig) -> None:
193
144
  self.schedule_type = schedule_type
194
- self.garanteed_parallelism = garanteed_parallelism
195
- self.burstable_parallelism = burstable_parallelism
145
+ self.garanteed_parallelism = config.garanteed_parallelism
146
+ self.burstable_parallelism = config.burstable_parallelism
196
147
 
197
148
  def __str__(self) -> str:
198
149
  return f'Worker(schedule_type={self.schedule_type.value})'
@@ -455,80 +406,17 @@ def schedule_request(
455
406
  enqueue()
456
407
 
457
408
 
458
- def start(deploy: bool) -> List[multiprocessing.Process]:
409
+ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
459
410
  """Start the request workers.
460
411
 
461
412
  Request workers run in background, schedule the requests and delegate the
462
- request execution to executor processes. We have different assumptions for
463
- the resources in different deployment modes, which leads to different
464
- worker setups:
465
-
466
- - Deployment mode (deploy=True), we assume the resources are dedicated to
467
- the API server and the resources will be tuned for serious use cases, so:
468
- - Use multiprocessing queue backend and dedicated workers processes to
469
- avoid GIL contention.
470
- - Parallelism (number of executor processes) is fixed and executor
471
- processes have same lifecycle with the server, which ensures
472
- best-effort cache reusing and stable resources consumption.
473
- - Reject to start in low resource environments, to avoid flaky
474
- deployments.
475
- - Local mode (deploy=False), we assume the server is running in a shared
476
- environment (e.g. laptop) and users typically do not pay attention to
477
- the resource setup of the server. Moreover, existing users may expect
478
- some consistent behaviors with old versions, i.e. before API server was
479
- introduced, so:
480
- - The max number of long-running executor processes are limited, to avoid
481
- high memory consumption when the server is idle.
482
- - Allow burstable workers to handle requests when all long-running
483
- workers are busy, which mimics the behavior of local sky CLI before
484
- API server was introduced.
485
- - Works in low resources environments, and further reduce the memory
486
- consumption in low resource environments.
487
-
488
- Note that there is still significant overhead for SDK users when migrate to
489
- local API server. Since the users are free to run sky operations in Threads
490
- when using SDK but all client operations will occupy at least one worker
491
- process after API server was introduced.
413
+ request execution to executor processes.
492
414
  """
493
- # Determine the job capacity of the workers based on the system resources.
494
- cpu_count = common_utils.get_cpu_count()
495
- mem_size_gb = common_utils.get_mem_size_gb()
496
- mem_for_workers = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
497
- # Runs in low resource mode if the available memory is less than
498
- # server_constants.MIN_AVAIL_MEM_GB.
499
- max_parallel_for_long = _max_long_worker_parallism(cpu_count,
500
- mem_for_workers,
501
- local=not deploy)
502
- max_parallel_for_short = _max_short_worker_parallism(
503
- mem_for_workers, max_parallel_for_long)
504
- if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
505
- # Permanent worker process may have significant memory consumption
506
- # (~350MB per worker) after running commands like `sky check`, so we
507
- # don't start any permanent workers in low resource local mode. This
508
- # mimics the behavior of local sky CLI before API server was
509
- # introduced, where the CLI will start new process everytime and
510
- # never reject to start due to resource constraints.
511
- # Note that the refresh daemon will still occupy one worker
512
- # permanently because it never exits.
513
- max_parallel_for_long = 0
514
- max_parallel_for_short = 0
515
- logger.warning(
516
- 'SkyPilot API server will run in low resource mode because '
517
- 'the available memory is less than '
518
- f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
519
- else:
520
- logger.info(
521
- f'SkyPilot API server will start {max_parallel_for_long} workers '
522
- f'for long requests and will allow at max '
523
- f'{max_parallel_for_short} short requests in parallel.')
524
- if not deploy:
525
- # For local mode, use local queue backend since we only run 1 uvicorn
526
- # worker in local mode.
527
- global queue_backend
528
- queue_backend = QueueBackend.LOCAL
415
+ global queue_backend
416
+ queue_backend = config.queue_backend
529
417
  sub_procs = []
530
418
  # Setup the queues.
531
- if queue_backend == QueueBackend.MULTIPROCESSING:
419
+ if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
532
420
  logger.info('Creating shared request queues')
533
421
  queue_names = [
534
422
  schedule_type.value for schedule_type in api_requests.ScheduleType
@@ -547,7 +435,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
547
435
  mp_queue.wait_for_queues_to_be_ready(queue_names,
548
436
  queue_server,
549
437
  port=port)
550
- elif queue_backend == QueueBackend.LOCAL:
438
+ elif queue_backend == server_config.QueueBackend.LOCAL:
551
439
  # No setup is needed for local queue backend.
552
440
  pass
553
441
  else:
@@ -563,40 +451,13 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
563
451
  thread = threading.Thread(target=worker.run, daemon=True)
564
452
  thread.start()
565
453
 
566
- burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
567
454
  # Start a worker for long requests.
568
455
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
569
- garanteed_parallelism=max_parallel_for_long,
570
- burstable_parallelism=burstable_parallelism)
456
+ config=config.long_worker_config)
571
457
  run_worker_in_background(long_worker)
572
458
 
573
459
  # Start a worker for short requests.
574
460
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
575
- garanteed_parallelism=max_parallel_for_short,
576
- burstable_parallelism=burstable_parallelism)
461
+ config=config.short_worker_config)
577
462
  run_worker_in_background(short_worker)
578
463
  return sub_procs
579
-
580
-
581
- @annotations.lru_cache(scope='global', maxsize=1)
582
- def _max_long_worker_parallism(cpu_count: int,
583
- mem_size_gb: float,
584
- local=False) -> int:
585
- """Max parallelism for long workers."""
586
- cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
587
- mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
588
- _LONG_WORKER_MEM_GB)
589
- n = max(_MIN_LONG_WORKERS,
590
- min(cpu_based_max_parallel, mem_based_max_parallel))
591
- if local:
592
- return min(n, _MAX_LONG_WORKERS_LOCAL)
593
- return n
594
-
595
-
596
- @annotations.lru_cache(scope='global', maxsize=1)
597
- def _max_short_worker_parallism(mem_size_gb: float,
598
- long_worker_parallism: int) -> int:
599
- """Max parallelism for short workers."""
600
- available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
601
- n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
602
- return n