skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
1
+ """Executor for the requests.
2
+
3
+ We start limited number of workers for long-running requests, and
4
+ significantly more workers for short-running requests. This is to optimize the
5
+ resource usage and the latency of the requests.
6
+
7
+ * Long-running requests are those requests that can take a long time to finish
8
+ and more resources are needed, such as cluster launching, starting, job
9
+ submission, managed job submission, etc.
10
+
11
+ * Short-running requests are those requests that can be done quickly, and
12
+ require a quick response, such as status check, job status check, etc.
13
+
14
+ With more short-running workers, we can serve more short-running requests in
15
+ parallel, and reduce the latency.
16
+
17
+ The number of the workers is determined by the system resources.
18
+
19
+ See the [README.md](../README.md) for detailed architecture of the executor.
20
+ """
21
+ import concurrent.futures
22
+ import contextlib
23
+ import dataclasses
24
+ import enum
25
+ import multiprocessing
26
+ import os
27
+ import queue as queue_lib
28
+ import signal
29
+ import sys
30
+ import time
31
+ import traceback
32
+ import typing
33
+ from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
34
+
35
+ import psutil
36
+ import setproctitle
37
+
38
+ from sky import global_user_state
39
+ from sky import models
40
+ from sky import sky_logging
41
+ from sky import skypilot_config
42
+ from sky.server import common as server_common
43
+ from sky.server import constants as server_constants
44
+ from sky.server.requests import payloads
45
+ from sky.server.requests import requests as api_requests
46
+ from sky.server.requests.queues import mp_queue
47
+ from sky.skylet import constants
48
+ from sky.utils import annotations
49
+ from sky.utils import common_utils
50
+ from sky.utils import timeline
51
+ from sky.utils import ux_utils
52
+
53
+ if typing.TYPE_CHECKING:
54
+ import types
55
+
56
+ # pylint: disable=ungrouped-imports
57
+ if sys.version_info >= (3, 10):
58
+ from typing import ParamSpec
59
+ else:
60
+ from typing_extensions import ParamSpec
61
+
62
+ P = ParamSpec('P')
63
+
64
+ logger = sky_logging.init_logger(__name__)
65
+
66
+ # On macOS, the default start method for multiprocessing is 'fork', which
67
+ # can cause issues with certain types of resources, including those used in
68
+ # the QueueManager in mp_queue.py.
69
+ # The 'spawn' start method is generally more compatible across different
70
+ # platforms, including macOS.
71
+ multiprocessing.set_start_method('spawn', force=True)
72
+
73
+ # Constants based on profiling the peak memory usage of
74
+ # various sky commands. See `tests/load_test/` for details.
75
+ # Max memory consumption for each request.
76
+ _PER_BLOCKING_REQUEST_MEM_GB = 0.25
77
+ _PER_NON_BLOCKING_REQUEST_MEM_GB = 0.15
78
+ # To control the number of blocking workers.
79
+ _CPU_MULTIPLIER_FOR_BLOCKING_WORKERS = 2
80
+ _MAX_BLOCKING_WORKERS_LOCAL = 4
81
+ # Percentage of memory for blocking requests
82
+ # from the memory reserved for SkyPilot.
83
+ # This is to reserve some memory for non-blocking requests.
84
+ _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
85
+
86
+
87
+ class QueueBackend(enum.Enum):
88
+ MULTIPROCESSING = 'multiprocessing'
89
+ # TODO(zhwu): we can add redis backend in the future.
90
+
91
+
92
+ @dataclasses.dataclass
93
+ class RequestWorker:
94
+ id: int
95
+ # The type of queue this worker works on.
96
+ schedule_type: api_requests.ScheduleType
97
+
98
+ def __str__(self) -> str:
99
+ return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
100
+
101
+
102
+ class RequestQueue:
103
+ """The queue for the requests, either redis or multiprocessing.
104
+
105
+ The elements in the queue are tuples of (request_id, ignore_return_value).
106
+ """
107
+
108
+ def __init__(self,
109
+ schedule_type: api_requests.ScheduleType,
110
+ backend: Optional[QueueBackend] = None) -> None:
111
+ self.name = schedule_type.value
112
+ self.backend = backend
113
+ assert (backend is None or
114
+ backend == QueueBackend.MULTIPROCESSING), backend
115
+ self.queue = mp_queue.get_queue(self.name)
116
+
117
+ def put(self, request: Tuple[str, bool]) -> None:
118
+ """Put and request to the queue.
119
+
120
+ Args:
121
+ request: A tuple of request_id and ignore_return_value.
122
+ """
123
+ self.queue.put(request) # type: ignore
124
+
125
+ def get(self) -> Optional[Tuple[str, bool]]:
126
+ """Get a request from the queue.
127
+
128
+ It is non-blocking if the queue is empty, and returns None.
129
+
130
+ Returns:
131
+ A tuple of request_id and ignore_return_value.
132
+ """
133
+ try:
134
+ return self.queue.get(block=False)
135
+ except queue_lib.Empty:
136
+ return None
137
+
138
+ def __len__(self) -> int:
139
+ """Get the length of the queue."""
140
+ return self.queue.qsize()
141
+
142
+
143
+ queue_backend = QueueBackend.MULTIPROCESSING
144
+
145
+
146
+ @annotations.lru_cache(scope='global', maxsize=None)
147
+ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
148
+ return RequestQueue(schedule_type, backend=queue_backend)
149
+
150
+
151
+ @contextlib.contextmanager
152
+ def override_request_env_and_config(
153
+ request_body: payloads.RequestBody) -> Generator[None, None, None]:
154
+ """Override the environment and SkyPilot config for a request."""
155
+ original_env = os.environ.copy()
156
+ os.environ.update(request_body.env_vars)
157
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
158
+ name=request_body.env_vars[constants.USER_ENV_VAR])
159
+ global_user_state.add_or_update_user(user)
160
+ # Force color to be enabled.
161
+ os.environ['CLICOLOR_FORCE'] = '1'
162
+ server_common.reload_for_new_request(
163
+ client_entrypoint=request_body.entrypoint,
164
+ client_command=request_body.entrypoint_command)
165
+ try:
166
+ with skypilot_config.override_skypilot_config(
167
+ request_body.override_skypilot_config):
168
+ yield
169
+ finally:
170
+ # We need to call the save_timeline() since atexit will not be
171
+ # triggered as multiple requests can be sharing the same process.
172
+ timeline.save_timeline()
173
+ # Restore the original environment variables, so that a new request
174
+ # won't be affected by the previous request, e.g. SKYPILOT_DEBUG
175
+ # setting, etc. This is necessary as our executor is reusing the
176
+ # same process for multiple requests.
177
+ os.environ.clear()
178
+ os.environ.update(original_env)
179
+
180
+
181
+ def _redirect_output(file: TextIO) -> Tuple[int, int]:
182
+ """Redirect stdout and stderr to the log file."""
183
+ fd = file.fileno() # Get the file descriptor from the file object
184
+ # Store copies of the original stdout and stderr file descriptors
185
+ original_stdout = os.dup(sys.stdout.fileno())
186
+ original_stderr = os.dup(sys.stderr.fileno())
187
+
188
+ # Copy this fd to stdout and stderr
189
+ os.dup2(fd, sys.stdout.fileno())
190
+ os.dup2(fd, sys.stderr.fileno())
191
+ return original_stdout, original_stderr
192
+
193
+
194
+ def _restore_output(original_stdout: int, original_stderr: int) -> None:
195
+ """Restore stdout and stderr to their original file descriptors."""
196
+ os.dup2(original_stdout, sys.stdout.fileno())
197
+ os.dup2(original_stderr, sys.stderr.fileno())
198
+
199
+ # Close the duplicate file descriptors
200
+ os.close(original_stdout)
201
+ os.close(original_stderr)
202
+
203
+
204
+ def _request_execution_wrapper(request_id: str,
205
+ ignore_return_value: bool) -> None:
206
+ """Wrapper for a request execution.
207
+
208
+ It wraps the execution of a request to:
209
+ 1. Deserialize the request from the request database and serialize the
210
+ return value/exception in the request database;
211
+ 2. Update the request status based on the execution result;
212
+ 3. Redirect the stdout and stderr of the execution to log file;
213
+ 4. Handle the SIGTERM signal to abort the request gracefully.
214
+ """
215
+
216
+ def sigterm_handler(signum: int,
217
+ frame: Optional['types.FrameType']) -> None:
218
+ raise KeyboardInterrupt
219
+
220
+ signal.signal(signal.SIGTERM, sigterm_handler)
221
+
222
+ pid = multiprocessing.current_process().pid
223
+ logger.info(f'Running request {request_id} with pid {pid}')
224
+ with api_requests.update_request(request_id) as request_task:
225
+ assert request_task is not None, request_id
226
+ log_path = request_task.log_path
227
+ request_task.pid = pid
228
+ request_task.status = api_requests.RequestStatus.RUNNING
229
+ func = request_task.entrypoint
230
+ request_body = request_task.request_body
231
+
232
+ with log_path.open('w', encoding='utf-8') as f:
233
+ # Store copies of the original stdout and stderr file descriptors
234
+ original_stdout, original_stderr = _redirect_output(f)
235
+ # Redirect the stdout/stderr before overriding the environment and
236
+ # config, as there can be some logs during override that needs to be
237
+ # captured in the log file.
238
+ try:
239
+ with override_request_env_and_config(request_body):
240
+ return_value = func(**request_body.to_kwargs())
241
+ except KeyboardInterrupt:
242
+ logger.info(f'Request {request_id} cancelled by user')
243
+ _restore_output(original_stdout, original_stderr)
244
+ return
245
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
246
+ with ux_utils.enable_traceback():
247
+ stacktrace = traceback.format_exc()
248
+ setattr(e, 'stacktrace', stacktrace)
249
+ with api_requests.update_request(request_id) as request_task:
250
+ assert request_task is not None, request_id
251
+ request_task.status = api_requests.RequestStatus.FAILED
252
+ request_task.set_error(e)
253
+ _restore_output(original_stdout, original_stderr)
254
+ logger.info(f'Request {request_id} failed due to '
255
+ f'{common_utils.format_exception(e)}')
256
+ return
257
+ else:
258
+ with api_requests.update_request(request_id) as request_task:
259
+ assert request_task is not None, request_id
260
+ request_task.status = api_requests.RequestStatus.SUCCEEDED
261
+ if not ignore_return_value:
262
+ request_task.set_return_value(return_value)
263
+ _restore_output(original_stdout, original_stderr)
264
+ logger.info(f'Request {request_id} finished')
265
+
266
+
267
+ def schedule_request(request_id: str,
268
+ request_name: str,
269
+ request_body: payloads.RequestBody,
270
+ func: Callable[P, Any],
271
+ request_cluster_name: Optional[str] = None,
272
+ ignore_return_value: bool = False,
273
+ schedule_type: api_requests.ScheduleType = api_requests.
274
+ ScheduleType.LONG,
275
+ is_skypilot_system: bool = False) -> None:
276
+ """Enqueue a request to the request queue."""
277
+ user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
278
+ if is_skypilot_system:
279
+ user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
280
+ global_user_state.add_or_update_user(
281
+ models.User(id=user_id, name=user_id))
282
+ request = api_requests.Request(request_id=request_id,
283
+ name=server_constants.REQUEST_NAME_PREFIX +
284
+ request_name,
285
+ entrypoint=func,
286
+ request_body=request_body,
287
+ status=api_requests.RequestStatus.PENDING,
288
+ created_at=time.time(),
289
+ schedule_type=schedule_type,
290
+ user_id=user_id,
291
+ cluster_name=request_cluster_name)
292
+
293
+ if not api_requests.create_if_not_exists(request):
294
+ logger.debug(f'Request {request_id} already exists.')
295
+ return
296
+
297
+ request.log_path.touch()
298
+ input_tuple = (request_id, ignore_return_value)
299
+
300
+ logger.info(f'Queuing request: {request_id}')
301
+ _get_queue(schedule_type).put(input_tuple)
302
+
303
+
304
+ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
305
+ """Worker for the requests.
306
+
307
+ Args:
308
+ max_parallel_size: Maximum number of parallel jobs this worker can run.
309
+ """
310
+ logger.info(f'Starting {worker} with pid '
311
+ f'{multiprocessing.current_process().pid}')
312
+ setproctitle.setproctitle(
313
+ f'SkyPilot:worker:{worker.schedule_type.value}-{worker.id}')
314
+ queue = _get_queue(worker.schedule_type)
315
+ # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
316
+ # because the former is more efficient with the support of lazy creation of
317
+ # worker processes.
318
+ # We use executor instead of individual multiprocessing.Process to avoid
319
+ # the overhead of forking a new process for each request, which can be about
320
+ # 1s delay.
321
+ with concurrent.futures.ProcessPoolExecutor(
322
+ max_workers=max_parallel_size) as executor:
323
+ while True:
324
+ request_element = queue.get()
325
+ if request_element is None:
326
+ time.sleep(0.1)
327
+ continue
328
+ request_id, ignore_return_value = request_element
329
+ request = api_requests.get_request(request_id)
330
+ if request.status == api_requests.RequestStatus.CANCELLED:
331
+ continue
332
+ logger.info(f'[{worker}] Submitting request: {request_id}')
333
+ # Start additional process to run the request, so that it can be
334
+ # cancelled when requested by a user.
335
+ # TODO(zhwu): since the executor is reusing the request process,
336
+ # multiple requests can share the same process pid, which may cause
337
+ # issues with SkyPilot core functions if they rely on the exit of
338
+ # the process, such as subprocess_daemon.py.
339
+ future = executor.submit(_request_execution_wrapper, request_id,
340
+ ignore_return_value)
341
+
342
+ if worker.schedule_type == api_requests.ScheduleType.LONG:
343
+ try:
344
+ future.result(timeout=None)
345
+ except Exception as e: # pylint: disable=broad-except
346
+ logger.error(f'[{worker}] Request {request_id} failed: {e}')
347
+ logger.info(f'[{worker}] Finished request: {request_id}')
348
+ else:
349
+ logger.info(f'[{worker}] Submitted request: {request_id}')
350
+
351
+
352
+ def _get_cpu_count() -> int:
353
+ """Get the number of CPUs.
354
+
355
+ If the API server is deployed as a pod in k8s cluster, we assume the
356
+ number of CPUs is provided by the downward API.
357
+ """
358
+ cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
359
+ if cpu_count is not None:
360
+ try:
361
+ return int(float(cpu_count))
362
+ except ValueError as e:
363
+ with ux_utils.print_exception_no_traceback():
364
+ raise ValueError(
365
+ f'Failed to parse the number of CPUs from {cpu_count}'
366
+ ) from e
367
+ return psutil.cpu_count()
368
+
369
+
370
+ def _get_mem_size_gb() -> float:
371
+ """Get the memory size in GB.
372
+
373
+ If the API server is deployed as a pod in k8s cluster, we assume the
374
+ memory size is provided by the downward API.
375
+ """
376
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
377
+ if mem_size is not None:
378
+ try:
379
+ return float(mem_size)
380
+ except ValueError as e:
381
+ with ux_utils.print_exception_no_traceback():
382
+ raise ValueError(
383
+ f'Failed to parse the memory size from {mem_size}') from e
384
+ return psutil.virtual_memory().total / (1024**3)
385
+
386
+
387
+ def start(deploy: bool) -> List[multiprocessing.Process]:
388
+ """Start the request workers."""
389
+ # Determine the job capacity of the workers based on the system resources.
390
+ cpu_count = _get_cpu_count()
391
+ mem_size_gb = _get_mem_size_gb()
392
+ mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
393
+ parallel_for_blocking = _max_parallel_size_for_blocking(
394
+ cpu_count, mem_size_gb)
395
+ if not deploy:
396
+ parallel_for_blocking = min(parallel_for_blocking,
397
+ _MAX_BLOCKING_WORKERS_LOCAL)
398
+ max_parallel_for_non_blocking = _max_parallel_size_for_non_blocking(
399
+ mem_size_gb, parallel_for_blocking)
400
+ logger.info(
401
+ f'SkyPilot API server will start {parallel_for_blocking} workers for '
402
+ f'blocking requests and will allow at max '
403
+ f'{max_parallel_for_non_blocking} non-blocking requests in parallel.')
404
+
405
+ # Setup the queues.
406
+ if queue_backend == QueueBackend.MULTIPROCESSING:
407
+ logger.info('Creating shared request queues')
408
+ queue_names = [
409
+ schedule_type.value for schedule_type in api_requests.ScheduleType
410
+ ]
411
+ # TODO(aylei): make queue manager port configurable or pick an available
412
+ # port automatically.
413
+ port = mp_queue.DEFAULT_QUEUE_MANAGER_PORT
414
+ if not common_utils.is_port_available(port):
415
+ raise RuntimeError(
416
+ f'SkyPilot API server fails to start as port {port!r} is '
417
+ 'already in use by another process.')
418
+ queue_server = multiprocessing.Process(
419
+ target=mp_queue.start_queue_manager, args=(queue_names, port))
420
+ queue_server.start()
421
+
422
+ mp_queue.wait_for_queues_to_be_ready(queue_names, port=port)
423
+
424
+ logger.info('Request queues created')
425
+
426
+ worker_procs = []
427
+ for worker_id in range(parallel_for_blocking):
428
+ worker = RequestWorker(id=worker_id,
429
+ schedule_type=api_requests.ScheduleType.LONG)
430
+ worker_proc = multiprocessing.Process(target=request_worker,
431
+ args=(worker, 1))
432
+ worker_proc.start()
433
+ worker_procs.append(worker_proc)
434
+
435
+ # Start a non-blocking worker.
436
+ worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
437
+ worker_proc = multiprocessing.Process(target=request_worker,
438
+ args=(worker,
439
+ max_parallel_for_non_blocking))
440
+ worker_proc.start()
441
+ worker_procs.append(worker_proc)
442
+ return worker_procs
443
+
444
+
445
+ @annotations.lru_cache(scope='global', maxsize=1)
446
+ def _max_parallel_size_for_blocking(cpu_count: int, mem_size_gb: float) -> int:
447
+ """Max parallelism for blocking requests."""
448
+ cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_BLOCKING_WORKERS
449
+ mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
450
+ _PER_BLOCKING_REQUEST_MEM_GB)
451
+ n = max(1, min(cpu_based_max_parallel, mem_based_max_parallel))
452
+ return n
453
+
454
+
455
+ @annotations.lru_cache(scope='global', maxsize=1)
456
+ def _max_parallel_size_for_non_blocking(mem_size_gb: float,
457
+ parallel_size_for_blocking: int) -> int:
458
+ """Max parallelism for non-blocking requests."""
459
+ available_mem = mem_size_gb - (parallel_size_for_blocking *
460
+ _PER_BLOCKING_REQUEST_MEM_GB)
461
+ n = max(1, int(available_mem / _PER_NON_BLOCKING_REQUEST_MEM_GB))
462
+ return n