skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,472 @@
1
+ """Executor for the requests.
2
+
3
+ We start limited number of workers for long-running requests, and
4
+ significantly more workers for short-running requests. This is to optimize the
5
+ resource usage and the latency of the requests.
6
+
7
+ * Long-running requests are those requests that can take a long time to finish
8
+ and more resources are needed, such as cluster launching, starting, job
9
+ submission, managed job submission, etc.
10
+
11
+ * Short-running requests are those requests that can be done quickly, and
12
+ require a quick response, such as status check, job status check, etc.
13
+
14
+ With more short-running workers, we can serve more short-running requests in
15
+ parallel, and reduce the latency.
16
+
17
+ The number of the workers is determined by the system resources.
18
+
19
+ See the [README.md](../README.md) for detailed architecture of the executor.
20
+ """
21
+ import concurrent.futures
22
+ import contextlib
23
+ import dataclasses
24
+ import enum
25
+ import multiprocessing
26
+ import os
27
+ import queue as queue_lib
28
+ import signal
29
+ import sys
30
+ import time
31
+ import traceback
32
+ import typing
33
+ from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
34
+
35
+ import setproctitle
36
+
37
+ from sky import global_user_state
38
+ from sky import models
39
+ from sky import sky_logging
40
+ from sky import skypilot_config
41
+ from sky.server import common as server_common
42
+ from sky.server import constants as server_constants
43
+ from sky.server.requests import payloads
44
+ from sky.server.requests import requests as api_requests
45
+ from sky.server.requests.queues import mp_queue
46
+ from sky.skylet import constants
47
+ from sky.utils import annotations
48
+ from sky.utils import common_utils
49
+ from sky.utils import timeline
50
+ from sky.utils import ux_utils
51
+
52
+ if typing.TYPE_CHECKING:
53
+ import types
54
+
55
+ # pylint: disable=ungrouped-imports
56
+ if sys.version_info >= (3, 10):
57
+ from typing import ParamSpec
58
+ else:
59
+ from typing_extensions import ParamSpec
60
+
61
+ P = ParamSpec('P')
62
+
63
+ logger = sky_logging.init_logger(__name__)
64
+
65
+ # On macOS, the default start method for multiprocessing is 'fork', which
66
+ # can cause issues with certain types of resources, including those used in
67
+ # the QueueManager in mp_queue.py.
68
+ # The 'spawn' start method is generally more compatible across different
69
+ # platforms, including macOS.
70
+ multiprocessing.set_start_method('spawn', force=True)
71
+
72
+ # Constants based on profiling the peak memory usage while serving various
73
+ # sky commands. These estimation are highly related to usage patterns
74
+ # (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
75
+ # the profiling covers major clouds and common usage patterns. For user has
76
+ # deviated usage pattern, they can override the default estimation by
77
+ # environment variables.
78
+ # NOTE(dev): update these constants for each release according to the load
79
+ # test results.
80
+ # TODO(aylei): maintaining these constants is error-prone, we may need to
81
+ # automatically tune parallelism at runtime according to system usage stats
82
+ # in the future.
83
+ _LONG_WORKER_MEM_GB = 0.4
84
+ _SHORT_WORKER_MEM_GB = 0.25
85
+ # To control the number of long workers.
86
+ _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
87
+ # Limit the number of long workers of local API server, since local server is
88
+ # typically:
89
+ # 1. launched automatically in an environment with high resource contention
90
+ # (e.g. Laptop)
91
+ # 2. used by a single user
92
+ _MAX_LONG_WORKERS_LOCAL = 4
93
+ # Percentage of memory for long requests
94
+ # from the memory reserved for SkyPilot.
95
+ # This is to reserve some memory for short requests.
96
+ _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
97
+ # Minimal number of long workers to ensure responsiveness.
98
+ _MIN_LONG_WORKERS = 1
99
+ # Minimal number of short workers, there is a daemon task running on short
100
+ # workers so at least 2 workers are needed to ensure responsiveness.
101
+ _MIN_SHORT_WORKERS = 2
102
+
103
+
104
+ class QueueBackend(enum.Enum):
105
+ MULTIPROCESSING = 'multiprocessing'
106
+ # TODO(zhwu): we can add redis backend in the future.
107
+
108
+
109
+ @dataclasses.dataclass
110
+ class RequestWorker:
111
+ id: int
112
+ # The type of queue this worker works on.
113
+ schedule_type: api_requests.ScheduleType
114
+
115
+ def __str__(self) -> str:
116
+ return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
117
+
118
+
119
+ class RequestQueue:
120
+ """The queue for the requests, either redis or multiprocessing.
121
+
122
+ The elements in the queue are tuples of (request_id, ignore_return_value).
123
+ """
124
+
125
+ def __init__(self,
126
+ schedule_type: api_requests.ScheduleType,
127
+ backend: Optional[QueueBackend] = None) -> None:
128
+ self.name = schedule_type.value
129
+ self.backend = backend
130
+ assert (backend is None or
131
+ backend == QueueBackend.MULTIPROCESSING), backend
132
+ self.queue = mp_queue.get_queue(self.name)
133
+
134
+ def put(self, request: Tuple[str, bool]) -> None:
135
+ """Put and request to the queue.
136
+
137
+ Args:
138
+ request: A tuple of request_id and ignore_return_value.
139
+ """
140
+ self.queue.put(request) # type: ignore
141
+
142
+ def get(self) -> Optional[Tuple[str, bool]]:
143
+ """Get a request from the queue.
144
+
145
+ It is non-blocking if the queue is empty, and returns None.
146
+
147
+ Returns:
148
+ A tuple of request_id and ignore_return_value.
149
+ """
150
+ try:
151
+ return self.queue.get(block=False)
152
+ except queue_lib.Empty:
153
+ return None
154
+
155
+ def __len__(self) -> int:
156
+ """Get the length of the queue."""
157
+ return self.queue.qsize()
158
+
159
+
160
+ queue_backend = QueueBackend.MULTIPROCESSING
161
+
162
+
163
+ @annotations.lru_cache(scope='global', maxsize=None)
164
+ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
165
+ return RequestQueue(schedule_type, backend=queue_backend)
166
+
167
+
168
+ @contextlib.contextmanager
169
+ def override_request_env_and_config(
170
+ request_body: payloads.RequestBody) -> Generator[None, None, None]:
171
+ """Override the environment and SkyPilot config for a request."""
172
+ original_env = os.environ.copy()
173
+ os.environ.update(request_body.env_vars)
174
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
175
+ name=request_body.env_vars[constants.USER_ENV_VAR])
176
+ global_user_state.add_or_update_user(user)
177
+ # Force color to be enabled.
178
+ os.environ['CLICOLOR_FORCE'] = '1'
179
+ server_common.reload_for_new_request(
180
+ client_entrypoint=request_body.entrypoint,
181
+ client_command=request_body.entrypoint_command,
182
+ using_remote_api_server=request_body.using_remote_api_server)
183
+ try:
184
+ with skypilot_config.override_skypilot_config(
185
+ request_body.override_skypilot_config):
186
+ yield
187
+ finally:
188
+ # We need to call the save_timeline() since atexit will not be
189
+ # triggered as multiple requests can be sharing the same process.
190
+ timeline.save_timeline()
191
+ # Restore the original environment variables, so that a new request
192
+ # won't be affected by the previous request, e.g. SKYPILOT_DEBUG
193
+ # setting, etc. This is necessary as our executor is reusing the
194
+ # same process for multiple requests.
195
+ os.environ.clear()
196
+ os.environ.update(original_env)
197
+
198
+
199
+ def _redirect_output(file: TextIO) -> Tuple[int, int]:
200
+ """Redirect stdout and stderr to the log file."""
201
+ fd = file.fileno() # Get the file descriptor from the file object
202
+ # Store copies of the original stdout and stderr file descriptors
203
+ original_stdout = os.dup(sys.stdout.fileno())
204
+ original_stderr = os.dup(sys.stderr.fileno())
205
+
206
+ # Copy this fd to stdout and stderr
207
+ os.dup2(fd, sys.stdout.fileno())
208
+ os.dup2(fd, sys.stderr.fileno())
209
+ return original_stdout, original_stderr
210
+
211
+
212
+ def _restore_output(original_stdout: int, original_stderr: int) -> None:
213
+ """Restore stdout and stderr to their original file descriptors."""
214
+ os.dup2(original_stdout, sys.stdout.fileno())
215
+ os.dup2(original_stderr, sys.stderr.fileno())
216
+
217
+ # Close the duplicate file descriptors
218
+ os.close(original_stdout)
219
+ os.close(original_stderr)
220
+
221
+
222
+ def _request_execution_wrapper(request_id: str,
223
+ ignore_return_value: bool) -> None:
224
+ """Wrapper for a request execution.
225
+
226
+ It wraps the execution of a request to:
227
+ 1. Deserialize the request from the request database and serialize the
228
+ return value/exception in the request database;
229
+ 2. Update the request status based on the execution result;
230
+ 3. Redirect the stdout and stderr of the execution to log file;
231
+ 4. Handle the SIGTERM signal to abort the request gracefully.
232
+ """
233
+
234
+ def sigterm_handler(signum: int,
235
+ frame: Optional['types.FrameType']) -> None:
236
+ raise KeyboardInterrupt
237
+
238
+ signal.signal(signal.SIGTERM, sigterm_handler)
239
+
240
+ pid = multiprocessing.current_process().pid
241
+ logger.info(f'Running request {request_id} with pid {pid}')
242
+ with api_requests.update_request(request_id) as request_task:
243
+ assert request_task is not None, request_id
244
+ log_path = request_task.log_path
245
+ request_task.pid = pid
246
+ request_task.status = api_requests.RequestStatus.RUNNING
247
+ func = request_task.entrypoint
248
+ request_body = request_task.request_body
249
+
250
+ with log_path.open('w', encoding='utf-8') as f:
251
+ # Store copies of the original stdout and stderr file descriptors
252
+ original_stdout, original_stderr = _redirect_output(f)
253
+ # Redirect the stdout/stderr before overriding the environment and
254
+ # config, as there can be some logs during override that needs to be
255
+ # captured in the log file.
256
+ try:
257
+ with override_request_env_and_config(request_body):
258
+ return_value = func(**request_body.to_kwargs())
259
+ f.flush()
260
+ except KeyboardInterrupt:
261
+ logger.info(f'Request {request_id} cancelled by user')
262
+ _restore_output(original_stdout, original_stderr)
263
+ return
264
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
265
+ with ux_utils.enable_traceback():
266
+ stacktrace = traceback.format_exc()
267
+ setattr(e, 'stacktrace', stacktrace)
268
+ with api_requests.update_request(request_id) as request_task:
269
+ assert request_task is not None, request_id
270
+ request_task.status = api_requests.RequestStatus.FAILED
271
+ request_task.set_error(e)
272
+ _restore_output(original_stdout, original_stderr)
273
+ logger.info(f'Request {request_id} failed due to '
274
+ f'{common_utils.format_exception(e)}')
275
+ return
276
+ else:
277
+ with api_requests.update_request(request_id) as request_task:
278
+ assert request_task is not None, request_id
279
+ request_task.status = api_requests.RequestStatus.SUCCEEDED
280
+ if not ignore_return_value:
281
+ request_task.set_return_value(return_value)
282
+ _restore_output(original_stdout, original_stderr)
283
+ logger.info(f'Request {request_id} finished')
284
+
285
+
286
+ def schedule_request(request_id: str,
287
+ request_name: str,
288
+ request_body: payloads.RequestBody,
289
+ func: Callable[P, Any],
290
+ request_cluster_name: Optional[str] = None,
291
+ ignore_return_value: bool = False,
292
+ schedule_type: api_requests.ScheduleType = api_requests.
293
+ ScheduleType.LONG,
294
+ is_skypilot_system: bool = False) -> None:
295
+ """Enqueue a request to the request queue."""
296
+ user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
297
+ if is_skypilot_system:
298
+ user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
299
+ global_user_state.add_or_update_user(
300
+ models.User(id=user_id, name=user_id))
301
+ request = api_requests.Request(request_id=request_id,
302
+ name=server_constants.REQUEST_NAME_PREFIX +
303
+ request_name,
304
+ entrypoint=func,
305
+ request_body=request_body,
306
+ status=api_requests.RequestStatus.PENDING,
307
+ created_at=time.time(),
308
+ schedule_type=schedule_type,
309
+ user_id=user_id,
310
+ cluster_name=request_cluster_name)
311
+
312
+ if not api_requests.create_if_not_exists(request):
313
+ logger.debug(f'Request {request_id} already exists.')
314
+ return
315
+
316
+ request.log_path.touch()
317
+ input_tuple = (request_id, ignore_return_value)
318
+
319
+ logger.info(f'Queuing request: {request_id}')
320
+ _get_queue(schedule_type).put(input_tuple)
321
+
322
+
323
+ def executor_initializer(proc_group: str):
324
+ setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
325
+ f'{multiprocessing.current_process().pid}')
326
+
327
+
328
+ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
329
+ """Worker for the requests.
330
+
331
+ Args:
332
+ max_parallel_size: Maximum number of parallel jobs this worker can run.
333
+ """
334
+ proc_group = f'{worker.schedule_type.value}-{worker.id}'
335
+ setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
336
+ queue = _get_queue(worker.schedule_type)
337
+
338
+ def process_request(executor: concurrent.futures.ProcessPoolExecutor):
339
+ try:
340
+ request_element = queue.get()
341
+ if request_element is None:
342
+ time.sleep(0.1)
343
+ return
344
+ request_id, ignore_return_value = request_element
345
+ request = api_requests.get_request(request_id)
346
+ assert request is not None, f'Request with ID {request_id} is None'
347
+ if request.status == api_requests.RequestStatus.CANCELLED:
348
+ return
349
+ logger.info(f'[{worker}] Submitting request: {request_id}')
350
+ # Start additional process to run the request, so that it can be
351
+ # cancelled when requested by a user.
352
+ # TODO(zhwu): since the executor is reusing the request process,
353
+ # multiple requests can share the same process pid, which may cause
354
+ # issues with SkyPilot core functions if they rely on the exit of
355
+ # the process, such as subprocess_daemon.py.
356
+ future = executor.submit(_request_execution_wrapper, request_id,
357
+ ignore_return_value)
358
+
359
+ if worker.schedule_type == api_requests.ScheduleType.LONG:
360
+ try:
361
+ future.result(timeout=None)
362
+ except Exception as e: # pylint: disable=broad-except
363
+ logger.error(f'[{worker}] Request {request_id} failed: {e}')
364
+ logger.info(f'[{worker}] Finished request: {request_id}')
365
+ else:
366
+ logger.info(f'[{worker}] Submitted request: {request_id}')
367
+ except KeyboardInterrupt:
368
+ # Interrupt the worker process will stop request execution, but
369
+ # the SIGTERM request should be respected anyway since it might
370
+ # be explicitly sent by user.
371
+ # TODO(aylei): crash the API server or recreate the worker process
372
+ # to avoid broken state.
373
+ logger.error(f'[{worker}] Worker process interrupted')
374
+ with ux_utils.print_exception_no_traceback():
375
+ raise
376
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
377
+ # Catch any other exceptions to avoid crashing the worker process.
378
+ logger.error(
379
+ f'[{worker}] Error processing request {request_id}: '
380
+ f'{common_utils.format_exception(e, use_bracket=True)}')
381
+
382
+ # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
383
+ # because the former is more efficient with the support of lazy creation of
384
+ # worker processes.
385
+ # We use executor instead of individual multiprocessing.Process to avoid
386
+ # the overhead of forking a new process for each request, which can be about
387
+ # 1s delay.
388
+ with concurrent.futures.ProcessPoolExecutor(
389
+ max_workers=max_parallel_size,
390
+ initializer=executor_initializer,
391
+ initargs=(proc_group,)) as executor:
392
+ while True:
393
+ process_request(executor)
394
+
395
+
396
+ def start(deploy: bool) -> List[multiprocessing.Process]:
397
+ """Start the request workers."""
398
+ # Determine the job capacity of the workers based on the system resources.
399
+ cpu_count = common_utils.get_cpu_count()
400
+ mem_size_gb = common_utils.get_mem_size_gb()
401
+ mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
402
+ max_parallel_for_long = _max_long_worker_parallism(cpu_count,
403
+ mem_size_gb,
404
+ local=not deploy)
405
+ max_parallel_for_short = _max_short_worker_parallism(
406
+ mem_size_gb, max_parallel_for_long)
407
+ logger.info(
408
+ f'SkyPilot API server will start {max_parallel_for_long} workers for '
409
+ f'long requests and will allow at max '
410
+ f'{max_parallel_for_short} short requests in parallel.')
411
+
412
+ sub_procs = []
413
+ # Setup the queues.
414
+ if queue_backend == QueueBackend.MULTIPROCESSING:
415
+ logger.info('Creating shared request queues')
416
+ queue_names = [
417
+ schedule_type.value for schedule_type in api_requests.ScheduleType
418
+ ]
419
+ # TODO(aylei): make queue manager port configurable or pick an available
420
+ # port automatically.
421
+ port = mp_queue.DEFAULT_QUEUE_MANAGER_PORT
422
+ if not common_utils.is_port_available(port):
423
+ raise RuntimeError(
424
+ f'SkyPilot API server fails to start as port {port!r} is '
425
+ 'already in use by another process.')
426
+ queue_server = multiprocessing.Process(
427
+ target=mp_queue.start_queue_manager, args=(queue_names, port))
428
+ queue_server.start()
429
+ sub_procs.append(queue_server)
430
+ mp_queue.wait_for_queues_to_be_ready(queue_names, port=port)
431
+
432
+ logger.info('Request queues created')
433
+
434
+ for worker_id in range(max_parallel_for_long):
435
+ worker = RequestWorker(id=worker_id,
436
+ schedule_type=api_requests.ScheduleType.LONG)
437
+ worker_proc = multiprocessing.Process(target=request_worker,
438
+ args=(worker, 1))
439
+ worker_proc.start()
440
+ sub_procs.append(worker_proc)
441
+
442
+ # Start a worker for short requests.
443
+ worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
444
+ worker_proc = multiprocessing.Process(target=request_worker,
445
+ args=(worker, max_parallel_for_short))
446
+ worker_proc.start()
447
+ sub_procs.append(worker_proc)
448
+ return sub_procs
449
+
450
+
451
+ @annotations.lru_cache(scope='global', maxsize=1)
452
+ def _max_long_worker_parallism(cpu_count: int,
453
+ mem_size_gb: float,
454
+ local=False) -> int:
455
+ """Max parallelism for long workers."""
456
+ cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
457
+ mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
458
+ _LONG_WORKER_MEM_GB)
459
+ n = max(_MIN_LONG_WORKERS,
460
+ min(cpu_based_max_parallel, mem_based_max_parallel))
461
+ if local:
462
+ return min(n, _MAX_LONG_WORKERS_LOCAL)
463
+ return n
464
+
465
+
466
+ @annotations.lru_cache(scope='global', maxsize=1)
467
+ def _max_short_worker_parallism(mem_size_gb: float,
468
+ long_worker_parallism: int) -> int:
469
+ """Max parallelism for short workers."""
470
+ available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
471
+ n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
472
+ return n