skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/server/common.py ADDED
@@ -0,0 +1,430 @@
1
+ """Common data structures and constants used in the API."""
2
+
3
+ import dataclasses
4
+ import enum
5
+ import functools
6
+ import json
7
+ import os
8
+ import pathlib
9
+ import subprocess
10
+ import sys
11
+ import time
12
+ import typing
13
+ from typing import Any, Dict, Optional
14
+ import uuid
15
+
16
+ import colorama
17
+ import filelock
18
+ import pydantic
19
+ import requests
20
+
21
+ from sky import exceptions
22
+ from sky import sky_logging
23
+ from sky import skypilot_config
24
+ from sky.data import data_utils
25
+ from sky.server import constants as server_constants
26
+ from sky.skylet import constants
27
+ from sky.usage import usage_lib
28
+ from sky.utils import annotations
29
+ from sky.utils import common_utils
30
+ from sky.utils import rich_utils
31
+ from sky.utils import ux_utils
32
+
33
+ if typing.TYPE_CHECKING:
34
+ from sky import dag as dag_lib
35
+
36
+ DEFAULT_SERVER_URL = 'http://127.0.0.1:46580'
37
+ AVAILBLE_LOCAL_API_SERVER_HOSTS = ['0.0.0.0', 'localhost', '127.0.0.1']
38
+ AVAILABLE_LOCAL_API_SERVER_URLS = [
39
+ f'http://{host}:46580' for host in AVAILBLE_LOCAL_API_SERVER_HOSTS
40
+ ]
41
+
42
+ API_SERVER_CMD = '-m sky.server.server'
43
+ # The client dir on the API server for storing user-specific data, such as file
44
+ # mounts, logs, etc. This dir is empheral and will be cleaned up when the API
45
+ # server is restarted.
46
+ API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
47
+ RETRY_COUNT_ON_TIMEOUT = 3
48
+
49
+ SKY_API_VERSION_WARNING = (
50
+ f'{colorama.Fore.YELLOW}SkyPilot API server is too old: '
51
+ f'v{{server_version}} (client version is v{{client_version}}). '
52
+ 'Please restart the SkyPilot API server with: '
53
+ 'sky api stop; sky api start'
54
+ f'{colorama.Style.RESET_ALL}')
55
+ RequestId = str
56
+ ApiVersion = Optional[str]
57
+
58
+ logger = sky_logging.init_logger(__name__)
59
+
60
+
61
+ class ApiServerStatus(enum.Enum):
62
+ HEALTHY = 'healthy'
63
+ UNHEALTHY = 'unhealthy'
64
+ VERSION_MISMATCH = 'version_mismatch'
65
+
66
+
67
+ @dataclasses.dataclass
68
+ class ApiServerInfo:
69
+ status: ApiServerStatus
70
+ api_version: ApiVersion
71
+
72
+
73
+ @annotations.lru_cache(scope='global')
74
+ def get_server_url(host: Optional[str] = None) -> str:
75
+ endpoint = DEFAULT_SERVER_URL
76
+ if host is not None:
77
+ endpoint = f'http://{host}:46580'
78
+
79
+ url = os.environ.get(
80
+ constants.SKY_API_SERVER_URL_ENV_VAR,
81
+ skypilot_config.get_nested(('api_server', 'endpoint'), endpoint))
82
+ return url.rstrip('/')
83
+
84
+
85
+ @annotations.lru_cache(scope='global')
86
+ def is_api_server_local():
87
+ return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
88
+
89
+
90
+ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
91
+ """Retrieve the status of the API server.
92
+
93
+ This function checks the health of the API server by sending a request
94
+ to the server's health endpoint. It retries the connection a specified
95
+ number of times in case of a timeout.
96
+
97
+ Args:
98
+ endpoint (Optional[str]): The endpoint of the API server.
99
+ If None, the default endpoint will be used.
100
+
101
+ Returns:
102
+ ApiServerInfo: An object containing the status and API version
103
+ of the server. The status can be HEALTHY, UNHEALTHY
104
+ or VERSION_MISMATCH.
105
+ """
106
+ time_out_try_count = 1
107
+ server_url = endpoint if endpoint is not None else get_server_url()
108
+ while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
109
+ try:
110
+ response = requests.get(f'{server_url}/api/health', timeout=2.5)
111
+ if response.status_code == 200:
112
+ try:
113
+ result = response.json()
114
+ api_version = result.get('api_version')
115
+ if api_version is None:
116
+ logger.warning(f'API server response missing '
117
+ f'version info. {server_url} may '
118
+ f'not be running SkyPilot API server.')
119
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
120
+ api_version=None)
121
+ if api_version == server_constants.API_VERSION:
122
+ return ApiServerInfo(status=ApiServerStatus.HEALTHY,
123
+ api_version=api_version)
124
+ return ApiServerInfo(
125
+ status=ApiServerStatus.VERSION_MISMATCH,
126
+ api_version=api_version)
127
+ except (json.JSONDecodeError, AttributeError) as e:
128
+ logger.warning('Failed to parse API server response: '
129
+ f'{str(e)}')
130
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
131
+ api_version=None)
132
+ else:
133
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
134
+ api_version=None)
135
+ except requests.exceptions.Timeout:
136
+ if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
137
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
138
+ api_version=None)
139
+ time_out_try_count += 1
140
+ continue
141
+ except requests.exceptions.ConnectionError:
142
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
143
+ api_version=None)
144
+
145
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
146
+
147
+
148
+ def handle_request_error(response: requests.Response) -> None:
149
+ if response.status_code != 200:
150
+ with ux_utils.print_exception_no_traceback():
151
+ raise RuntimeError(
152
+ 'Failed to process response from SkyPilot API server at '
153
+ f'{get_server_url()}. '
154
+ f'Response: {response.status_code} '
155
+ f'{response.text}')
156
+
157
+
158
+ def get_request_id(response: requests.Response) -> RequestId:
159
+ handle_request_error(response)
160
+ request_id = response.headers.get('X-Request-ID')
161
+ if request_id is None:
162
+ with ux_utils.print_exception_no_traceback():
163
+ raise RuntimeError(
164
+ 'Failed to get request ID from SkyPilot API server at '
165
+ f'{get_server_url()}. Response: {response.status_code} '
166
+ f'{response.text}')
167
+ return request_id
168
+
169
+
170
+ def _start_api_server(deploy: bool = False,
171
+ host: str = '127.0.0.1',
172
+ foreground: bool = False):
173
+ """Starts a SkyPilot API server locally."""
174
+ server_url = get_server_url(host)
175
+ assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
176
+ f'server url {server_url} is not a local url')
177
+ with rich_utils.client_status('Starting SkyPilot API server'):
178
+ logger.info(f'{colorama.Style.DIM}Failed to connect to '
179
+ f'SkyPilot API server at {server_url}. '
180
+ 'Starting a local server.'
181
+ f'{colorama.Style.RESET_ALL}')
182
+ if not is_api_server_local():
183
+ raise RuntimeError(f'Cannot start API server: {get_server_url()} '
184
+ 'is not a local URL')
185
+
186
+ # Check available memory before starting the server.
187
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
188
+ if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
189
+ logger.warning(
190
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
191
+ f'has {avail_mem_size_gb:.1f}GB memory available. '
192
+ f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
193
+ 'recommended to support higher load with better performance.'
194
+ f'{colorama.Style.RESET_ALL}')
195
+
196
+ args = [sys.executable, *API_SERVER_CMD.split()]
197
+ if deploy:
198
+ args += ['--deploy']
199
+ if host is not None:
200
+ args += [f'--host={host}']
201
+
202
+ if foreground:
203
+ # Replaces the current process with the API server
204
+ os.execvp(args[0], args)
205
+
206
+ log_path = os.path.expanduser(constants.API_SERVER_LOGS)
207
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
208
+ cmd = f'{" ".join(args)} > {log_path} 2>&1 < /dev/null'
209
+
210
+ # Start the API server process in the background and don't wait for it.
211
+ # If this is called from a CLI invocation, we need
212
+ # start_new_session=True so that SIGINT on the CLI will not also kill
213
+ # the API server.
214
+ subprocess.Popen(cmd, shell=True, start_new_session=True)
215
+
216
+ # Wait for the server to start until timeout.
217
+ # Conservative upper time bound for starting the server based on
218
+ # profiling.
219
+ timeout_sec = 12
220
+ start_time = time.time()
221
+ while True:
222
+ api_server_info = get_api_server_status()
223
+ assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
224
+ f'API server version mismatch when starting the server. '
225
+ f'Server version: {api_server_info.api_version} '
226
+ f'Client version: {server_constants.API_VERSION}')
227
+ if api_server_info.status == ApiServerStatus.HEALTHY:
228
+ break
229
+ elif time.time() - start_time >= timeout_sec:
230
+ with ux_utils.print_exception_no_traceback():
231
+ raise RuntimeError(
232
+ 'Failed to start SkyPilot API server at '
233
+ f'{get_server_url(host)}'
234
+ f'\nView logs at: {constants.API_SERVER_LOGS}')
235
+ time.sleep(0.5)
236
+ logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
237
+
238
+
239
+ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
240
+ """Check if the API server is healthy.
241
+
242
+ Args:
243
+ endpoint (Optional[str]): The endpoint of the API server.
244
+ If None, the default endpoint will be used.
245
+
246
+ Raises:
247
+ RuntimeError: If the server is not healthy or the client version does
248
+ not match the server version.
249
+ """
250
+ endpoint = endpoint if endpoint is not None else get_server_url()
251
+ api_server_info = get_api_server_status(endpoint)
252
+ api_server_status = api_server_info.status
253
+ if api_server_status == ApiServerStatus.VERSION_MISMATCH:
254
+ with ux_utils.print_exception_no_traceback():
255
+ raise RuntimeError(
256
+ SKY_API_VERSION_WARNING.format(
257
+ server_version=api_server_info.api_version,
258
+ client_version=server_constants.API_VERSION))
259
+ elif api_server_status == ApiServerStatus.UNHEALTHY:
260
+ with ux_utils.print_exception_no_traceback():
261
+ raise exceptions.ApiServerConnectionError(endpoint)
262
+
263
+
264
+ def check_server_healthy_or_start_fn(deploy: bool = False,
265
+ host: str = '127.0.0.1',
266
+ foreground: bool = False):
267
+ try:
268
+ check_server_healthy()
269
+ except exceptions.ApiServerConnectionError as exc:
270
+ endpoint = get_server_url()
271
+ if not is_api_server_local():
272
+ with ux_utils.print_exception_no_traceback():
273
+ raise exceptions.ApiServerConnectionError(endpoint) from exc
274
+ # Lock to prevent multiple processes from starting the server at the
275
+ # same time, causing issues with database initialization.
276
+ with filelock.FileLock(
277
+ os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
278
+ # Check again if server is already running. Other processes may
279
+ # have started the server while we were waiting for the lock.
280
+ api_server_info = get_api_server_status(endpoint)
281
+ if api_server_info.status == ApiServerStatus.UNHEALTHY:
282
+ _start_api_server(deploy, host, foreground)
283
+
284
+
285
+ def check_server_healthy_or_start(func):
286
+
287
+ @functools.wraps(func)
288
+ def wrapper(*args, deploy: bool = False, host: str = '127.0.0.1', **kwargs):
289
+ check_server_healthy_or_start_fn(deploy, host)
290
+ return func(*args, **kwargs)
291
+
292
+ return wrapper
293
+
294
+
295
+ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
296
+ workdir_only: bool) -> 'dag_lib.Dag':
297
+ """Translates the file mounts path in a task to the path on API server.
298
+
299
+ When a task involves file mounts, the client will invoke
300
+ `upload_mounts_to_api_server` above to upload those local files to the API
301
+ server first. This function will then translates the paths in the task to
302
+ be the actual file paths on the API server, based on the
303
+ `file_mounts_mapping` in the task set by the client.
304
+
305
+ Args:
306
+ task: The task to be translated.
307
+ env_vars: The environment variables of the task.
308
+ workdir_only: Whether to only translate the workdir, which is used for
309
+ `exec`, as it does not need other files/folders in file_mounts.
310
+
311
+ Returns:
312
+ The translated task as a single-task dag.
313
+ """
314
+ from sky.utils import dag_utils # pylint: disable=import-outside-toplevel
315
+
316
+ user_hash = env_vars.get(constants.USER_ID_ENV_VAR, 'unknown')
317
+
318
+ # We should not use int(time.time()) as there can be multiple requests at
319
+ # the same second.
320
+ task_id = str(uuid.uuid4().hex)
321
+ client_dir = (API_SERVER_CLIENT_DIR.expanduser().resolve() / user_hash)
322
+ client_task_dir = client_dir / 'tasks'
323
+ client_task_dir.mkdir(parents=True, exist_ok=True)
324
+
325
+ client_task_path = client_task_dir / f'{task_id}.yaml'
326
+ client_task_path.write_text(task)
327
+
328
+ client_file_mounts_dir = client_dir / 'file_mounts'
329
+ client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
330
+
331
+ def _get_client_file_mounts_path(
332
+ original_path: str, file_mounts_mapping: Dict[str, str]) -> str:
333
+ return str(client_file_mounts_dir /
334
+ file_mounts_mapping[original_path].lstrip('/'))
335
+
336
+ task_configs = common_utils.read_yaml_all(str(client_task_path))
337
+ for task_config in task_configs:
338
+ if task_config is None:
339
+ continue
340
+ file_mounts_mapping = task_config.get('file_mounts_mapping', {})
341
+ if not file_mounts_mapping:
342
+ # We did not mount any files to new paths on the remote server
343
+ # so no need to resolve filepaths.
344
+ continue
345
+ if 'workdir' in task_config:
346
+ workdir = task_config['workdir']
347
+ task_config['workdir'] = str(
348
+ client_file_mounts_dir /
349
+ file_mounts_mapping[workdir].lstrip('/'))
350
+ if workdir_only:
351
+ continue
352
+ if 'file_mounts' in task_config:
353
+ file_mounts = task_config['file_mounts']
354
+ for dst, src in file_mounts.items():
355
+ if isinstance(src, str):
356
+ if not data_utils.is_cloud_store_url(src):
357
+ file_mounts[dst] = _get_client_file_mounts_path(
358
+ src, file_mounts_mapping)
359
+ elif isinstance(src, dict):
360
+ if 'source' in src:
361
+ source = src['source']
362
+ if isinstance(source, str):
363
+ if data_utils.is_cloud_store_url(source):
364
+ continue
365
+ src['source'] = _get_client_file_mounts_path(
366
+ source, file_mounts_mapping)
367
+ else:
368
+ new_source = []
369
+ for src_item in source:
370
+ new_source.append(
371
+ _get_client_file_mounts_path(
372
+ src_item, file_mounts_mapping))
373
+ src['source'] = new_source
374
+ else:
375
+ raise ValueError(f'Unexpected file_mounts value: {src}')
376
+ if 'service' in task_config:
377
+ service = task_config['service']
378
+ if 'tls' in service:
379
+ tls = service['tls']
380
+ for key in ['keyfile', 'certfile']:
381
+ if key in tls:
382
+ tls[key] = _get_client_file_mounts_path(
383
+ tls[key], file_mounts_mapping)
384
+
385
+ # We can switch to using string, but this is to make it easier to debug, by
386
+ # persisting the translated task yaml file.
387
+ translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
388
+ common_utils.dump_yaml(str(translated_client_task_path), task_configs)
389
+
390
+ dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
391
+ return dag
392
+
393
+
394
+ def api_server_user_logs_dir_prefix(
395
+ user_hash: Optional[str] = None) -> pathlib.Path:
396
+ if user_hash is None:
397
+ user_hash = common_utils.get_user_hash()
398
+ return API_SERVER_CLIENT_DIR / user_hash / 'sky_logs'
399
+
400
+
401
+ def request_body_to_params(body: pydantic.BaseModel) -> Dict[str, Any]:
402
+ return {
403
+ k: v for k, v in body.model_dump(mode='json').items() if v is not None
404
+ }
405
+
406
+
407
+ def reload_for_new_request(client_entrypoint: Optional[str],
408
+ client_command: Optional[str],
409
+ using_remote_api_server: bool):
410
+ """Reload modules, global variables, and usage message for a new request."""
411
+ # Reset the client entrypoint and command for the usage message.
412
+ common_utils.set_client_status(
413
+ client_entrypoint=client_entrypoint,
414
+ client_command=client_command,
415
+ using_remote_api_server=using_remote_api_server,
416
+ )
417
+
418
+ # Clear cache should be called before reload_logger and usage reset,
419
+ # otherwise, the latest env var will not be used.
420
+ for func in annotations.FUNCTIONS_NEED_RELOAD_CACHE:
421
+ func.cache_clear()
422
+
423
+ # We need to reset usage message, so that the message is up-to-date with the
424
+ # latest information in the context, e.g. client entrypoint and run id.
425
+ usage_lib.messages.reset(usage_lib.MessageType.USAGE)
426
+
427
+ # Make sure the logger takes the new environment variables. This is
428
+ # necessary because the logger is initialized before the environment
429
+ # variables are set, such as SKYPILOT_DEBUG.
430
+ sky_logging.reload_logger()
@@ -0,0 +1,21 @@
1
+ """Constants for the API servers."""
2
+
3
+ # API server version, whenever there is a change in API server that requires a
4
+ # restart of the local API server or error out when the client does not match
5
+ # the server version.
6
+ API_VERSION = '2'
7
+
8
+ # Prefix for API request names.
9
+ REQUEST_NAME_PREFIX = 'sky.'
10
+ # The user ID of the SkyPilot system.
11
+ SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
12
+ # The memory (GB) that SkyPilot tries to not use to prevent OOM.
13
+ MIN_AVAIL_MEM_GB = 2
14
+ # Default encoder/decoder handler name.
15
+ DEFAULT_HANDLER_NAME = 'default'
16
+ # The path to the API request database.
17
+ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
18
+
19
+ # The interval (seconds) for the cluster status to be refreshed in the
20
+ # background.
21
+ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
@@ -0,0 +1,174 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <style>
5
+ body {
6
+ margin: 0;
7
+ padding: 10px;
8
+ background: #1e1e1e;
9
+ color: #d4d4d4;
10
+ font-family: monospace;
11
+ }
12
+ #output {
13
+ white-space: pre-wrap;
14
+ word-wrap: break-word;
15
+ font-size: 14px;
16
+ line-height: 1.4;
17
+ }
18
+ .ansi-black-fg { color: #000000; }
19
+ .ansi-red-fg { color: #cd0000; }
20
+ .ansi-green-fg { color: #00cd00; }
21
+ .ansi-yellow-fg { color: #cdcd00; }
22
+ .ansi-blue-fg { color: #0000ee; }
23
+ .ansi-magenta-fg { color: #cd00cd; }
24
+ .ansi-cyan-fg { color: #00cdcd; }
25
+ .ansi-white-fg { color: #e5e5e5; }
26
+ .ansi-bright-black-fg { color: #7f7f7f; }
27
+ .ansi-bright-red-fg { color: #ff0000; }
28
+ .ansi-bright-green-fg { color: #00ff00; }
29
+ .ansi-bright-yellow-fg { color: #ffff00; }
30
+ .ansi-bright-blue-fg { color: #5c5cff; }
31
+ .ansi-bright-magenta-fg { color: #ff00ff; }
32
+ .ansi-bright-cyan-fg { color: #00ffff; }
33
+ .ansi-bright-white-fg { color: #ffffff; }
34
+ .ansi-bold { font-weight: bold; }
35
+ .ansi-dim { opacity: 0.7; }
36
+ .ansi-italic { font-style: italic; }
37
+ .ansi-underline { text-decoration: underline; }
38
+ </style>
39
+ </head>
40
+ <body>
41
+ <pre id="output"></pre>
42
+ <script>
43
+ const output = document.getElementById('output');
44
+
45
+ // ANSI escape code parser
46
+ class AnsiParser {
47
+ constructor() {
48
+ this.fg = null;
49
+ this.bg = null;
50
+ this.bold = false;
51
+ this.dim = false;
52
+ this.italic = false;
53
+ this.underline = false;
54
+ }
55
+
56
+ reset() {
57
+ this.fg = null;
58
+ this.bg = null;
59
+ this.bold = false;
60
+ this.dim = false;
61
+ this.italic = false;
62
+ this.underline = false;
63
+ }
64
+
65
+ getStyle() {
66
+ const classes = [];
67
+ if (this.fg) classes.push(`ansi-${this.fg}-fg`);
68
+ if (this.bg) classes.push(`ansi-${this.bg}-bg`);
69
+ if (this.bold) classes.push('ansi-bold');
70
+ if (this.dim) classes.push('ansi-dim');
71
+ if (this.italic) classes.push('ansi-italic');
72
+ if (this.underline) classes.push('ansi-underline');
73
+ return classes.join(' ');
74
+ }
75
+
76
+ parse(text) {
77
+ const result = [];
78
+ const regex = /\x1b\[(\d+)m|([^\x1b]+)/g;
79
+ let match;
80
+
81
+ while ((match = regex.exec(text)) !== null) {
82
+ if (match[1]) {
83
+ // ANSI escape code
84
+ const code = parseInt(match[1]);
85
+ switch(code) {
86
+ case 0: this.reset(); break;
87
+ case 1: this.bold = true; break;
88
+ case 2: this.dim = true; break;
89
+ case 3: this.italic = true; break;
90
+ case 4: this.underline = true; break;
91
+ case 30: this.fg = 'black'; break;
92
+ case 31: this.fg = 'red'; break;
93
+ case 32: this.fg = 'green'; break;
94
+ case 33: this.fg = 'yellow'; break;
95
+ case 34: this.fg = 'blue'; break;
96
+ case 35: this.fg = 'magenta'; break;
97
+ case 36: this.fg = 'cyan'; break;
98
+ case 37: this.fg = 'white'; break;
99
+ case 90: this.fg = 'bright-black'; break;
100
+ case 91: this.fg = 'bright-red'; break;
101
+ case 92: this.fg = 'bright-green'; break;
102
+ case 93: this.fg = 'bright-yellow'; break;
103
+ case 94: this.fg = 'bright-blue'; break;
104
+ case 95: this.fg = 'bright-magenta'; break;
105
+ case 96: this.fg = 'bright-cyan'; break;
106
+ case 97: this.fg = 'bright-white'; break;
107
+ }
108
+ } else if (match[2]) {
109
+ // Regular text
110
+ const style = this.getStyle();
111
+ result.push(style
112
+ ? `<span class="${style}">${match[2]}</span>`
113
+ : match[2]);
114
+ }
115
+ }
116
+ return result.join('');
117
+ }
118
+ }
119
+
120
+ const parser = new AnsiParser();
121
+ const decoder = new TextDecoder();
122
+
123
+ function displayError(message) {
124
+ const errorText = `\n\x1b[1;31mError: ${message}\x1b[0m\n`; // Red, bold error message
125
+ output.innerHTML += parser.parse(errorText);
126
+ window.scrollTo(0, document.body.scrollHeight);
127
+ }
128
+
129
+ fetch('{stream_url}')
130
+ .then(response => {
131
+ if (!response.ok) {
132
+ // For HTTP errors, read the error message from response
133
+ return response.text().then(text => {
134
+ try {
135
+ // Try to parse as JSON (FastAPI error format)
136
+ const error = JSON.parse(text);
137
+ throw new Error(error.detail || error.message || text);
138
+ } catch (e) {
139
+ // If not JSON, use the raw text
140
+ throw new Error(text);
141
+ }
142
+ });
143
+ }
144
+ const reader = response.body.getReader();
145
+
146
+ function readStream() {
147
+ reader.read().then(({ done, value }) => {
148
+ if (done) {
149
+ return;
150
+ }
151
+ try {
152
+ const text = decoder.decode(value, { stream: true });
153
+ output.innerHTML += parser.parse(text);
154
+ window.scrollTo(0, document.body.scrollHeight);
155
+ readStream();
156
+ } catch (err) {
157
+ displayError(`Failed to process stream: ${err.message}`);
158
+ throw err;
159
+ }
160
+ }).catch(err => {
161
+ displayError(`Failed to read stream: ${err.message}`);
162
+ console.error('Stream read error:', err);
163
+ });
164
+ }
165
+
166
+ readStream();
167
+ })
168
+ .catch(err => {
169
+ displayError(err.message);
170
+ console.error('Fetch error:', err);
171
+ });
172
+ </script>
173
+ </body>
174
+ </html>
File without changes