skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,366 @@
1
+ """SDK for SkyServe."""
2
+ import json
3
+ import typing
4
+ from typing import List, Optional, Union
5
+
6
+ import click
7
+ import requests
8
+
9
+ from sky.client import common as client_common
10
+ from sky.server import common as server_common
11
+ from sky.server.requests import payloads
12
+ from sky.usage import usage_lib
13
+ from sky.utils import dag_utils
14
+
15
+ if typing.TYPE_CHECKING:
16
+ import io
17
+
18
+ import sky
19
+ from sky.serve import serve_utils
20
+
21
+
22
+ @usage_lib.entrypoint
23
+ @server_common.check_server_healthy_or_start
24
+ def up(
25
+ task: Union['sky.Task', 'sky.Dag'],
26
+ service_name: str,
27
+ # Internal only:
28
+ # pylint: disable=invalid-name
29
+ _need_confirmation: bool = False
30
+ ) -> server_common.RequestId:
31
+ """Spins up a service.
32
+
33
+ Please refer to the sky.cli.serve_up for the document.
34
+
35
+ Args:
36
+ task: sky.Task to serve up.
37
+ service_name: Name of the service.
38
+ _need_confirmation: (Internal only) Whether to show a confirmation
39
+ prompt before spinning up the service.
40
+
41
+ Returns:
42
+ The request ID of the up request.
43
+
44
+ Request Returns:
45
+ service_name (str): The name of the service. Same if passed in as an
46
+ argument.
47
+ endpoint (str): The service endpoint.
48
+ """
49
+
50
+ # Avoid circular import.
51
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
52
+
53
+ dag = dag_utils.convert_entrypoint_to_dag(task)
54
+ sdk.validate(dag)
55
+ request_id = sdk.optimize(dag)
56
+ sdk.stream_and_get(request_id)
57
+ if _need_confirmation:
58
+ prompt = f'Launching a new service {service_name!r}. Proceed?'
59
+ if prompt is not None:
60
+ click.confirm(prompt, default=True, abort=True, show_default=True)
61
+
62
+ dag = client_common.upload_mounts_to_api_server(dag)
63
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
64
+
65
+ body = payloads.ServeUpBody(
66
+ task=dag_str,
67
+ service_name=service_name,
68
+ )
69
+ response = requests.post(
70
+ f'{server_common.get_server_url()}/serve/up',
71
+ json=json.loads(body.model_dump_json()),
72
+ timeout=(5, None),
73
+ )
74
+ return server_common.get_request_id(response)
75
+
76
+
77
+ @usage_lib.entrypoint
78
+ @server_common.check_server_healthy_or_start
79
+ def update(
80
+ task: Union['sky.Task', 'sky.Dag'],
81
+ service_name: str,
82
+ mode: 'serve_utils.UpdateMode',
83
+ # Internal only:
84
+ # pylint: disable=invalid-name
85
+ _need_confirmation: bool = False
86
+ ) -> server_common.RequestId:
87
+ """Updates an existing service.
88
+
89
+ Please refer to the sky.cli.serve_update for the document.
90
+
91
+ Args:
92
+ task: sky.Task to update.
93
+ service_name: Name of the service.
94
+ mode: Update mode, including:
95
+ - sky.serve.UpdateMode.ROLLING
96
+ - sky.serve.UpdateMode.BLUE_GREEN
97
+ _need_confirmation: (Internal only) Whether to show a confirmation
98
+ prompt before updating the service.
99
+
100
+ Returns:
101
+ The request ID of the update request.
102
+
103
+ Request Returns:
104
+ None
105
+ """
106
+ # Avoid circular import.
107
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
108
+
109
+ dag = dag_utils.convert_entrypoint_to_dag(task)
110
+ sdk.validate(dag)
111
+ request_id = sdk.optimize(dag)
112
+ sdk.stream_and_get(request_id)
113
+ if _need_confirmation:
114
+ click.confirm(f'Updating service {service_name!r}. Proceed?',
115
+ default=True,
116
+ abort=True,
117
+ show_default=True)
118
+
119
+ dag = client_common.upload_mounts_to_api_server(dag)
120
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
121
+ body = payloads.ServeUpdateBody(
122
+ task=dag_str,
123
+ service_name=service_name,
124
+ mode=mode,
125
+ )
126
+
127
+ response = requests.post(
128
+ f'{server_common.get_server_url()}/serve/update',
129
+ json=json.loads(body.model_dump_json()),
130
+ timeout=(5, None),
131
+ )
132
+ return server_common.get_request_id(response)
133
+
134
+
135
+ @usage_lib.entrypoint
136
+ @server_common.check_server_healthy_or_start
137
+ def down(
138
+ service_names: Optional[Union[str, List[str]]],
139
+ all: bool = False, # pylint: disable=redefined-builtin
140
+ purge: bool = False
141
+ ) -> server_common.RequestId:
142
+ """Tears down a service.
143
+
144
+ Please refer to the sky.cli.serve_down for the docs.
145
+
146
+ Args:
147
+ service_names: Name of the service(s).
148
+ all: Whether to terminate all services.
149
+ purge: Whether to terminate services in a failed status. These services
150
+ may potentially lead to resource leaks.
151
+
152
+ Returns:
153
+ The request ID of the down request.
154
+
155
+ Request Returns:
156
+ None
157
+
158
+ Request Raises:
159
+ sky.exceptions.ClusterNotUpError: if the sky serve controller is not up.
160
+ ValueError: if the arguments are invalid.
161
+ RuntimeError: if failed to terminate the service.
162
+ """
163
+ body = payloads.ServeDownBody(
164
+ service_names=service_names,
165
+ all=all,
166
+ purge=purge,
167
+ )
168
+ response = requests.post(
169
+ f'{server_common.get_server_url()}/serve/down',
170
+ json=json.loads(body.model_dump_json()),
171
+ timeout=(5, None),
172
+ )
173
+ return server_common.get_request_id(response)
174
+
175
+
176
+ @usage_lib.entrypoint
177
+ @server_common.check_server_healthy_or_start
178
+ def terminate_replica(service_name: str, replica_id: int,
179
+ purge: bool) -> server_common.RequestId:
180
+ """Tears down a specific replica for the given service.
181
+
182
+ Args:
183
+ service_name: Name of the service.
184
+ replica_id: ID of replica to terminate.
185
+ purge: Whether to terminate replicas in a failed status. These replicas
186
+ may lead to resource leaks, so we require the user to explicitly
187
+ specify this flag to make sure they are aware of this potential
188
+ resource leak.
189
+
190
+ Returns:
191
+ The request ID of the terminate replica request.
192
+
193
+ Request Raises:
194
+ sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
195
+ RuntimeError: if failed to terminate the replica.
196
+ """
197
+ body = payloads.ServeTerminateReplicaBody(
198
+ service_name=service_name,
199
+ replica_id=replica_id,
200
+ purge=purge,
201
+ )
202
+ response = requests.post(
203
+ f'{server_common.get_server_url()}/serve/terminate-replica',
204
+ json=json.loads(body.model_dump_json()),
205
+ timeout=(5, None),
206
+ )
207
+ return server_common.get_request_id(response)
208
+
209
+
210
+ @usage_lib.entrypoint
211
+ @server_common.check_server_healthy_or_start
212
+ def status(
213
+ service_names: Optional[Union[str,
214
+ List[str]]]) -> server_common.RequestId:
215
+ """Gets service statuses.
216
+
217
+ If service_names is given, return those services. Otherwise, return all
218
+ services.
219
+
220
+ Each returned value has the following fields:
221
+
222
+ .. code-block:: python
223
+
224
+ {
225
+ 'name': (str) service name,
226
+ 'active_versions': (List[int]) a list of versions that are active,
227
+ 'controller_job_id': (int) the job id of the controller,
228
+ 'uptime': (int) uptime in seconds,
229
+ 'status': (sky.ServiceStatus) service status,
230
+ 'controller_port': (Optional[int]) controller port,
231
+ 'load_balancer_port': (Optional[int]) load balancer port,
232
+ 'endpoint': (Optional[str]) endpoint of the service,
233
+ 'policy': (Optional[str]) autoscaling policy description,
234
+ 'requested_resources_str': (str) str representation of
235
+ requested resources,
236
+ 'load_balancing_policy': (str) load balancing policy name,
237
+ 'replica_info': (List[Dict[str, Any]]) replica information,
238
+ }
239
+
240
+ Each entry in replica_info has the following fields:
241
+
242
+ .. code-block:: python
243
+
244
+ {
245
+ 'replica_id': (int) replica id,
246
+ 'name': (str) replica name,
247
+ 'status': (sky.serve.ReplicaStatus) replica status,
248
+ 'version': (int) replica version,
249
+ 'launched_at': (int) timestamp of launched,
250
+ 'handle': (ResourceHandle) handle of the replica cluster,
251
+ 'endpoint': (str) endpoint of the replica,
252
+ }
253
+
254
+ For possible service statuses and replica statuses, please refer to
255
+ sky.cli.serve_status.
256
+
257
+ Args:
258
+ service_names: a single or a list of service names to query. If None,
259
+ query all services.
260
+
261
+ Returns:
262
+ The request ID of the status request.
263
+
264
+ Request Returns:
265
+ service_records (List[Dict[str, Any]]): A list of dicts, with each
266
+ dict containing the information of a service. If a service is not
267
+ found, it will be omitted from the returned list.
268
+
269
+ Request Raises:
270
+ RuntimeError: if failed to get the service status.
271
+ exceptions.ClusterNotUpError: if the sky serve controller is not up.
272
+ """
273
+ body = payloads.ServeStatusBody(service_names=service_names,)
274
+ response = requests.post(
275
+ f'{server_common.get_server_url()}/serve/status',
276
+ json=json.loads(body.model_dump_json()),
277
+ timeout=(5, None),
278
+ )
279
+ return server_common.get_request_id(response)
280
+
281
+
282
+ @usage_lib.entrypoint
283
+ @server_common.check_server_healthy_or_start
284
+ def tail_logs(service_name: str,
285
+ target: Union[str, 'serve_utils.ServiceComponent'],
286
+ replica_id: Optional[int] = None,
287
+ follow: bool = True,
288
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
289
+ """Tails logs for a service.
290
+
291
+ Usage:
292
+
293
+ .. code-block:: python
294
+
295
+ sky.serve.tail_logs(
296
+ service_name,
297
+ target=<component>,
298
+ follow=False, # Optionally, default to True
299
+ # replica_id=3, # Must be specified when target is REPLICA.
300
+ )
301
+
302
+
303
+ ``target`` is a enum of ``sky.serve.ServiceComponent``, which can be one of:
304
+
305
+ - ``sky.serve.ServiceComponent.CONTROLLER``
306
+
307
+ - ``sky.serve.ServiceComponent.LOAD_BALANCER``
308
+
309
+ - ``sky.serve.ServiceComponent.REPLICA``
310
+
311
+ Pass target as a lower-case string is also supported, e.g.
312
+ ``target='controller'``.
313
+ To use ``sky.serve.ServiceComponent.REPLICA``, you must specify
314
+ ``replica_id``.
315
+
316
+ To tail controller logs:
317
+
318
+ .. code-block:: python
319
+
320
+ # follow default to True
321
+ sky.serve.tail_logs(
322
+ service_name, target=sky.serve.ServiceComponent.CONTROLLER
323
+ )
324
+
325
+ To print replica 3 logs:
326
+
327
+ .. code-block:: python
328
+
329
+ # Pass target as a lower-case string is also supported.
330
+ sky.serve.tail_logs(
331
+ service_name, target='replica',
332
+ follow=False, replica_id=3
333
+ )
334
+
335
+ Args:
336
+ service_name: Name of the service.
337
+ target: The component to tail logs.
338
+ replica_id: The ID of the replica to tail logs.
339
+ follow: Whether to follow the logs.
340
+ output_stream: The stream to write the logs to. If None, print to the
341
+ console.
342
+
343
+ Returns:
344
+ The request ID of the tail logs request.
345
+
346
+ Request Raises:
347
+ sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
348
+ ValueError: arguments not valid, or failed to tail the logs.
349
+ """
350
+ # Avoid circular import.
351
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
352
+
353
+ body = payloads.ServeLogsBody(
354
+ service_name=service_name,
355
+ target=target,
356
+ replica_id=replica_id,
357
+ follow=follow,
358
+ )
359
+ response = requests.post(
360
+ f'{server_common.get_server_url()}/serve/logs',
361
+ json=json.loads(body.model_dump_json()),
362
+ timeout=(5, None),
363
+ stream=True,
364
+ )
365
+ request_id = server_common.get_request_id(response)
366
+ sdk.stream_response(request_id, response, output_stream)
sky/serve/constants.py CHANGED
@@ -12,6 +12,9 @@ PORT_SELECTION_FILE_LOCK_PATH = f'{SKYSERVE_METADATA_DIR}/port_selection.lock'
12
12
  # Signal file path for controller to handle signals.
13
13
  SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}'
14
14
 
15
+ # Time to wait in seconds for controller to setup, this involves the time to run
16
+ # cloud dependencies installation.
17
+ CONTROLLER_SETUP_TIMEOUT_SECONDS = 300
15
18
  # Time to wait in seconds for service to register on the controller.
16
19
  SERVICE_REGISTER_TIMEOUT_SECONDS = 60
17
20
 
@@ -39,8 +42,7 @@ ENDPOINT_PROBE_INTERVAL_SECONDS = 10
39
42
  # The default timeout in seconds for a readiness probe request. We set the
40
43
  # timeout to 15s since using actual generation in LLM services as readiness
41
44
  # probe is very time-consuming (33B, 70B, ...).
42
- # TODO(tian): Expose this option to users in yaml file.
43
- READINESS_PROBE_TIMEOUT_SECONDS = 15
45
+ DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS = 15
44
46
 
45
47
  # Autoscaler window size in seconds for query per second. We calculate qps by
46
48
  # divide the number of queries in last window size by this window size.
@@ -93,4 +95,11 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
93
95
  # change for the serve_utils.ServeCodeGen, we need to bump this version, so that
94
96
  # the user can be notified to update their SkyPilot serve version on the remote
95
97
  # cluster.
96
- SERVE_VERSION = 1
98
+ # Changelog:
99
+ # v1.0 - Introduce rolling update.
100
+ # v2.0 - Added template-replica feature.
101
+ SERVE_VERSION = 2
102
+
103
+ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
104
+ 'The version of service is outdated and does not support manually '
105
+ 'terminating replicas. Please terminate the service and spin up again.')
sky/serve/controller.py CHANGED
@@ -2,14 +2,16 @@
2
2
 
3
3
  Responsible for autoscaling and replica management.
4
4
  """
5
+ import contextlib
5
6
  import logging
6
- import os
7
7
  import threading
8
8
  import time
9
9
  import traceback
10
10
  from typing import Any, Dict, List
11
11
 
12
+ import colorama
12
13
  import fastapi
14
+ from fastapi import responses
13
15
  import uvicorn
14
16
 
15
17
  from sky import serve
@@ -50,7 +52,14 @@ class SkyServeController:
50
52
  autoscalers.Autoscaler.from_spec(service_name, service_spec))
51
53
  self._host = host
52
54
  self._port = port
53
- self._app = fastapi.FastAPI()
55
+ self._app = fastapi.FastAPI(lifespan=self.lifespan)
56
+
57
+ @contextlib.asynccontextmanager
58
+ async def lifespan(self, _: fastapi.FastAPI):
59
+ uvicorn_access_logger = logging.getLogger('uvicorn.access')
60
+ for handler in uvicorn_access_logger.handlers:
61
+ handler.setFormatter(sky_logging.FORMATTER)
62
+ yield
54
63
 
55
64
  def _run_autoscaler(self):
56
65
  logger.info('Starting autoscaler.')
@@ -58,9 +67,16 @@ class SkyServeController:
58
67
  try:
59
68
  replica_infos = serve_state.get_replica_infos(
60
69
  self._service_name)
70
+ # Use the active versions set by replica manager to make
71
+ # sure we only scale down the outdated replicas that are
72
+ # not used by the load balancer.
73
+ record = serve_state.get_service_from_name(self._service_name)
74
+ assert record is not None, ('No service record found for '
75
+ f'{self._service_name}')
76
+ active_versions = record['active_versions']
61
77
  logger.info(f'All replica info: {replica_infos}')
62
- scaling_options = self._autoscaler.evaluate_scaling(
63
- replica_infos)
78
+ scaling_options = self._autoscaler.generate_scaling_decisions(
79
+ replica_infos, active_versions)
64
80
  for scaling_option in scaling_options:
65
81
  logger.info(f'Scaling option received: {scaling_option}')
66
82
  if (scaling_option.operator ==
@@ -68,15 +84,10 @@ class SkyServeController:
68
84
  assert (scaling_option.target is None or isinstance(
69
85
  scaling_option.target, dict)), scaling_option
70
86
  self._replica_manager.scale_up(scaling_option.target)
71
- elif (scaling_option.operator ==
72
- autoscalers.AutoscalerDecisionOperator.SCALE_DOWN):
87
+ else:
73
88
  assert isinstance(scaling_option.target,
74
89
  int), scaling_option
75
90
  self._replica_manager.scale_down(scaling_option.target)
76
- else:
77
- with ux_utils.enable_traceback():
78
- logger.error('Error in scaling_option.operator: '
79
- f'{scaling_option.operator}')
80
91
  except Exception as e: # pylint: disable=broad-except
81
92
  # No matter what error happens, we should keep the
82
93
  # monitor running.
@@ -89,7 +100,8 @@ class SkyServeController:
89
100
  def run(self) -> None:
90
101
 
91
102
  @self._app.post('/controller/load_balancer_sync')
92
- async def load_balancer_sync(request: fastapi.Request):
103
+ async def load_balancer_sync(
104
+ request: fastapi.Request) -> fastapi.Response:
93
105
  request_data = await request.json()
94
106
  # TODO(MaoZiming): Check aggregator type.
95
107
  request_aggregator: Dict[str, Any] = request_data.get(
@@ -97,18 +109,21 @@ class SkyServeController:
97
109
  timestamps: List[int] = request_aggregator.get('timestamps', [])
98
110
  logger.info(f'Received {len(timestamps)} inflight requests.')
99
111
  self._autoscaler.collect_request_information(request_aggregator)
100
- return {
112
+ return responses.JSONResponse(content={
101
113
  'ready_replica_urls':
102
114
  self._replica_manager.get_active_replica_urls()
103
- }
115
+ },
116
+ status_code=200)
104
117
 
105
118
  @self._app.post('/controller/update_service')
106
- async def update_service(request: fastapi.Request):
119
+ async def update_service(request: fastapi.Request) -> fastapi.Response:
107
120
  request_data = await request.json()
108
121
  try:
109
122
  version = request_data.get('version', None)
110
123
  if version is None:
111
- return {'message': 'Error: version is not specified.'}
124
+ return responses.JSONResponse(
125
+ content={'message': 'Error: version is not specified.'},
126
+ status_code=400)
112
127
  update_mode_str = request_data.get(
113
128
  'mode', serve_utils.DEFAULT_UPDATE_MODE.value)
114
129
  update_mode = serve_utils.UpdateMode(update_mode_str)
@@ -137,40 +152,95 @@ class SkyServeController:
137
152
  self._autoscaler.update_version(version,
138
153
  service,
139
154
  update_mode=update_mode)
140
- return {'message': 'Success'}
155
+ return responses.JSONResponse(content={'message': 'Success'},
156
+ status_code=200)
141
157
  except Exception as e: # pylint: disable=broad-except
142
158
  logger.error(f'Error in update_service: '
143
159
  f'{common_utils.format_exception(e)}')
144
- return {'message': 'Error'}
160
+ return responses.JSONResponse(content={'message': 'Error'},
161
+ status_code=500)
145
162
 
146
- @self._app.on_event('startup')
147
- def configure_logger():
148
- uvicorn_access_logger = logging.getLogger('uvicorn.access')
149
- for handler in uvicorn_access_logger.handlers:
150
- handler.setFormatter(sky_logging.FORMATTER)
163
+ @self._app.post('/controller/terminate_replica')
164
+ async def terminate_replica(
165
+ request: fastapi.Request) -> fastapi.Response:
166
+ request_data = await request.json()
167
+ replica_id = request_data['replica_id']
168
+ assert isinstance(replica_id,
169
+ int), 'Error: replica ID must be an integer.'
170
+ purge = request_data['purge']
171
+ assert isinstance(purge, bool), 'Error: purge must be a boolean.'
172
+ replica_info = serve_state.get_replica_info_from_id(
173
+ self._service_name, replica_id)
174
+ assert replica_info is not None, (f'Error: replica '
175
+ f'{replica_id} does not exist.')
176
+ replica_status = replica_info.status
177
+
178
+ if replica_status == serve_state.ReplicaStatus.SHUTTING_DOWN:
179
+ return responses.JSONResponse(
180
+ status_code=409,
181
+ content={
182
+ 'message':
183
+ f'Replica {replica_id} of service '
184
+ f'{self._service_name!r} is already in the process '
185
+ f'of terminating. Skip terminating now.'
186
+ })
187
+
188
+ if (replica_status in serve_state.ReplicaStatus.failed_statuses()
189
+ and not purge):
190
+ return responses.JSONResponse(
191
+ status_code=409,
192
+ content={
193
+ 'message': f'{colorama.Fore.YELLOW}Replica '
194
+ f'{replica_id} of service '
195
+ f'{self._service_name!r} is in failed '
196
+ f'status ({replica_info.status}). '
197
+ f'Skipping its termination as it could '
198
+ f'lead to a resource leak. '
199
+ f'(Use `sky serve down '
200
+ f'{self._service_name!r} --replica-id '
201
+ f'{replica_id} --purge` to '
202
+ 'forcefully terminate the replica.)'
203
+ f'{colorama.Style.RESET_ALL}'
204
+ })
205
+
206
+ self._replica_manager.scale_down(replica_id, purge=purge)
207
+
208
+ action = 'terminated' if not purge else 'purged'
209
+ message = (f'{colorama.Fore.GREEN}Replica {replica_id} of service '
210
+ f'{self._service_name!r} is scheduled to be '
211
+ f'{action}.{colorama.Style.RESET_ALL}\n'
212
+ f'Please use {ux_utils.BOLD}sky serve status '
213
+ f'{self._service_name}{ux_utils.RESET_BOLD} '
214
+ f'to check the latest status.')
215
+ return responses.JSONResponse(status_code=200,
216
+ content={'message': message})
217
+
218
+ @self._app.exception_handler(Exception)
219
+ async def validation_exception_handler(
220
+ request: fastapi.Request, exc: Exception) -> fastapi.Response:
221
+ with ux_utils.enable_traceback():
222
+ logger.error(f'Error in controller: {exc!r}')
223
+ return responses.JSONResponse(
224
+ status_code=500,
225
+ content={
226
+ 'message':
227
+ (f'Failed method {request.method} at URL {request.url}.'
228
+ f' Exception message is {exc!r}.')
229
+ },
230
+ )
151
231
 
152
232
  threading.Thread(target=self._run_autoscaler).start()
153
233
 
154
234
  logger.info('SkyServe Controller started on '
155
235
  f'http://{self._host}:{self._port}')
156
236
 
157
- uvicorn.run(self._app, host={self._host}, port=self._port)
237
+ uvicorn.run(self._app, host=self._host, port=self._port)
158
238
 
159
239
 
160
240
  # TODO(tian): Probably we should support service that will stop the VM in
161
241
  # specific time period.
162
242
  def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
163
- task_yaml: str, controller_port: int):
164
- # We expose the controller to the public network when running inside a
165
- # kubernetes cluster to allow external load balancers (example, for
166
- # high availability load balancers) to communicate with the controller.
167
- def _get_host():
168
- if 'KUBERNETES_SERVICE_HOST' in os.environ:
169
- return '0.0.0.0'
170
- else:
171
- return 'localhost'
172
-
173
- host = _get_host()
174
- controller = SkyServeController(service_name, service_spec, task_yaml, host,
175
- controller_port)
243
+ task_yaml: str, controller_host: str, controller_port: int):
244
+ controller = SkyServeController(service_name, service_spec, task_yaml,
245
+ controller_host, controller_port)
176
246
  controller.run()