skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
File without changes
@@ -1,6 +1,8 @@
1
1
  """SkyServe core APIs."""
2
2
  import re
3
+ import signal
3
4
  import tempfile
5
+ import threading
4
6
  from typing import Any, Dict, List, Optional, Tuple, Union
5
7
 
6
8
  import colorama
@@ -8,6 +10,7 @@ import colorama
8
10
  import sky
9
11
  from sky import backends
10
12
  from sky import exceptions
13
+ from sky import execution
11
14
  from sky import sky_logging
12
15
  from sky import task as task_lib
13
16
  from sky.backends import backend_utils
@@ -17,6 +20,9 @@ from sky.serve import serve_state
17
20
  from sky.serve import serve_utils
18
21
  from sky.skylet import constants
19
22
  from sky.usage import usage_lib
23
+ from sky.utils import admin_policy_utils
24
+ from sky.utils import command_runner
25
+ from sky.utils import common
20
26
  from sky.utils import common_utils
21
27
  from sky.utils import controller_utils
22
28
  from sky.utils import resources_utils
@@ -63,7 +69,8 @@ def _validate_service_task(task: 'sky.Task') -> None:
63
69
  'SkyServe will replenish preempted spot '
64
70
  f'with {policy_description} instances.')
65
71
 
66
- replica_ingress_port: Optional[int] = None
72
+ replica_ingress_port: Optional[int] = int(
73
+ task.service.ports) if (task.service.ports is not None) else None
67
74
  for requested_resources in task.resources:
68
75
  if (task.service.use_ondemand_fallback and
69
76
  not requested_resources.use_spot):
@@ -72,22 +79,58 @@ def _validate_service_task(task: 'sky.Task') -> None:
72
79
  '`use_ondemand_fallback` is only supported '
73
80
  'for spot resources. Please explicitly specify '
74
81
  '`use_spot: true` in resources for on-demand fallback.')
75
- requested_ports = list(
76
- resources_utils.port_ranges_to_set(requested_resources.ports))
77
- if len(requested_ports) != 1:
78
- with ux_utils.print_exception_no_traceback():
79
- raise ValueError(
80
- 'Must only specify one port in resources. Each replica '
81
- 'will use the port specified as application ingress port.')
82
- service_port = requested_ports[0]
83
- if replica_ingress_port is None:
84
- replica_ingress_port = service_port
85
- elif service_port != replica_ingress_port:
86
- with ux_utils.print_exception_no_traceback():
87
- raise ValueError(
88
- f'Got multiple ports: {service_port} and '
89
- f'{replica_ingress_port} in different resources. '
90
- 'Please specify the same port instead.')
82
+ if task.service.ports is None:
83
+ requested_ports = list(
84
+ resources_utils.port_ranges_to_set(requested_resources.ports))
85
+ if len(requested_ports) != 1:
86
+ with ux_utils.print_exception_no_traceback():
87
+ raise ValueError(
88
+ 'To open multiple ports on the replica, please set the '
89
+ '`service.ports` field to specify a main service port. '
90
+ 'Must only specify one port in resources otherwise. '
91
+ 'Each replica will use the port specified as '
92
+ 'application ingress port.')
93
+ service_port = requested_ports[0]
94
+ if replica_ingress_port is None:
95
+ replica_ingress_port = service_port
96
+ elif service_port != replica_ingress_port:
97
+ with ux_utils.print_exception_no_traceback():
98
+ raise ValueError(
99
+ f'Got multiple ports: {service_port} and '
100
+ f'{replica_ingress_port} in different resources. '
101
+ 'Please specify the same port instead.')
102
+
103
+
104
+ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
105
+ service_name: str, task: 'sky.Task') -> Dict[str, Any]:
106
+ """Rewrite the paths of TLS credentials in the task.
107
+
108
+ Args:
109
+ service_name: Name of the service.
110
+ task: sky.Task to rewrite.
111
+
112
+ Returns:
113
+ The generated template variables for TLS.
114
+ """
115
+ service_spec = task.service
116
+ # Already checked by _validate_service_task
117
+ assert service_spec is not None
118
+ if service_spec.tls_credential is None:
119
+ return {'use_tls': False}
120
+ remote_tls_keyfile = (
121
+ serve_utils.generate_remote_tls_keyfile_name(service_name))
122
+ remote_tls_certfile = (
123
+ serve_utils.generate_remote_tls_certfile_name(service_name))
124
+ tls_template_vars = {
125
+ 'use_tls': True,
126
+ 'remote_tls_keyfile': remote_tls_keyfile,
127
+ 'remote_tls_certfile': remote_tls_certfile,
128
+ 'local_tls_keyfile': service_spec.tls_credential.keyfile,
129
+ 'local_tls_certfile': service_spec.tls_credential.certfile,
130
+ }
131
+ service_spec.tls_credential = serve_utils.TLSCredential(
132
+ remote_tls_keyfile, remote_tls_certfile)
133
+ return tls_template_vars
91
134
 
92
135
 
93
136
  @usage_lib.entrypoint
@@ -95,7 +138,7 @@ def up(
95
138
  task: 'sky.Task',
96
139
  service_name: Optional[str] = None,
97
140
  ) -> Tuple[str, str]:
98
- """Spin up a service.
141
+ """Spins up a service.
99
142
 
100
143
  Please refer to the sky.cli.serve_up for the document.
101
144
 
@@ -108,6 +151,7 @@ def up(
108
151
  argument.
109
152
  endpoint: str; The service endpoint.
110
153
  """
154
+ task.validate()
111
155
  if service_name is None:
112
156
  service_name = serve_utils.generate_service_name()
113
157
 
@@ -123,9 +167,20 @@ def up(
123
167
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
124
168
 
125
169
  _validate_service_task(task)
170
+ # Always apply the policy again here, even though it might have been applied
171
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
172
+ # and get the mutated config.
173
+ dag, mutated_user_config = admin_policy_utils.apply(
174
+ task, use_mutated_config_in_current_request=False)
175
+ task = dag.tasks[0]
176
+
177
+ with rich_utils.safe_status(
178
+ ux_utils.spinner_message('Initializing service')):
179
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
180
+ task, task_type='serve')
126
181
 
127
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(task,
128
- path='serve')
182
+ tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
183
+ service_name, task)
129
184
 
130
185
  with tempfile.NamedTemporaryFile(
131
186
  prefix=f'service-task-{service_name}-',
@@ -134,7 +189,7 @@ def up(
134
189
  prefix=f'controller-task-{service_name}-',
135
190
  mode='w',
136
191
  ) as controller_file:
137
- controller_name = serve_utils.SKY_SERVE_CONTROLLER_NAME
192
+ controller_name = common.SKY_SERVE_CONTROLLER_NAME
138
193
  task_config = task.to_yaml_config()
139
194
  common_utils.dump_yaml(service_file.name, task_config)
140
195
  remote_tmp_task_yaml_path = (
@@ -155,9 +210,11 @@ def up(
155
210
  'remote_user_config_path': remote_config_yaml_path,
156
211
  'modified_catalogs':
157
212
  service_catalog_common.get_modified_catalog_file_mounts(),
213
+ **tls_template_vars,
158
214
  **controller_utils.shared_controller_vars_to_fill(
159
215
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
160
216
  remote_user_config_path=remote_config_yaml_path,
217
+ local_user_config=mutated_user_config,
161
218
  ),
162
219
  }
163
220
  common_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE,
@@ -192,15 +249,16 @@ def up(
192
249
  # with the current job id, we know the service is up and running
193
250
  # for the first time; otherwise it is a name conflict.
194
251
  idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
195
- controller_job_id, controller_handle = sky.launch(
196
- task=controller_task,
197
- stream_logs=False,
198
- cluster_name=controller_name,
199
- detach_run=True,
200
- idle_minutes_to_autostop=idle_minutes_to_autostop,
201
- retry_until_up=True,
202
- _disable_controller_check=True,
203
- )
252
+ # Since the controller may be shared among multiple users, launch the
253
+ # controller with the API server's user hash.
254
+ with common.with_server_user_hash():
255
+ controller_job_id, controller_handle = execution.launch(
256
+ task=controller_task,
257
+ cluster_name=controller_name,
258
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
259
+ retry_until_up=True,
260
+ _disable_controller_check=True,
261
+ )
204
262
 
205
263
  style = colorama.Style
206
264
  fore = colorama.Fore
@@ -209,7 +267,8 @@ def up(
209
267
  # TODO(tian): Cache endpoint locally to speedup. Endpoint won't
210
268
  # change after the first time, so there is no consistency issue.
211
269
  with rich_utils.safe_status(
212
- '[cyan]Waiting for the service to register[/]'):
270
+ ux_utils.spinner_message(
271
+ 'Waiting for the service to register')):
213
272
  # This function will check the controller job id in the database
214
273
  # and return the endpoint if the job id matches. Otherwise it will
215
274
  # return None.
@@ -258,44 +317,47 @@ def up(
258
317
  else:
259
318
  lb_port = serve_utils.load_service_initialization_result(
260
319
  lb_port_payload)
261
- endpoint = backend_utils.get_endpoints(
320
+ socket_endpoint = backend_utils.get_endpoints(
262
321
  controller_handle.cluster_name, lb_port,
263
322
  skip_status_check=True).get(lb_port)
264
- assert endpoint is not None, 'Did not get endpoint for controller.'
265
-
266
- sky_logging.print(
323
+ assert socket_endpoint is not None, (
324
+ 'Did not get endpoint for controller.')
325
+ # Already checked by _validate_service_task
326
+ assert task.service is not None
327
+ protocol = ('http'
328
+ if task.service.tls_credential is None else 'https')
329
+ endpoint = f'{protocol}://{socket_endpoint}'
330
+
331
+ logger.info(
267
332
  f'{fore.CYAN}Service name: '
268
333
  f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
269
334
  f'\n{fore.CYAN}Endpoint URL: '
270
335
  f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
271
- '\nTo see detailed info:\t\t'
272
- f'{backend_utils.BOLD}sky serve status {service_name} '
273
- f'[--endpoint]{backend_utils.RESET_BOLD}'
274
- '\nTo teardown the service:\t'
275
- f'{backend_utils.BOLD}sky serve down {service_name}'
276
- f'{backend_utils.RESET_BOLD}'
277
- '\n'
278
- '\nTo see logs of a replica:\t'
279
- f'{backend_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
280
- f'{backend_utils.RESET_BOLD}'
281
- '\nTo see logs of load balancer:\t'
282
- f'{backend_utils.BOLD}sky serve logs --load-balancer {service_name}'
283
- f'{backend_utils.RESET_BOLD}'
284
- '\nTo see logs of controller:\t'
285
- f'{backend_utils.BOLD}sky serve logs --controller {service_name}'
286
- f'{backend_utils.RESET_BOLD}'
287
- '\n'
288
- '\nTo monitor replica status:\t'
289
- f'{backend_utils.BOLD}watch -n10 sky serve status {service_name}'
290
- f'{backend_utils.RESET_BOLD}'
291
- '\nTo send a test request:\t\t'
292
- f'{backend_utils.BOLD}curl {endpoint}'
293
- f'{backend_utils.RESET_BOLD}'
294
- '\n'
295
- f'\n{fore.GREEN}SkyServe is spinning up your service now.'
296
- f'{style.RESET_ALL}'
297
- f'\n{fore.GREEN}The replicas should be ready within a '
298
- f'short time.{style.RESET_ALL}')
336
+ f'\n📋 Useful Commands'
337
+ f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
338
+ f'{ux_utils.BOLD}sky serve status {service_name} '
339
+ f'[--endpoint]{ux_utils.RESET_BOLD}'
340
+ f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
341
+ f'{ux_utils.BOLD}sky serve down {service_name}'
342
+ f'{ux_utils.RESET_BOLD}'
343
+ f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
344
+ f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
345
+ f'{ux_utils.RESET_BOLD}'
346
+ f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
347
+ f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
348
+ f'{ux_utils.RESET_BOLD}'
349
+ f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
350
+ f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
351
+ f'{ux_utils.RESET_BOLD}'
352
+ f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
353
+ f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
354
+ f'{ux_utils.RESET_BOLD}'
355
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
356
+ f'{ux_utils.BOLD}curl {endpoint}'
357
+ f'{ux_utils.RESET_BOLD}'
358
+ '\n\n' +
359
+ ux_utils.finishing_message('Service is spinning up and replicas '
360
+ 'will be ready shortly.'))
299
361
  return service_name, endpoint
300
362
 
301
363
 
@@ -304,24 +366,43 @@ def update(
304
366
  task: 'sky.Task',
305
367
  service_name: str,
306
368
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE) -> None:
307
- """Update an existing service.
369
+ """Updates an existing service.
308
370
 
309
371
  Please refer to the sky.cli.serve_update for the document.
310
372
 
311
373
  Args:
312
374
  task: sky.Task to update.
313
375
  service_name: Name of the service.
376
+ mode: Update mode.
314
377
  """
378
+ task.validate()
315
379
  _validate_service_task(task)
380
+
381
+ # Always apply the policy again here, even though it might have been applied
382
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
383
+ # and get the mutated config.
384
+ # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
385
+ # will not apply the config.
386
+ dag, _ = admin_policy_utils.apply(
387
+ task, use_mutated_config_in_current_request=False)
388
+ task = dag.tasks[0]
389
+
390
+ assert task.service is not None
391
+ if task.service.tls_credential is not None:
392
+ logger.warning('Updating TLS keyfile and certfile is not supported. '
393
+ 'Any updates to the keyfile and certfile will not take '
394
+ 'effect. To update TLS keyfile and certfile, please '
395
+ 'tear down the service and spin up a new one.')
396
+
316
397
  handle = backend_utils.is_controller_accessible(
317
398
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
318
399
  stopped_message=
319
400
  'Service controller is stopped. There is no service to update. '
320
- f'To spin up a new service, use {backend_utils.BOLD}'
321
- f'sky serve up{backend_utils.RESET_BOLD}',
401
+ f'To spin up a new service, use {ux_utils.BOLD}'
402
+ f'sky serve up{ux_utils.RESET_BOLD}',
322
403
  non_existent_message='Service does not exist. '
323
404
  'To spin up a new service, '
324
- f'use {backend_utils.BOLD}sky serve up{backend_utils.RESET_BOLD}',
405
+ f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
325
406
  )
326
407
 
327
408
  backend = backend_utils.get_backend_from_handle(handle)
@@ -344,11 +425,11 @@ def update(
344
425
  raise RuntimeError(e.error_msg) from e
345
426
 
346
427
  service_statuses = serve_utils.load_service_status(serve_status_payload)
347
- if len(service_statuses) == 0:
428
+ if not service_statuses:
348
429
  with ux_utils.print_exception_no_traceback():
349
430
  raise RuntimeError(f'Cannot find service {service_name!r}.'
350
- f'To spin up a service, use {backend_utils.BOLD}'
351
- f'sky serve up{backend_utils.RESET_BOLD}')
431
+ f'To spin up a service, use {ux_utils.BOLD}'
432
+ f'sky serve up{ux_utils.RESET_BOLD}')
352
433
 
353
434
  if len(service_statuses) > 1:
354
435
  with ux_utils.print_exception_no_traceback():
@@ -368,8 +449,21 @@ def update(
368
449
  with ux_utils.print_exception_no_traceback():
369
450
  raise RuntimeError(prompt)
370
451
 
371
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(task,
372
- path='serve')
452
+ original_lb_policy = service_record['load_balancing_policy']
453
+ assert task.service is not None, 'Service section not found.'
454
+ if original_lb_policy != task.service.load_balancing_policy:
455
+ logger.warning(
456
+ f'{colorama.Fore.YELLOW}Current load balancing policy '
457
+ f'{original_lb_policy!r} is different from the new policy '
458
+ f'{task.service.load_balancing_policy!r}. Updating the load '
459
+ 'balancing policy is not supported yet and it will be ignored. '
460
+ 'The service will continue to use the current load balancing '
461
+ f'policy.{colorama.Style.RESET_ALL}')
462
+
463
+ with rich_utils.safe_status(
464
+ ux_utils.spinner_message('Initializing service')):
465
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
466
+ task, task_type='serve')
373
467
 
374
468
  code = serve_utils.ServeCodeGen.add_version(service_name)
375
469
  returncode, version_string_payload, stderr = backend.run_on_head(
@@ -427,8 +521,8 @@ def update(
427
521
 
428
522
  print(f'{colorama.Fore.GREEN}Service {service_name!r} update scheduled.'
429
523
  f'{colorama.Style.RESET_ALL}\n'
430
- f'Please use {backend_utils.BOLD}sky serve status {service_name} '
431
- f'{backend_utils.RESET_BOLD}to check the latest status.')
524
+ f'Please use {ux_utils.BOLD}sky serve status {service_name} '
525
+ f'{ux_utils.RESET_BOLD}to check the latest status.')
432
526
 
433
527
 
434
528
  @usage_lib.entrypoint
@@ -438,7 +532,7 @@ def down(
438
532
  all: bool = False,
439
533
  purge: bool = False,
440
534
  ) -> None:
441
- """Teardown a service.
535
+ """Tears down a service.
442
536
 
443
537
  Please refer to the sky.cli.serve_down for the docs.
444
538
 
@@ -462,9 +556,9 @@ def down(
462
556
  stopped_message='All services should have terminated.')
463
557
 
464
558
  service_names_str = ','.join(service_names)
465
- if sum([len(service_names) > 0, all]) != 1:
466
- argument_str = f'service_names={service_names_str}' if len(
467
- service_names) > 0 else ''
559
+ if sum([bool(service_names), all]) != 1:
560
+ argument_str = (f'service_names={service_names_str}'
561
+ if service_names else '')
468
562
  argument_str += ' all' if all else ''
469
563
  raise ValueError('Can only specify one of service_names or all. '
470
564
  f'Provided {argument_str!r}.')
@@ -482,7 +576,7 @@ def down(
482
576
  except exceptions.FetchClusterInfoError as e:
483
577
  raise RuntimeError(
484
578
  'Failed to fetch controller IP. Please refresh controller status '
485
- f'by `sky status -r {serve_utils.SKY_SERVE_CONTROLLER_NAME}` '
579
+ f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
486
580
  'and try again.') from e
487
581
 
488
582
  try:
@@ -492,6 +586,53 @@ def down(
492
586
  except exceptions.CommandError as e:
493
587
  raise RuntimeError(e.error_msg) from e
494
588
 
589
+ logger.info(stdout)
590
+
591
+
592
+ @usage_lib.entrypoint
593
+ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
594
+ """Tears down a specific replica for the given service.
595
+
596
+ Args:
597
+ service_name: Name of the service.
598
+ replica_id: ID of replica to terminate.
599
+ purge: Whether to terminate replicas in a failed status. These replicas
600
+ may lead to resource leaks, so we require the user to explicitly
601
+ specify this flag to make sure they are aware of this potential
602
+ resource leak.
603
+
604
+ Raises:
605
+ sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
606
+ RuntimeError: if failed to terminate the replica.
607
+ """
608
+ handle = backend_utils.is_controller_accessible(
609
+ controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
610
+ stopped_message=
611
+ 'No service is running now. Please spin up a service first.',
612
+ non_existent_message='No service is running now. '
613
+ 'Please spin up a service first.',
614
+ )
615
+
616
+ backend = backend_utils.get_backend_from_handle(handle)
617
+ assert isinstance(backend, backends.CloudVmRayBackend)
618
+
619
+ code = serve_utils.ServeCodeGen.terminate_replica(service_name, replica_id,
620
+ purge)
621
+ returncode, stdout, stderr = backend.run_on_head(handle,
622
+ code,
623
+ require_outputs=True,
624
+ stream_logs=False,
625
+ separate_stderr=True)
626
+
627
+ try:
628
+ subprocess_utils.handle_returncode(returncode,
629
+ code,
630
+ 'Failed to terminate the replica',
631
+ stderr,
632
+ stream_logs=True)
633
+ except exceptions.CommandError as e:
634
+ raise RuntimeError(e.error_msg) from e
635
+
495
636
  sky_logging.print(stdout)
496
637
 
497
638
 
@@ -499,7 +640,7 @@ def down(
499
640
  def status(
500
641
  service_names: Optional[Union[str,
501
642
  List[str]]] = None) -> List[Dict[str, Any]]:
502
- """Get service statuses.
643
+ """Gets service statuses.
503
644
 
504
645
  If service_names is given, return those services. Otherwise, return all
505
646
  services.
@@ -516,11 +657,12 @@ def status(
516
657
  'status': (sky.ServiceStatus) service status,
517
658
  'controller_port': (Optional[int]) controller port,
518
659
  'load_balancer_port': (Optional[int]) load balancer port,
519
- 'policy': (Optional[str]) load balancer policy description,
520
- 'requested_resources': (sky.Resources) requested resources
521
- for replica (deprecated),
660
+ 'endpoint': (Optional[str]) load balancer endpoint,
661
+ 'policy': (Optional[str]) autoscaling policy description,
522
662
  'requested_resources_str': (str) str representation of
523
663
  requested resources,
664
+ 'load_balancing_policy': (str) load balancing policy name,
665
+ 'tls_encrypted': (bool) whether the service is TLS encrypted,
524
666
  'replica_info': (List[Dict[str, Any]]) replica information,
525
667
  }
526
668
 
@@ -535,6 +677,7 @@ def status(
535
677
  'version': (int) replica version,
536
678
  'launched_at': (int) timestamp of launched,
537
679
  'handle': (ResourceHandle) handle of the replica cluster,
680
+ 'endpoint': (str) endpoint of the replica,
538
681
  }
539
682
 
540
683
  For possible service statuses and replica statuses, please refer to
@@ -588,7 +731,24 @@ def status(
588
731
  except exceptions.CommandError as e:
589
732
  raise RuntimeError(e.error_msg) from e
590
733
 
591
- return serve_utils.load_service_status(serve_status_payload)
734
+ service_records = serve_utils.load_service_status(serve_status_payload)
735
+ # Get the endpoint for each service
736
+ for service_record in service_records:
737
+ service_record['endpoint'] = None
738
+ if service_record['load_balancer_port'] is not None:
739
+ try:
740
+ endpoint = backend_utils.get_endpoints(
741
+ cluster=common.SKY_SERVE_CONTROLLER_NAME,
742
+ port=service_record['load_balancer_port']).get(
743
+ service_record['load_balancer_port'], None)
744
+ except exceptions.ClusterNotUpError:
745
+ pass
746
+ else:
747
+ protocol = ('https'
748
+ if service_record['tls_encrypted'] else 'http')
749
+ service_record['endpoint'] = f'{protocol}://{endpoint}'
750
+
751
+ return service_records
592
752
 
593
753
 
594
754
  @usage_lib.entrypoint
@@ -599,7 +759,7 @@ def tail_logs(
599
759
  replica_id: Optional[int] = None,
600
760
  follow: bool = True,
601
761
  ) -> None:
602
- """Tail logs for a service.
762
+ """Tails logs for a service.
603
763
 
604
764
  Usage:
605
765
  sky.serve.tail_logs(
@@ -638,6 +798,7 @@ def tail_logs(
638
798
  with ux_utils.print_exception_no_traceback():
639
799
  raise ValueError(f'`target` must be a string or '
640
800
  f'sky.serve.ServiceComponent, got {type(target)}.')
801
+
641
802
  if target == serve_utils.ServiceComponent.REPLICA:
642
803
  if replica_id is None:
643
804
  with ux_utils.print_exception_no_traceback():
@@ -655,8 +816,28 @@ def tail_logs(
655
816
 
656
817
  backend = backend_utils.get_backend_from_handle(handle)
657
818
  assert isinstance(backend, backends.CloudVmRayBackend), backend
658
- backend.tail_serve_logs(handle,
659
- service_name,
660
- target,
661
- replica_id,
662
- follow=follow)
819
+
820
+ if target != serve_utils.ServiceComponent.REPLICA:
821
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
822
+ service_name,
823
+ stream_controller=(
824
+ target == serve_utils.ServiceComponent.CONTROLLER),
825
+ follow=follow)
826
+ else:
827
+ assert replica_id is not None, service_name
828
+ code = serve_utils.ServeCodeGen.stream_replica_logs(
829
+ service_name, replica_id, follow)
830
+
831
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
832
+ # kill the process, so we need to handle it manually here.
833
+ if threading.current_thread() is threading.main_thread():
834
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
835
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
836
+
837
+ # Refer to the notes in
838
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
839
+ backend.run_on_head(handle,
840
+ code,
841
+ stream_logs=True,
842
+ process_stream=False,
843
+ ssh_mode=command_runner.SshMode.INTERACTIVE)