skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """User interface with the SkyServe."""
2
2
  import base64
3
3
  import collections
4
+ import dataclasses
4
5
  import enum
5
6
  import os
6
7
  import pathlib
@@ -23,15 +24,15 @@ import requests
23
24
  from sky import backends
24
25
  from sky import exceptions
25
26
  from sky import global_user_state
26
- from sky import status_lib
27
- from sky.backends import backend_utils
28
27
  from sky.serve import constants
29
28
  from sky.serve import serve_state
30
29
  from sky.skylet import constants as skylet_constants
31
30
  from sky.skylet import job_lib
32
31
  from sky.utils import common_utils
33
32
  from sky.utils import log_utils
33
+ from sky.utils import message_utils
34
34
  from sky.utils import resources_utils
35
+ from sky.utils import status_lib
35
36
  from sky.utils import ux_utils
36
37
 
37
38
  if typing.TYPE_CHECKING:
@@ -39,15 +40,19 @@ if typing.TYPE_CHECKING:
39
40
 
40
41
  from sky.serve import replica_managers
41
42
 
42
- SKY_SERVE_CONTROLLER_NAME: str = (
43
- f'sky-serve-controller-{common_utils.get_user_hash()}')
44
43
  _SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3)
45
44
  NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
46
45
  constants.CONTROLLER_MEMORY_USAGE_GB)
47
46
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
48
47
 
49
- _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*'
50
- _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*'
48
+ # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
49
+ # and always appear after a space. Be careful when changing UX as this
50
+ # assumption is used to expand some log files while ignoring others.
51
+ _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
52
+ _SKYPILOT_PROVISION_LOG_PATTERN = (
53
+ fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
54
+ _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
55
+
51
56
  # TODO(tian): Find all existing replica id and print here.
52
57
  _FAILED_TO_FIND_REPLICA_MSG = (
53
58
  f'{colorama.Fore.RED}Failed to find replica '
@@ -86,6 +91,19 @@ class UpdateMode(enum.Enum):
86
91
  BLUE_GREEN = 'blue_green'
87
92
 
88
93
 
94
+ @dataclasses.dataclass
95
+ class TLSCredential:
96
+ """TLS credential for the service."""
97
+ keyfile: str
98
+ certfile: str
99
+
100
+ def dump_uvicorn_kwargs(self) -> Dict[str, str]:
101
+ return {
102
+ 'ssl_keyfile': os.path.expanduser(self.keyfile),
103
+ 'ssl_certfile': os.path.expanduser(self.certfile),
104
+ }
105
+
106
+
89
107
  DEFAULT_UPDATE_MODE = UpdateMode.ROLLING
90
108
 
91
109
  _SIGNAL_TO_ERROR = {
@@ -104,7 +122,7 @@ ValueType = TypeVar('ValueType')
104
122
  class ThreadSafeDict(Generic[KeyType, ValueType]):
105
123
  """A thread-safe dict."""
106
124
 
107
- def __init__(self, *args, **kwargs) -> None:
125
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
108
126
  self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
109
127
  self._lock = threading.Lock()
110
128
 
@@ -237,6 +255,18 @@ def generate_replica_log_file_name(service_name: str, replica_id: int) -> str:
237
255
  return os.path.join(dir_name, f'replica_{replica_id}.log')
238
256
 
239
257
 
258
+ def generate_remote_tls_keyfile_name(service_name: str) -> str:
259
+ dir_name = generate_remote_service_dir_name(service_name)
260
+ # Don't expand here since it is used for remote machine.
261
+ return os.path.join(dir_name, 'tls_keyfile')
262
+
263
+
264
+ def generate_remote_tls_certfile_name(service_name: str) -> str:
265
+ dir_name = generate_remote_service_dir_name(service_name)
266
+ # Don't expand here since it is used for remote machine.
267
+ return os.path.join(dir_name, 'tls_certfile')
268
+
269
+
240
270
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
241
271
  return f'{service_name}-{replica_id}'
242
272
 
@@ -246,9 +276,11 @@ def set_service_status_and_active_versions_from_replica(
246
276
  update_mode: UpdateMode) -> None:
247
277
  record = serve_state.get_service_from_name(service_name)
248
278
  if record is None:
249
- raise ValueError('The service is up-ed in an old version and does not '
250
- 'support update. Please `sky serve down` '
251
- 'it first and relaunch the service.')
279
+ with ux_utils.print_exception_no_traceback():
280
+ raise ValueError(
281
+ 'The service is up-ed in an old version and does not '
282
+ 'support update. Please `sky serve down` '
283
+ 'it first and relaunch the service.')
252
284
  if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
253
285
  # When the service is shutting down, there is a period of time which the
254
286
  # controller still responds to the request, and the replica is not
@@ -289,7 +321,8 @@ def update_service_status() -> None:
289
321
  def update_service_encoded(service_name: str, version: int, mode: str) -> str:
290
322
  service_status = _get_service_status(service_name)
291
323
  if service_status is None:
292
- raise ValueError(f'Service {service_name!r} does not exist.')
324
+ with ux_utils.print_exception_no_traceback():
325
+ raise ValueError(f'Service {service_name!r} does not exist.')
293
326
  controller_port = service_status['controller_port']
294
327
  resp = requests.post(
295
328
  _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
@@ -299,14 +332,54 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
299
332
  'mode': mode,
300
333
  })
301
334
  if resp.status_code == 404:
302
- raise ValueError('The service is up-ed in an old version and does not '
303
- 'support update. Please `sky serve down` '
304
- 'it first and relaunch the service. ')
335
+ with ux_utils.print_exception_no_traceback():
336
+ raise ValueError(
337
+ 'The service is up-ed in an old version and does not '
338
+ 'support update. Please `sky serve down` '
339
+ 'it first and relaunch the service. ')
340
+ elif resp.status_code == 400:
341
+ with ux_utils.print_exception_no_traceback():
342
+ raise ValueError(f'Client error during service update: {resp.text}')
343
+ elif resp.status_code == 500:
344
+ with ux_utils.print_exception_no_traceback():
345
+ raise RuntimeError(
346
+ f'Server error during service update: {resp.text}')
305
347
  elif resp.status_code != 200:
306
- raise ValueError(f'Failed to update service: {resp.text}')
348
+ with ux_utils.print_exception_no_traceback():
349
+ raise ValueError(f'Failed to update service: {resp.text}')
307
350
 
308
351
  service_msg = resp.json()['message']
309
- return common_utils.encode_payload(service_msg)
352
+ return message_utils.encode_payload(service_msg)
353
+
354
+
355
+ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
356
+ service_status = _get_service_status(service_name)
357
+ if service_status is None:
358
+ with ux_utils.print_exception_no_traceback():
359
+ raise ValueError(f'Service {service_name!r} does not exist.')
360
+ replica_info = serve_state.get_replica_info_from_id(service_name,
361
+ replica_id)
362
+ if replica_info is None:
363
+ with ux_utils.print_exception_no_traceback():
364
+ raise ValueError(
365
+ f'Replica {replica_id} for service {service_name} does not '
366
+ 'exist.')
367
+
368
+ controller_port = service_status['controller_port']
369
+ resp = requests.post(
370
+ _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
371
+ '/controller/terminate_replica',
372
+ json={
373
+ 'replica_id': replica_id,
374
+ 'purge': purge,
375
+ })
376
+
377
+ message: str = resp.json()['message']
378
+ if resp.status_code != 200:
379
+ with ux_utils.print_exception_no_traceback():
380
+ raise ValueError(f'Failed to terminate replica {replica_id} '
381
+ f'in {service_name}. Reason:\n{message}')
382
+ return message
310
383
 
311
384
 
312
385
  def _get_service_status(
@@ -334,7 +407,7 @@ def _get_service_status(
334
407
 
335
408
 
336
409
  def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
337
- service_statuses = []
410
+ service_statuses: List[Dict[str, str]] = []
338
411
  if service_names is None:
339
412
  # Get all service names
340
413
  service_names = serve_state.get_glob_service_names(None)
@@ -346,13 +419,28 @@ def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
346
419
  k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
347
420
  for k, v in service_status.items()
348
421
  })
349
- return common_utils.encode_payload(service_statuses)
422
+ # We have to use payload_type here to avoid the issue of
423
+ # message_utils.decode_payload() not being able to correctly decode the
424
+ # message with <sky-payload> tags.
425
+ return message_utils.encode_payload(service_statuses,
426
+ payload_type='service_status')
350
427
 
351
428
 
352
429
  def load_service_status(payload: str) -> List[Dict[str, Any]]:
353
- service_statuses_encoded = common_utils.decode_payload(payload)
354
- service_statuses = []
430
+ try:
431
+ service_statuses_encoded = message_utils.decode_payload(
432
+ payload, payload_type='service_status')
433
+ except ValueError as e:
434
+ if 'Invalid payload string' in str(e):
435
+ # Backward compatibility for serve controller started before #4660
436
+ # where the payload type is not added.
437
+ service_statuses_encoded = message_utils.decode_payload(payload)
438
+ else:
439
+ raise
440
+ service_statuses: List[Dict[str, Any]] = []
355
441
  for service_status in service_statuses_encoded:
442
+ if not isinstance(service_status, dict):
443
+ raise ValueError(f'Invalid service status: {service_status}')
356
444
  service_statuses.append({
357
445
  k: pickle.loads(base64.b64decode(v))
358
446
  for k, v in service_status.items()
@@ -362,16 +450,16 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
362
450
 
363
451
  def add_version_encoded(service_name: str) -> str:
364
452
  new_version = serve_state.add_version(service_name)
365
- return common_utils.encode_payload(new_version)
453
+ return message_utils.encode_payload(new_version)
366
454
 
367
455
 
368
456
  def load_version_string(payload: str) -> str:
369
- return common_utils.decode_payload(payload)
457
+ return message_utils.decode_payload(payload)
370
458
 
371
459
 
372
460
  def _terminate_failed_services(
373
461
  service_name: str,
374
- service_status: serve_state.ServiceStatus) -> Optional[str]:
462
+ service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
375
463
  """Terminate service in failed status.
376
464
 
377
465
  Services included in ServiceStatus.failed_statuses() do not have an
@@ -383,7 +471,7 @@ def _terminate_failed_services(
383
471
  A message indicating potential resource leak (if any). If no
384
472
  resource leak is detected, return None.
385
473
  """
386
- remaining_replica_clusters = []
474
+ remaining_replica_clusters: List[str] = []
387
475
  # The controller should have already attempted to terminate those
388
476
  # replicas, so we don't need to try again here.
389
477
  for replica_info in serve_state.get_replica_infos(service_name):
@@ -397,6 +485,7 @@ def _terminate_failed_services(
397
485
  generate_remote_service_dir_name(service_name))
398
486
  shutil.rmtree(service_dir)
399
487
  serve_state.remove_service(service_name)
488
+ serve_state.delete_all_versions(service_name)
400
489
 
401
490
  if not remaining_replica_clusters:
402
491
  return None
@@ -409,26 +498,35 @@ def _terminate_failed_services(
409
498
 
410
499
  def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
411
500
  service_names = serve_state.get_glob_service_names(service_names)
412
- terminated_service_names = []
413
- messages = []
501
+ terminated_service_names: List[str] = []
502
+ messages: List[str] = []
414
503
  for service_name in service_names:
415
504
  service_status = _get_service_status(service_name,
416
505
  with_replica_info=False)
417
- assert service_status is not None
418
- if service_status['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
506
+ if (service_status is not None and service_status['status']
507
+ == serve_state.ServiceStatus.SHUTTING_DOWN):
419
508
  # Already scheduled to be terminated.
420
509
  continue
421
- if (service_status['status']
510
+ # If the `services` and `version_specs` table are not aligned, it might
511
+ # result in a None service status. In this case, the controller process
512
+ # is not functioning as well and we should also use the
513
+ # `_terminate_failed_services` function to clean up the service.
514
+ # This is a safeguard for a rare case, that is accidentally abort
515
+ # between `serve_state.add_service` and
516
+ # `serve_state.add_or_update_version` in service.py.
517
+ if (service_status is None or service_status['status']
422
518
  in serve_state.ServiceStatus.failed_statuses()):
519
+ failed_status = (service_status['status']
520
+ if service_status is not None else None)
423
521
  if purge:
424
522
  message = _terminate_failed_services(service_name,
425
- service_status['status'])
523
+ failed_status)
426
524
  if message is not None:
427
525
  messages.append(message)
428
526
  else:
429
527
  messages.append(
430
528
  f'{colorama.Fore.YELLOW}Service {service_name!r} is in '
431
- f'failed status ({service_status["status"]}). Skipping '
529
+ f'failed status ({failed_status}). Skipping '
432
530
  'its termination as it could lead to a resource leak. '
433
531
  f'(Use `sky serve down {service_name} --purge` to '
434
532
  'forcefully terminate the service.)'
@@ -447,7 +545,7 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
447
545
  f.write(UserSignal.TERMINATE.value)
448
546
  f.flush()
449
547
  terminated_service_names.append(f'{service_name!r}')
450
- if len(terminated_service_names) == 0:
548
+ if not terminated_service_names:
451
549
  messages.append('No service to terminate.')
452
550
  else:
453
551
  identity_str = f'Service {terminated_service_names[0]} is'
@@ -472,7 +570,31 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
472
570
  Encoded load balancer port assigned to the service.
473
571
  """
474
572
  start_time = time.time()
573
+ setup_completed = False
475
574
  while True:
575
+ job_status = job_lib.get_status(job_id)
576
+ if job_status is None or job_status < job_lib.JobStatus.RUNNING:
577
+ # Wait for the controller process to finish setting up. It can be
578
+ # slow if a lot cloud dependencies are being installed.
579
+ if (time.time() - start_time >
580
+ constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
581
+ with ux_utils.print_exception_no_traceback():
582
+ raise RuntimeError(
583
+ f'Failed to start the controller '
584
+ f'process for the service {service_name!r} '
585
+ f'within '
586
+ f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS} seconds.'
587
+ )
588
+ # No need to check the service status as the controller process
589
+ # is still setting up.
590
+ time.sleep(1)
591
+ continue
592
+
593
+ if not setup_completed:
594
+ setup_completed = True
595
+ # Reset the start time to wait for the service to be registered.
596
+ start_time = time.time()
597
+
476
598
  record = serve_state.get_service_from_name(service_name)
477
599
  if record is not None:
478
600
  if job_id != record['controller_job_id']:
@@ -480,12 +602,11 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
480
602
  raise ValueError(
481
603
  f'The service {service_name!r} is already running. '
482
604
  'Please specify a different name for your service. '
483
- 'To update an existing service, run: `sky serve down` '
484
- 'and then `sky serve up` again (in-place update will '
485
- 'be supported in the future).')
605
+ 'To update an existing service, run: sky serve update '
606
+ f'{service_name} <new-service-yaml>')
486
607
  lb_port = record['load_balancer_port']
487
608
  if lb_port is not None:
488
- return common_utils.encode_payload(lb_port)
609
+ return message_utils.encode_payload(lb_port)
489
610
  elif len(serve_state.get_services()) >= NUM_SERVICE_THRESHOLD:
490
611
  with ux_utils.print_exception_no_traceback():
491
612
  raise RuntimeError('Max number of services reached. '
@@ -508,7 +629,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
508
629
 
509
630
 
510
631
  def load_service_initialization_result(payload: str) -> int:
511
- return common_utils.decode_payload(payload)
632
+ return message_utils.decode_payload(payload)
512
633
 
513
634
 
514
635
  def check_service_status_healthy(service_name: str) -> Optional[str]:
@@ -539,16 +660,27 @@ def get_latest_version_with_min_replicas(
539
660
  return active_versions[-1] if active_versions else None
540
661
 
541
662
 
542
- def _follow_replica_logs(
543
- file: TextIO,
544
- cluster_name: str,
545
- *,
546
- finish_stream: Callable[[], bool],
547
- exit_if_stream_end: bool = False,
548
- no_new_content_timeout: Optional[int] = None) -> Iterator[str]:
549
- line = ''
550
- log_file = None
551
- no_new_content_cnt = 0
663
+ def _follow_logs_with_provision_expanding(
664
+ file: TextIO,
665
+ cluster_name: str,
666
+ *,
667
+ should_stop: Callable[[], bool],
668
+ stop_on_eof: bool = False,
669
+ idle_timeout_seconds: Optional[int] = None,
670
+ ) -> Iterator[str]:
671
+ """Follows logs and expands any provision.log references found.
672
+
673
+ Args:
674
+ file: Log file to read from.
675
+ cluster_name: Name of the cluster being launched.
676
+ should_stop: Callback that returns True when streaming should stop.
677
+ stop_on_eof: If True, stop when reaching end of file.
678
+ idle_timeout_seconds: If set, stop after these many seconds without
679
+ new content.
680
+
681
+ Yields:
682
+ Log lines, including expanded content from referenced provision logs.
683
+ """
552
684
 
553
685
  def cluster_is_up() -> bool:
554
686
  cluster_record = global_user_state.get_cluster_from_name(cluster_name)
@@ -556,51 +688,51 @@ def _follow_replica_logs(
556
688
  return False
557
689
  return cluster_record['status'] == status_lib.ClusterStatus.UP
558
690
 
559
- while True:
560
- tmp = file.readline()
561
- if tmp is not None and tmp != '':
562
- no_new_content_cnt = 0
563
- line += tmp
564
- if '\n' in line or '\r' in line:
565
- # Tailing detailed progress for user. All logs in skypilot is
566
- # of format `To view detailed progress: tail -n100 -f *.log`.
567
- x = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
568
- if x is not None:
569
- log_file = os.path.expanduser(x.group(1))
570
- elif re.match(_SKYPILOT_LOG_PATTERN, line) is None:
571
- # Not print other logs (file sync logs) since we lack
572
- # utility to determine when these log files are finished
573
- # writing.
574
- # TODO(tian): Not skip these logs since there are small
575
- # chance that error will happen in file sync. Need to find
576
- # a better way to do this.
577
- yield line
578
- # Output next line first since it indicates the process is
579
- # starting. For our launching logs, it's always:
580
- # Launching on <cloud> <region> (<zone>)
581
- if log_file is not None:
582
- with open(log_file, 'r', newline='',
583
- encoding='utf-8') as f:
584
- # We still exit if more than 10 seconds without new
585
- # content to avoid any internal bug that causes
586
- # the launch failed and cluster status remains INIT.
587
- for l in _follow_replica_logs(
588
- f,
589
- cluster_name,
590
- finish_stream=cluster_is_up,
591
- exit_if_stream_end=exit_if_stream_end,
592
- no_new_content_timeout=10):
593
- yield l
594
- log_file = None
595
- line = ''
596
- else:
597
- if exit_if_stream_end or finish_stream():
598
- break
599
- if no_new_content_timeout is not None:
600
- if no_new_content_cnt >= no_new_content_timeout:
601
- break
602
- no_new_content_cnt += 1
603
- time.sleep(1)
691
+ def process_line(line: str) -> Iterator[str]:
692
+ # The line might be directing users to view logs, like
693
+ # `✓ Cluster launched: new-http. View logs at: *.log`
694
+ # We should tail the detailed logs for user.
695
+ provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
696
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
697
+
698
+ if provision_log_prompt is not None:
699
+ nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
700
+
701
+ try:
702
+ with open(nested_log_path, 'r', newline='',
703
+ encoding='utf-8') as f:
704
+ # We still exit if more than 10 seconds without new content
705
+ # to avoid any internal bug that causes the launch to fail
706
+ # while cluster status remains INIT.
707
+ yield from log_utils.follow_logs(f,
708
+ should_stop=cluster_is_up,
709
+ stop_on_eof=stop_on_eof,
710
+ idle_timeout_seconds=10)
711
+ except FileNotFoundError:
712
+ yield line
713
+
714
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
715
+ f'Try to expand log file {nested_log_path} but not '
716
+ f'found. Skipping...{colorama.Style.RESET_ALL}')
717
+ pass
718
+ return
719
+
720
+ if log_prompt is not None:
721
+ # Now we skip other logs (file sync logs) since we lack
722
+ # utility to determine when these log files are finished
723
+ # writing.
724
+ # TODO(tian): We should not skip these logs since there are
725
+ # small chance that error will happen in file sync. Need to
726
+ # find a better way to do this.
727
+ return
728
+
729
+ yield line
730
+
731
+ return log_utils.follow_logs(file,
732
+ should_stop=should_stop,
733
+ stop_on_eof=stop_on_eof,
734
+ process_line=process_line,
735
+ idle_timeout_seconds=idle_timeout_seconds)
604
736
 
605
737
 
606
738
  def stream_replica_logs(service_name: str, replica_id: int,
@@ -631,17 +763,21 @@ def stream_replica_logs(service_name: str, replica_id: int,
631
763
  for info in replica_info:
632
764
  if info.replica_id == replica_id:
633
765
  return info.status
634
- raise ValueError(
635
- _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
766
+ with ux_utils.print_exception_no_traceback():
767
+ raise ValueError(
768
+ _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
636
769
 
637
- finish_stream = (
770
+ replica_provisioned = (
638
771
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
639
772
  with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
640
- for line in _follow_replica_logs(f,
641
- replica_cluster_name,
642
- finish_stream=finish_stream,
643
- exit_if_stream_end=not follow):
773
+ for line in _follow_logs_with_provision_expanding(
774
+ f,
775
+ replica_cluster_name,
776
+ should_stop=replica_provisioned,
777
+ stop_on_eof=not follow,
778
+ ):
644
779
  print(line, end='', flush=True)
780
+
645
781
  if (not follow and
646
782
  _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
647
783
  # Early exit if not following the logs.
@@ -666,22 +802,6 @@ def stream_replica_logs(service_name: str, replica_id: int,
666
802
  return ''
667
803
 
668
804
 
669
- def _follow_logs(file: TextIO, *, finish_stream: Callable[[], bool],
670
- exit_if_stream_end: bool) -> Iterator[str]:
671
- line = ''
672
- while True:
673
- tmp = file.readline()
674
- if tmp is not None and tmp != '':
675
- line += tmp
676
- if '\n' in line or '\r' in line:
677
- yield line
678
- line = ''
679
- else:
680
- if exit_if_stream_end or finish_stream():
681
- break
682
- time.sleep(1)
683
-
684
-
685
805
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
686
806
  follow: bool) -> str:
687
807
  msg = check_service_status_healthy(service_name)
@@ -700,9 +820,11 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
700
820
 
701
821
  with open(os.path.expanduser(log_file), 'r', newline='',
702
822
  encoding='utf-8') as f:
703
- for line in _follow_logs(f,
704
- finish_stream=_service_is_terminal,
705
- exit_if_stream_end=not follow):
823
+ for line in log_utils.follow_logs(
824
+ f,
825
+ should_stop=_service_is_terminal,
826
+ stop_on_eof=not follow,
827
+ ):
706
828
  print(line, end='', flush=True)
707
829
  return ''
708
830
 
@@ -721,28 +843,6 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
721
843
  return f'{ready_replica_num}/{total_replica_num}'
722
844
 
723
845
 
724
- def get_endpoint(service_record: Dict[str, Any]) -> str:
725
- # Don't use backend_utils.is_controller_up since it is too slow.
726
- handle = global_user_state.get_handle_from_cluster_name(
727
- SKY_SERVE_CONTROLLER_NAME)
728
- assert isinstance(handle, backends.CloudVmRayResourceHandle)
729
- if handle is None:
730
- return '-'
731
- load_balancer_port = service_record['load_balancer_port']
732
- if load_balancer_port is None:
733
- return '-'
734
- try:
735
- endpoint = backend_utils.get_endpoints(handle.cluster_name,
736
- load_balancer_port).get(
737
- load_balancer_port, None)
738
- except exceptions.ClusterNotUpError:
739
- return '-'
740
- if endpoint is None:
741
- return '-'
742
- assert isinstance(endpoint, str), endpoint
743
- return endpoint
744
-
745
-
746
846
  def format_service_table(service_records: List[Dict[str, Any]],
747
847
  show_all: bool) -> str:
748
848
  if not service_records:
@@ -752,10 +852,12 @@ def format_service_table(service_records: List[Dict[str, Any]],
752
852
  'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'
753
853
  ]
754
854
  if show_all:
755
- service_columns.extend(['POLICY', 'REQUESTED_RESOURCES'])
855
+ service_columns.extend([
856
+ 'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
857
+ ])
756
858
  service_table = log_utils.create_table(service_columns)
757
859
 
758
- replica_infos = []
860
+ replica_infos: List[Dict[str, Any]] = []
759
861
  for record in service_records:
760
862
  for replica in record['replica_info']:
761
863
  replica['service_name'] = record['name']
@@ -770,14 +872,12 @@ def format_service_table(service_records: List[Dict[str, Any]],
770
872
  service_status = record['status']
771
873
  status_str = service_status.colored_str()
772
874
  replicas = _get_replicas(record)
773
- endpoint = get_endpoint(record)
875
+ endpoint = record['endpoint']
876
+ if endpoint is None:
877
+ endpoint = '-'
774
878
  policy = record['policy']
775
- # TODO(tian): Backward compatibility.
776
- # Remove `requested_resources` field after 2 minor release, 0.6.0.
777
- if record.get('requested_resources_str') is None:
778
- requested_resources_str = str(record['requested_resources'])
779
- else:
780
- requested_resources_str = record['requested_resources_str']
879
+ requested_resources_str = record['requested_resources_str']
880
+ load_balancing_policy = record['load_balancing_policy']
781
881
 
782
882
  service_values = [
783
883
  service_name,
@@ -788,7 +888,8 @@ def format_service_table(service_records: List[Dict[str, Any]],
788
888
  endpoint,
789
889
  ]
790
890
  if show_all:
791
- service_values.extend([policy, requested_resources_str])
891
+ service_values.extend(
892
+ [policy, load_balancing_policy, requested_resources_str])
792
893
  service_table.add_row(service_values)
793
894
 
794
895
  replica_table = _format_replica_table(replica_infos, show_all)
@@ -830,7 +931,8 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
830
931
  region = '-'
831
932
  zone = '-'
832
933
 
833
- replica_handle: 'backends.CloudVmRayResourceHandle' = record['handle']
934
+ replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
935
+ 'handle']
834
936
  if replica_handle is not None:
835
937
  resources_str = resources_utils.get_readable_resources_repr(
836
938
  replica_handle, simplify=not show_all)
@@ -902,6 +1004,18 @@ class ServeCodeGen:
902
1004
  ]
903
1005
  return cls._build(code)
904
1006
 
1007
+ @classmethod
1008
+ def terminate_replica(cls, service_name: str, replica_id: int,
1009
+ purge: bool) -> str:
1010
+ code = [
1011
+ f'(lambda: print(serve_utils.terminate_replica({service_name!r}, '
1012
+ f'{replica_id}, {purge}), end="", flush=True) '
1013
+ 'if getattr(constants, "SERVE_VERSION", 0) >= 2 else '
1014
+ f'exec("raise RuntimeError('
1015
+ f'{constants.TERMINATE_REPLICA_VERSION_MISMATCH_ERROR!r})"))()'
1016
+ ]
1017
+ return cls._build(code)
1018
+
905
1019
  @classmethod
906
1020
  def wait_service_registration(cls, service_name: str, job_id: int) -> str:
907
1021
  code = [
@@ -933,21 +1047,18 @@ class ServeCodeGen:
933
1047
  def _build(cls, code: List[str]) -> str:
934
1048
  code = cls._PREFIX + code
935
1049
  generated_code = '; '.join(code)
936
- return (f'{skylet_constants.SKY_PYTHON_CMD} '
1050
+ # Use the local user id to make sure the operation goes to the correct
1051
+ # user.
1052
+ return (f'export {skylet_constants.USER_ID_ENV_VAR}='
1053
+ f'"{common_utils.get_user_hash()}"; '
1054
+ f'{skylet_constants.SKY_PYTHON_CMD} '
937
1055
  f'-u -c {shlex.quote(generated_code)}')
938
1056
 
939
1057
  @classmethod
940
1058
  def update_service(cls, service_name: str, version: int, mode: str) -> str:
941
1059
  code = [
942
- # Backward compatibility for old serve version on the remote
943
- # machine. The `mode` argument was added in #3249, and if the remote
944
- # machine has an old SkyPilot version before that, we need to avoid
945
- # passing the `mode` argument to the job_lib functions.
946
- # TODO(zhwu): Remove this in 0.7.0 release.
947
- f'mode_kwargs = {{"mode": {mode!r}}} '
948
- 'if getattr(constants, "SERVE_VERSION", 0) >= 1 else {}',
949
1060
  f'msg = serve_utils.update_service_encoded({service_name!r}, '
950
- f'{version}, **mode_kwargs)',
1061
+ f'{version}, mode={mode!r})',
951
1062
  'print(msg, end="", flush=True)',
952
1063
  ]
953
1064
  return cls._build(code)