skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/serve/server/core.py CHANGED
@@ -1,83 +1,36 @@
1
1
  """SkyServe core APIs."""
2
2
  import pathlib
3
- import re
4
3
  import signal
5
- import tempfile
6
4
  import threading
7
5
  import typing
8
6
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
7
 
10
- import colorama
11
-
12
- import sky
13
8
  from sky import backends
14
9
  from sky import exceptions
15
- from sky import execution
16
10
  from sky import sky_logging
17
- from sky import skypilot_config
18
- from sky import task as task_lib
19
11
  from sky.backends import backend_utils
20
- from sky.catalog import common as service_catalog_common
21
- from sky.data import storage as storage_lib
22
- from sky.serve import constants as serve_constants
23
- from sky.serve import serve_state
24
12
  from sky.serve import serve_utils
25
- from sky.skylet import constants
13
+ from sky.serve.server import impl
26
14
  from sky.usage import usage_lib
27
- from sky.utils import admin_policy_utils
28
15
  from sky.utils import command_runner
29
- from sky.utils import common
30
- from sky.utils import common_utils
31
16
  from sky.utils import controller_utils
32
- from sky.utils import dag_utils
33
17
  from sky.utils import rich_utils
34
18
  from sky.utils import subprocess_utils
35
19
  from sky.utils import ux_utils
36
20
 
37
21
  if typing.TYPE_CHECKING:
38
- from sky.backends import cloud_vm_ray_backend
22
+ import sky
39
23
 
40
24
  logger = sky_logging.init_logger(__name__)
41
25
 
42
26
 
43
- def _rewrite_tls_credential_paths_and_get_tls_env_vars(
44
- service_name: str, task: 'sky.Task') -> Dict[str, Any]:
45
- """Rewrite the paths of TLS credentials in the task.
46
-
47
- Args:
48
- service_name: Name of the service.
49
- task: sky.Task to rewrite.
50
-
51
- Returns:
52
- The generated template variables for TLS.
53
- """
54
- service_spec = task.service
55
- # Already checked by validate_service_task
56
- assert service_spec is not None
57
- if service_spec.tls_credential is None:
58
- return {'use_tls': False}
59
- remote_tls_keyfile = (
60
- serve_utils.generate_remote_tls_keyfile_name(service_name))
61
- remote_tls_certfile = (
62
- serve_utils.generate_remote_tls_certfile_name(service_name))
63
- tls_template_vars = {
64
- 'use_tls': True,
65
- 'remote_tls_keyfile': remote_tls_keyfile,
66
- 'remote_tls_certfile': remote_tls_certfile,
67
- 'local_tls_keyfile': service_spec.tls_credential.keyfile,
68
- 'local_tls_certfile': service_spec.tls_credential.certfile,
69
- }
70
- service_spec.tls_credential = serve_utils.TLSCredential(
71
- remote_tls_keyfile, remote_tls_certfile)
72
- return tls_template_vars
73
-
74
-
75
27
  def _get_all_replica_targets(
76
28
  service_name: str, backend: backends.CloudVmRayBackend,
77
29
  handle: backends.CloudVmRayResourceHandle
78
30
  ) -> Set[serve_utils.ServiceComponentTarget]:
79
31
  """Helper function to get targets for all live replicas."""
80
- code = serve_utils.ServeCodeGen.get_service_status([service_name])
32
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
33
+ pool=False)
81
34
  returncode, serve_status_payload, stderr = backend.run_on_head(
82
35
  handle,
83
36
  code,
@@ -125,236 +78,7 @@ def up(
125
78
  argument.
126
79
  endpoint: str; The service endpoint.
127
80
  """
128
- task.validate()
129
- if service_name is None:
130
- service_name = serve_utils.generate_service_name()
131
-
132
- # The service name will be used as:
133
- # 1. controller cluster name: 'sky-serve-controller-<service_name>'
134
- # 2. replica cluster name: '<service_name>-<replica_id>'
135
- # In both cases, service name shares the same regex with cluster name.
136
- if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None:
137
- with ux_utils.print_exception_no_traceback():
138
- raise ValueError(f'Service name {service_name!r} is invalid: '
139
- f'ensure it is fully matched by regex (e.g., '
140
- 'only contains lower letters, numbers and dash): '
141
- f'{constants.CLUSTER_NAME_VALID_REGEX}')
142
-
143
- serve_utils.validate_service_task(task)
144
- dag = dag_utils.convert_entrypoint_to_dag(task)
145
- dag.resolve_and_validate_volumes()
146
- # Always apply the policy again here, even though it might have been applied
147
- # in the CLI. This is to ensure that we apply the policy to the final DAG
148
- # and get the mutated config.
149
- dag, mutated_user_config = admin_policy_utils.apply(dag)
150
- dag.pre_mount_volumes()
151
- task = dag.tasks[0]
152
-
153
- with rich_utils.safe_status(
154
- ux_utils.spinner_message('Initializing service')):
155
- # Handle file mounts using two-hop approach when cloud storage
156
- # unavailable
157
- storage_clouds = (
158
- storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
159
- force_disable_cloud_bucket = skypilot_config.get_nested(
160
- ('serve', 'force_disable_cloud_bucket'), False)
161
- if storage_clouds and not force_disable_cloud_bucket:
162
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
163
- task, task_type='serve')
164
- local_to_controller_file_mounts = {}
165
- else:
166
- # Fall back to two-hop file_mount uploading when no cloud storage
167
- if task.storage_mounts:
168
- raise exceptions.NotSupportedError(
169
- 'Cloud-based file_mounts are specified, but no cloud '
170
- 'storage is available. Please specify local '
171
- 'file_mounts only.')
172
- local_to_controller_file_mounts = (
173
- controller_utils.translate_local_file_mounts_to_two_hop(task))
174
-
175
- tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
176
- service_name, task)
177
-
178
- with tempfile.NamedTemporaryFile(
179
- prefix=f'service-task-{service_name}-',
180
- mode='w',
181
- ) as service_file, tempfile.NamedTemporaryFile(
182
- prefix=f'controller-task-{service_name}-',
183
- mode='w',
184
- ) as controller_file:
185
- controller_name = common.SKY_SERVE_CONTROLLER_NAME
186
- task_config = task.to_yaml_config()
187
- common_utils.dump_yaml(service_file.name, task_config)
188
- remote_tmp_task_yaml_path = (
189
- serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
190
- remote_config_yaml_path = (
191
- serve_utils.generate_remote_config_yaml_file_name(service_name))
192
- controller_log_file = (
193
- serve_utils.generate_remote_controller_log_file_name(service_name))
194
- controller_resources = controller_utils.get_controller_resources(
195
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
196
- task_resources=task.resources)
197
-
198
- vars_to_fill = {
199
- 'remote_task_yaml_path': remote_tmp_task_yaml_path,
200
- 'local_task_yaml_path': service_file.name,
201
- 'service_name': service_name,
202
- 'controller_log_file': controller_log_file,
203
- 'remote_user_config_path': remote_config_yaml_path,
204
- 'local_to_controller_file_mounts': local_to_controller_file_mounts,
205
- 'modified_catalogs':
206
- service_catalog_common.get_modified_catalog_file_mounts(),
207
- **tls_template_vars,
208
- **controller_utils.shared_controller_vars_to_fill(
209
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
210
- remote_user_config_path=remote_config_yaml_path,
211
- local_user_config=mutated_user_config,
212
- ),
213
- }
214
- common_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE,
215
- vars_to_fill,
216
- output_path=controller_file.name)
217
- controller_task = task_lib.Task.from_yaml(controller_file.name)
218
- # TODO(tian): Probably run another sky.launch after we get the load
219
- # balancer port from the controller? So we don't need to open so many
220
- # ports here. Or, we should have a nginx traffic control to refuse
221
- # any connection to the unregistered ports.
222
- controller_resources = {
223
- r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
224
- for r in controller_resources
225
- }
226
- controller_task.set_resources(controller_resources)
227
-
228
- # # Set service_name so the backend will know to modify default ray
229
- # task CPU usage to custom value instead of default 0.5 vCPU. We need
230
- # to set it to a smaller value to support a larger number of services.
231
- controller_task.service_name = service_name
232
-
233
- print(f'{colorama.Fore.YELLOW}Launching controller for '
234
- f'{service_name!r}...{colorama.Style.RESET_ALL}')
235
- # We directly submit the request to the controller and let the
236
- # controller to check name conflict. Suppose we have multiple
237
- # sky.serve.up() with same service name, the first one will
238
- # successfully write its job id to controller service database;
239
- # and for all following sky.serve.up(), the controller will throw
240
- # an exception (name conflict detected) and exit. Therefore the
241
- # controller job id in database could be use as an indicator of
242
- # whether the service is already running. If the id is the same
243
- # with the current job id, we know the service is up and running
244
- # for the first time; otherwise it is a name conflict.
245
- # Since the controller may be shared among multiple users, launch the
246
- # controller with the API server's user hash.
247
- with common.with_server_user():
248
- with skypilot_config.local_active_workspace_ctx(
249
- constants.SKYPILOT_DEFAULT_WORKSPACE):
250
- controller_job_id, controller_handle = execution.launch(
251
- task=controller_task,
252
- cluster_name=controller_name,
253
- retry_until_up=True,
254
- _disable_controller_check=True,
255
- )
256
-
257
- style = colorama.Style
258
- fore = colorama.Fore
259
-
260
- assert controller_job_id is not None and controller_handle is not None
261
- # TODO(tian): Cache endpoint locally to speedup. Endpoint won't
262
- # change after the first time, so there is no consistency issue.
263
- with rich_utils.safe_status(
264
- ux_utils.spinner_message(
265
- 'Waiting for the service to register')):
266
- # This function will check the controller job id in the database
267
- # and return the endpoint if the job id matches. Otherwise it will
268
- # return None.
269
- code = serve_utils.ServeCodeGen.wait_service_registration(
270
- service_name, controller_job_id)
271
- backend = backend_utils.get_backend_from_handle(controller_handle)
272
- assert isinstance(backend, backends.CloudVmRayBackend)
273
- assert isinstance(controller_handle,
274
- backends.CloudVmRayResourceHandle)
275
- returncode, lb_port_payload, _ = backend.run_on_head(
276
- controller_handle,
277
- code,
278
- require_outputs=True,
279
- stream_logs=False)
280
- try:
281
- subprocess_utils.handle_returncode(
282
- returncode, code, 'Failed to wait for service initialization',
283
- lb_port_payload)
284
- except exceptions.CommandError:
285
- statuses = backend.get_job_status(controller_handle,
286
- [controller_job_id],
287
- stream_logs=False)
288
- controller_job_status = list(statuses.values())[0]
289
- if controller_job_status == sky.JobStatus.PENDING:
290
- # Max number of services reached due to vCPU constraint.
291
- # The controller job is pending due to ray job scheduling.
292
- # We manually cancel the job here.
293
- backend.cancel_jobs(controller_handle, [controller_job_id])
294
- with ux_utils.print_exception_no_traceback():
295
- raise RuntimeError(
296
- 'Max number of services reached. '
297
- 'To spin up more services, please '
298
- 'tear down some existing services.') from None
299
- else:
300
- # Possible cases:
301
- # (1) name conflict;
302
- # (2) max number of services reached due to memory
303
- # constraint. The job will successfully run on the
304
- # controller, but there will be an error thrown due
305
- # to memory constraint check in the controller.
306
- # See sky/serve/service.py for more details.
307
- with ux_utils.print_exception_no_traceback():
308
- raise RuntimeError(
309
- 'Failed to spin up the service. Please '
310
- 'check the logs above for more details.') from None
311
- else:
312
- lb_port = serve_utils.load_service_initialization_result(
313
- lb_port_payload)
314
- socket_endpoint = backend_utils.get_endpoints(
315
- controller_handle.cluster_name, lb_port,
316
- skip_status_check=True).get(lb_port)
317
- assert socket_endpoint is not None, (
318
- 'Did not get endpoint for controller.')
319
- # Already checked by validate_service_task
320
- assert task.service is not None
321
- protocol = ('http'
322
- if task.service.tls_credential is None else 'https')
323
- socket_endpoint = socket_endpoint.replace('https://', '').replace(
324
- 'http://', '')
325
- endpoint = f'{protocol}://{socket_endpoint}'
326
-
327
- logger.info(
328
- f'{fore.CYAN}Service name: '
329
- f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
330
- f'\n{fore.CYAN}Endpoint URL: '
331
- f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
332
- f'\n📋 Useful Commands'
333
- f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
334
- f'{ux_utils.BOLD}sky serve status {service_name} '
335
- f'[--endpoint]{ux_utils.RESET_BOLD}'
336
- f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
337
- f'{ux_utils.BOLD}sky serve down {service_name}'
338
- f'{ux_utils.RESET_BOLD}'
339
- f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
340
- f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
341
- f'{ux_utils.RESET_BOLD}'
342
- f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
343
- f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
344
- f'{ux_utils.RESET_BOLD}'
345
- f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
346
- f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
347
- f'{ux_utils.RESET_BOLD}'
348
- f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
349
- f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
350
- f'{ux_utils.RESET_BOLD}'
351
- f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
352
- f'{ux_utils.BOLD}curl {endpoint}'
353
- f'{ux_utils.RESET_BOLD}'
354
- '\n\n' +
355
- ux_utils.finishing_message('Service is spinning up and replicas '
356
- 'will be ready shortly.'))
357
- return service_name, endpoint
81
+ return impl.up(task, service_name, pool=False)
358
82
 
359
83
 
360
84
  @usage_lib.entrypoint
@@ -371,153 +95,7 @@ def update(
371
95
  service_name: Name of the service.
372
96
  mode: Update mode.
373
97
  """
374
- task.validate()
375
- serve_utils.validate_service_task(task)
376
-
377
- # Always apply the policy again here, even though it might have been applied
378
- # in the CLI. This is to ensure that we apply the policy to the final DAG
379
- # and get the mutated config.
380
- # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
381
- # will not apply the config.
382
- dag, _ = admin_policy_utils.apply(task)
383
- task = dag.tasks[0]
384
-
385
- assert task.service is not None
386
- if task.service.tls_credential is not None:
387
- logger.warning('Updating TLS keyfile and certfile is not supported. '
388
- 'Any updates to the keyfile and certfile will not take '
389
- 'effect. To update TLS keyfile and certfile, please '
390
- 'tear down the service and spin up a new one.')
391
-
392
- handle = backend_utils.is_controller_accessible(
393
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
394
- stopped_message=
395
- 'Service controller is stopped. There is no service to update. '
396
- f'To spin up a new service, use {ux_utils.BOLD}'
397
- f'sky serve up{ux_utils.RESET_BOLD}',
398
- non_existent_message='Service does not exist. '
399
- 'To spin up a new service, '
400
- f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
401
- )
402
-
403
- backend = backend_utils.get_backend_from_handle(handle)
404
- assert isinstance(backend, backends.CloudVmRayBackend)
405
-
406
- code = serve_utils.ServeCodeGen.get_service_status([service_name])
407
- returncode, serve_status_payload, stderr = backend.run_on_head(
408
- handle,
409
- code,
410
- require_outputs=True,
411
- stream_logs=False,
412
- separate_stderr=True)
413
- try:
414
- subprocess_utils.handle_returncode(returncode,
415
- code, 'Failed to get service status '
416
- 'when update service',
417
- stderr,
418
- stream_logs=True)
419
- except exceptions.CommandError as e:
420
- raise RuntimeError(e.error_msg) from e
421
-
422
- service_statuses = serve_utils.load_service_status(serve_status_payload)
423
- if not service_statuses:
424
- with ux_utils.print_exception_no_traceback():
425
- raise RuntimeError(f'Cannot find service {service_name!r}.'
426
- f'To spin up a service, use {ux_utils.BOLD}'
427
- f'sky serve up{ux_utils.RESET_BOLD}')
428
-
429
- if len(service_statuses) > 1:
430
- with ux_utils.print_exception_no_traceback():
431
- raise RuntimeError(
432
- f'Multiple services found for {service_name!r}. ')
433
- service_record = service_statuses[0]
434
- prompt = None
435
- if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
436
- ):
437
- prompt = (f'Service {service_name!r} has a failed controller. '
438
- 'Please clean up the service and try again.')
439
- elif (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT
440
- ):
441
- prompt = (f'Service {service_name!r} is still initializing '
442
- 'its controller. Please try again later.')
443
- if prompt is not None:
444
- with ux_utils.print_exception_no_traceback():
445
- raise RuntimeError(prompt)
446
-
447
- original_lb_policy = service_record['load_balancing_policy']
448
- assert task.service is not None, 'Service section not found.'
449
- if original_lb_policy != task.service.load_balancing_policy:
450
- logger.warning(
451
- f'{colorama.Fore.YELLOW}Current load balancing policy '
452
- f'{original_lb_policy!r} is different from the new policy '
453
- f'{task.service.load_balancing_policy!r}. Updating the load '
454
- 'balancing policy is not supported yet and it will be ignored. '
455
- 'The service will continue to use the current load balancing '
456
- f'policy.{colorama.Style.RESET_ALL}')
457
-
458
- with rich_utils.safe_status(
459
- ux_utils.spinner_message('Initializing service')):
460
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
461
- task, task_type='serve')
462
-
463
- code = serve_utils.ServeCodeGen.add_version(service_name)
464
- returncode, version_string_payload, stderr = backend.run_on_head(
465
- handle,
466
- code,
467
- require_outputs=True,
468
- stream_logs=False,
469
- separate_stderr=True)
470
- try:
471
- subprocess_utils.handle_returncode(returncode,
472
- code,
473
- 'Failed to add version',
474
- stderr,
475
- stream_logs=True)
476
- except exceptions.CommandError as e:
477
- raise RuntimeError(e.error_msg) from e
478
-
479
- version_string = serve_utils.load_version_string(version_string_payload)
480
- try:
481
- current_version = int(version_string)
482
- except ValueError as e:
483
- with ux_utils.print_exception_no_traceback():
484
- raise ValueError(f'Failed to parse version: {version_string}; '
485
- f'Returncode: {returncode}') from e
486
-
487
- print(f'New version: {current_version}')
488
- with tempfile.NamedTemporaryFile(
489
- prefix=f'{service_name}-v{current_version}',
490
- mode='w') as service_file:
491
- task_config = task.to_yaml_config()
492
- common_utils.dump_yaml(service_file.name, task_config)
493
- remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
494
- service_name, current_version, expand_user=False)
495
-
496
- backend.sync_file_mounts(handle,
497
- {remote_task_yaml_path: service_file.name},
498
- storage_mounts=None)
499
-
500
- code = serve_utils.ServeCodeGen.update_service(service_name,
501
- current_version,
502
- mode=mode.value)
503
- returncode, _, stderr = backend.run_on_head(handle,
504
- code,
505
- require_outputs=True,
506
- stream_logs=False,
507
- separate_stderr=True)
508
- try:
509
- subprocess_utils.handle_returncode(returncode,
510
- code,
511
- 'Failed to update services',
512
- stderr,
513
- stream_logs=True)
514
- except exceptions.CommandError as e:
515
- raise RuntimeError(e.error_msg) from e
516
-
517
- print(f'{colorama.Fore.GREEN}Service {service_name!r} update scheduled.'
518
- f'{colorama.Style.RESET_ALL}\n'
519
- f'Please use {ux_utils.BOLD}sky serve status {service_name} '
520
- f'{ux_utils.RESET_BOLD}to check the latest status.')
98
+ return impl.update(task, service_name, mode, pool=False)
521
99
 
522
100
 
523
101
  @usage_lib.entrypoint
@@ -542,46 +120,7 @@ def down(
542
120
  ValueError: if the arguments are invalid.
543
121
  RuntimeError: if failed to terminate the service.
544
122
  """
545
- if service_names is None:
546
- service_names = []
547
- if isinstance(service_names, str):
548
- service_names = [service_names]
549
- handle = backend_utils.is_controller_accessible(
550
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
551
- stopped_message='All services should have terminated.')
552
-
553
- service_names_str = ','.join(service_names)
554
- if sum([bool(service_names), all]) != 1:
555
- argument_str = (f'service_names={service_names_str}'
556
- if service_names else '')
557
- argument_str += ' all' if all else ''
558
- raise ValueError('Can only specify one of service_names or all. '
559
- f'Provided {argument_str!r}.')
560
-
561
- backend = backend_utils.get_backend_from_handle(handle)
562
- assert isinstance(backend, backends.CloudVmRayBackend)
563
- service_names = None if all else service_names
564
- code = serve_utils.ServeCodeGen.terminate_services(service_names, purge)
565
-
566
- try:
567
- returncode, stdout, _ = backend.run_on_head(handle,
568
- code,
569
- require_outputs=True,
570
- stream_logs=False)
571
- except exceptions.FetchClusterInfoError as e:
572
- raise RuntimeError(
573
- 'Failed to fetch controller IP. Please refresh controller status '
574
- f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
575
- 'and try again.') from e
576
-
577
- try:
578
- subprocess_utils.handle_returncode(returncode, code,
579
- 'Failed to terminate service',
580
- stdout)
581
- except exceptions.CommandError as e:
582
- raise RuntimeError(e.error_msg) from e
583
-
584
- logger.info(stdout)
123
+ return impl.down(service_names, all, purge, pool=False)
585
124
 
586
125
 
587
126
  @usage_lib.entrypoint
@@ -690,63 +229,7 @@ def status(
690
229
  RuntimeError: if failed to get the service status.
691
230
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
692
231
  """
693
- if service_names is not None:
694
- if isinstance(service_names, str):
695
- service_names = [service_names]
696
-
697
- try:
698
- backend_utils.check_network_connection()
699
- except exceptions.NetworkError as e:
700
- with ux_utils.print_exception_no_traceback():
701
- raise RuntimeError(
702
- 'Failed to refresh service status due to network error.') from e
703
-
704
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
705
- handle = backend_utils.is_controller_accessible(
706
- controller=controller_type,
707
- stopped_message=controller_type.value.default_hint_if_non_existent)
708
-
709
- backend = backend_utils.get_backend_from_handle(handle)
710
- assert isinstance(backend, backends.CloudVmRayBackend)
711
-
712
- code = serve_utils.ServeCodeGen.get_service_status(service_names)
713
- returncode, serve_status_payload, stderr = backend.run_on_head(
714
- handle,
715
- code,
716
- require_outputs=True,
717
- stream_logs=False,
718
- separate_stderr=True)
719
-
720
- try:
721
- subprocess_utils.handle_returncode(returncode,
722
- code,
723
- 'Failed to fetch services',
724
- stderr,
725
- stream_logs=True)
726
- except exceptions.CommandError as e:
727
- raise RuntimeError(e.error_msg) from e
728
-
729
- service_records = serve_utils.load_service_status(serve_status_payload)
730
- # Get the endpoint for each service
731
- for service_record in service_records:
732
- service_record['endpoint'] = None
733
- if service_record['load_balancer_port'] is not None:
734
- try:
735
- endpoint = backend_utils.get_endpoints(
736
- cluster=common.SKY_SERVE_CONTROLLER_NAME,
737
- port=service_record['load_balancer_port']).get(
738
- service_record['load_balancer_port'], None)
739
- except exceptions.ClusterNotUpError:
740
- pass
741
- else:
742
- protocol = ('https'
743
- if service_record['tls_encrypted'] else 'http')
744
- if endpoint is not None:
745
- endpoint = endpoint.replace('https://',
746
- '').replace('http://', '')
747
- service_record['endpoint'] = f'{protocol}://{endpoint}'
748
-
749
- return service_records
232
+ return impl.status(service_names, pool=False)
750
233
 
751
234
 
752
235
  ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]