skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -11,6 +11,7 @@ import shlex
|
|
|
11
11
|
import shutil
|
|
12
12
|
import threading
|
|
13
13
|
import time
|
|
14
|
+
import traceback
|
|
14
15
|
import typing
|
|
15
16
|
from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
|
|
16
17
|
List, Optional, TextIO, Type, TypeVar, Union)
|
|
@@ -22,7 +23,10 @@ import filelock
|
|
|
22
23
|
from sky import backends
|
|
23
24
|
from sky import exceptions
|
|
24
25
|
from sky import global_user_state
|
|
26
|
+
from sky import sky_logging
|
|
27
|
+
from sky import skypilot_config
|
|
25
28
|
from sky.adaptors import common as adaptors_common
|
|
29
|
+
from sky.jobs import state as managed_job_state
|
|
26
30
|
from sky.serve import constants
|
|
27
31
|
from sky.serve import serve_state
|
|
28
32
|
from sky.serve import spot_placer
|
|
@@ -47,6 +51,8 @@ else:
|
|
|
47
51
|
psutil = adaptors_common.LazyImport('psutil')
|
|
48
52
|
requests = adaptors_common.LazyImport('requests')
|
|
49
53
|
|
|
54
|
+
logger = sky_logging.init_logger(__name__)
|
|
55
|
+
|
|
50
56
|
|
|
51
57
|
@annotations.lru_cache(scope='request')
|
|
52
58
|
def get_num_service_threshold():
|
|
@@ -244,7 +250,22 @@ class RequestTimestamp(RequestsAggregator):
|
|
|
244
250
|
return f'RequestTimestamp(timestamps={self.timestamps})'
|
|
245
251
|
|
|
246
252
|
|
|
247
|
-
def
|
|
253
|
+
def get_service_filelock_path(pool: str) -> str:
|
|
254
|
+
path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
|
|
255
|
+
'pool.lock').expanduser().absolute()
|
|
256
|
+
path.parents[0].mkdir(parents=True, exist_ok=True)
|
|
257
|
+
return str(path)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
|
261
|
+
def is_consolidation_mode() -> bool:
|
|
262
|
+
consolidation_mode = skypilot_config.get_nested(
|
|
263
|
+
('serve', 'controller', 'consolidation_mode'), default_value=False)
|
|
264
|
+
# _check_consolidation_mode_consistency(consolidation_mode)
|
|
265
|
+
return consolidation_mode
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
248
269
|
"""Validate the task for Sky Serve.
|
|
249
270
|
|
|
250
271
|
Args:
|
|
@@ -267,17 +288,25 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
267
288
|
'use `dynamic_ondemand_fallback` or set '
|
|
268
289
|
'base_ondemand_fallback_replicas.')
|
|
269
290
|
|
|
291
|
+
field_name = 'service' if not pool else 'pool'
|
|
270
292
|
if task.service is None:
|
|
271
293
|
with ux_utils.print_exception_no_traceback():
|
|
272
|
-
raise RuntimeError('
|
|
294
|
+
raise RuntimeError(f'{field_name.capitalize()} section not found.')
|
|
295
|
+
|
|
296
|
+
if pool != task.service.pool:
|
|
297
|
+
with ux_utils.print_exception_no_traceback():
|
|
298
|
+
raise ValueError(f'{field_name.capitalize()} section in the YAML '
|
|
299
|
+
f'file does not match the pool argument. '
|
|
300
|
+
f'To fix, add a valid `{field_name}` field.')
|
|
273
301
|
|
|
274
302
|
policy_description = ('on-demand'
|
|
275
303
|
if task.service.dynamic_ondemand_fallback else 'spot')
|
|
276
304
|
for resource in list(task.resources):
|
|
277
305
|
if resource.job_recovery is not None:
|
|
306
|
+
sys_name = 'SkyServe' if not pool else 'Cluster Pool'
|
|
278
307
|
with ux_utils.print_exception_no_traceback():
|
|
279
|
-
raise ValueError('job_recovery is disabled for
|
|
280
|
-
'
|
|
308
|
+
raise ValueError(f'job_recovery is disabled for {sys_name}. '
|
|
309
|
+
f'{sys_name} will replenish preempted spot '
|
|
281
310
|
f'with {policy_description} instances.')
|
|
282
311
|
|
|
283
312
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
@@ -300,7 +329,7 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
300
329
|
raise ValueError(
|
|
301
330
|
'`spot_placer` is only supported for spot resources. '
|
|
302
331
|
'Please explicitly specify `use_spot: true` in resources.')
|
|
303
|
-
if task.service.ports is None:
|
|
332
|
+
if not pool and task.service.ports is None:
|
|
304
333
|
requested_ports = list(
|
|
305
334
|
resources_utils.port_ranges_to_set(requested_resources.ports))
|
|
306
335
|
if len(requested_ports) != 1:
|
|
@@ -320,10 +349,16 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
320
349
|
f'Got multiple ports: {service_port} and '
|
|
321
350
|
f'{replica_ingress_port} in different resources. '
|
|
322
351
|
'Please specify the same port instead.')
|
|
352
|
+
if pool:
|
|
353
|
+
if (task.service.ports is not None or
|
|
354
|
+
requested_resources.ports is not None):
|
|
355
|
+
with ux_utils.print_exception_no_traceback():
|
|
356
|
+
raise ValueError('Cannot specify ports in a cluster pool.')
|
|
323
357
|
|
|
324
358
|
|
|
325
|
-
def generate_service_name():
|
|
326
|
-
|
|
359
|
+
def generate_service_name(pool: bool = False):
|
|
360
|
+
noun = 'pool' if pool else 'service'
|
|
361
|
+
return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
|
|
327
362
|
|
|
328
363
|
|
|
329
364
|
def generate_remote_service_dir_name(service_name: str) -> str:
|
|
@@ -426,6 +461,9 @@ def set_service_status_and_active_versions_from_replica(
|
|
|
426
461
|
|
|
427
462
|
|
|
428
463
|
def update_service_status() -> None:
|
|
464
|
+
if is_consolidation_mode():
|
|
465
|
+
# TODO(tian): PID-based tracking.
|
|
466
|
+
return
|
|
429
467
|
services = serve_state.get_services()
|
|
430
468
|
for record in services:
|
|
431
469
|
if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
|
|
@@ -440,11 +478,14 @@ def update_service_status() -> None:
|
|
|
440
478
|
record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
|
|
441
479
|
|
|
442
480
|
|
|
443
|
-
def update_service_encoded(service_name: str, version: int, mode: str
|
|
444
|
-
|
|
481
|
+
def update_service_encoded(service_name: str, version: int, mode: str,
|
|
482
|
+
pool: bool) -> str:
|
|
483
|
+
noun = 'pool' if pool else 'service'
|
|
484
|
+
capnoun = noun.capitalize()
|
|
485
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
445
486
|
if service_status is None:
|
|
446
487
|
with ux_utils.print_exception_no_traceback():
|
|
447
|
-
raise ValueError(f'
|
|
488
|
+
raise ValueError(f'{capnoun} {service_name!r} does not exist.')
|
|
448
489
|
controller_port = service_status['controller_port']
|
|
449
490
|
resp = requests.post(
|
|
450
491
|
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
@@ -455,27 +496,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
|
|
455
496
|
})
|
|
456
497
|
if resp.status_code == 404:
|
|
457
498
|
with ux_utils.print_exception_no_traceback():
|
|
499
|
+
# This only happens for services since pool is added after the
|
|
500
|
+
# update feature is introduced.
|
|
458
501
|
raise ValueError(
|
|
459
502
|
'The service is up-ed in an old version and does not '
|
|
460
503
|
'support update. Please `sky serve down` '
|
|
461
504
|
'it first and relaunch the service. ')
|
|
462
505
|
elif resp.status_code == 400:
|
|
463
506
|
with ux_utils.print_exception_no_traceback():
|
|
464
|
-
raise ValueError(f'Client error during
|
|
507
|
+
raise ValueError(f'Client error during {noun} update: {resp.text}')
|
|
465
508
|
elif resp.status_code == 500:
|
|
466
509
|
with ux_utils.print_exception_no_traceback():
|
|
467
510
|
raise RuntimeError(
|
|
468
|
-
f'Server error during
|
|
511
|
+
f'Server error during {noun} update: {resp.text}')
|
|
469
512
|
elif resp.status_code != 200:
|
|
470
513
|
with ux_utils.print_exception_no_traceback():
|
|
471
|
-
raise ValueError(f'Failed to update
|
|
514
|
+
raise ValueError(f'Failed to update {noun}: {resp.text}')
|
|
472
515
|
|
|
473
516
|
service_msg = resp.json()['message']
|
|
474
517
|
return message_utils.encode_payload(service_msg)
|
|
475
518
|
|
|
476
519
|
|
|
477
520
|
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
478
|
-
|
|
521
|
+
# TODO(tian): Currently pool does not support terminating replica.
|
|
522
|
+
service_status = _get_service_status(service_name, pool=False)
|
|
479
523
|
if service_status is None:
|
|
480
524
|
with ux_utils.print_exception_no_traceback():
|
|
481
525
|
raise ValueError(f'Service {service_name!r} does not exist.')
|
|
@@ -506,6 +550,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
|
506
550
|
|
|
507
551
|
def _get_service_status(
|
|
508
552
|
service_name: str,
|
|
553
|
+
pool: bool,
|
|
509
554
|
with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
|
|
510
555
|
"""Get the status dict of the service.
|
|
511
556
|
|
|
@@ -520,27 +565,63 @@ def _get_service_status(
|
|
|
520
565
|
record = serve_state.get_service_from_name(service_name)
|
|
521
566
|
if record is None:
|
|
522
567
|
return None
|
|
568
|
+
if record['pool'] != pool:
|
|
569
|
+
return None
|
|
570
|
+
|
|
571
|
+
record['pool_yaml'] = ''
|
|
572
|
+
if record['pool']:
|
|
573
|
+
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
574
|
+
record['version'])
|
|
575
|
+
original_config = common_utils.read_yaml(latest_yaml_path)
|
|
576
|
+
original_config.pop('run', None)
|
|
577
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
578
|
+
if svc is not None:
|
|
579
|
+
svc.pop('pool', None)
|
|
580
|
+
original_config['pool'] = svc
|
|
581
|
+
record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
|
|
582
|
+
|
|
583
|
+
record['target_num_replicas'] = 0
|
|
584
|
+
try:
|
|
585
|
+
controller_port = record['controller_port']
|
|
586
|
+
resp = requests.get(
|
|
587
|
+
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
588
|
+
'/autoscaler/info')
|
|
589
|
+
record['target_num_replicas'] = resp.json()['target_num_replicas']
|
|
590
|
+
except requests.exceptions.RequestException:
|
|
591
|
+
record['target_num_replicas'] = None
|
|
592
|
+
except Exception as e: # pylint: disable=broad-except
|
|
593
|
+
logger.error(f'Failed to get autoscaler info for {service_name}: '
|
|
594
|
+
f'{common_utils.format_exception(e)}\n'
|
|
595
|
+
f'Traceback: {traceback.format_exc()}')
|
|
596
|
+
|
|
523
597
|
if with_replica_info:
|
|
524
598
|
record['replica_info'] = [
|
|
525
|
-
info.to_info_dict(with_handle=True)
|
|
599
|
+
info.to_info_dict(with_handle=True, with_url=not pool)
|
|
526
600
|
for info in serve_state.get_replica_infos(service_name)
|
|
527
601
|
]
|
|
602
|
+
if pool:
|
|
603
|
+
for replica_info in record['replica_info']:
|
|
604
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
605
|
+
service_name, replica_info['name'])
|
|
606
|
+
replica_info['used_by'] = job_ids[0] if job_ids else None
|
|
528
607
|
return record
|
|
529
608
|
|
|
530
609
|
|
|
531
|
-
def get_service_status_encoded(service_names: Optional[List[str]]
|
|
610
|
+
def get_service_status_encoded(service_names: Optional[List[str]],
|
|
611
|
+
pool: bool) -> str:
|
|
532
612
|
service_statuses: List[Dict[str, str]] = []
|
|
533
613
|
if service_names is None:
|
|
534
614
|
# Get all service names
|
|
535
615
|
service_names = serve_state.get_glob_service_names(None)
|
|
536
616
|
for service_name in service_names:
|
|
537
|
-
service_status = _get_service_status(service_name)
|
|
617
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
538
618
|
if service_status is None:
|
|
539
619
|
continue
|
|
540
620
|
service_statuses.append({
|
|
541
621
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
|
542
622
|
for k, v in service_status.items()
|
|
543
623
|
})
|
|
624
|
+
service_statuses = sorted(service_statuses, key=lambda x: x['name'])
|
|
544
625
|
# We have to use payload_type here to avoid the issue of
|
|
545
626
|
# message_utils.decode_payload() not being able to correctly decode the
|
|
546
627
|
# message with <sky-payload> tags.
|
|
@@ -579,6 +660,71 @@ def load_version_string(payload: str) -> str:
|
|
|
579
660
|
return message_utils.decode_payload(payload)
|
|
580
661
|
|
|
581
662
|
|
|
663
|
+
def num_replicas(service_name: str) -> int:
|
|
664
|
+
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
665
|
+
return len(serve_state.get_replica_infos(service_name))
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
669
|
+
"""Get the next available cluster name from idle replicas.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
service_name: The name of the service.
|
|
673
|
+
job_id: Optional job ID to associate with the acquired cluster.
|
|
674
|
+
If None, a placeholder will be used.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
The cluster name if an idle replica is found, None otherwise.
|
|
678
|
+
"""
|
|
679
|
+
# Check if service exists
|
|
680
|
+
service_status = _get_service_status(service_name,
|
|
681
|
+
pool=True,
|
|
682
|
+
with_replica_info=False)
|
|
683
|
+
if service_status is None:
|
|
684
|
+
logger.error(f'Service {service_name!r} does not exist.')
|
|
685
|
+
return None
|
|
686
|
+
if not service_status['pool']:
|
|
687
|
+
logger.error(f'Service {service_name!r} is not a cluster pool.')
|
|
688
|
+
return None
|
|
689
|
+
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
690
|
+
|
|
691
|
+
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
692
|
+
ready_replicas = [
|
|
693
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
694
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
695
|
+
]
|
|
696
|
+
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
697
|
+
for replica_info in ready_replicas:
|
|
698
|
+
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
699
|
+
service_name, replica_info.cluster_name)
|
|
700
|
+
# TODO(tian): Make it resources aware. Currently we allow and only
|
|
701
|
+
# allow one job per replica. In the following PR, we should:
|
|
702
|
+
# i) When the replica is launched with `any_of` resources (
|
|
703
|
+
# replicas can have different resources), we should check if
|
|
704
|
+
# the resources that jobs require are available on the replica.
|
|
705
|
+
# e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
|
|
706
|
+
# should only goes to replica with A100.
|
|
707
|
+
# ii) When a job only requires a subset of the resources on the
|
|
708
|
+
# replica, each replica should be able to handle multiple jobs
|
|
709
|
+
# at the same time. e.g., if a job requires A100:1 on a A100:8
|
|
710
|
+
# pool, it should be able to run 4 jobs at the same time.
|
|
711
|
+
if not jobs_on_replica:
|
|
712
|
+
idle_replicas.append(replica_info)
|
|
713
|
+
if not idle_replicas:
|
|
714
|
+
logger.info(f'No idle replicas found for pool {service_name!r}')
|
|
715
|
+
return None
|
|
716
|
+
|
|
717
|
+
# Select the first idle replica.
|
|
718
|
+
# TODO(tian): "Load balancing" policy.
|
|
719
|
+
replica_info = idle_replicas[0]
|
|
720
|
+
logger.info(f'Selected replica {replica_info.replica_id} with cluster '
|
|
721
|
+
f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
|
|
722
|
+
f'{service_name!r}')
|
|
723
|
+
managed_job_state.set_current_cluster_name(job_id,
|
|
724
|
+
replica_info.cluster_name)
|
|
725
|
+
return replica_info.cluster_name
|
|
726
|
+
|
|
727
|
+
|
|
582
728
|
def _terminate_failed_services(
|
|
583
729
|
service_name: str,
|
|
584
730
|
service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
|
|
@@ -618,17 +764,38 @@ def _terminate_failed_services(
|
|
|
618
764
|
f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
|
|
619
765
|
|
|
620
766
|
|
|
621
|
-
def terminate_services(service_names: Optional[List[str]], purge: bool
|
|
767
|
+
def terminate_services(service_names: Optional[List[str]], purge: bool,
|
|
768
|
+
pool: bool) -> str:
|
|
769
|
+
noun = 'pool' if pool else 'service'
|
|
770
|
+
capnoun = noun.capitalize()
|
|
622
771
|
service_names = serve_state.get_glob_service_names(service_names)
|
|
623
772
|
terminated_service_names: List[str] = []
|
|
624
773
|
messages: List[str] = []
|
|
625
774
|
for service_name in service_names:
|
|
626
775
|
service_status = _get_service_status(service_name,
|
|
776
|
+
pool=pool,
|
|
627
777
|
with_replica_info=False)
|
|
778
|
+
if service_status is None:
|
|
779
|
+
continue
|
|
628
780
|
if (service_status is not None and service_status['status']
|
|
629
781
|
== serve_state.ServiceStatus.SHUTTING_DOWN):
|
|
630
782
|
# Already scheduled to be terminated.
|
|
631
783
|
continue
|
|
784
|
+
if pool:
|
|
785
|
+
nonterminal_job_ids = (
|
|
786
|
+
managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
|
|
787
|
+
if nonterminal_job_ids:
|
|
788
|
+
nonterminal_job_ids_str = ','.join(
|
|
789
|
+
str(job_id) for job_id in nonterminal_job_ids)
|
|
790
|
+
num_nonterminal_jobs = len(nonterminal_job_ids)
|
|
791
|
+
messages.append(
|
|
792
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
|
|
793
|
+
f'{num_nonterminal_jobs} nonterminal jobs: '
|
|
794
|
+
f'{nonterminal_job_ids_str}. To terminate the {noun}, '
|
|
795
|
+
f'please run `sky jobs cancel --pool {service_name}` to '
|
|
796
|
+
'cancel all jobs in the pool first.'
|
|
797
|
+
f'{colorama.Style.RESET_ALL}')
|
|
798
|
+
continue
|
|
632
799
|
# If the `services` and `version_specs` table are not aligned, it might
|
|
633
800
|
# result in a None service status. In this case, the controller process
|
|
634
801
|
# is not functioning as well and we should also use the
|
|
@@ -636,10 +803,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
636
803
|
# This is a safeguard for a rare case, that is accidentally abort
|
|
637
804
|
# between `serve_state.add_service` and
|
|
638
805
|
# `serve_state.add_or_update_version` in service.py.
|
|
639
|
-
|
|
806
|
+
purge_cmd = (f'sky jobs pool down {service_name} --purge'
|
|
807
|
+
if pool else f'sky serve down {service_name} --purge')
|
|
808
|
+
if (service_status['status']
|
|
640
809
|
in serve_state.ServiceStatus.failed_statuses()):
|
|
641
|
-
failed_status =
|
|
642
|
-
if service_status is not None else None)
|
|
810
|
+
failed_status = service_status['status']
|
|
643
811
|
if purge:
|
|
644
812
|
message = _terminate_failed_services(service_name,
|
|
645
813
|
failed_status)
|
|
@@ -647,11 +815,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
647
815
|
messages.append(message)
|
|
648
816
|
else:
|
|
649
817
|
messages.append(
|
|
650
|
-
f'{colorama.Fore.YELLOW}
|
|
818
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
|
|
651
819
|
f'failed status ({failed_status}). Skipping '
|
|
652
820
|
'its termination as it could lead to a resource leak. '
|
|
653
|
-
f'(Use `
|
|
654
|
-
'forcefully terminate the service.)'
|
|
821
|
+
f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
|
|
655
822
|
f'{colorama.Style.RESET_ALL}')
|
|
656
823
|
# Don't add to terminated_service_names since it's not
|
|
657
824
|
# actually terminated.
|
|
@@ -668,12 +835,12 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
668
835
|
f.flush()
|
|
669
836
|
terminated_service_names.append(f'{service_name!r}')
|
|
670
837
|
if not terminated_service_names:
|
|
671
|
-
messages.append('No
|
|
838
|
+
messages.append(f'No {noun} to terminate.')
|
|
672
839
|
else:
|
|
673
|
-
identity_str = f'
|
|
840
|
+
identity_str = f'{capnoun} {terminated_service_names[0]} is'
|
|
674
841
|
if len(terminated_service_names) > 1:
|
|
675
842
|
terminated_service_names_str = ', '.join(terminated_service_names)
|
|
676
|
-
identity_str = f'
|
|
843
|
+
identity_str = f'{capnoun}s {terminated_service_names_str} are'
|
|
677
844
|
messages.append(f'{identity_str} scheduled to be terminated.')
|
|
678
845
|
return '\n'.join(messages)
|
|
679
846
|
|
|
@@ -694,32 +861,35 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
|
694
861
|
start_time = time.time()
|
|
695
862
|
setup_completed = False
|
|
696
863
|
while True:
|
|
697
|
-
|
|
698
|
-
if
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
864
|
+
# TODO(tian): PID-based tracking.
|
|
865
|
+
if not is_consolidation_mode():
|
|
866
|
+
job_status = job_lib.get_status(job_id)
|
|
867
|
+
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
868
|
+
# Wait for the controller process to finish setting up. It
|
|
869
|
+
# can be slow if a lot cloud dependencies are being installed.
|
|
870
|
+
if (time.time() - start_time >
|
|
871
|
+
constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
|
|
872
|
+
with ux_utils.print_exception_no_traceback():
|
|
873
|
+
raise RuntimeError(
|
|
874
|
+
f'Failed to start the controller process for '
|
|
875
|
+
f'the service {service_name!r} within '
|
|
876
|
+
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
|
|
877
|
+
f' seconds.')
|
|
878
|
+
# No need to check the service status as the controller process
|
|
879
|
+
# is still setting up.
|
|
880
|
+
time.sleep(1)
|
|
881
|
+
continue
|
|
714
882
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
883
|
+
if not setup_completed:
|
|
884
|
+
setup_completed = True
|
|
885
|
+
# Reset the start time to wait for the service to be registered.
|
|
886
|
+
start_time = time.time()
|
|
719
887
|
|
|
720
888
|
record = serve_state.get_service_from_name(service_name)
|
|
721
889
|
if record is not None:
|
|
722
|
-
|
|
890
|
+
# TODO(tian): PID-based tracking.
|
|
891
|
+
if (not is_consolidation_mode() and
|
|
892
|
+
job_id != record['controller_job_id']):
|
|
723
893
|
with ux_utils.print_exception_no_traceback():
|
|
724
894
|
raise ValueError(
|
|
725
895
|
f'The service {service_name!r} is already running. '
|
|
@@ -1059,18 +1229,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
|
|
|
1059
1229
|
return f'{ready_replica_num}/{total_replica_num}'
|
|
1060
1230
|
|
|
1061
1231
|
|
|
1062
|
-
def format_service_table(service_records: List[Dict[str, Any]],
|
|
1063
|
-
|
|
1232
|
+
def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
|
|
1233
|
+
pool: bool) -> str:
|
|
1234
|
+
noun = 'pool' if pool else 'service'
|
|
1064
1235
|
if not service_records:
|
|
1065
|
-
return 'No existing
|
|
1236
|
+
return f'No existing {noun}s.'
|
|
1066
1237
|
|
|
1067
1238
|
service_columns = [
|
|
1068
|
-
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1239
|
+
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1240
|
+
'REPLICAS' if not pool else 'WORKERS'
|
|
1069
1241
|
]
|
|
1242
|
+
if not pool:
|
|
1243
|
+
service_columns.append('ENDPOINT')
|
|
1070
1244
|
if show_all:
|
|
1071
1245
|
service_columns.extend([
|
|
1072
1246
|
'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
|
|
1073
1247
|
])
|
|
1248
|
+
if pool:
|
|
1249
|
+
# Remove the load balancing policy column for pools.
|
|
1250
|
+
service_columns.pop(-2)
|
|
1074
1251
|
service_table = log_utils.create_table(service_columns)
|
|
1075
1252
|
|
|
1076
1253
|
replica_infos: List[Dict[str, Any]] = []
|
|
@@ -1101,35 +1278,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
|
1101
1278
|
uptime,
|
|
1102
1279
|
status_str,
|
|
1103
1280
|
replicas,
|
|
1104
|
-
endpoint,
|
|
1105
1281
|
]
|
|
1282
|
+
if not pool:
|
|
1283
|
+
service_values.append(endpoint)
|
|
1106
1284
|
if show_all:
|
|
1107
1285
|
service_values.extend(
|
|
1108
1286
|
[policy, load_balancing_policy, requested_resources_str])
|
|
1287
|
+
if pool:
|
|
1288
|
+
service_values.pop(-2)
|
|
1109
1289
|
service_table.add_row(service_values)
|
|
1110
1290
|
|
|
1111
|
-
replica_table = _format_replica_table(replica_infos, show_all)
|
|
1291
|
+
replica_table = _format_replica_table(replica_infos, show_all, pool)
|
|
1292
|
+
replica_noun = 'Pool Workers' if pool else 'Service Replicas'
|
|
1112
1293
|
return (f'{service_table}\n'
|
|
1113
1294
|
f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1114
|
-
f'
|
|
1295
|
+
f'{replica_noun}{colorama.Style.RESET_ALL}\n'
|
|
1115
1296
|
f'{replica_table}')
|
|
1116
1297
|
|
|
1117
1298
|
|
|
1118
|
-
def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1119
|
-
|
|
1299
|
+
def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
1300
|
+
pool: bool) -> str:
|
|
1301
|
+
noun = 'worker' if pool else 'replica'
|
|
1120
1302
|
if not replica_records:
|
|
1121
|
-
return 'No existing
|
|
1303
|
+
return f'No existing {noun}s.'
|
|
1122
1304
|
|
|
1123
1305
|
replica_columns = [
|
|
1124
|
-
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1125
|
-
'RESOURCES', 'STATUS'
|
|
1306
|
+
'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1307
|
+
'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
|
|
1126
1308
|
]
|
|
1309
|
+
if pool:
|
|
1310
|
+
replica_columns.append('USED_BY')
|
|
1311
|
+
# Remove the endpoint column for pool workers.
|
|
1312
|
+
replica_columns.pop(3)
|
|
1127
1313
|
replica_table = log_utils.create_table(replica_columns)
|
|
1128
1314
|
|
|
1129
1315
|
truncate_hint = ''
|
|
1130
1316
|
if not show_all:
|
|
1131
1317
|
if len(replica_records) > _REPLICA_TRUNC_NUM:
|
|
1132
|
-
truncate_hint = '\n... (use --all to show all
|
|
1318
|
+
truncate_hint = f'\n... (use --all to show all {noun}s)'
|
|
1133
1319
|
replica_records = replica_records[:_REPLICA_TRUNC_NUM]
|
|
1134
1320
|
|
|
1135
1321
|
for record in replica_records:
|
|
@@ -1143,6 +1329,8 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1143
1329
|
resources_str = '-'
|
|
1144
1330
|
replica_status = record['status']
|
|
1145
1331
|
status_str = replica_status.colored_str()
|
|
1332
|
+
used_by = record.get('used_by', None)
|
|
1333
|
+
used_by_str = str(used_by) if used_by is not None else '-'
|
|
1146
1334
|
|
|
1147
1335
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
|
1148
1336
|
'handle']
|
|
@@ -1161,6 +1349,9 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1161
1349
|
resources_str,
|
|
1162
1350
|
status_str,
|
|
1163
1351
|
]
|
|
1352
|
+
if pool:
|
|
1353
|
+
replica_values.append(used_by_str)
|
|
1354
|
+
replica_values.pop(3)
|
|
1164
1355
|
replica_table.add_row(replica_values)
|
|
1165
1356
|
|
|
1166
1357
|
return f'{replica_table}{truncate_hint}'
|
|
@@ -1185,13 +1376,16 @@ class ServeCodeGen:
|
|
|
1185
1376
|
'from sky.serve import serve_state',
|
|
1186
1377
|
'from sky.serve import serve_utils',
|
|
1187
1378
|
'from sky.serve import constants',
|
|
1379
|
+
'serve_version = constants.SERVE_VERSION',
|
|
1188
1380
|
]
|
|
1189
1381
|
|
|
1190
1382
|
@classmethod
|
|
1191
|
-
def get_service_status(cls, service_names: Optional[List[str]]
|
|
1383
|
+
def get_service_status(cls, service_names: Optional[List[str]],
|
|
1384
|
+
pool: bool) -> str:
|
|
1192
1385
|
code = [
|
|
1193
|
-
f'
|
|
1194
|
-
'
|
|
1386
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1387
|
+
f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
|
|
1388
|
+
'**kwargs)', 'print(msg, end="", flush=True)'
|
|
1195
1389
|
]
|
|
1196
1390
|
return cls._build(code)
|
|
1197
1391
|
|
|
@@ -1204,11 +1398,12 @@ class ServeCodeGen:
|
|
|
1204
1398
|
return cls._build(code)
|
|
1205
1399
|
|
|
1206
1400
|
@classmethod
|
|
1207
|
-
def terminate_services(cls, service_names: Optional[List[str]],
|
|
1208
|
-
|
|
1401
|
+
def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
|
|
1402
|
+
pool: bool) -> str:
|
|
1209
1403
|
code = [
|
|
1404
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1210
1405
|
f'msg = serve_utils.terminate_services({service_names!r}, '
|
|
1211
|
-
f'purge={purge})', 'print(msg, end="", flush=True)'
|
|
1406
|
+
f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
|
|
1212
1407
|
]
|
|
1213
1408
|
return cls._build(code)
|
|
1214
1409
|
|
|
@@ -1253,6 +1448,17 @@ class ServeCodeGen:
|
|
|
1253
1448
|
]
|
|
1254
1449
|
return cls._build(code)
|
|
1255
1450
|
|
|
1451
|
+
@classmethod
|
|
1452
|
+
def update_service(cls, service_name: str, version: int, mode: str,
|
|
1453
|
+
pool: bool) -> str:
|
|
1454
|
+
code = [
|
|
1455
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1456
|
+
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1457
|
+
f'{version}, mode={mode!r}, **kwargs)',
|
|
1458
|
+
'print(msg, end="", flush=True)',
|
|
1459
|
+
]
|
|
1460
|
+
return cls._build(code)
|
|
1461
|
+
|
|
1256
1462
|
@classmethod
|
|
1257
1463
|
def _build(cls, code: List[str]) -> str:
|
|
1258
1464
|
code = cls._PREFIX + code
|
|
@@ -1263,12 +1469,3 @@ class ServeCodeGen:
|
|
|
1263
1469
|
f'"{common_utils.get_user_hash()}"; '
|
|
1264
1470
|
f'{skylet_constants.SKY_PYTHON_CMD} '
|
|
1265
1471
|
f'-u -c {shlex.quote(generated_code)}')
|
|
1266
|
-
|
|
1267
|
-
@classmethod
|
|
1268
|
-
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
|
1269
|
-
code = [
|
|
1270
|
-
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1271
|
-
f'{version}, mode={mode!r})',
|
|
1272
|
-
'print(msg, end="", flush=True)',
|
|
1273
|
-
]
|
|
1274
|
-
return cls._build(code)
|