skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +128 -31
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +12 -7
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +87 -7
- sky/jobs/utils.py +35 -19
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/kubernetes/utils.py +25 -0
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +22 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
sky/provision/azure/config.py
CHANGED
@@ -5,16 +5,18 @@ a cluster to be launched.
|
|
5
5
|
"""
|
6
6
|
import hashlib
|
7
7
|
import json
|
8
|
-
import logging
|
9
8
|
from pathlib import Path
|
10
9
|
import random
|
11
10
|
import time
|
12
11
|
from typing import Any, Callable
|
13
12
|
|
13
|
+
from sky import exceptions
|
14
|
+
from sky import sky_logging
|
14
15
|
from sky.adaptors import azure
|
15
16
|
from sky.provision import common
|
17
|
+
from sky.utils import common_utils
|
16
18
|
|
17
|
-
logger =
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
18
20
|
|
19
21
|
UNIQUE_ID_LEN = 4
|
20
22
|
_DEPLOYMENT_NAME = 'skypilot-config'
|
@@ -92,10 +94,19 @@ def bootstrap_instances(
|
|
92
94
|
retry += 1
|
93
95
|
continue
|
94
96
|
raise
|
97
|
+
except azure.exceptions().ClientAuthenticationError as e:
|
98
|
+
message = (
|
99
|
+
'Failed to authenticate with Azure. Please check your Azure '
|
100
|
+
f'credentials. Error: {common_utils.format_exception(e)}'
|
101
|
+
).replace('\n', ' ')
|
102
|
+
logger.error(message)
|
103
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
95
104
|
else:
|
96
|
-
|
105
|
+
message = (
|
97
106
|
f'Timed out waiting for resource group {resource_group} to be '
|
98
107
|
'deleted.')
|
108
|
+
logger.error(message)
|
109
|
+
raise TimeoutError(message)
|
99
110
|
|
100
111
|
# load the template file
|
101
112
|
current_path = Path(__file__).parent
|
sky/provision/azure/instance.py
CHANGED
@@ -441,15 +441,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
441
441
|
if to_start_count > 0:
|
442
442
|
resource_client = azure.get_client('resource', subscription_id)
|
443
443
|
logger.debug(f'run_instances: Creating {to_start_count} instances.')
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
444
|
+
try:
|
445
|
+
created_instances = _create_instances(
|
446
|
+
compute_client=compute_client,
|
447
|
+
resource_client=resource_client,
|
448
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
449
|
+
resource_group=resource_group,
|
450
|
+
provider_config=provider_config,
|
451
|
+
node_config=config.node_config,
|
452
|
+
tags=tags,
|
453
|
+
count=to_start_count)
|
454
|
+
except Exception as e:
|
455
|
+
err_message = common_utils.format_exception(
|
456
|
+
e, use_bracket=True).replace('\n', ' ')
|
457
|
+
logger.error(f'Failed to create instances: {err_message}')
|
458
|
+
raise
|
453
459
|
created_instance_ids = [inst.name for inst in created_instances]
|
454
460
|
|
455
461
|
non_running_instance_statuses = list(
|
@@ -632,7 +632,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
632
632
|
try:
|
633
633
|
return _create_pods(region, cluster_name_on_cloud, config)
|
634
634
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
635
|
-
|
635
|
+
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
636
|
+
logger.warning('run_instances: Error occurred when creating pods: '
|
637
|
+
f'{e_msg}')
|
636
638
|
raise
|
637
639
|
|
638
640
|
|
@@ -1998,3 +1998,28 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
|
|
1998
1998
|
# we need to use in-cluster auth.
|
1999
1999
|
context = None
|
2000
2000
|
return context
|
2001
|
+
|
2002
|
+
|
2003
|
+
def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
2004
|
+
"""Gets all SkyPilot pods in the Kubernetes cluster.
|
2005
|
+
|
2006
|
+
Args:
|
2007
|
+
context: Kubernetes context to use. If None, uses the current context.
|
2008
|
+
|
2009
|
+
Returns:
|
2010
|
+
A list of Kubernetes pod objects.
|
2011
|
+
"""
|
2012
|
+
if context is None:
|
2013
|
+
context = get_current_kube_config_context_name()
|
2014
|
+
|
2015
|
+
try:
|
2016
|
+
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
2017
|
+
label_selector='skypilot-cluster',
|
2018
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
2019
|
+
except kubernetes.max_retry_error():
|
2020
|
+
raise exceptions.ResourcesUnavailableError(
|
2021
|
+
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
2022
|
+
'Please check if the cluster is healthy and retry. To debug, run: '
|
2023
|
+
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
2024
|
+
) from None
|
2025
|
+
return pods
|
sky/provision/provisioner.py
CHANGED
@@ -14,6 +14,7 @@ import colorama
|
|
14
14
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
|
+
from sky import exceptions
|
17
18
|
from sky import provision
|
18
19
|
from sky import sky_logging
|
19
20
|
from sky import status_lib
|
@@ -42,76 +43,50 @@ _TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n'
|
|
42
43
|
def _bulk_provision(
|
43
44
|
cloud: clouds.Cloud,
|
44
45
|
region: clouds.Region,
|
45
|
-
zones: Optional[List[clouds.Zone]],
|
46
46
|
cluster_name: resources_utils.ClusterName,
|
47
47
|
bootstrap_config: provision_common.ProvisionConfig,
|
48
48
|
) -> provision_common.ProvisionRecord:
|
49
49
|
provider_name = repr(cloud)
|
50
50
|
region_name = region.name
|
51
51
|
|
52
|
-
style = colorama.Style
|
53
|
-
|
54
|
-
if not zones:
|
55
|
-
# For Azure, zones is always an empty list.
|
56
|
-
zone_str = 'all zones'
|
57
|
-
else:
|
58
|
-
zone_str = ','.join(z.name for z in zones)
|
59
|
-
|
60
|
-
if isinstance(cloud, clouds.Kubernetes):
|
61
|
-
# Omit the region name for Kubernetes.
|
62
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud}{style.RESET_ALL} '
|
63
|
-
f'{cluster_name!r}.')
|
64
|
-
else:
|
65
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud} '
|
66
|
-
f'{region_name}{style.RESET_ALL} ({zone_str})')
|
67
|
-
|
68
52
|
start = time.time()
|
69
|
-
|
53
|
+
# TODO(suquark): Should we cache the bootstrapped result?
|
54
|
+
# Currently it is not necessary as bootstrapping takes
|
55
|
+
# only ~3s, caching it seems over-engineering and could
|
56
|
+
# cause other issues like the cache is not synced
|
57
|
+
# with the cloud configuration.
|
58
|
+
config = provision.bootstrap_instances(provider_name, region_name,
|
59
|
+
cluster_name.name_on_cloud,
|
60
|
+
bootstrap_config)
|
61
|
+
|
62
|
+
provision_record = provision.run_instances(provider_name,
|
63
|
+
region_name,
|
64
|
+
cluster_name.name_on_cloud,
|
65
|
+
config=config)
|
66
|
+
|
67
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
68
|
+
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
69
|
+
rich_utils.force_update_status(
|
70
|
+
ux_utils.spinner_message('Launching - Checking instance status',
|
71
|
+
str(provision_logging.config.log_path)))
|
72
|
+
# AWS would take a very short time (<<1s) updating the state of the
|
73
|
+
# instance.
|
74
|
+
time.sleep(1)
|
75
|
+
for retry_cnt in range(_MAX_RETRY):
|
70
76
|
try:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
f'{common_utils.format_exception(e)}')
|
85
|
-
raise
|
86
|
-
|
87
|
-
provision_record = provision.run_instances(provider_name,
|
88
|
-
region_name,
|
89
|
-
cluster_name.name_on_cloud,
|
90
|
-
config=config)
|
91
|
-
|
92
|
-
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
93
|
-
logger.debug(
|
94
|
-
f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
95
|
-
status.update('[bold cyan]Launching - Checking instance status[/]')
|
96
|
-
# AWS would take a very short time (<<1s) updating the state of the
|
97
|
-
# instance.
|
98
|
-
time.sleep(1)
|
99
|
-
for retry_cnt in range(_MAX_RETRY):
|
100
|
-
try:
|
101
|
-
provision.wait_instances(provider_name,
|
102
|
-
region_name,
|
103
|
-
cluster_name.name_on_cloud,
|
104
|
-
state=status_lib.ClusterStatus.UP)
|
105
|
-
break
|
106
|
-
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
107
|
-
time.sleep(backoff.current_backoff())
|
108
|
-
else:
|
109
|
-
raise RuntimeError(
|
110
|
-
f'Failed to wait for instances of {cluster_name!r} to be '
|
111
|
-
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
112
|
-
logger.debug(
|
113
|
-
f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
114
|
-
'retries.')
|
77
|
+
provision.wait_instances(provider_name,
|
78
|
+
region_name,
|
79
|
+
cluster_name.name_on_cloud,
|
80
|
+
state=status_lib.ClusterStatus.UP)
|
81
|
+
break
|
82
|
+
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
83
|
+
time.sleep(backoff.current_backoff())
|
84
|
+
else:
|
85
|
+
raise RuntimeError(
|
86
|
+
f'Failed to wait for instances of {cluster_name!r} to be '
|
87
|
+
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
88
|
+
logger.debug(f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
89
|
+
'retries.')
|
115
90
|
|
116
91
|
logger.debug(
|
117
92
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
@@ -162,8 +137,11 @@ def bulk_provision(
|
|
162
137
|
logger.debug(
|
163
138
|
'Provision config:\n'
|
164
139
|
f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
|
165
|
-
return _bulk_provision(cloud, region,
|
140
|
+
return _bulk_provision(cloud, region, cluster_name,
|
166
141
|
bootstrap_config)
|
142
|
+
except exceptions.NoClusterLaunchedError:
|
143
|
+
# Skip the teardown if the cluster was never launched.
|
144
|
+
raise
|
167
145
|
except Exception: # pylint: disable=broad-except
|
168
146
|
zone_str = 'all zones'
|
169
147
|
if zones:
|
@@ -440,23 +418,30 @@ def _post_provision_setup(
|
|
440
418
|
# We don't set docker_user here, as we are configuring the VM itself.
|
441
419
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
442
420
|
cluster_yaml, ssh_user=cluster_info.ssh_user)
|
421
|
+
docker_config = config_from_yaml.get('docker', {})
|
443
422
|
|
444
423
|
with rich_utils.safe_status(
|
445
|
-
|
424
|
+
ux_utils.spinner_message(
|
425
|
+
'Launching - Waiting for SSH access',
|
426
|
+
provision_logging.config.log_path)) as status:
|
446
427
|
|
447
428
|
logger.debug(
|
448
429
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
449
430
|
wait_for_ssh(cluster_info, ssh_credentials)
|
450
|
-
logger.debug(f'SSH
|
431
|
+
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
432
|
+
vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
|
451
433
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
452
|
-
|
453
|
-
|
454
|
-
|
434
|
+
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
435
|
+
indent_str = (ux_utils.INDENT_SYMBOL
|
436
|
+
if docker_config else ux_utils.INDENT_LAST_SYMBOL)
|
437
|
+
logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
|
438
|
+
f'up.{colorama.Style.RESET_ALL}')
|
455
439
|
|
456
|
-
docker_config = config_from_yaml.get('docker', {})
|
457
440
|
if docker_config:
|
458
441
|
status.update(
|
459
|
-
|
442
|
+
ux_utils.spinner_message(
|
443
|
+
'Launching - Initializing docker container',
|
444
|
+
provision_logging.config.log_path))
|
460
445
|
docker_user = instance_setup.initialize_docker(
|
461
446
|
cluster_name.name_on_cloud,
|
462
447
|
docker_config=docker_config,
|
@@ -470,6 +455,8 @@ def _post_provision_setup(
|
|
470
455
|
cluster_info.docker_user = docker_user
|
471
456
|
ssh_credentials['docker_user'] = docker_user
|
472
457
|
logger.debug(f'Docker user: {docker_user}')
|
458
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
459
|
+
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
473
460
|
|
474
461
|
# We mount the metadata with sky wheel for speedup.
|
475
462
|
# NOTE: currently we mount all credentials for all nodes, because
|
@@ -482,8 +469,9 @@ def _post_provision_setup(
|
|
482
469
|
# for later.
|
483
470
|
file_mounts = config_from_yaml.get('file_mounts', {})
|
484
471
|
|
485
|
-
runtime_preparation_str = (
|
486
|
-
|
472
|
+
runtime_preparation_str = (ux_utils.spinner_message(
|
473
|
+
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
474
|
+
provision_logging.config.log_path))
|
487
475
|
status.update(
|
488
476
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
489
477
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
@@ -551,8 +539,9 @@ def _post_provision_setup(
|
|
551
539
|
instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
|
552
540
|
cluster_info, ssh_credentials)
|
553
541
|
|
554
|
-
logger.info(
|
555
|
-
|
542
|
+
logger.info(
|
543
|
+
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
544
|
+
provision_logging.config.log_path))
|
556
545
|
return cluster_info
|
557
546
|
|
558
547
|
|
sky/serve/core.py
CHANGED
@@ -129,8 +129,10 @@ def up(
|
|
129
129
|
task, use_mutated_config_in_current_request=False)
|
130
130
|
task = dag.tasks[0]
|
131
131
|
|
132
|
-
|
133
|
-
|
132
|
+
with rich_utils.safe_status(
|
133
|
+
ux_utils.spinner_message('Initializing service')):
|
134
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
135
|
+
task, path='serve')
|
134
136
|
|
135
137
|
with tempfile.NamedTemporaryFile(
|
136
138
|
prefix=f'service-task-{service_name}-',
|
@@ -215,7 +217,8 @@ def up(
|
|
215
217
|
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
|
216
218
|
# change after the first time, so there is no consistency issue.
|
217
219
|
with rich_utils.safe_status(
|
218
|
-
|
220
|
+
ux_utils.spinner_message(
|
221
|
+
'Waiting for the service to register')):
|
219
222
|
# This function will check the controller job id in the database
|
220
223
|
# and return the endpoint if the job id matches. Otherwise it will
|
221
224
|
# return None.
|
@@ -274,34 +277,31 @@ def up(
|
|
274
277
|
f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
|
275
278
|
f'\n{fore.CYAN}Endpoint URL: '
|
276
279
|
f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
|
277
|
-
'\
|
278
|
-
f'{
|
279
|
-
f'
|
280
|
-
'
|
281
|
-
f'{
|
282
|
-
f'{
|
283
|
-
'
|
284
|
-
'\
|
285
|
-
f'{
|
286
|
-
f'{
|
287
|
-
'\
|
288
|
-
f'{
|
289
|
-
f'{
|
290
|
-
'\
|
291
|
-
f'{
|
292
|
-
f'{
|
293
|
-
'\n'
|
294
|
-
'
|
295
|
-
f'{
|
296
|
-
f'{
|
297
|
-
'
|
298
|
-
f'{
|
299
|
-
|
300
|
-
'
|
301
|
-
|
302
|
-
f'{style.RESET_ALL}'
|
303
|
-
f'\n{fore.GREEN}The replicas should be ready within a '
|
304
|
-
f'short time.{style.RESET_ALL}')
|
280
|
+
f'\n📋 Useful Commands'
|
281
|
+
f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
|
282
|
+
f'{ux_utils.BOLD}sky serve status {service_name} '
|
283
|
+
f'[--endpoint]{ux_utils.RESET_BOLD}'
|
284
|
+
f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
|
285
|
+
f'{ux_utils.BOLD}sky serve down {service_name}'
|
286
|
+
f'{ux_utils.RESET_BOLD}'
|
287
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
|
288
|
+
f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
|
289
|
+
f'{ux_utils.RESET_BOLD}'
|
290
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
|
291
|
+
f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
|
292
|
+
f'{ux_utils.RESET_BOLD}'
|
293
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
|
294
|
+
f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
|
295
|
+
f'{ux_utils.RESET_BOLD}'
|
296
|
+
f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
|
297
|
+
f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
|
298
|
+
f'{ux_utils.RESET_BOLD}'
|
299
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
|
300
|
+
f'{ux_utils.BOLD}curl {endpoint}'
|
301
|
+
f'{ux_utils.RESET_BOLD}'
|
302
|
+
'\n\n' +
|
303
|
+
ux_utils.finishing_message('Service is spinning up and replicas '
|
304
|
+
'will be ready shortly.'))
|
305
305
|
return service_name, endpoint
|
306
306
|
|
307
307
|
|
@@ -323,11 +323,11 @@ def update(
|
|
323
323
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
324
324
|
stopped_message=
|
325
325
|
'Service controller is stopped. There is no service to update. '
|
326
|
-
f'To spin up a new service, use {
|
327
|
-
f'sky serve up{
|
326
|
+
f'To spin up a new service, use {ux_utils.BOLD}'
|
327
|
+
f'sky serve up{ux_utils.RESET_BOLD}',
|
328
328
|
non_existent_message='Service does not exist. '
|
329
329
|
'To spin up a new service, '
|
330
|
-
f'use {
|
330
|
+
f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
|
331
331
|
)
|
332
332
|
|
333
333
|
backend = backend_utils.get_backend_from_handle(handle)
|
@@ -353,8 +353,8 @@ def update(
|
|
353
353
|
if len(service_statuses) == 0:
|
354
354
|
with ux_utils.print_exception_no_traceback():
|
355
355
|
raise RuntimeError(f'Cannot find service {service_name!r}.'
|
356
|
-
f'To spin up a service, use {
|
357
|
-
f'sky serve up{
|
356
|
+
f'To spin up a service, use {ux_utils.BOLD}'
|
357
|
+
f'sky serve up{ux_utils.RESET_BOLD}')
|
358
358
|
|
359
359
|
if len(service_statuses) > 1:
|
360
360
|
with ux_utils.print_exception_no_traceback():
|
@@ -374,8 +374,10 @@ def update(
|
|
374
374
|
with ux_utils.print_exception_no_traceback():
|
375
375
|
raise RuntimeError(prompt)
|
376
376
|
|
377
|
-
|
378
|
-
|
377
|
+
with rich_utils.safe_status(
|
378
|
+
ux_utils.spinner_message('Initializing service')):
|
379
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
380
|
+
task, path='serve')
|
379
381
|
|
380
382
|
code = serve_utils.ServeCodeGen.add_version(service_name)
|
381
383
|
returncode, version_string_payload, stderr = backend.run_on_head(
|
@@ -433,8 +435,8 @@ def update(
|
|
433
435
|
|
434
436
|
print(f'{colorama.Fore.GREEN}Service {service_name!r} update scheduled.'
|
435
437
|
f'{colorama.Style.RESET_ALL}\n'
|
436
|
-
f'Please use {
|
437
|
-
f'{
|
438
|
+
f'Please use {ux_utils.BOLD}sky serve status {service_name} '
|
439
|
+
f'{ux_utils.RESET_BOLD}to check the latest status.')
|
438
440
|
|
439
441
|
|
440
442
|
@usage_lib.entrypoint
|
sky/sky_logging.py
CHANGED
@@ -10,10 +10,10 @@ import colorama
|
|
10
10
|
from sky.utils import env_options
|
11
11
|
from sky.utils import rich_utils
|
12
12
|
|
13
|
-
#
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
# UX: Should we show logging prefixes and some extra information in optimizer?
|
14
|
+
_show_logging_prefix = (env_options.Options.SHOW_DEBUG_INFO.get() or
|
15
|
+
not env_options.Options.MINIMIZE_LOGGING.get())
|
16
|
+
_FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
17
17
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
18
18
|
|
19
19
|
|
@@ -45,6 +45,7 @@ _root_logger = logging.getLogger('sky')
|
|
45
45
|
_default_handler = None
|
46
46
|
_logging_config = threading.local()
|
47
47
|
|
48
|
+
NO_PREFIX_FORMATTER = NewLineFormatter(None, datefmt=_DATE_FORMAT)
|
48
49
|
FORMATTER = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
|
49
50
|
DIM_FORMATTER = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT, dim=True)
|
50
51
|
|
@@ -67,7 +68,10 @@ def _setup_logger():
|
|
67
68
|
else:
|
68
69
|
_default_handler.setLevel(logging.INFO)
|
69
70
|
_root_logger.addHandler(_default_handler)
|
70
|
-
|
71
|
+
if _show_logging_prefix:
|
72
|
+
_default_handler.setFormatter(FORMATTER)
|
73
|
+
else:
|
74
|
+
_default_handler.setFormatter(NO_PREFIX_FORMATTER)
|
71
75
|
# Setting this will avoid the message
|
72
76
|
# being propagated to the parent logger.
|
73
77
|
_root_logger.propagate = False
|
sky/skylet/log_lib.py
CHANGED
@@ -21,6 +21,7 @@ from sky.skylet import constants
|
|
21
21
|
from sky.skylet import job_lib
|
22
22
|
from sky.utils import log_utils
|
23
23
|
from sky.utils import subprocess_utils
|
24
|
+
from sky.utils import ux_utils
|
24
25
|
|
25
26
|
_SKY_LOG_WAITING_GAP_SECONDS = 1
|
26
27
|
_SKY_LOG_WAITING_MAX_RETRY = 5
|
@@ -377,7 +378,9 @@ def _follow_job_logs(file,
|
|
377
378
|
wait_last_logs = False
|
378
379
|
continue
|
379
380
|
status_str = status.value if status is not None else 'None'
|
380
|
-
print(
|
381
|
+
print(
|
382
|
+
ux_utils.finishing_message(
|
383
|
+
f'Job finished (status: {status_str}).'))
|
381
384
|
return
|
382
385
|
|
383
386
|
time.sleep(_SKY_LOG_TAILING_GAP_SECONDS)
|
@@ -412,8 +415,6 @@ def tail_logs(job_id: Optional[int],
|
|
412
415
|
return
|
413
416
|
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
414
417
|
f'{managed_job_id}.')
|
415
|
-
logger.info(f'{colorama.Fore.YELLOW}Start streaming logs for {job_str}.'
|
416
|
-
f'{colorama.Style.RESET_ALL}')
|
417
418
|
log_path = os.path.join(log_dir, 'run.log')
|
418
419
|
log_path = os.path.expanduser(log_path)
|
419
420
|
|
@@ -437,7 +438,7 @@ def tail_logs(job_id: Optional[int],
|
|
437
438
|
time.sleep(_SKY_LOG_WAITING_GAP_SECONDS)
|
438
439
|
status = job_lib.update_job_status([job_id], silent=True)[0]
|
439
440
|
|
440
|
-
start_stream_at = '
|
441
|
+
start_stream_at = 'Waiting for task resources on '
|
441
442
|
if follow and status in [
|
442
443
|
job_lib.JobStatus.SETTING_UP,
|
443
444
|
job_lib.JobStatus.PENDING,
|
@@ -25,7 +25,7 @@ _TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata'
|
|
25
25
|
_REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name'
|
26
26
|
_REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem'
|
27
27
|
_REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
|
28
|
-
_GET_INTERNAL_IP_CMD = 'ip -4 -br addr show | grep UP | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1]))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
|
28
|
+
_GET_INTERNAL_IP_CMD = 's=$(ip -4 -br addr show | grep UP); echo "$s"; echo "$s" | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1])|104\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
|
29
29
|
|
30
30
|
logger = logging.getLogger(__name__)
|
31
31
|
|