skypilot-nightly 1.0.0.dev20250108__py3-none-any.whl → 1.0.0.dev20250110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/authentication.py +20 -8
- sky/backends/cloud_vm_ray_backend.py +146 -0
- sky/cli.py +19 -7
- sky/clouds/aws.py +26 -11
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +46 -0
- sky/jobs/state.py +27 -0
- sky/jobs/utils.py +9 -0
- sky/provision/aws/config.py +11 -3
- sky/templates/jobs-controller.yaml.j2 +34 -4
- sky/utils/controller_utils.py +3 -0
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/METADATA +11 -2
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/RECORD +18 -18
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fd1ac0e6fbde830ac6be9e42f523a0e460fd84e8'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250110'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/authentication.py
CHANGED
@@ -408,14 +408,26 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
408
408
|
secret = k8s.client.V1Secret(
|
409
409
|
metadata=k8s.client.V1ObjectMeta(**secret_metadata),
|
410
410
|
string_data={secret_field_name: public_key})
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
secret_name
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
411
|
+
try:
|
412
|
+
if kubernetes_utils.check_secret_exists(secret_name, namespace,
|
413
|
+
context):
|
414
|
+
logger.debug(f'Key {secret_name} exists in the cluster, '
|
415
|
+
'patching it...')
|
416
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
417
|
+
secret_name, namespace, secret)
|
418
|
+
else:
|
419
|
+
logger.debug(f'Key {secret_name} does not exist in the cluster, '
|
420
|
+
'creating it...')
|
421
|
+
kubernetes.core_api(context).create_namespaced_secret(
|
422
|
+
namespace, secret)
|
423
|
+
except kubernetes.api_exception() as e:
|
424
|
+
if e.status == 409 and e.reason == 'AlreadyExists':
|
425
|
+
logger.debug(f'Key {secret_name} was created concurrently, '
|
426
|
+
'patching it...')
|
427
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
428
|
+
secret_name, namespace, secret)
|
429
|
+
else:
|
430
|
+
raise e
|
419
431
|
|
420
432
|
private_key_path, _ = get_or_generate_keys()
|
421
433
|
if network_mode == nodeport_mode:
|
@@ -3891,6 +3891,152 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3891
3891
|
stdin=subprocess.DEVNULL,
|
3892
3892
|
)
|
3893
3893
|
|
3894
|
+
def sync_down_managed_job_logs(
|
3895
|
+
self,
|
3896
|
+
handle: CloudVmRayResourceHandle,
|
3897
|
+
job_id: Optional[int] = None,
|
3898
|
+
job_name: Optional[str] = None,
|
3899
|
+
controller: bool = False,
|
3900
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
|
3901
|
+
"""Sync down logs for a managed job.
|
3902
|
+
|
3903
|
+
Args:
|
3904
|
+
handle: The handle to the cluster.
|
3905
|
+
job_id: The job ID to sync down logs for.
|
3906
|
+
job_name: The job name to sync down logs for.
|
3907
|
+
controller: Whether to sync down logs for the controller.
|
3908
|
+
local_dir: The local directory to sync down logs to.
|
3909
|
+
|
3910
|
+
Returns:
|
3911
|
+
A dictionary mapping job_id to log path.
|
3912
|
+
"""
|
3913
|
+
# if job_name is not None, job_id should be None
|
3914
|
+
assert job_name is None or job_id is None, (job_name, job_id)
|
3915
|
+
if job_id is None and job_name is not None:
|
3916
|
+
# generate code to get the job_id
|
3917
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3918
|
+
job_name=job_name)
|
3919
|
+
returncode, run_timestamps, stderr = self.run_on_head(
|
3920
|
+
handle,
|
3921
|
+
code,
|
3922
|
+
stream_logs=False,
|
3923
|
+
require_outputs=True,
|
3924
|
+
separate_stderr=True)
|
3925
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3926
|
+
'Failed to sync down logs.',
|
3927
|
+
stderr)
|
3928
|
+
job_ids = common_utils.decode_payload(run_timestamps)
|
3929
|
+
if not job_ids:
|
3930
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3931
|
+
'No matching job found'
|
3932
|
+
f'{colorama.Style.RESET_ALL}')
|
3933
|
+
return {}
|
3934
|
+
elif len(job_ids) > 1:
|
3935
|
+
logger.info(
|
3936
|
+
f'{colorama.Fore.YELLOW}'
|
3937
|
+
f'Multiple jobs IDs found under the name {job_name}. '
|
3938
|
+
'Downloading the latest job logs.'
|
3939
|
+
f'{colorama.Style.RESET_ALL}')
|
3940
|
+
job_ids = [job_ids[0]] # descending order
|
3941
|
+
else:
|
3942
|
+
job_ids = [job_id]
|
3943
|
+
|
3944
|
+
# get the run_timestamp
|
3945
|
+
# the function takes in [job_id]
|
3946
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
|
3947
|
+
returncode, run_timestamps, stderr = self.run_on_head(
|
3948
|
+
handle,
|
3949
|
+
code,
|
3950
|
+
stream_logs=False,
|
3951
|
+
require_outputs=True,
|
3952
|
+
separate_stderr=True)
|
3953
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3954
|
+
'Failed to sync logs.', stderr)
|
3955
|
+
# returns with a dict of {job_id: run_timestamp}
|
3956
|
+
run_timestamps = common_utils.decode_payload(run_timestamps)
|
3957
|
+
if not run_timestamps:
|
3958
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3959
|
+
'No matching log directories found'
|
3960
|
+
f'{colorama.Style.RESET_ALL}')
|
3961
|
+
return {}
|
3962
|
+
|
3963
|
+
run_timestamp = list(run_timestamps.values())[0]
|
3964
|
+
job_id = list(run_timestamps.keys())[0]
|
3965
|
+
local_log_dir = ''
|
3966
|
+
if controller: # download controller logs
|
3967
|
+
remote_log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
3968
|
+
run_timestamp)
|
3969
|
+
local_log_dir = os.path.expanduser(
|
3970
|
+
os.path.join(local_dir, run_timestamp))
|
3971
|
+
|
3972
|
+
logger.info(f'{colorama.Fore.CYAN}'
|
3973
|
+
f'Job {job_ids} local logs: {local_log_dir}'
|
3974
|
+
f'{colorama.Style.RESET_ALL}')
|
3975
|
+
|
3976
|
+
runners = handle.get_command_runners()
|
3977
|
+
|
3978
|
+
def _rsync_down(args) -> None:
|
3979
|
+
"""Rsync down logs from remote nodes.
|
3980
|
+
|
3981
|
+
Args:
|
3982
|
+
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
3983
|
+
"""
|
3984
|
+
(runner, local_log_dir, remote_log_dir) = args
|
3985
|
+
try:
|
3986
|
+
os.makedirs(local_log_dir, exist_ok=True)
|
3987
|
+
runner.rsync(
|
3988
|
+
source=f'{remote_log_dir}/',
|
3989
|
+
target=local_log_dir,
|
3990
|
+
up=False,
|
3991
|
+
stream_logs=False,
|
3992
|
+
)
|
3993
|
+
except exceptions.CommandError as e:
|
3994
|
+
if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
|
3995
|
+
# Raised by rsync_down. Remote log dir may not exist
|
3996
|
+
# since the job can be run on some part of the nodes.
|
3997
|
+
logger.debug(
|
3998
|
+
f'{runner.node_id} does not have the tasks/*.')
|
3999
|
+
else:
|
4000
|
+
raise
|
4001
|
+
|
4002
|
+
parallel_args = [[runner, *item]
|
4003
|
+
for item in zip([local_log_dir], [remote_log_dir])
|
4004
|
+
for runner in runners]
|
4005
|
+
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4006
|
+
else: # download job logs
|
4007
|
+
local_log_dir = os.path.expanduser(
|
4008
|
+
os.path.join(local_dir, 'managed_jobs', run_timestamp))
|
4009
|
+
os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
|
4010
|
+
log_file = os.path.join(local_log_dir, 'run.log')
|
4011
|
+
|
4012
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
|
4013
|
+
job_id=job_id,
|
4014
|
+
follow=False,
|
4015
|
+
controller=False)
|
4016
|
+
|
4017
|
+
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
4018
|
+
# kill the process, so we need to handle it manually here.
|
4019
|
+
if threading.current_thread() is threading.main_thread():
|
4020
|
+
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4021
|
+
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4022
|
+
|
4023
|
+
# We redirect the output to the log file
|
4024
|
+
# and disable the STDOUT and STDERR
|
4025
|
+
self.run_on_head(
|
4026
|
+
handle,
|
4027
|
+
code,
|
4028
|
+
log_path=log_file,
|
4029
|
+
stream_logs=False,
|
4030
|
+
process_stream=False,
|
4031
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4032
|
+
stdin=subprocess.DEVNULL,
|
4033
|
+
)
|
4034
|
+
|
4035
|
+
logger.info(f'{colorama.Fore.CYAN}'
|
4036
|
+
f'Job {job_id} logs: {local_log_dir}'
|
4037
|
+
f'{colorama.Style.RESET_ALL}')
|
4038
|
+
return {str(job_id): local_log_dir}
|
4039
|
+
|
3894
4040
|
def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
|
3895
4041
|
service_name: str, target: serve_lib.ServiceComponent,
|
3896
4042
|
replica_id: Optional[int], follow: bool) -> None:
|
sky/cli.py
CHANGED
@@ -3933,17 +3933,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3933
3933
|
required=False,
|
3934
3934
|
help='Query the latest job logs, restarting the jobs controller if stopped.'
|
3935
3935
|
)
|
3936
|
+
@click.option('--sync-down',
|
3937
|
+
'-s',
|
3938
|
+
default=False,
|
3939
|
+
is_flag=True,
|
3940
|
+
required=False,
|
3941
|
+
help='Download logs for all jobs shown in the queue.')
|
3936
3942
|
@click.argument('job_id', required=False, type=int)
|
3937
3943
|
@usage_lib.entrypoint
|
3938
3944
|
def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
3939
|
-
controller: bool, refresh: bool):
|
3940
|
-
"""Tail the log of a managed job."""
|
3945
|
+
controller: bool, refresh: bool, sync_down: bool):
|
3946
|
+
"""Tail or sync down the log of a managed job."""
|
3941
3947
|
try:
|
3942
|
-
|
3943
|
-
|
3944
|
-
|
3945
|
-
|
3946
|
-
|
3948
|
+
if sync_down:
|
3949
|
+
managed_jobs.sync_down_logs(name=name,
|
3950
|
+
job_id=job_id,
|
3951
|
+
controller=controller,
|
3952
|
+
refresh=refresh)
|
3953
|
+
else:
|
3954
|
+
managed_jobs.tail_logs(name=name,
|
3955
|
+
job_id=job_id,
|
3956
|
+
follow=follow,
|
3957
|
+
controller=controller,
|
3958
|
+
refresh=refresh)
|
3947
3959
|
except exceptions.ClusterNotUpError:
|
3948
3960
|
with ux_utils.print_exception_no_traceback():
|
3949
3961
|
raise
|
sky/clouds/aws.py
CHANGED
@@ -95,6 +95,10 @@ class AWSIdentityType(enum.Enum):
|
|
95
95
|
|
96
96
|
CONTAINER_ROLE = 'container-role'
|
97
97
|
|
98
|
+
CUSTOM_PROCESS = 'custom-process'
|
99
|
+
|
100
|
+
ASSUME_ROLE = 'assume-role'
|
101
|
+
|
98
102
|
# Name Value Type Location
|
99
103
|
# ---- ----- ---- --------
|
100
104
|
# profile <not set> None None
|
@@ -614,10 +618,27 @@ class AWS(clouds.Cloud):
|
|
614
618
|
hints = f'AWS IAM role is set.{single_cloud_hint}'
|
615
619
|
elif identity_type == AWSIdentityType.CONTAINER_ROLE:
|
616
620
|
# Similar to the IAM ROLE, an ECS container may not store credentials
|
617
|
-
# in the~/.aws/credentials file. So we don't check for the existence of
|
621
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
618
622
|
# the file. i.e. the container will be assigned the IAM role of the
|
619
623
|
# task: skypilot-v1.
|
620
624
|
hints = f'AWS container-role is set.{single_cloud_hint}'
|
625
|
+
elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
|
626
|
+
# Similar to the IAM ROLE, a custom process may not store credentials
|
627
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
628
|
+
# the file. i.e. the custom process will be assigned the IAM role of the
|
629
|
+
# task: skypilot-v1.
|
630
|
+
hints = f'AWS custom-process is set.{single_cloud_hint}'
|
631
|
+
elif identity_type == AWSIdentityType.ASSUME_ROLE:
|
632
|
+
# When using ASSUME ROLE, the credentials are coming from a different
|
633
|
+
# source profile. So we don't check for the existence of ~/.aws/credentials.
|
634
|
+
# i.e. the assumed role will be assigned the IAM role of the
|
635
|
+
# task: skypilot-v1.
|
636
|
+
hints = f'AWS assume-role is set.{single_cloud_hint}'
|
637
|
+
elif identity_type == AWSIdentityType.ENV:
|
638
|
+
# When using ENV vars, the credentials are coming from the environment
|
639
|
+
# variables. So we don't check for the existence of ~/.aws/credentials.
|
640
|
+
# i.e. the identity is not determined by the file.
|
641
|
+
hints = f'AWS env is set.{single_cloud_hint}'
|
621
642
|
else:
|
622
643
|
# This file is required because it is required by the VMs launched on
|
623
644
|
# other clouds to access private s3 buckets and resources like EC2.
|
@@ -669,16 +690,10 @@ class AWS(clouds.Cloud):
|
|
669
690
|
f'Unexpected `aws configure list` output:\n{output}')
|
670
691
|
return len(results) == 1
|
671
692
|
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
|
677
|
-
return AWSIdentityType.CONTAINER_ROLE
|
678
|
-
elif _is_access_key_of_type(AWSIdentityType.ENV.value):
|
679
|
-
return AWSIdentityType.ENV
|
680
|
-
else:
|
681
|
-
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
693
|
+
for identity_type in AWSIdentityType:
|
694
|
+
if _is_access_key_of_type(identity_type.value):
|
695
|
+
return identity_type
|
696
|
+
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
682
697
|
|
683
698
|
@classmethod
|
684
699
|
@functools.lru_cache(maxsize=1)
|
sky/jobs/__init__.py
CHANGED
@@ -9,6 +9,7 @@ from sky.jobs.core import cancel
|
|
9
9
|
from sky.jobs.core import launch
|
10
10
|
from sky.jobs.core import queue
|
11
11
|
from sky.jobs.core import queue_from_kubernetes_pod
|
12
|
+
from sky.jobs.core import sync_down_logs
|
12
13
|
from sky.jobs.core import tail_logs
|
13
14
|
from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
|
14
15
|
from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
|
@@ -37,6 +38,7 @@ __all__ = [
|
|
37
38
|
'queue',
|
38
39
|
'queue_from_kubernetes_pod',
|
39
40
|
'tail_logs',
|
41
|
+
'sync_down_logs',
|
40
42
|
# utils
|
41
43
|
'ManagedJobCodeGen',
|
42
44
|
'format_job_table',
|
sky/jobs/core.py
CHANGED
@@ -427,6 +427,52 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
427
427
|
controller=controller)
|
428
428
|
|
429
429
|
|
430
|
+
@usage_lib.entrypoint
|
431
|
+
def sync_down_logs(
|
432
|
+
name: Optional[str],
|
433
|
+
job_id: Optional[int],
|
434
|
+
refresh: bool,
|
435
|
+
controller: bool,
|
436
|
+
local_dir: str = skylet_constants.SKY_LOGS_DIRECTORY) -> None:
|
437
|
+
"""Sync down logs of managed jobs.
|
438
|
+
|
439
|
+
Please refer to sky.cli.job_logs for documentation.
|
440
|
+
|
441
|
+
Raises:
|
442
|
+
ValueError: invalid arguments.
|
443
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
444
|
+
"""
|
445
|
+
# TODO(zhwu): Automatically restart the jobs controller
|
446
|
+
if name is not None and job_id is not None:
|
447
|
+
with ux_utils.print_exception_no_traceback():
|
448
|
+
raise ValueError('Cannot specify both name and job_id.')
|
449
|
+
|
450
|
+
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
451
|
+
job_name_or_id_str = ''
|
452
|
+
if job_id is not None:
|
453
|
+
job_name_or_id_str = str(job_id)
|
454
|
+
elif name is not None:
|
455
|
+
job_name_or_id_str = f'-n {name}'
|
456
|
+
else:
|
457
|
+
job_name_or_id_str = ''
|
458
|
+
handle = _maybe_restart_controller(
|
459
|
+
refresh,
|
460
|
+
stopped_message=(
|
461
|
+
f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
|
462
|
+
f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
|
463
|
+
f'-r --sync-down {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
|
464
|
+
spinner_message='Retrieving job logs')
|
465
|
+
|
466
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
467
|
+
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
468
|
+
|
469
|
+
backend.sync_down_managed_job_logs(handle,
|
470
|
+
job_id=job_id,
|
471
|
+
job_name=name,
|
472
|
+
controller=controller,
|
473
|
+
local_dir=local_dir)
|
474
|
+
|
475
|
+
|
430
476
|
spot_launch = common_utils.deprecated_function(
|
431
477
|
launch,
|
432
478
|
name='sky.jobs.launch',
|
sky/jobs/state.py
CHANGED
@@ -564,6 +564,33 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
564
564
|
return job_ids
|
565
565
|
|
566
566
|
|
567
|
+
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
568
|
+
"""Get all job ids by name."""
|
569
|
+
name_filter = ''
|
570
|
+
field_values = []
|
571
|
+
if name is not None:
|
572
|
+
# We match the job name from `job_info` for the jobs submitted after
|
573
|
+
# #1982, and from `spot` for the jobs submitted before #1982, whose
|
574
|
+
# job_info is not available.
|
575
|
+
name_filter = ('WHERE (job_info.name=(?) OR '
|
576
|
+
'(job_info.name IS NULL AND spot.task_name=(?)))')
|
577
|
+
field_values = [name, name]
|
578
|
+
|
579
|
+
# Left outer join is used here instead of join, because the job_info does
|
580
|
+
# not contain the managed jobs submitted before #1982.
|
581
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
582
|
+
rows = cursor.execute(
|
583
|
+
f"""\
|
584
|
+
SELECT DISTINCT spot.spot_job_id
|
585
|
+
FROM spot
|
586
|
+
LEFT OUTER JOIN job_info
|
587
|
+
ON spot.spot_job_id=job_info.spot_job_id
|
588
|
+
{name_filter}
|
589
|
+
ORDER BY spot.spot_job_id DESC""", field_values).fetchall()
|
590
|
+
job_ids = [row[0] for row in rows if row[0] is not None]
|
591
|
+
return job_ids
|
592
|
+
|
593
|
+
|
567
594
|
def _get_all_task_ids_statuses(
|
568
595
|
job_id: int) -> List[Tuple[int, ManagedJobStatus]]:
|
569
596
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
sky/jobs/utils.py
CHANGED
@@ -855,6 +855,15 @@ class ManagedJobCodeGen:
|
|
855
855
|
""")
|
856
856
|
return cls._build(code)
|
857
857
|
|
858
|
+
@classmethod
|
859
|
+
def get_all_job_ids_by_name(cls, job_name: str) -> str:
|
860
|
+
code = textwrap.dedent(f"""\
|
861
|
+
from sky.utils import common_utils
|
862
|
+
job_id = managed_job_state.get_all_job_ids_by_name({job_name!r})
|
863
|
+
print(common_utils.encode_payload(job_id), end="", flush=True)
|
864
|
+
""")
|
865
|
+
return cls._build(code)
|
866
|
+
|
858
867
|
@classmethod
|
859
868
|
def stream_logs(cls,
|
860
869
|
job_name: Optional[str],
|
sky/provision/aws/config.py
CHANGED
@@ -383,10 +383,13 @@ def _usable_subnets(
|
|
383
383
|
raise exc
|
384
384
|
|
385
385
|
if not subnets:
|
386
|
+
vpc_msg = (f'Does a default VPC exist in region '
|
387
|
+
f'{ec2.meta.client.meta.region_name}? ') if (
|
388
|
+
vpc_id_of_sg is None) else ''
|
386
389
|
_skypilot_log_error_and_exit_for_failover(
|
387
|
-
'No usable subnets found
|
388
|
-
'manually creating an instance in your specified region to '
|
389
|
-
'populate the list of subnets and
|
390
|
+
f'No usable subnets found. {vpc_msg}'
|
391
|
+
'Try manually creating an instance in your specified region to '
|
392
|
+
'populate the list of subnets and try again. '
|
390
393
|
'Note that the subnet must map public IPs '
|
391
394
|
'on instance launch unless you set `use_internal_ips: true` in '
|
392
395
|
'the `provider` config.')
|
@@ -495,6 +498,11 @@ def _get_subnet_and_vpc_id(ec2, security_group_ids: Optional[List[str]],
|
|
495
498
|
vpc_id_of_sg = None
|
496
499
|
|
497
500
|
all_subnets = list(ec2.subnets.all())
|
501
|
+
# If no VPC is specified, use the default VPC.
|
502
|
+
# We filter only for default VPCs to avoid using subnets that users may
|
503
|
+
# not want SkyPilot to use.
|
504
|
+
if vpc_id_of_sg is None:
|
505
|
+
all_subnets = [s for s in all_subnets if s.vpc.is_default]
|
498
506
|
subnets, vpc_id = _usable_subnets(
|
499
507
|
ec2,
|
500
508
|
user_specified_subnets=None,
|
@@ -26,10 +26,40 @@ setup: |
|
|
26
26
|
echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
|
27
27
|
{% endif %}
|
28
28
|
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
29
|
+
# Create systemd service file
|
30
|
+
mkdir -p ~/.config/systemd/user/
|
31
|
+
|
32
|
+
# Create systemd user service file
|
33
|
+
cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
|
34
|
+
[Unit]
|
35
|
+
Description=SkyPilot Jobs Dashboard
|
36
|
+
After=network.target
|
37
|
+
|
38
|
+
[Service]
|
39
|
+
Environment="PATH={{ sky_python_env_path }}:\$PATH"
|
40
|
+
Environment="SKYPILOT_USER_ID={{controller_envs.SKYPILOT_USER_ID}}"
|
41
|
+
Environment="SKYPILOT_USER={{controller_envs.SKYPILOT_USER}}"
|
42
|
+
Restart=always
|
43
|
+
StandardOutput=append:/home/$USER/.sky/job-dashboard.log
|
44
|
+
StandardError=append:/home/$USER/.sky/job-dashboard.log
|
45
|
+
ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
|
46
|
+
|
47
|
+
[Install]
|
48
|
+
WantedBy=default.target
|
49
|
+
EOF
|
50
|
+
|
51
|
+
if command -v systemctl &>/dev/null && systemctl --user show &>/dev/null; then
|
52
|
+
systemctl --user daemon-reload
|
53
|
+
systemctl --user enable --now skypilot-dashboard
|
54
|
+
else
|
55
|
+
echo "Systemd user services not found. Setting up SkyPilot dashboard manually."
|
56
|
+
# Kill any old dashboard processes
|
57
|
+
ps aux | grep -v nohup | grep -v grep | grep -- '-m sky.jobs.dashboard.dashboard' \
|
58
|
+
| awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
|
59
|
+
# Launch the dashboard in the background if not already running
|
60
|
+
(ps aux | grep -v nohup | grep -v grep | grep -q -- '-m sky.jobs.dashboard.dashboard') || \
|
61
|
+
(nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &)
|
62
|
+
fi
|
33
63
|
|
34
64
|
run: |
|
35
65
|
{{ sky_activate_python_env }}
|
sky/utils/controller_utils.py
CHANGED
@@ -206,6 +206,9 @@ def _get_cloud_dependencies_installation_commands(
|
|
206
206
|
# installed, so we don't check that.
|
207
207
|
python_packages: Set[str] = set()
|
208
208
|
|
209
|
+
# add flask to the controller dependencies for dashboard
|
210
|
+
python_packages.add('flask')
|
211
|
+
|
209
212
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
210
213
|
commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
|
211
214
|
f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250110
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -146,6 +146,15 @@ Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
146
146
|
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
147
147
|
Requires-Dist: azure-common; extra == "all"
|
148
148
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
149
|
+
Dynamic: author
|
150
|
+
Dynamic: classifier
|
151
|
+
Dynamic: description
|
152
|
+
Dynamic: description-content-type
|
153
|
+
Dynamic: license
|
154
|
+
Dynamic: project-url
|
155
|
+
Dynamic: provides-extra
|
156
|
+
Dynamic: requires-dist
|
157
|
+
Dynamic: summary
|
149
158
|
|
150
159
|
<p align="center">
|
151
160
|
<img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/skypilot-wide-light-1k.png" width=55%>
|
{skypilot_nightly-1.0.0.dev20250108.dist-info → skypilot_nightly-1.0.0.dev20250110.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=gyZh8lvbKyHrmnJe3wuoL4bFMxus_ERhfhmLop8pq3s,5944
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
|
-
sky/authentication.py,sha256=
|
3
|
+
sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
|
4
4
|
sky/check.py,sha256=s8deMVL-k9y8gd519K7NWZc3DqWsEySwiAr0uH3Vvcc,9459
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=ra3u-Erv8TwalWFU1Fw4_ix0oUWfVAd9eQsruQRx_Lc,214915
|
6
6
|
sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
|
7
7
|
sky/core.py,sha256=CPwNZQlC5WKLzTb2Tjo2Uogg0EvOt-yLCRlegqK_92A,38598
|
8
8
|
sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
|
@@ -32,7 +32,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
32
32
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
33
33
|
sky/backends/backend.py,sha256=iBs5gnMaaUoH2OIQ3xhAjWdrJWqj8T61Za9TGsBFpvQ,7515
|
34
34
|
sky/backends/backend_utils.py,sha256=Eeew8YV0VYSYxozqzadNMZrjhEMjlE3yuzTRP7YSl50,137348
|
35
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
35
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=OZGMcazLq9bWEmWHk0Fkdj_SLjxp8GYuu2sL__D-rls,247424
|
36
36
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
37
37
|
sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
|
38
38
|
sky/backends/wheel_utils.py,sha256=5BUzBqfYz7p1ME6_0PXGmcsAkLVb8NrFt317p7a4X8s,8278
|
@@ -41,7 +41,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
42
42
|
sky/benchmark/benchmark_utils.py,sha256=mP8Ox2WiKfthq6LcUAZnHknFQ0n8v9o_rCh1LXLgkqc,26192
|
43
43
|
sky/clouds/__init__.py,sha256=iORaV6auqMxa6-6FKgt1C9f3UqRk1GASUtakua3tb9A,1395
|
44
|
-
sky/clouds/aws.py,sha256=
|
44
|
+
sky/clouds/aws.py,sha256=6mwI6wb1ry11KTMMdRVJ6W5cQuGF_v3gmRs4axJxEQw,53806
|
45
45
|
sky/clouds/azure.py,sha256=KtnnNZn4ZEr7xndBHxX91v0YXSI1QWPgIefuM1zDUBA,30784
|
46
46
|
sky/clouds/cloud.py,sha256=5_ZduUcyCEY1JnX_h0PrJ5xwtPP4oor4jf6cICgSArc,35370
|
47
47
|
sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
|
@@ -96,13 +96,13 @@ sky/data/data_utils.py,sha256=HjcgMDuWRR_fNQ9gjuROi9GgPVvTGApiJwxGtdb2_UU,28860
|
|
96
96
|
sky/data/mounting_utils.py,sha256=FfOYpu4Rvj8WT4NNLAZPP8nj5k9MQQCXlEgmoid_lus,14455
|
97
97
|
sky/data/storage.py,sha256=07ccD5YaQ9j6R_zPkvNk7qXnW3awDkCn9V-Sx-KXGvo,201715
|
98
98
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
99
|
-
sky/jobs/__init__.py,sha256=
|
99
|
+
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
100
100
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
101
101
|
sky/jobs/controller.py,sha256=DDt92Sa0TV3VULnEyM5QopUowciH6PE9u0yTDumFatM,28538
|
102
|
-
sky/jobs/core.py,sha256=
|
102
|
+
sky/jobs/core.py,sha256=AVbboohNCUDqfK_7DDkc-wJOg87nE7L6Vw0wbPTelIA,20022
|
103
103
|
sky/jobs/recovery_strategy.py,sha256=eP9CLy5qiNTyMJTWWzAxdQ4YolUZWL1g3cLMH7tw8Es,27312
|
104
|
-
sky/jobs/state.py,sha256=
|
105
|
-
sky/jobs/utils.py,sha256=
|
104
|
+
sky/jobs/state.py,sha256=1NeW0SVtfVd02MnS9OzvV-OV9Plch8QLH-ZZnttaLCg,27598
|
105
|
+
sky/jobs/utils.py,sha256=G-3f0qxJEep4Rl52UxnXLcVmjt2uLYn0qUja1pClwmw,39031
|
106
106
|
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
107
107
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
108
108
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
@@ -115,7 +115,7 @@ sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
|
115
115
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
116
116
|
sky/provision/provisioner.py,sha256=ZOgFOO0NB4QZVPwd4qikRqi615Bq67n0Vcl3cTDVxNE,29153
|
117
117
|
sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
|
118
|
-
sky/provision/aws/config.py,sha256=
|
118
|
+
sky/provision/aws/config.py,sha256=_8jvi8UVMtIVChDDnv5uHV2CoPyKvKqvxJ4xIEBYdDc,24629
|
119
119
|
sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
|
120
120
|
sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,3238
|
121
121
|
sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
|
@@ -237,7 +237,7 @@ sky/templates/do-ray.yml.j2,sha256=sRKpn0tC-uPYtSZ20OB4fMzE7RbPQUr8kOCIbuJ4b4Q,4
|
|
237
237
|
sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
|
238
238
|
sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
|
239
239
|
sky/templates/ibm-ray.yml.j2,sha256=RMBUqPId8i4CnVwcyfK3DbRapF1jFMuGQlY0E0PFbMU,6669
|
240
|
-
sky/templates/jobs-controller.yaml.j2,sha256=
|
240
|
+
sky/templates/jobs-controller.yaml.j2,sha256=SDC4VzQ-difQ1pSh6YensI14GDVJjeKMBMjl7gibq7A,2597
|
241
241
|
sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
|
242
242
|
sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
|
243
243
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=iw7mypHszg6Ggq9MbyiYMFOkSlXaQZulaxqC5IWYGCc,3381
|
@@ -262,7 +262,7 @@ sky/utils/command_runner.py,sha256=ewDjFxcCOv0OeG2aUOIfVWmTls65up9DvSnAXURvGfM,3
|
|
262
262
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
263
263
|
sky/utils/common_utils.py,sha256=Kh0iymQl9I4HXxYSc3TTcv-xeso27pU_1hGNOc9Xw2o,25370
|
264
264
|
sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
|
265
|
-
sky/utils/controller_utils.py,sha256=
|
265
|
+
sky/utils/controller_utils.py,sha256=g4wvp6BrXUcwjRbMvy_LBtZPMPOzHXeRWyEoXORoZrU,44381
|
266
266
|
sky/utils/dag_utils.py,sha256=R1yhJssvzDg13p6PJIC8OkYFBiR64eIx5xQeRpAG9n4,6099
|
267
267
|
sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
268
268
|
sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
|
@@ -288,9 +288,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
288
288
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
289
289
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
290
290
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
291
|
-
skypilot_nightly-1.0.0.
|
292
|
-
skypilot_nightly-1.0.0.
|
293
|
-
skypilot_nightly-1.0.0.
|
294
|
-
skypilot_nightly-1.0.0.
|
295
|
-
skypilot_nightly-1.0.0.
|
296
|
-
skypilot_nightly-1.0.0.
|
291
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
292
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/METADATA,sha256=JylfOy73kl-oaZXLQ4wZQXK3YFZoFDn6-Ch-_z6h8-U,20632
|
293
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
294
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
295
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
296
|
+
skypilot_nightly-1.0.0.dev20250110.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|