skypilot-nightly 1.0.0.dev20241107__py3-none-any.whl → 1.0.0.dev20241109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +12 -6
- sky/backends/wheel_utils.py +5 -1
- sky/cli.py +28 -4
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +3 -1
- sky/core.py +3 -2
- sky/dag.py +1 -0
- sky/data/mounting_utils.py +4 -16
- sky/exceptions.py +4 -1
- sky/execution.py +10 -8
- sky/jobs/core.py +3 -1
- sky/jobs/dashboard/dashboard.py +2 -1
- sky/jobs/recovery_strategy.py +16 -5
- sky/jobs/state.py +94 -79
- sky/jobs/utils.py +18 -10
- sky/provision/aws/config.py +25 -5
- sky/provision/instance_setup.py +1 -0
- sky/provision/runpod/instance.py +6 -1
- sky/serve/core.py +11 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +10 -3
- sky/skylet/log_lib.py +77 -8
- sky/templates/kubernetes-ray.yml.j2 +3 -1
- sky/utils/admin_policy_utils.py +1 -0
- sky/utils/command_runner.py +14 -2
- sky/utils/control_master_utils.py +49 -0
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/RECORD +32 -31
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '42c79e1d0a5e018e275705ada53957573f9a0181'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241109'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -3262,6 +3262,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3262
3262
|
) -> None:
|
3263
3263
|
"""Executes generated code on the head node."""
|
3264
3264
|
style = colorama.Style
|
3265
|
+
fore = colorama.Fore
|
3265
3266
|
|
3266
3267
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3267
3268
|
remote_log_dir = self.log_dir
|
@@ -3373,9 +3374,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3373
3374
|
controller = controller_utils.Controllers.from_name(name)
|
3374
3375
|
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
3375
3376
|
logger.info(
|
3376
|
-
f'\n
|
3377
|
-
f'\nManaged Job ID: '
|
3377
|
+
f'\n{fore.CYAN}Managed Job ID: '
|
3378
3378
|
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3379
|
+
f'\n📋 Useful Commands'
|
3379
3380
|
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
|
3380
3381
|
f'{ux_utils.BOLD}sky jobs cancel {job_id}'
|
3381
3382
|
f'{ux_utils.RESET_BOLD}'
|
@@ -3392,8 +3393,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3392
3393
|
f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
|
3393
3394
|
f'{ux_utils.RESET_BOLD}')
|
3394
3395
|
elif controller is None:
|
3395
|
-
logger.info(f'\n
|
3396
|
-
f'
|
3396
|
+
logger.info(f'\n{fore.CYAN}Job ID: '
|
3397
|
+
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3398
|
+
f'\n📋 Useful Commands'
|
3397
3399
|
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
|
3398
3400
|
f'{ux_utils.BOLD}sky cancel {name} {job_id}'
|
3399
3401
|
f'{ux_utils.RESET_BOLD}'
|
@@ -3709,7 +3711,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3709
3711
|
handle: CloudVmRayResourceHandle,
|
3710
3712
|
job_id: Optional[int],
|
3711
3713
|
managed_job_id: Optional[int] = None,
|
3712
|
-
follow: bool = True
|
3714
|
+
follow: bool = True,
|
3715
|
+
tail: int = 0) -> int:
|
3713
3716
|
"""Tail the logs of a job.
|
3714
3717
|
|
3715
3718
|
Args:
|
@@ -3717,10 +3720,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3717
3720
|
job_id: The job ID to tail the logs of.
|
3718
3721
|
managed_job_id: The managed job ID for display purpose only.
|
3719
3722
|
follow: Whether to follow the logs.
|
3723
|
+
tail: The number of lines to display from the end of the
|
3724
|
+
log file. If 0, print all lines.
|
3720
3725
|
"""
|
3721
3726
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
3722
3727
|
managed_job_id=managed_job_id,
|
3723
|
-
follow=follow
|
3728
|
+
follow=follow,
|
3729
|
+
tail=tail)
|
3724
3730
|
if job_id is None and managed_job_id is None:
|
3725
3731
|
logger.info(
|
3726
3732
|
'Job ID not provided. Streaming the logs of the latest job.')
|
sky/backends/wheel_utils.py
CHANGED
@@ -129,7 +129,11 @@ def _build_sky_wheel() -> pathlib.Path:
|
|
129
129
|
|
130
130
|
wheel_dir = WHEEL_DIR / hash_of_latest_wheel
|
131
131
|
wheel_dir.mkdir(parents=True, exist_ok=True)
|
132
|
-
shutil.move
|
132
|
+
# shutil.move will fail when the file already exists and is being
|
133
|
+
# moved across filesystems.
|
134
|
+
if not os.path.exists(
|
135
|
+
os.path.join(wheel_dir, os.path.basename(wheel_path))):
|
136
|
+
shutil.move(str(wheel_path), wheel_dir)
|
133
137
|
return wheel_dir / wheel_path.name
|
134
138
|
|
135
139
|
|
sky/cli.py
CHANGED
@@ -46,6 +46,7 @@ from rich import progress as rich_progress
|
|
46
46
|
import yaml
|
47
47
|
|
48
48
|
import sky
|
49
|
+
from sky import admin_policy
|
49
50
|
from sky import backends
|
50
51
|
from sky import check as sky_check
|
51
52
|
from sky import clouds as sky_clouds
|
@@ -67,6 +68,7 @@ from sky.skylet import constants
|
|
67
68
|
from sky.skylet import job_lib
|
68
69
|
from sky.skylet import log_lib
|
69
70
|
from sky.usage import usage_lib
|
71
|
+
from sky.utils import admin_policy_utils
|
70
72
|
from sky.utils import common_utils
|
71
73
|
from sky.utils import controller_utils
|
72
74
|
from sky.utils import dag_utils
|
@@ -582,6 +584,15 @@ def _launch_with_confirm(
|
|
582
584
|
with ux_utils.print_exception_no_traceback():
|
583
585
|
raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
|
584
586
|
f'{colorama.Style.RESET_ALL}') from e
|
587
|
+
dag, _ = admin_policy_utils.apply(
|
588
|
+
dag,
|
589
|
+
request_options=admin_policy.RequestOptions(
|
590
|
+
cluster_name=cluster,
|
591
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
592
|
+
down=down,
|
593
|
+
dryrun=dryrun,
|
594
|
+
),
|
595
|
+
)
|
585
596
|
dag = sky.optimize(dag)
|
586
597
|
task = dag.tasks[0]
|
587
598
|
|
@@ -2011,6 +2022,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
2011
2022
|
help=('Follow the logs of a job. '
|
2012
2023
|
'If --no-follow is specified, print the log so far and exit. '
|
2013
2024
|
'[default: --follow]'))
|
2025
|
+
@click.option(
|
2026
|
+
'--tail',
|
2027
|
+
default=0,
|
2028
|
+
type=int,
|
2029
|
+
help=('The number of lines to display from the end of the log file. '
|
2030
|
+
'Default is 0, which means print all lines.'))
|
2014
2031
|
@click.argument('cluster',
|
2015
2032
|
required=True,
|
2016
2033
|
type=str,
|
@@ -2024,6 +2041,7 @@ def logs(
|
|
2024
2041
|
sync_down: bool,
|
2025
2042
|
status: bool, # pylint: disable=redefined-outer-name
|
2026
2043
|
follow: bool,
|
2044
|
+
tail: int,
|
2027
2045
|
):
|
2028
2046
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2029
2047
|
"""Tail the log of a job.
|
@@ -2090,7 +2108,7 @@ def logs(
|
|
2090
2108
|
click.secho(f'Job {id_str}not found', fg='red')
|
2091
2109
|
sys.exit(1)
|
2092
2110
|
|
2093
|
-
core.tail_logs(cluster, job_id, follow)
|
2111
|
+
core.tail_logs(cluster, job_id, follow, tail)
|
2094
2112
|
|
2095
2113
|
|
2096
2114
|
@cli.command()
|
@@ -3036,9 +3054,9 @@ def show_gpus(
|
|
3036
3054
|
and spot instances. There may be multiple regions with the same lowest
|
3037
3055
|
price.
|
3038
3056
|
|
3039
|
-
If ``--cloud kubernetes`` is specified, it will show the
|
3040
|
-
of the GPU available on a single node and the real-time
|
3041
|
-
the GPU across all nodes in the Kubernetes cluster.
|
3057
|
+
If ``--cloud kubernetes`` or ``--cloud k8s`` is specified, it will show the
|
3058
|
+
maximum quantities of the GPU available on a single node and the real-time
|
3059
|
+
availability of the GPU across all nodes in the Kubernetes cluster.
|
3042
3060
|
|
3043
3061
|
Definitions of certain fields:
|
3044
3062
|
|
@@ -3667,6 +3685,8 @@ def jobs_launch(
|
|
3667
3685
|
|
3668
3686
|
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3669
3687
|
fg='cyan')
|
3688
|
+
dag, _ = admin_policy_utils.apply(
|
3689
|
+
dag, use_mutated_config_in_current_request=False)
|
3670
3690
|
dag = sky.optimize(dag)
|
3671
3691
|
|
3672
3692
|
if not yes:
|
@@ -4145,6 +4165,8 @@ def serve_up(
|
|
4145
4165
|
fg='cyan')
|
4146
4166
|
with sky.Dag() as dag:
|
4147
4167
|
dag.add(task)
|
4168
|
+
dag, _ = admin_policy_utils.apply(
|
4169
|
+
dag, use_mutated_config_in_current_request=False)
|
4148
4170
|
sky.optimize(dag)
|
4149
4171
|
|
4150
4172
|
if not yes:
|
@@ -4261,6 +4283,8 @@ def serve_update(
|
|
4261
4283
|
fg='cyan')
|
4262
4284
|
with sky.Dag() as dag:
|
4263
4285
|
dag.add(task)
|
4286
|
+
dag, _ = admin_policy_utils.apply(
|
4287
|
+
dag, use_mutated_config_in_current_request=False)
|
4264
4288
|
sky.optimize(dag)
|
4265
4289
|
|
4266
4290
|
if not yes:
|
@@ -20,7 +20,6 @@ DEFAULT_LAMBDA_KEYS_PATH = os.path.expanduser('~/.lambda_cloud/lambda_keys')
|
|
20
20
|
|
21
21
|
# List of all possible regions.
|
22
22
|
REGIONS = [
|
23
|
-
'australia-southeast-1',
|
24
23
|
'europe-central-1',
|
25
24
|
'asia-south-1',
|
26
25
|
'me-west-1',
|
@@ -28,9 +27,12 @@ REGIONS = [
|
|
28
27
|
'asia-northeast-1',
|
29
28
|
'asia-northeast-2',
|
30
29
|
'us-east-1',
|
30
|
+
'us-east-2',
|
31
31
|
'us-west-2',
|
32
32
|
'us-west-1',
|
33
33
|
'us-south-1',
|
34
|
+
'us-south-2',
|
35
|
+
'us-south-3',
|
34
36
|
'us-west-3',
|
35
37
|
'us-midwest-1',
|
36
38
|
]
|
sky/core.py
CHANGED
@@ -742,7 +742,8 @@ def cancel(
|
|
742
742
|
@usage_lib.entrypoint
|
743
743
|
def tail_logs(cluster_name: str,
|
744
744
|
job_id: Optional[int],
|
745
|
-
follow: bool = True
|
745
|
+
follow: bool = True,
|
746
|
+
tail: int = 0) -> None:
|
746
747
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
747
748
|
"""Tail the logs of a job.
|
748
749
|
|
@@ -775,7 +776,7 @@ def tail_logs(cluster_name: str,
|
|
775
776
|
f'{colorama.Style.RESET_ALL}')
|
776
777
|
|
777
778
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
778
|
-
backend.tail_logs(handle, job_id, follow=follow)
|
779
|
+
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
779
780
|
|
780
781
|
|
781
782
|
@usage_lib.entrypoint
|
sky/dag.py
CHANGED
sky/data/mounting_utils.py
CHANGED
@@ -276,23 +276,11 @@ def get_mounting_command(
|
|
276
276
|
script = get_mounting_script(mount_path, mount_cmd, install_cmd,
|
277
277
|
version_check_cmd)
|
278
278
|
|
279
|
-
# TODO(romilb): Get direct bash script to work like so:
|
280
|
-
# command = f'bash <<-\EOL' \
|
281
|
-
# f'{script}' \
|
282
|
-
# 'EOL'
|
283
|
-
|
284
|
-
# TODO(romilb): This heredoc should have EOF after script, but it
|
285
|
-
# fails with sky's ssh pipeline. Instead, we don't use EOF and use )
|
286
|
-
# as the end of heredoc. This raises a warning (here-document delimited
|
287
|
-
# by end-of-file) that can be safely ignored.
|
288
|
-
|
289
279
|
# While these commands are run sequentially for each storage object,
|
290
280
|
# we add random int to be on the safer side and avoid collisions.
|
291
281
|
script_path = f'~/.sky/mount_{random.randint(0, 1000000)}.sh'
|
292
|
-
|
293
|
-
|
294
|
-
f'{
|
295
|
-
f'
|
296
|
-
f' && bash {script_path}'
|
297
|
-
f' && rm {script_path}')
|
282
|
+
command = (f'echo {shlex.quote(script)} > {script_path} && '
|
283
|
+
f'chmod +x {script_path} && '
|
284
|
+
f'bash {script_path} && '
|
285
|
+
f'rm {script_path}')
|
298
286
|
return command
|
sky/exceptions.py
CHANGED
@@ -3,6 +3,8 @@ import enum
|
|
3
3
|
import typing
|
4
4
|
from typing import List, Optional, Sequence
|
5
5
|
|
6
|
+
from sky.utils import env_options
|
7
|
+
|
6
8
|
if typing.TYPE_CHECKING:
|
7
9
|
from sky import status_lib
|
8
10
|
from sky.backends import backend
|
@@ -104,7 +106,8 @@ class CommandError(Exception):
|
|
104
106
|
if not command:
|
105
107
|
message = error_msg
|
106
108
|
else:
|
107
|
-
if len(command) > 100
|
109
|
+
if (len(command) > 100 and
|
110
|
+
not env_options.Options.SHOW_DEBUG_INFO.get()):
|
108
111
|
# Chunck the command to avoid overflow.
|
109
112
|
command = command[:100] + '...'
|
110
113
|
message = (f'Command {command} failed with return code '
|
sky/execution.py
CHANGED
@@ -160,14 +160,16 @@ def _execute(
|
|
160
160
|
"""
|
161
161
|
|
162
162
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
163
|
-
|
164
|
-
dag,
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
163
|
+
if not dag.policy_applied:
|
164
|
+
dag, _ = admin_policy_utils.apply(
|
165
|
+
dag,
|
166
|
+
request_options=admin_policy.RequestOptions(
|
167
|
+
cluster_name=cluster_name,
|
168
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
169
|
+
down=down,
|
170
|
+
dryrun=dryrun,
|
171
|
+
),
|
172
|
+
)
|
171
173
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
172
174
|
task = dag.tasks[0]
|
173
175
|
|
sky/jobs/core.py
CHANGED
@@ -59,8 +59,10 @@ def launch(
|
|
59
59
|
"""
|
60
60
|
entrypoint = task
|
61
61
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
62
|
-
|
63
62
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
63
|
+
# Always apply the policy again here, even though it might have been applied
|
64
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
65
|
+
# and get the mutated config.
|
64
66
|
dag, mutated_user_config = admin_policy_utils.apply(
|
65
67
|
dag, use_mutated_config_in_current_request=False)
|
66
68
|
if not dag.is_chain():
|
sky/jobs/dashboard/dashboard.py
CHANGED
@@ -26,7 +26,8 @@ def _is_running_on_jobs_controller() -> bool:
|
|
26
26
|
"""
|
27
27
|
if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
|
28
28
|
config = yaml.safe_load(
|
29
|
-
pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text(
|
29
|
+
pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text(
|
30
|
+
encoding='utf-8'))
|
30
31
|
cluster_name = config.get('cluster_name', '')
|
31
32
|
candidate_controller_names = (
|
32
33
|
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -36,6 +36,11 @@ DEFAULT_RECOVERY_STRATEGY = None
|
|
36
36
|
# 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
|
37
37
|
MAX_JOB_CHECKING_RETRY = 10
|
38
38
|
|
39
|
+
# Minutes to job cluster autodown. This should be significantly larger than
|
40
|
+
# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
|
41
|
+
# cluster before its status can be updated by the job controller.
|
42
|
+
_AUTODOWN_MINUTES = 5
|
43
|
+
|
39
44
|
|
40
45
|
def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
41
46
|
"""Terminate the cluster."""
|
@@ -302,11 +307,17 @@ class StrategyExecutor:
|
|
302
307
|
usage_lib.messages.usage.set_internal()
|
303
308
|
# Detach setup, so that the setup failure can be detected
|
304
309
|
# by the controller process (job_status -> FAILED_SETUP).
|
305
|
-
sky.launch(
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
+
sky.launch(
|
311
|
+
self.dag,
|
312
|
+
cluster_name=self.cluster_name,
|
313
|
+
# We expect to tear down the cluster as soon as the job is
|
314
|
+
# finished. However, in case the controller dies, set
|
315
|
+
# autodown to try and avoid a resource leak.
|
316
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
317
|
+
down=True,
|
318
|
+
detach_setup=True,
|
319
|
+
detach_run=True,
|
320
|
+
_is_launched_by_jobs_controller=True)
|
310
321
|
logger.info('Managed job cluster launched.')
|
311
322
|
except (exceptions.InvalidClusterNameError,
|
312
323
|
exceptions.NoCloudAccessError,
|
sky/jobs/state.py
CHANGED
@@ -12,6 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
12
12
|
import colorama
|
13
13
|
|
14
14
|
from sky import sky_logging
|
15
|
+
from sky.utils import common_utils
|
15
16
|
from sky.utils import db_utils
|
16
17
|
|
17
18
|
if typing.TYPE_CHECKING:
|
@@ -22,23 +23,6 @@ CallbackType = Callable[[str], None]
|
|
22
23
|
logger = sky_logging.init_logger(__name__)
|
23
24
|
|
24
25
|
|
25
|
-
def _get_db_path() -> str:
|
26
|
-
"""Workaround to collapse multi-step Path ops for type checker.
|
27
|
-
Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
|
28
|
-
"""
|
29
|
-
path = pathlib.Path('~/.sky/spot_jobs.db')
|
30
|
-
path = path.expanduser().absolute()
|
31
|
-
path.parents[0].mkdir(parents=True, exist_ok=True)
|
32
|
-
return str(path)
|
33
|
-
|
34
|
-
|
35
|
-
_DB_PATH = _get_db_path()
|
36
|
-
|
37
|
-
# Module-level connection/cursor; thread-safe as the module is only imported
|
38
|
-
# once.
|
39
|
-
_CONN = sqlite3.connect(_DB_PATH)
|
40
|
-
_CURSOR = _CONN.cursor()
|
41
|
-
|
42
26
|
# === Database schema ===
|
43
27
|
# `spot` table contains all the finest-grained tasks, including all the
|
44
28
|
# tasks of a managed job (called spot for legacy reason, as it is generalized
|
@@ -50,68 +34,99 @@ _CURSOR = _CONN.cursor()
|
|
50
34
|
# identifier/primary key for all the tasks. We will use `spot_job_id`
|
51
35
|
# to identify the spot job.
|
52
36
|
# TODO(zhwu): schema migration may be needed.
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
db_utils.add_column_to_table(
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
37
|
+
def create_table(cursor, conn):
|
38
|
+
# Enable WAL mode to avoid locking issues.
|
39
|
+
# See: issue #3863, #1441 and PR #1509
|
40
|
+
# https://github.com/microsoft/WSL/issues/2395
|
41
|
+
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
42
|
+
# This may cause the database locked problem from WSL issue #1441.
|
43
|
+
if not common_utils.is_wsl():
|
44
|
+
try:
|
45
|
+
cursor.execute('PRAGMA journal_mode=WAL')
|
46
|
+
except sqlite3.OperationalError as e:
|
47
|
+
if 'database is locked' not in str(e):
|
48
|
+
raise
|
49
|
+
# If the database is locked, it is OK to continue, as the WAL mode
|
50
|
+
# is not critical and is likely to be enabled by other processes.
|
51
|
+
|
52
|
+
cursor.execute("""\
|
53
|
+
CREATE TABLE IF NOT EXISTS spot (
|
54
|
+
job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
55
|
+
job_name TEXT,
|
56
|
+
resources TEXT,
|
57
|
+
submitted_at FLOAT,
|
58
|
+
status TEXT,
|
59
|
+
run_timestamp TEXT CANDIDATE KEY,
|
60
|
+
start_at FLOAT DEFAULT NULL,
|
61
|
+
end_at FLOAT DEFAULT NULL,
|
62
|
+
last_recovered_at FLOAT DEFAULT -1,
|
63
|
+
recovery_count INTEGER DEFAULT 0,
|
64
|
+
job_duration FLOAT DEFAULT 0,
|
65
|
+
failure_reason TEXT,
|
66
|
+
spot_job_id INTEGER,
|
67
|
+
task_id INTEGER DEFAULT 0,
|
68
|
+
task_name TEXT,
|
69
|
+
specs TEXT)""")
|
70
|
+
conn.commit()
|
71
|
+
|
72
|
+
db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
|
73
|
+
# Create a new column `spot_job_id`, which is the same for tasks of the
|
74
|
+
# same managed job.
|
75
|
+
# The original `job_id` no longer has an actual meaning, but only a legacy
|
76
|
+
# identifier for all tasks in database.
|
77
|
+
db_utils.add_column_to_table(cursor,
|
78
|
+
conn,
|
79
|
+
'spot',
|
80
|
+
'spot_job_id',
|
81
|
+
'INTEGER',
|
82
|
+
copy_from='job_id')
|
83
|
+
db_utils.add_column_to_table(cursor,
|
84
|
+
conn,
|
85
|
+
'spot',
|
86
|
+
'task_id',
|
87
|
+
'INTEGER DEFAULT 0',
|
88
|
+
value_to_replace_existing_entries=0)
|
89
|
+
db_utils.add_column_to_table(cursor,
|
90
|
+
conn,
|
91
|
+
'spot',
|
92
|
+
'task_name',
|
93
|
+
'TEXT',
|
94
|
+
copy_from='job_name')
|
95
|
+
|
96
|
+
# Specs is some useful information about the task, e.g., the
|
97
|
+
# max_restarts_on_errors value. It is stored in JSON format.
|
98
|
+
db_utils.add_column_to_table(cursor,
|
99
|
+
conn,
|
100
|
+
'spot',
|
101
|
+
'specs',
|
102
|
+
'TEXT',
|
103
|
+
value_to_replace_existing_entries=json.dumps({
|
104
|
+
'max_restarts_on_errors': 0,
|
105
|
+
}))
|
106
|
+
|
107
|
+
# `job_info` contains the mapping from job_id to the job_name.
|
108
|
+
# In the future, it may contain more information about each job.
|
109
|
+
cursor.execute("""\
|
110
|
+
CREATE TABLE IF NOT EXISTS job_info (
|
111
|
+
spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
112
|
+
name TEXT)""")
|
113
|
+
conn.commit()
|
114
|
+
|
115
|
+
|
116
|
+
# Module-level connection/cursor; thread-safe as the module is only imported
|
117
|
+
# once.
|
118
|
+
def _get_db_path() -> str:
|
119
|
+
"""Workaround to collapse multi-step Path ops for type checker.
|
120
|
+
Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
|
121
|
+
"""
|
122
|
+
path = pathlib.Path('~/.sky/spot_jobs.db')
|
123
|
+
path = path.expanduser().absolute()
|
124
|
+
path.parents[0].mkdir(parents=True, exist_ok=True)
|
125
|
+
return str(path)
|
126
|
+
|
127
|
+
|
128
|
+
_DB_PATH = _get_db_path()
|
129
|
+
db_utils.SQLiteConn(_DB_PATH, create_table)
|
115
130
|
|
116
131
|
# job_duration is the time a job actually runs (including the
|
117
132
|
# setup duration) before last_recover, excluding the provision
|
sky/jobs/utils.py
CHANGED
@@ -14,7 +14,7 @@ import shutil
|
|
14
14
|
import textwrap
|
15
15
|
import time
|
16
16
|
import typing
|
17
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
17
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
18
18
|
|
19
19
|
import colorama
|
20
20
|
import filelock
|
@@ -487,6 +487,7 @@ def stream_logs(job_id: Optional[int],
|
|
487
487
|
job_id = managed_job_state.get_latest_job_id()
|
488
488
|
if job_id is None:
|
489
489
|
return 'No managed job found.'
|
490
|
+
|
490
491
|
if controller:
|
491
492
|
if job_id is None:
|
492
493
|
assert job_name is not None
|
@@ -494,16 +495,22 @@ def stream_logs(job_id: Optional[int],
|
|
494
495
|
# We manually filter the jobs by name, instead of using
|
495
496
|
# get_nonterminal_job_ids_by_name, as with `controller=True`, we
|
496
497
|
# should be able to show the logs for jobs in terminal states.
|
497
|
-
|
498
|
-
|
499
|
-
|
498
|
+
managed_job_ids: Set[int] = {
|
499
|
+
job['job_id']
|
500
|
+
for job in managed_jobs
|
501
|
+
if job['job_name'] == job_name
|
502
|
+
}
|
503
|
+
if len(managed_job_ids) == 0:
|
500
504
|
return f'No managed job found with name {job_name!r}.'
|
501
|
-
if len(
|
502
|
-
job_ids_str = ', '.join(
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
505
|
+
if len(managed_job_ids) > 1:
|
506
|
+
job_ids_str = ', '.join(
|
507
|
+
str(job_id) for job_id in managed_job_ids)
|
508
|
+
with ux_utils.print_exception_no_traceback():
|
509
|
+
raise ValueError(
|
510
|
+
f'Multiple managed jobs found with name {job_name!r} '
|
511
|
+
f'(Job IDs: {job_ids_str}). Please specify the job_id '
|
512
|
+
'instead.')
|
513
|
+
job_id = managed_job_ids.pop()
|
507
514
|
assert job_id is not None, (job_id, job_name)
|
508
515
|
# TODO: keep the following code sync with
|
509
516
|
# job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
|
@@ -849,6 +856,7 @@ class ManagedJobCodeGen:
|
|
849
856
|
|
850
857
|
from sky.skylet import job_lib, log_lib
|
851
858
|
from sky.skylet import constants
|
859
|
+
from sky.utils import ux_utils
|
852
860
|
try:
|
853
861
|
from sky.jobs.utils import stream_logs_by_id
|
854
862
|
except ImportError:
|
sky/provision/aws/config.py
CHANGED
@@ -42,8 +42,9 @@ def _skypilot_log_error_and_exit_for_failover(error: str) -> None:
|
|
42
42
|
Mainly used for handling VPC/subnet errors before nodes are launched.
|
43
43
|
"""
|
44
44
|
# NOTE: keep. The backend looks for this to know no nodes are launched.
|
45
|
-
|
46
|
-
|
45
|
+
full_error = f'SKYPILOT_ERROR_NO_NODES_LAUNCHED: {error}'
|
46
|
+
logger.error(full_error)
|
47
|
+
raise RuntimeError(full_error)
|
47
48
|
|
48
49
|
|
49
50
|
def bootstrap_instances(
|
@@ -222,10 +223,27 @@ def _configure_iam_role(iam) -> Dict[str, Any]:
|
|
222
223
|
|
223
224
|
|
224
225
|
@functools.lru_cache(maxsize=128) # Keep bounded.
|
225
|
-
def _get_route_tables(ec2, vpc_id: Optional[str],
|
226
|
+
def _get_route_tables(ec2, vpc_id: Optional[str], region: str,
|
227
|
+
main: bool) -> List[Any]:
|
228
|
+
"""Get route tables associated with a VPC and region
|
229
|
+
|
230
|
+
Args:
|
231
|
+
ec2: ec2 resource object
|
232
|
+
vpc_id: vpc_id is optional, if not provided, all route tables in the
|
233
|
+
region will be returned
|
234
|
+
region: region is mandatory to allow the lru cache
|
235
|
+
to return the corect results
|
236
|
+
main: if True, only main route tables will be returned otherwise
|
237
|
+
only non-main route tables will be returned
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
A list of route tables associated with the options VPC and region
|
241
|
+
"""
|
226
242
|
filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
|
227
243
|
if vpc_id is not None:
|
228
244
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
245
|
+
logger.debug(
|
246
|
+
f'Getting route tables with filters: {filters} in region: {region}')
|
229
247
|
return ec2.meta.client.describe_route_tables(Filters=filters).get(
|
230
248
|
'RouteTables', [])
|
231
249
|
|
@@ -238,7 +256,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
|
|
238
256
|
https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
|
239
257
|
"""
|
240
258
|
# Get the route tables associated with the subnet
|
241
|
-
|
259
|
+
region = ec2.meta.client.meta.region_name
|
260
|
+
all_route_tables = _get_route_tables(ec2, vpc_id, region, main=False)
|
242
261
|
route_tables = [
|
243
262
|
rt for rt in all_route_tables
|
244
263
|
# An RT can be associated with multiple subnets, i.e.,
|
@@ -267,7 +286,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
|
|
267
286
|
# subnets. Since the associations are implicit, the filter above won't find
|
268
287
|
# any. Check there exists a main route table with routes pointing to an IGW.
|
269
288
|
logger.debug('Checking main route table')
|
270
|
-
|
289
|
+
region = ec2.meta.client.meta.region_name
|
290
|
+
main_route_tables = _get_route_tables(ec2, vpc_id, region, main=True)
|
271
291
|
return _has_igw_route(main_route_tables)
|
272
292
|
|
273
293
|
|
sky/provision/instance_setup.py
CHANGED
@@ -264,6 +264,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
|
|
264
264
|
f'--disable-usage-stats '
|
265
265
|
f'--port={constants.SKY_REMOTE_RAY_PORT} '
|
266
266
|
f'--dashboard-port={constants.SKY_REMOTE_RAY_DASHBOARD_PORT} '
|
267
|
+
f'--min-worker-port 11002 '
|
267
268
|
f'--object-manager-port=8076 '
|
268
269
|
f'--temp-dir={constants.SKY_REMOTE_RAY_TEMPDIR}')
|
269
270
|
if custom_resource:
|
sky/provision/runpod/instance.py
CHANGED
@@ -232,7 +232,12 @@ def query_ports(
|
|
232
232
|
instances = _filter_instances(cluster_name_on_cloud,
|
233
233
|
None,
|
234
234
|
head_only=True)
|
235
|
-
assert len(instances)
|
235
|
+
assert len(instances) <= 1
|
236
|
+
# It is possible that the instance is terminated on console by
|
237
|
+
# the user. In this case, the instance will not be found and we
|
238
|
+
# should return an empty dict.
|
239
|
+
if not instances:
|
240
|
+
return {}
|
236
241
|
head_inst = list(instances.values())[0]
|
237
242
|
ready_ports: Dict[int, List[common.Endpoint]] = {
|
238
243
|
port: [common.SocketEndpoint(**endpoint)]
|
sky/serve/core.py
CHANGED
@@ -124,7 +124,9 @@ def up(
|
|
124
124
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
125
125
|
|
126
126
|
_validate_service_task(task)
|
127
|
-
|
127
|
+
# Always apply the policy again here, even though it might have been applied
|
128
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
129
|
+
# and get the mutated config.
|
128
130
|
dag, mutated_user_config = admin_policy_utils.apply(
|
129
131
|
task, use_mutated_config_in_current_request=False)
|
130
132
|
task = dag.tasks[0]
|
@@ -319,6 +321,14 @@ def update(
|
|
319
321
|
service_name: Name of the service.
|
320
322
|
"""
|
321
323
|
_validate_service_task(task)
|
324
|
+
# Always apply the policy again here, even though it might have been applied
|
325
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
326
|
+
# and get the mutated config.
|
327
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
328
|
+
# will not apply the config.
|
329
|
+
dag, _ = admin_policy_utils.apply(
|
330
|
+
task, use_mutated_config_in_current_request=False)
|
331
|
+
task = dag.tasks[0]
|
322
332
|
handle = backend_utils.is_controller_accessible(
|
323
333
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
324
334
|
stopped_message=
|
sky/skylet/constants.py
CHANGED
@@ -79,7 +79,7 @@ SKYLET_VERSION = '8'
|
|
79
79
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
80
80
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
81
81
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
82
|
-
SKYLET_LIB_VERSION =
|
82
|
+
SKYLET_LIB_VERSION = 2
|
83
83
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
84
84
|
|
85
85
|
# `sky jobs dashboard`-related
|
sky/skylet/job_lib.py
CHANGED
@@ -29,6 +29,7 @@ if typing.TYPE_CHECKING:
|
|
29
29
|
|
30
30
|
logger = sky_logging.init_logger(__name__)
|
31
31
|
|
32
|
+
_LINUX_NEW_LINE = '\n'
|
32
33
|
_JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
|
33
34
|
|
34
35
|
|
@@ -602,6 +603,7 @@ def update_job_status(job_ids: List[int],
|
|
602
603
|
# the pending table until appearing in ray jobs. For jobs
|
603
604
|
# submitted outside of the grace period, we will consider the
|
604
605
|
# ray job status.
|
606
|
+
|
605
607
|
if not (pending_job['submit'] > 0 and pending_job['submit'] <
|
606
608
|
ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
|
607
609
|
# Reset the job status to PENDING even though it may not
|
@@ -903,14 +905,19 @@ class JobLibCodeGen:
|
|
903
905
|
def tail_logs(cls,
|
904
906
|
job_id: Optional[int],
|
905
907
|
managed_job_id: Optional[int],
|
906
|
-
follow: bool = True
|
908
|
+
follow: bool = True,
|
909
|
+
tail: int = 0) -> str:
|
907
910
|
# pylint: disable=line-too-long
|
911
|
+
|
908
912
|
code = [
|
913
|
+
# We use != instead of is not because 1 is not None will print a warning:
|
914
|
+
# <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
|
909
915
|
f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
|
910
916
|
'run_timestamp = job_lib.get_run_timestamp(job_id)',
|
911
917
|
f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
|
912
|
-
f'
|
913
|
-
f'
|
918
|
+
f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
|
919
|
+
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
920
|
+
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
914
921
|
]
|
915
922
|
return cls._build(code)
|
916
923
|
|
sky/skylet/log_lib.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
This is a remote utility module that provides logging functionality.
|
4
4
|
"""
|
5
|
+
import collections
|
5
6
|
import copy
|
6
7
|
import io
|
7
8
|
import multiprocessing.pool
|
@@ -12,7 +13,8 @@ import sys
|
|
12
13
|
import tempfile
|
13
14
|
import textwrap
|
14
15
|
import time
|
15
|
-
from typing import Dict, Iterator, List, Optional,
|
16
|
+
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
17
|
+
Tuple, Union)
|
16
18
|
|
17
19
|
import colorama
|
18
20
|
|
@@ -26,6 +28,9 @@ from sky.utils import ux_utils
|
|
26
28
|
_SKY_LOG_WAITING_GAP_SECONDS = 1
|
27
29
|
_SKY_LOG_WAITING_MAX_RETRY = 5
|
28
30
|
_SKY_LOG_TAILING_GAP_SECONDS = 0.2
|
31
|
+
# Peek the head of the lines to check if we need to start
|
32
|
+
# streaming when tail > 0.
|
33
|
+
PEEK_HEAD_LINES_FOR_START_STREAM = 20
|
29
34
|
|
30
35
|
logger = sky_logging.init_logger(__name__)
|
31
36
|
|
@@ -330,6 +335,7 @@ def run_bash_command_with_log(bash_command: str,
|
|
330
335
|
|
331
336
|
def _follow_job_logs(file,
|
332
337
|
job_id: int,
|
338
|
+
start_streaming: bool,
|
333
339
|
start_streaming_at: str = '') -> Iterator[str]:
|
334
340
|
"""Yield each line from a file as they are written.
|
335
341
|
|
@@ -338,7 +344,6 @@ def _follow_job_logs(file,
|
|
338
344
|
# No need to lock the status here, as the while loop can handle
|
339
345
|
# the older status.
|
340
346
|
status = job_lib.get_status_no_lock(job_id)
|
341
|
-
start_streaming = False
|
342
347
|
wait_last_logs = True
|
343
348
|
while True:
|
344
349
|
tmp = file.readline()
|
@@ -378,10 +383,45 @@ def _follow_job_logs(file,
|
|
378
383
|
status = job_lib.get_status_no_lock(job_id)
|
379
384
|
|
380
385
|
|
386
|
+
def _peek_head_lines(log_file: TextIO) -> List[str]:
|
387
|
+
"""Peek the head of the file."""
|
388
|
+
lines = [
|
389
|
+
log_file.readline() for _ in range(PEEK_HEAD_LINES_FOR_START_STREAM)
|
390
|
+
]
|
391
|
+
# Reset the file pointer to the beginning
|
392
|
+
log_file.seek(0, os.SEEK_SET)
|
393
|
+
return [line for line in lines if line]
|
394
|
+
|
395
|
+
|
396
|
+
def _should_stream_the_whole_tail_lines(head_lines_of_log_file: List[str],
|
397
|
+
tail_lines: Deque[str],
|
398
|
+
start_stream_at: str) -> bool:
|
399
|
+
"""Check if the entire tail lines should be streamed."""
|
400
|
+
# See comment:
|
401
|
+
# https://github.com/skypilot-org/skypilot/pull/4241#discussion_r1833611567
|
402
|
+
# for more details.
|
403
|
+
# Case 1: If start_stream_at is found at the head of the tail lines,
|
404
|
+
# we should not stream the whole tail lines.
|
405
|
+
for index, line in enumerate(tail_lines):
|
406
|
+
if index >= PEEK_HEAD_LINES_FOR_START_STREAM:
|
407
|
+
break
|
408
|
+
if start_stream_at in line:
|
409
|
+
return False
|
410
|
+
# Case 2: If start_stream_at is found at the head of log file, but not at
|
411
|
+
# the tail lines, we need to stream the whole tail lines.
|
412
|
+
for line in head_lines_of_log_file:
|
413
|
+
if start_stream_at in line:
|
414
|
+
return True
|
415
|
+
# Case 3: If start_stream_at is not at the head, and not found at the tail
|
416
|
+
# lines, we should not stream the whole tail lines.
|
417
|
+
return False
|
418
|
+
|
419
|
+
|
381
420
|
def tail_logs(job_id: Optional[int],
|
382
421
|
log_dir: Optional[str],
|
383
422
|
managed_job_id: Optional[int] = None,
|
384
|
-
follow: bool = True
|
423
|
+
follow: bool = True,
|
424
|
+
tail: int = 0) -> None:
|
385
425
|
"""Tail the logs of a job.
|
386
426
|
|
387
427
|
Args:
|
@@ -390,6 +430,8 @@ def tail_logs(job_id: Optional[int],
|
|
390
430
|
managed_job_id: The managed job id (for logging info only to avoid
|
391
431
|
confusion).
|
392
432
|
follow: Whether to follow the logs or print the logs so far and exit.
|
433
|
+
tail: The number of lines to display from the end of the log file,
|
434
|
+
if 0, print all lines.
|
393
435
|
"""
|
394
436
|
if job_id is None:
|
395
437
|
# This only happens when job_lib.get_latest_job_id() returns None,
|
@@ -430,6 +472,8 @@ def tail_logs(job_id: Optional[int],
|
|
430
472
|
status = job_lib.update_job_status([job_id], silent=True)[0]
|
431
473
|
|
432
474
|
start_stream_at = 'Waiting for task resources on '
|
475
|
+
# Explicitly declare the type to avoid mypy warning.
|
476
|
+
lines: Iterable[str] = []
|
433
477
|
if follow and status in [
|
434
478
|
job_lib.JobStatus.SETTING_UP,
|
435
479
|
job_lib.JobStatus.PENDING,
|
@@ -440,18 +484,43 @@ def tail_logs(job_id: Optional[int],
|
|
440
484
|
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
441
485
|
# Using `_follow` instead of `tail -f` to streaming the whole
|
442
486
|
# log and creating a new process for tail.
|
487
|
+
start_streaming = False
|
488
|
+
if tail > 0:
|
489
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
490
|
+
lines = collections.deque(log_file, maxlen=tail)
|
491
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
492
|
+
head_lines_of_log_file, lines, start_stream_at)
|
493
|
+
for line in lines:
|
494
|
+
if start_stream_at in line:
|
495
|
+
start_streaming = True
|
496
|
+
if start_streaming:
|
497
|
+
print(line, end='')
|
498
|
+
# Flush the last n lines
|
499
|
+
print(end='', flush=True)
|
500
|
+
# Now, the cursor is at the end of the last lines
|
501
|
+
# if tail > 0
|
443
502
|
for line in _follow_job_logs(log_file,
|
444
503
|
job_id=job_id,
|
504
|
+
start_streaming=start_streaming,
|
445
505
|
start_streaming_at=start_stream_at):
|
446
506
|
print(line, end='', flush=True)
|
447
507
|
else:
|
448
508
|
try:
|
449
|
-
|
450
|
-
with open(log_path, 'r', encoding='utf-8') as
|
451
|
-
|
509
|
+
start_streaming = False
|
510
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
511
|
+
if tail > 0:
|
512
|
+
# If tail > 0, we need to read the last n lines.
|
513
|
+
# We use double ended queue to rotate the last n lines.
|
514
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
515
|
+
lines = collections.deque(log_file, maxlen=tail)
|
516
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
517
|
+
head_lines_of_log_file, lines, start_stream_at)
|
518
|
+
else:
|
519
|
+
lines = log_file
|
520
|
+
for line in lines:
|
452
521
|
if start_stream_at in line:
|
453
|
-
|
454
|
-
if
|
522
|
+
start_streaming = True
|
523
|
+
if start_streaming:
|
455
524
|
print(line, end='', flush=True)
|
456
525
|
except FileNotFoundError:
|
457
526
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
@@ -324,6 +324,8 @@ available_node_types:
|
|
324
324
|
command: ["/bin/bash", "-c", "--"]
|
325
325
|
args:
|
326
326
|
- |
|
327
|
+
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
328
|
+
|
327
329
|
# Tails file and checks every 5 sec for
|
328
330
|
# open file handlers with write access
|
329
331
|
# closes if none exist
|
@@ -333,7 +335,7 @@ available_node_types:
|
|
333
335
|
while kill -0 $TAIL_PID 2> /dev/null; do
|
334
336
|
# only two PIDs should be accessing the file
|
335
337
|
# the log appender and log tailer
|
336
|
-
if [ $(
|
338
|
+
if [ $(mylsof $file | wc -l) -lt 2 ]; then
|
337
339
|
kill $TAIL_PID
|
338
340
|
break
|
339
341
|
fi
|
sky/utils/admin_policy_utils.py
CHANGED
sky/utils/command_runner.py
CHANGED
@@ -11,6 +11,7 @@ from sky import sky_logging
|
|
11
11
|
from sky.skylet import constants
|
12
12
|
from sky.skylet import log_lib
|
13
13
|
from sky.utils import common_utils
|
14
|
+
from sky.utils import control_master_utils
|
14
15
|
from sky.utils import subprocess_utils
|
15
16
|
from sky.utils import timeline
|
16
17
|
|
@@ -104,13 +105,22 @@ def ssh_options_list(
|
|
104
105
|
}
|
105
106
|
# SSH Control will have a severe delay when using docker_ssh_proxy_command.
|
106
107
|
# TODO(tian): Investigate why.
|
108
|
+
#
|
109
|
+
# We disable ControlMaster when ssh_proxy_command is used, because the
|
110
|
+
# master connection will be idle although the connection might be shared
|
111
|
+
# by other ssh commands that is not idle. In that case, user's custom proxy
|
112
|
+
# command may drop the connection due to idle timeout, since it will only
|
113
|
+
# see the idle master connection. It is an issue even with the
|
114
|
+
# ServerAliveInterval set, since the keepalive message may not be recognized
|
115
|
+
# by the custom proxy command, such as AWS SSM Session Manager.
|
116
|
+
#
|
107
117
|
# We also do not use ControlMaster when we use `kubectl port-forward`
|
108
118
|
# to access Kubernetes pods over SSH+Proxycommand. This is because the
|
109
119
|
# process running ProxyCommand is kept running as long as the ssh session
|
110
120
|
# is running and the ControlMaster keeps the session, which results in
|
111
121
|
# 'ControlPersist' number of seconds delay per ssh commands ran.
|
112
122
|
if (ssh_control_name is not None and docker_ssh_proxy_command is None and
|
113
|
-
not disable_control_master):
|
123
|
+
ssh_proxy_command is None and not disable_control_master):
|
114
124
|
arg_dict.update({
|
115
125
|
# Control path: important optimization as we do multiple ssh in one
|
116
126
|
# sky.launch().
|
@@ -459,7 +469,9 @@ class SSHCommandRunner(CommandRunner):
|
|
459
469
|
None if ssh_control_name is None else hashlib.md5(
|
460
470
|
ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
|
461
471
|
self._ssh_proxy_command = ssh_proxy_command
|
462
|
-
self.disable_control_master =
|
472
|
+
self.disable_control_master = (
|
473
|
+
disable_control_master or
|
474
|
+
control_master_utils.should_disable_control_master())
|
463
475
|
if docker_user is not None:
|
464
476
|
assert port is None or port == 22, (
|
465
477
|
f'port must be None or 22 for docker_user, got {port}.')
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""Utils to check if the ssh control master should be disabled."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.utils import subprocess_utils
|
7
|
+
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
def is_tmp_9p_filesystem() -> bool:
|
12
|
+
"""Check if the /tmp filesystem is 9p.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
bool: True if the /tmp filesystem is 9p, False otherwise.
|
16
|
+
"""
|
17
|
+
|
18
|
+
result = subprocess_utils.run(['df', '-T', '/tmp'],
|
19
|
+
capture_output=True,
|
20
|
+
text=True,
|
21
|
+
shell=None,
|
22
|
+
check=False,
|
23
|
+
executable=None)
|
24
|
+
|
25
|
+
if result.returncode != 0:
|
26
|
+
return False
|
27
|
+
|
28
|
+
filesystem_infos = result.stdout.strip().split('\n')
|
29
|
+
if len(filesystem_infos) < 2:
|
30
|
+
return False
|
31
|
+
filesystem_types = filesystem_infos[1].split()
|
32
|
+
if len(filesystem_types) < 2:
|
33
|
+
return False
|
34
|
+
return filesystem_types[1].lower() == '9p'
|
35
|
+
|
36
|
+
|
37
|
+
@functools.lru_cache
|
38
|
+
def should_disable_control_master() -> bool:
|
39
|
+
"""Whether disable ssh control master based on file system.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
bool: True if the ssh control master should be disabled,
|
43
|
+
False otherwise.
|
44
|
+
"""
|
45
|
+
if is_tmp_9p_filesystem():
|
46
|
+
return True
|
47
|
+
# there may be additional criteria to disable ssh control master
|
48
|
+
# in the future. They should be checked here
|
49
|
+
return False
|
{skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/RECORD
RENAMED
@@ -1,13 +1,13 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=vuaxCFFtQHJTriSEGG_wKshl6nmhDcnt70q66x1rkvA,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=jEjXs5Z0u263eJIsTHoKyG9oOY6giqw19s2di9kEv1s,212088
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
|
-
sky/core.py,sha256=
|
8
|
-
sky/dag.py,sha256=
|
9
|
-
sky/exceptions.py,sha256=
|
10
|
-
sky/execution.py,sha256=
|
7
|
+
sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
|
8
|
+
sky/dag.py,sha256=O9g8NnO8L1SGUEDyqW9W341AH4Wvd3nJs54niR-pkrk,2822
|
9
|
+
sky/exceptions.py,sha256=E3C2Ejcc8RUDAUQn7ar_Jr97C_AxD2rKKMmJOfLJ9d0,8965
|
10
|
+
sky/execution.py,sha256=TwcorzFxR_0m8uazPdeKltU3g3ikgUSqqzcSBrHp7K4,26070
|
11
11
|
sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
|
12
12
|
sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
|
13
13
|
sky/resources.py,sha256=Zt8mCCmdvZ5ZCqY-l3KXlx_lkUesAopRtaEcEsrRFZo,68465
|
@@ -31,10 +31,10 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
33
|
sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=6Ew9Ej92KGlumlCnyDcGSEbHInj7g2Shqwx4oxRkWVQ,233122
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
|
-
sky/backends/wheel_utils.py,sha256=
|
37
|
+
sky/backends/wheel_utils.py,sha256=CUVOwlBtQjOMv-RSDGx2jMQ0M1D0w9ZPm0TDafJwBDI,8180
|
38
38
|
sky/backends/monkey_patches/monkey_patch_ray_up.py,sha256=76-y2fCaE3JINj8lEwHT1eirYzCbpD8O1ySsysuGu8o,3450
|
39
39
|
sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
@@ -79,7 +79,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzE
|
|
79
79
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
80
80
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
|
81
81
|
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
|
82
|
-
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=
|
82
|
+
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=yOPmmckiQ0HU6bKXWd7YdTrsF2sql3Bs_jYNpuxlo0I,4942
|
83
83
|
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
|
84
84
|
sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
|
@@ -90,29 +90,29 @@ sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas
|
|
90
90
|
sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
|
91
91
|
sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
|
92
92
|
sky/data/data_utils.py,sha256=-P5GsDH_m4slrCz4vHdgiFezIys8ufzvhEKePJwfjFc,28597
|
93
|
-
sky/data/mounting_utils.py,sha256=
|
93
|
+
sky/data/mounting_utils.py,sha256=HwBGg1NmX-2IJZV_6h2r1U3ajTGOyfmA3MqboA7znqU,11004
|
94
94
|
sky/data/storage.py,sha256=OQ_kznF-P50Jq0feO5FBqm97QGhfbsZ2dX-Ar3sVWr4,163903
|
95
95
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
96
96
|
sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
|
97
97
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
98
98
|
sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
|
99
|
-
sky/jobs/core.py,sha256=
|
100
|
-
sky/jobs/recovery_strategy.py,sha256=
|
101
|
-
sky/jobs/state.py,sha256=
|
102
|
-
sky/jobs/utils.py,sha256=
|
103
|
-
sky/jobs/dashboard/dashboard.py,sha256=
|
99
|
+
sky/jobs/core.py,sha256=Lk_zKizc9a7O-8WHhh4-VXBS5kT0jRpwmNNA7S4ueIo,17347
|
100
|
+
sky/jobs/recovery_strategy.py,sha256=O_DouAfWx8FNdQxXsr2msMwlKCIodS99cW6V4Lf1vMo,27219
|
101
|
+
sky/jobs/state.py,sha256=DE02bCZc9bPbbuayb3Zml553mb4pEV7Z8t1pt8IGbYM,25252
|
102
|
+
sky/jobs/utils.py,sha256=Ff3TttIEdVeM1_kOVkviqIDjeVfBPIXVE8i-yP1VDM8,37976
|
103
|
+
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
104
104
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
105
105
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
106
106
|
sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
|
107
107
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
108
108
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
109
109
|
sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
|
110
|
-
sky/provision/instance_setup.py,sha256=
|
110
|
+
sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
|
111
111
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
112
112
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
113
113
|
sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
|
114
114
|
sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
|
115
|
-
sky/provision/aws/config.py,sha256=
|
115
|
+
sky/provision/aws/config.py,sha256=dbwulPxXGIJjKJddv85PbtlXOjwLemaD65j3DISNsK0,24214
|
116
116
|
sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
|
117
117
|
sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,3238
|
118
118
|
sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
|
@@ -154,7 +154,7 @@ sky/provision/paperspace/instance.py,sha256=q_V01DZSMXLfy63Zwt6AQotq02JuXQZb5CHS
|
|
154
154
|
sky/provision/paperspace/utils.py,sha256=uOmxbDKjV6skFizC4gYXSxRuEqso5ck2kF7MbtNmhEs,9580
|
155
155
|
sky/provision/runpod/__init__.py,sha256=6HYvHI27EaLrX1SS0vWVhdLu5HDBeZCdvAeDJuwM5pk,556
|
156
156
|
sky/provision/runpod/config.py,sha256=9ulZJVL7nHuxhTdoj8D7lNn7SdicJ5zc6FIcHIG9tcg,321
|
157
|
-
sky/provision/runpod/instance.py,sha256=
|
157
|
+
sky/provision/runpod/instance.py,sha256=AIWzTHuAu2dw8Rk-AHc7-14hUAYPEKh_UMzAhMzjDh0,9807
|
158
158
|
sky/provision/runpod/utils.py,sha256=ZjrcpjKzwS2nXQ21dW405PLxBl_V9awcfRjucGB3alw,6795
|
159
159
|
sky/provision/vsphere/__init__.py,sha256=5icB8-kfs926S9DVfNJSCBVr7z7cmCEDr04-YHX89_4,788
|
160
160
|
sky/provision/vsphere/config.py,sha256=f_ojGmi_vbnwJ8Ri48cqhZHBOuIkj41j9bFbq-ldPOo,504
|
@@ -175,7 +175,7 @@ sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
|
|
175
175
|
sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
|
176
176
|
sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
|
177
177
|
sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
|
178
|
-
sky/serve/core.py,sha256=
|
178
|
+
sky/serve/core.py,sha256=hszs95BwtC4wIJujGNokvFC46VjojgRz1BbYOIIPh6k,31601
|
179
179
|
sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
|
180
180
|
sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
|
181
181
|
sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
|
@@ -190,10 +190,10 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
190
|
sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
|
191
191
|
sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
|
192
192
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
193
|
-
sky/skylet/constants.py,sha256=
|
193
|
+
sky/skylet/constants.py,sha256=w05Enrg9RhGp99P1WDYMKK_ki0M-e0bS8Wr-VZR0Vn8,14468
|
194
194
|
sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
|
195
|
-
sky/skylet/job_lib.py,sha256
|
196
|
-
sky/skylet/log_lib.py,sha256=
|
195
|
+
sky/skylet/job_lib.py,sha256=FD1n9vE0daOEUKSH3lnccfBh7Vs81R8s4ILZyKu2o7M,37275
|
196
|
+
sky/skylet/log_lib.py,sha256=BmhAgcLvlin3szhj33IH0kbdCALacVisF2x61BQpZdY,21888
|
197
197
|
sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
|
198
198
|
sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
|
199
199
|
sky/skylet/subprocess_daemon.py,sha256=IJwGAzOdERrhWJS7VYKAUByNsLyIkKkB0w5nk06okG8,2818
|
@@ -228,7 +228,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjA
|
|
228
228
|
sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
|
229
229
|
sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
|
230
230
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
|
231
|
-
sky/templates/kubernetes-ray.yml.j2,sha256=
|
231
|
+
sky/templates/kubernetes-ray.yml.j2,sha256=dsWlkX-0b1igeZI4c0u0Jzia5I_9gezCiewR6pX1LlY,18374
|
232
232
|
sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
|
233
233
|
sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
|
234
234
|
sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
|
@@ -243,11 +243,12 @@ sky/usage/constants.py,sha256=8xpg9vhDU9A3eObtpkNFjwa42oCazqGEv4yw_vJSO7U,590
|
|
243
243
|
sky/usage/usage_lib.py,sha256=mxsbwUMEQjesUOIv4Yne-Ze7rVxSQYr3_wBXruifGRA,17898
|
244
244
|
sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
245
245
|
sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
|
246
|
-
sky/utils/admin_policy_utils.py,sha256=
|
246
|
+
sky/utils/admin_policy_utils.py,sha256=_Vt_jTTYCXmMdryj0vrrumFPewa93qHnzUqBDXjAhRU,5981
|
247
247
|
sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
|
248
|
-
sky/utils/command_runner.py,sha256=
|
248
|
+
sky/utils/command_runner.py,sha256=GHTZxoJQ3V8WVSRAaOA4JpRTxtCtuq36H9U8kOfWUwc,36450
|
249
249
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
250
250
|
sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
|
251
|
+
sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
|
251
252
|
sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
|
252
253
|
sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
|
253
254
|
sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
@@ -274,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
275
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
276
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
277
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
278
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
279
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/METADATA,sha256=YM8C71GXOj5CoHQlj5yNYhL8UkZ75DL-qMMTPXCOmXY,19708
|
280
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
281
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
282
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
283
|
+
skypilot_nightly-1.0.0.dev20241109.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|