skypilot-nightly 1.0.0.dev20241107__py3-none-any.whl → 1.0.0.dev20241109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +12 -6
  3. sky/backends/wheel_utils.py +5 -1
  4. sky/cli.py +28 -4
  5. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +3 -1
  6. sky/core.py +3 -2
  7. sky/dag.py +1 -0
  8. sky/data/mounting_utils.py +4 -16
  9. sky/exceptions.py +4 -1
  10. sky/execution.py +10 -8
  11. sky/jobs/core.py +3 -1
  12. sky/jobs/dashboard/dashboard.py +2 -1
  13. sky/jobs/recovery_strategy.py +16 -5
  14. sky/jobs/state.py +94 -79
  15. sky/jobs/utils.py +18 -10
  16. sky/provision/aws/config.py +25 -5
  17. sky/provision/instance_setup.py +1 -0
  18. sky/provision/runpod/instance.py +6 -1
  19. sky/serve/core.py +11 -1
  20. sky/skylet/constants.py +1 -1
  21. sky/skylet/job_lib.py +10 -3
  22. sky/skylet/log_lib.py +77 -8
  23. sky/templates/kubernetes-ray.yml.j2 +3 -1
  24. sky/utils/admin_policy_utils.py +1 -0
  25. sky/utils/command_runner.py +14 -2
  26. sky/utils/control_master_utils.py +49 -0
  27. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/METADATA +1 -1
  28. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/RECORD +32 -31
  29. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/LICENSE +0 -0
  30. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/WHEEL +0 -0
  31. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/entry_points.txt +0 -0
  32. {skypilot_nightly-1.0.0.dev20241107.dist-info → skypilot_nightly-1.0.0.dev20241109.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'fe2ce9a262c059722ddce46f5594fc2ca2370c0d'
8
+ _SKYPILOT_COMMIT_SHA = '42c79e1d0a5e018e275705ada53957573f9a0181'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241107'
38
+ __version__ = '1.0.0.dev20241109'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3262,6 +3262,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3262
3262
  ) -> None:
3263
3263
  """Executes generated code on the head node."""
3264
3264
  style = colorama.Style
3265
+ fore = colorama.Fore
3265
3266
 
3266
3267
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3267
3268
  remote_log_dir = self.log_dir
@@ -3373,9 +3374,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3373
3374
  controller = controller_utils.Controllers.from_name(name)
3374
3375
  if controller == controller_utils.Controllers.JOBS_CONTROLLER:
3375
3376
  logger.info(
3376
- f'\n📋 Useful Commands'
3377
- f'\nManaged Job ID: '
3377
+ f'\n{fore.CYAN}Managed Job ID: '
3378
3378
  f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3379
+ f'\n📋 Useful Commands'
3379
3380
  f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
3380
3381
  f'{ux_utils.BOLD}sky jobs cancel {job_id}'
3381
3382
  f'{ux_utils.RESET_BOLD}'
@@ -3392,8 +3393,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3392
3393
  f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
3393
3394
  f'{ux_utils.RESET_BOLD}')
3394
3395
  elif controller is None:
3395
- logger.info(f'\n📋 Useful Commands'
3396
- f'\nJob ID: {job_id}'
3396
+ logger.info(f'\n{fore.CYAN}Job ID: '
3397
+ f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3398
+ f'\n📋 Useful Commands'
3397
3399
  f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
3398
3400
  f'{ux_utils.BOLD}sky cancel {name} {job_id}'
3399
3401
  f'{ux_utils.RESET_BOLD}'
@@ -3709,7 +3711,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3709
3711
  handle: CloudVmRayResourceHandle,
3710
3712
  job_id: Optional[int],
3711
3713
  managed_job_id: Optional[int] = None,
3712
- follow: bool = True) -> int:
3714
+ follow: bool = True,
3715
+ tail: int = 0) -> int:
3713
3716
  """Tail the logs of a job.
3714
3717
 
3715
3718
  Args:
@@ -3717,10 +3720,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3717
3720
  job_id: The job ID to tail the logs of.
3718
3721
  managed_job_id: The managed job ID for display purpose only.
3719
3722
  follow: Whether to follow the logs.
3723
+ tail: The number of lines to display from the end of the
3724
+ log file. If 0, print all lines.
3720
3725
  """
3721
3726
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
3722
3727
  managed_job_id=managed_job_id,
3723
- follow=follow)
3728
+ follow=follow,
3729
+ tail=tail)
3724
3730
  if job_id is None and managed_job_id is None:
3725
3731
  logger.info(
3726
3732
  'Job ID not provided. Streaming the logs of the latest job.')
@@ -129,7 +129,11 @@ def _build_sky_wheel() -> pathlib.Path:
129
129
 
130
130
  wheel_dir = WHEEL_DIR / hash_of_latest_wheel
131
131
  wheel_dir.mkdir(parents=True, exist_ok=True)
132
- shutil.move(str(wheel_path), wheel_dir)
132
+ # shutil.move will fail when the file already exists and is being
133
+ # moved across filesystems.
134
+ if not os.path.exists(
135
+ os.path.join(wheel_dir, os.path.basename(wheel_path))):
136
+ shutil.move(str(wheel_path), wheel_dir)
133
137
  return wheel_dir / wheel_path.name
134
138
 
135
139
 
sky/cli.py CHANGED
@@ -46,6 +46,7 @@ from rich import progress as rich_progress
46
46
  import yaml
47
47
 
48
48
  import sky
49
+ from sky import admin_policy
49
50
  from sky import backends
50
51
  from sky import check as sky_check
51
52
  from sky import clouds as sky_clouds
@@ -67,6 +68,7 @@ from sky.skylet import constants
67
68
  from sky.skylet import job_lib
68
69
  from sky.skylet import log_lib
69
70
  from sky.usage import usage_lib
71
+ from sky.utils import admin_policy_utils
70
72
  from sky.utils import common_utils
71
73
  from sky.utils import controller_utils
72
74
  from sky.utils import dag_utils
@@ -582,6 +584,15 @@ def _launch_with_confirm(
582
584
  with ux_utils.print_exception_no_traceback():
583
585
  raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
584
586
  f'{colorama.Style.RESET_ALL}') from e
587
+ dag, _ = admin_policy_utils.apply(
588
+ dag,
589
+ request_options=admin_policy.RequestOptions(
590
+ cluster_name=cluster,
591
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
592
+ down=down,
593
+ dryrun=dryrun,
594
+ ),
595
+ )
585
596
  dag = sky.optimize(dag)
586
597
  task = dag.tasks[0]
587
598
 
@@ -2011,6 +2022,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2011
2022
  help=('Follow the logs of a job. '
2012
2023
  'If --no-follow is specified, print the log so far and exit. '
2013
2024
  '[default: --follow]'))
2025
+ @click.option(
2026
+ '--tail',
2027
+ default=0,
2028
+ type=int,
2029
+ help=('The number of lines to display from the end of the log file. '
2030
+ 'Default is 0, which means print all lines.'))
2014
2031
  @click.argument('cluster',
2015
2032
  required=True,
2016
2033
  type=str,
@@ -2024,6 +2041,7 @@ def logs(
2024
2041
  sync_down: bool,
2025
2042
  status: bool, # pylint: disable=redefined-outer-name
2026
2043
  follow: bool,
2044
+ tail: int,
2027
2045
  ):
2028
2046
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2029
2047
  """Tail the log of a job.
@@ -2090,7 +2108,7 @@ def logs(
2090
2108
  click.secho(f'Job {id_str}not found', fg='red')
2091
2109
  sys.exit(1)
2092
2110
 
2093
- core.tail_logs(cluster, job_id, follow)
2111
+ core.tail_logs(cluster, job_id, follow, tail)
2094
2112
 
2095
2113
 
2096
2114
  @cli.command()
@@ -3036,9 +3054,9 @@ def show_gpus(
3036
3054
  and spot instances. There may be multiple regions with the same lowest
3037
3055
  price.
3038
3056
 
3039
- If ``--cloud kubernetes`` is specified, it will show the maximum quantities
3040
- of the GPU available on a single node and the real-time availability of
3041
- the GPU across all nodes in the Kubernetes cluster.
3057
+ If ``--cloud kubernetes`` or ``--cloud k8s`` is specified, it will show the
3058
+ maximum quantities of the GPU available on a single node and the real-time
3059
+ availability of the GPU across all nodes in the Kubernetes cluster.
3042
3060
 
3043
3061
  Definitions of certain fields:
3044
3062
 
@@ -3667,6 +3685,8 @@ def jobs_launch(
3667
3685
 
3668
3686
  click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3669
3687
  fg='cyan')
3688
+ dag, _ = admin_policy_utils.apply(
3689
+ dag, use_mutated_config_in_current_request=False)
3670
3690
  dag = sky.optimize(dag)
3671
3691
 
3672
3692
  if not yes:
@@ -4145,6 +4165,8 @@ def serve_up(
4145
4165
  fg='cyan')
4146
4166
  with sky.Dag() as dag:
4147
4167
  dag.add(task)
4168
+ dag, _ = admin_policy_utils.apply(
4169
+ dag, use_mutated_config_in_current_request=False)
4148
4170
  sky.optimize(dag)
4149
4171
 
4150
4172
  if not yes:
@@ -4261,6 +4283,8 @@ def serve_update(
4261
4283
  fg='cyan')
4262
4284
  with sky.Dag() as dag:
4263
4285
  dag.add(task)
4286
+ dag, _ = admin_policy_utils.apply(
4287
+ dag, use_mutated_config_in_current_request=False)
4264
4288
  sky.optimize(dag)
4265
4289
 
4266
4290
  if not yes:
@@ -20,7 +20,6 @@ DEFAULT_LAMBDA_KEYS_PATH = os.path.expanduser('~/.lambda_cloud/lambda_keys')
20
20
 
21
21
  # List of all possible regions.
22
22
  REGIONS = [
23
- 'australia-southeast-1',
24
23
  'europe-central-1',
25
24
  'asia-south-1',
26
25
  'me-west-1',
@@ -28,9 +27,12 @@ REGIONS = [
28
27
  'asia-northeast-1',
29
28
  'asia-northeast-2',
30
29
  'us-east-1',
30
+ 'us-east-2',
31
31
  'us-west-2',
32
32
  'us-west-1',
33
33
  'us-south-1',
34
+ 'us-south-2',
35
+ 'us-south-3',
34
36
  'us-west-3',
35
37
  'us-midwest-1',
36
38
  ]
sky/core.py CHANGED
@@ -742,7 +742,8 @@ def cancel(
742
742
  @usage_lib.entrypoint
743
743
  def tail_logs(cluster_name: str,
744
744
  job_id: Optional[int],
745
- follow: bool = True) -> None:
745
+ follow: bool = True,
746
+ tail: int = 0) -> None:
746
747
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
747
748
  """Tail the logs of a job.
748
749
 
@@ -775,7 +776,7 @@ def tail_logs(cluster_name: str,
775
776
  f'{colorama.Style.RESET_ALL}')
776
777
 
777
778
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
778
- backend.tail_logs(handle, job_id, follow=follow)
779
+ backend.tail_logs(handle, job_id, follow=follow, tail=tail)
779
780
 
780
781
 
781
782
  @usage_lib.entrypoint
sky/dag.py CHANGED
@@ -23,6 +23,7 @@ class Dag:
23
23
 
24
24
  self.graph = nx.DiGraph()
25
25
  self.name: Optional[str] = None
26
+ self.policy_applied: bool = False
26
27
 
27
28
  def add(self, task: 'task.Task') -> None:
28
29
  self.graph.add_node(task)
@@ -276,23 +276,11 @@ def get_mounting_command(
276
276
  script = get_mounting_script(mount_path, mount_cmd, install_cmd,
277
277
  version_check_cmd)
278
278
 
279
- # TODO(romilb): Get direct bash script to work like so:
280
- # command = f'bash <<-\EOL' \
281
- # f'{script}' \
282
- # 'EOL'
283
-
284
- # TODO(romilb): This heredoc should have EOF after script, but it
285
- # fails with sky's ssh pipeline. Instead, we don't use EOF and use )
286
- # as the end of heredoc. This raises a warning (here-document delimited
287
- # by end-of-file) that can be safely ignored.
288
-
289
279
  # While these commands are run sequentially for each storage object,
290
280
  # we add random int to be on the safer side and avoid collisions.
291
281
  script_path = f'~/.sky/mount_{random.randint(0, 1000000)}.sh'
292
- first_line = r'(cat <<-\EOF > {}'.format(script_path)
293
- command = (f'{first_line}'
294
- f'{script}'
295
- f') && chmod +x {script_path}'
296
- f' && bash {script_path}'
297
- f' && rm {script_path}')
282
+ command = (f'echo {shlex.quote(script)} > {script_path} && '
283
+ f'chmod +x {script_path} && '
284
+ f'bash {script_path} && '
285
+ f'rm {script_path}')
298
286
  return command
sky/exceptions.py CHANGED
@@ -3,6 +3,8 @@ import enum
3
3
  import typing
4
4
  from typing import List, Optional, Sequence
5
5
 
6
+ from sky.utils import env_options
7
+
6
8
  if typing.TYPE_CHECKING:
7
9
  from sky import status_lib
8
10
  from sky.backends import backend
@@ -104,7 +106,8 @@ class CommandError(Exception):
104
106
  if not command:
105
107
  message = error_msg
106
108
  else:
107
- if len(command) > 100:
109
+ if (len(command) > 100 and
110
+ not env_options.Options.SHOW_DEBUG_INFO.get()):
108
111
  # Chunck the command to avoid overflow.
109
112
  command = command[:100] + '...'
110
113
  message = (f'Command {command} failed with return code '
sky/execution.py CHANGED
@@ -160,14 +160,16 @@ def _execute(
160
160
  """
161
161
 
162
162
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
163
- dag, _ = admin_policy_utils.apply(
164
- dag,
165
- request_options=admin_policy.RequestOptions(
166
- cluster_name=cluster_name,
167
- idle_minutes_to_autostop=idle_minutes_to_autostop,
168
- down=down,
169
- dryrun=dryrun,
170
- ))
163
+ if not dag.policy_applied:
164
+ dag, _ = admin_policy_utils.apply(
165
+ dag,
166
+ request_options=admin_policy.RequestOptions(
167
+ cluster_name=cluster_name,
168
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
169
+ down=down,
170
+ dryrun=dryrun,
171
+ ),
172
+ )
171
173
  assert len(dag) == 1, f'We support 1 task for now. {dag}'
172
174
  task = dag.tasks[0]
173
175
 
sky/jobs/core.py CHANGED
@@ -59,8 +59,10 @@ def launch(
59
59
  """
60
60
  entrypoint = task
61
61
  dag_uuid = str(uuid.uuid4().hex[:4])
62
-
63
62
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
63
+ # Always apply the policy again here, even though it might have been applied
64
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
65
+ # and get the mutated config.
64
66
  dag, mutated_user_config = admin_policy_utils.apply(
65
67
  dag, use_mutated_config_in_current_request=False)
66
68
  if not dag.is_chain():
@@ -26,7 +26,8 @@ def _is_running_on_jobs_controller() -> bool:
26
26
  """
27
27
  if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
28
28
  config = yaml.safe_load(
29
- pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text())
29
+ pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text(
30
+ encoding='utf-8'))
30
31
  cluster_name = config.get('cluster_name', '')
31
32
  candidate_controller_names = (
32
33
  controller_utils.Controllers.JOBS_CONTROLLER.value.
@@ -36,6 +36,11 @@ DEFAULT_RECOVERY_STRATEGY = None
36
36
  # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
37
37
  MAX_JOB_CHECKING_RETRY = 10
38
38
 
39
+ # Minutes to job cluster autodown. This should be significantly larger than
40
+ # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
41
+ # cluster before its status can be updated by the job controller.
42
+ _AUTODOWN_MINUTES = 5
43
+
39
44
 
40
45
  def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
41
46
  """Terminate the cluster."""
@@ -302,11 +307,17 @@ class StrategyExecutor:
302
307
  usage_lib.messages.usage.set_internal()
303
308
  # Detach setup, so that the setup failure can be detected
304
309
  # by the controller process (job_status -> FAILED_SETUP).
305
- sky.launch(self.dag,
306
- cluster_name=self.cluster_name,
307
- detach_setup=True,
308
- detach_run=True,
309
- _is_launched_by_jobs_controller=True)
310
+ sky.launch(
311
+ self.dag,
312
+ cluster_name=self.cluster_name,
313
+ # We expect to tear down the cluster as soon as the job is
314
+ # finished. However, in case the controller dies, set
315
+ # autodown to try and avoid a resource leak.
316
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
317
+ down=True,
318
+ detach_setup=True,
319
+ detach_run=True,
320
+ _is_launched_by_jobs_controller=True)
310
321
  logger.info('Managed job cluster launched.')
311
322
  except (exceptions.InvalidClusterNameError,
312
323
  exceptions.NoCloudAccessError,
sky/jobs/state.py CHANGED
@@ -12,6 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
12
12
  import colorama
13
13
 
14
14
  from sky import sky_logging
15
+ from sky.utils import common_utils
15
16
  from sky.utils import db_utils
16
17
 
17
18
  if typing.TYPE_CHECKING:
@@ -22,23 +23,6 @@ CallbackType = Callable[[str], None]
22
23
  logger = sky_logging.init_logger(__name__)
23
24
 
24
25
 
25
- def _get_db_path() -> str:
26
- """Workaround to collapse multi-step Path ops for type checker.
27
- Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
28
- """
29
- path = pathlib.Path('~/.sky/spot_jobs.db')
30
- path = path.expanduser().absolute()
31
- path.parents[0].mkdir(parents=True, exist_ok=True)
32
- return str(path)
33
-
34
-
35
- _DB_PATH = _get_db_path()
36
-
37
- # Module-level connection/cursor; thread-safe as the module is only imported
38
- # once.
39
- _CONN = sqlite3.connect(_DB_PATH)
40
- _CURSOR = _CONN.cursor()
41
-
42
26
  # === Database schema ===
43
27
  # `spot` table contains all the finest-grained tasks, including all the
44
28
  # tasks of a managed job (called spot for legacy reason, as it is generalized
@@ -50,68 +34,99 @@ _CURSOR = _CONN.cursor()
50
34
  # identifier/primary key for all the tasks. We will use `spot_job_id`
51
35
  # to identify the spot job.
52
36
  # TODO(zhwu): schema migration may be needed.
53
- _CURSOR.execute("""\
54
- CREATE TABLE IF NOT EXISTS spot (
55
- job_id INTEGER PRIMARY KEY AUTOINCREMENT,
56
- job_name TEXT,
57
- resources TEXT,
58
- submitted_at FLOAT,
59
- status TEXT,
60
- run_timestamp TEXT CANDIDATE KEY,
61
- start_at FLOAT DEFAULT NULL,
62
- end_at FLOAT DEFAULT NULL,
63
- last_recovered_at FLOAT DEFAULT -1,
64
- recovery_count INTEGER DEFAULT 0,
65
- job_duration FLOAT DEFAULT 0,
66
- failure_reason TEXT,
67
- spot_job_id INTEGER,
68
- task_id INTEGER DEFAULT 0,
69
- task_name TEXT,
70
- specs TEXT)""")
71
- _CONN.commit()
72
-
73
- db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
74
- # Create a new column `spot_job_id`, which is the same for tasks of the
75
- # same managed job.
76
- # The original `job_id` no longer has an actual meaning, but only a legacy
77
- # identifier for all tasks in database.
78
- db_utils.add_column_to_table(_CURSOR,
79
- _CONN,
80
- 'spot',
81
- 'spot_job_id',
82
- 'INTEGER',
83
- copy_from='job_id')
84
- db_utils.add_column_to_table(_CURSOR,
85
- _CONN,
86
- 'spot',
87
- 'task_id',
88
- 'INTEGER DEFAULT 0',
89
- value_to_replace_existing_entries=0)
90
- db_utils.add_column_to_table(_CURSOR,
91
- _CONN,
92
- 'spot',
93
- 'task_name',
94
- 'TEXT',
95
- copy_from='job_name')
96
-
97
- # Specs is some useful information about the task, e.g., the
98
- # max_restarts_on_errors value. It is stored in JSON format.
99
- db_utils.add_column_to_table(_CURSOR,
100
- _CONN,
101
- 'spot',
102
- 'specs',
103
- 'TEXT',
104
- value_to_replace_existing_entries=json.dumps({
105
- 'max_restarts_on_errors': 0,
106
- }))
107
-
108
- # `job_info` contains the mapping from job_id to the job_name.
109
- # In the future, it may contain more information about each job.
110
- _CURSOR.execute("""\
111
- CREATE TABLE IF NOT EXISTS job_info (
112
- spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
113
- name TEXT)""")
114
- _CONN.commit()
37
+ def create_table(cursor, conn):
38
+ # Enable WAL mode to avoid locking issues.
39
+ # See: issue #3863, #1441 and PR #1509
40
+ # https://github.com/microsoft/WSL/issues/2395
41
+ # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
42
+ # This may cause the database locked problem from WSL issue #1441.
43
+ if not common_utils.is_wsl():
44
+ try:
45
+ cursor.execute('PRAGMA journal_mode=WAL')
46
+ except sqlite3.OperationalError as e:
47
+ if 'database is locked' not in str(e):
48
+ raise
49
+ # If the database is locked, it is OK to continue, as the WAL mode
50
+ # is not critical and is likely to be enabled by other processes.
51
+
52
+ cursor.execute("""\
53
+ CREATE TABLE IF NOT EXISTS spot (
54
+ job_id INTEGER PRIMARY KEY AUTOINCREMENT,
55
+ job_name TEXT,
56
+ resources TEXT,
57
+ submitted_at FLOAT,
58
+ status TEXT,
59
+ run_timestamp TEXT CANDIDATE KEY,
60
+ start_at FLOAT DEFAULT NULL,
61
+ end_at FLOAT DEFAULT NULL,
62
+ last_recovered_at FLOAT DEFAULT -1,
63
+ recovery_count INTEGER DEFAULT 0,
64
+ job_duration FLOAT DEFAULT 0,
65
+ failure_reason TEXT,
66
+ spot_job_id INTEGER,
67
+ task_id INTEGER DEFAULT 0,
68
+ task_name TEXT,
69
+ specs TEXT)""")
70
+ conn.commit()
71
+
72
+ db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
73
+ # Create a new column `spot_job_id`, which is the same for tasks of the
74
+ # same managed job.
75
+ # The original `job_id` no longer has an actual meaning, but only a legacy
76
+ # identifier for all tasks in database.
77
+ db_utils.add_column_to_table(cursor,
78
+ conn,
79
+ 'spot',
80
+ 'spot_job_id',
81
+ 'INTEGER',
82
+ copy_from='job_id')
83
+ db_utils.add_column_to_table(cursor,
84
+ conn,
85
+ 'spot',
86
+ 'task_id',
87
+ 'INTEGER DEFAULT 0',
88
+ value_to_replace_existing_entries=0)
89
+ db_utils.add_column_to_table(cursor,
90
+ conn,
91
+ 'spot',
92
+ 'task_name',
93
+ 'TEXT',
94
+ copy_from='job_name')
95
+
96
+ # Specs is some useful information about the task, e.g., the
97
+ # max_restarts_on_errors value. It is stored in JSON format.
98
+ db_utils.add_column_to_table(cursor,
99
+ conn,
100
+ 'spot',
101
+ 'specs',
102
+ 'TEXT',
103
+ value_to_replace_existing_entries=json.dumps({
104
+ 'max_restarts_on_errors': 0,
105
+ }))
106
+
107
+ # `job_info` contains the mapping from job_id to the job_name.
108
+ # In the future, it may contain more information about each job.
109
+ cursor.execute("""\
110
+ CREATE TABLE IF NOT EXISTS job_info (
111
+ spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
112
+ name TEXT)""")
113
+ conn.commit()
114
+
115
+
116
+ # Module-level connection/cursor; thread-safe as the module is only imported
117
+ # once.
118
+ def _get_db_path() -> str:
119
+ """Workaround to collapse multi-step Path ops for type checker.
120
+ Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
121
+ """
122
+ path = pathlib.Path('~/.sky/spot_jobs.db')
123
+ path = path.expanduser().absolute()
124
+ path.parents[0].mkdir(parents=True, exist_ok=True)
125
+ return str(path)
126
+
127
+
128
+ _DB_PATH = _get_db_path()
129
+ db_utils.SQLiteConn(_DB_PATH, create_table)
115
130
 
116
131
  # job_duration is the time a job actually runs (including the
117
132
  # setup duration) before last_recover, excluding the provision
sky/jobs/utils.py CHANGED
@@ -14,7 +14,7 @@ import shutil
14
14
  import textwrap
15
15
  import time
16
16
  import typing
17
- from typing import Any, Dict, List, Optional, Tuple, Union
17
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
18
18
 
19
19
  import colorama
20
20
  import filelock
@@ -487,6 +487,7 @@ def stream_logs(job_id: Optional[int],
487
487
  job_id = managed_job_state.get_latest_job_id()
488
488
  if job_id is None:
489
489
  return 'No managed job found.'
490
+
490
491
  if controller:
491
492
  if job_id is None:
492
493
  assert job_name is not None
@@ -494,16 +495,22 @@ def stream_logs(job_id: Optional[int],
494
495
  # We manually filter the jobs by name, instead of using
495
496
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
496
497
  # should be able to show the logs for jobs in terminal states.
497
- managed_jobs = list(
498
- filter(lambda job: job['job_name'] == job_name, managed_jobs))
499
- if len(managed_jobs) == 0:
498
+ managed_job_ids: Set[int] = {
499
+ job['job_id']
500
+ for job in managed_jobs
501
+ if job['job_name'] == job_name
502
+ }
503
+ if len(managed_job_ids) == 0:
500
504
  return f'No managed job found with name {job_name!r}.'
501
- if len(managed_jobs) > 1:
502
- job_ids_str = ', '.join(job['job_id'] for job in managed_jobs)
503
- raise ValueError(
504
- f'Multiple managed jobs found with name {job_name!r} (Job '
505
- f'IDs: {job_ids_str}). Please specify the job_id instead.')
506
- job_id = managed_jobs[0]['job_id']
505
+ if len(managed_job_ids) > 1:
506
+ job_ids_str = ', '.join(
507
+ str(job_id) for job_id in managed_job_ids)
508
+ with ux_utils.print_exception_no_traceback():
509
+ raise ValueError(
510
+ f'Multiple managed jobs found with name {job_name!r} '
511
+ f'(Job IDs: {job_ids_str}). Please specify the job_id '
512
+ 'instead.')
513
+ job_id = managed_job_ids.pop()
507
514
  assert job_id is not None, (job_id, job_name)
508
515
  # TODO: keep the following code sync with
509
516
  # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
@@ -849,6 +856,7 @@ class ManagedJobCodeGen:
849
856
 
850
857
  from sky.skylet import job_lib, log_lib
851
858
  from sky.skylet import constants
859
+ from sky.utils import ux_utils
852
860
  try:
853
861
  from sky.jobs.utils import stream_logs_by_id
854
862
  except ImportError:
@@ -42,8 +42,9 @@ def _skypilot_log_error_and_exit_for_failover(error: str) -> None:
42
42
  Mainly used for handling VPC/subnet errors before nodes are launched.
43
43
  """
44
44
  # NOTE: keep. The backend looks for this to know no nodes are launched.
45
- prefix = 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
46
- raise RuntimeError(prefix + error)
45
+ full_error = f'SKYPILOT_ERROR_NO_NODES_LAUNCHED: {error}'
46
+ logger.error(full_error)
47
+ raise RuntimeError(full_error)
47
48
 
48
49
 
49
50
  def bootstrap_instances(
@@ -222,10 +223,27 @@ def _configure_iam_role(iam) -> Dict[str, Any]:
222
223
 
223
224
 
224
225
  @functools.lru_cache(maxsize=128) # Keep bounded.
225
- def _get_route_tables(ec2, vpc_id: Optional[str], main: bool) -> List[Any]:
226
+ def _get_route_tables(ec2, vpc_id: Optional[str], region: str,
227
+ main: bool) -> List[Any]:
228
+ """Get route tables associated with a VPC and region
229
+
230
+ Args:
231
+ ec2: ec2 resource object
232
+ vpc_id: vpc_id is optional, if not provided, all route tables in the
233
+ region will be returned
234
+ region: region is mandatory to allow the lru cache
235
+ to return the corect results
236
+ main: if True, only main route tables will be returned otherwise
237
+ only non-main route tables will be returned
238
+
239
+ Returns:
240
+ A list of route tables associated with the options VPC and region
241
+ """
226
242
  filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
227
243
  if vpc_id is not None:
228
244
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
245
+ logger.debug(
246
+ f'Getting route tables with filters: {filters} in region: {region}')
229
247
  return ec2.meta.client.describe_route_tables(Filters=filters).get(
230
248
  'RouteTables', [])
231
249
 
@@ -238,7 +256,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
238
256
  https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
239
257
  """
240
258
  # Get the route tables associated with the subnet
241
- all_route_tables = _get_route_tables(ec2, vpc_id, main=False)
259
+ region = ec2.meta.client.meta.region_name
260
+ all_route_tables = _get_route_tables(ec2, vpc_id, region, main=False)
242
261
  route_tables = [
243
262
  rt for rt in all_route_tables
244
263
  # An RT can be associated with multiple subnets, i.e.,
@@ -267,7 +286,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
267
286
  # subnets. Since the associations are implicit, the filter above won't find
268
287
  # any. Check there exists a main route table with routes pointing to an IGW.
269
288
  logger.debug('Checking main route table')
270
- main_route_tables = _get_route_tables(ec2, vpc_id, main=True)
289
+ region = ec2.meta.client.meta.region_name
290
+ main_route_tables = _get_route_tables(ec2, vpc_id, region, main=True)
271
291
  return _has_igw_route(main_route_tables)
272
292
 
273
293
 
@@ -264,6 +264,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
264
264
  f'--disable-usage-stats '
265
265
  f'--port={constants.SKY_REMOTE_RAY_PORT} '
266
266
  f'--dashboard-port={constants.SKY_REMOTE_RAY_DASHBOARD_PORT} '
267
+ f'--min-worker-port 11002 '
267
268
  f'--object-manager-port=8076 '
268
269
  f'--temp-dir={constants.SKY_REMOTE_RAY_TEMPDIR}')
269
270
  if custom_resource:
@@ -232,7 +232,12 @@ def query_ports(
232
232
  instances = _filter_instances(cluster_name_on_cloud,
233
233
  None,
234
234
  head_only=True)
235
- assert len(instances) == 1
235
+ assert len(instances) <= 1
236
+ # It is possible that the instance is terminated on console by
237
+ # the user. In this case, the instance will not be found and we
238
+ # should return an empty dict.
239
+ if not instances:
240
+ return {}
236
241
  head_inst = list(instances.values())[0]
237
242
  ready_ports: Dict[int, List[common.Endpoint]] = {
238
243
  port: [common.SocketEndpoint(**endpoint)]
sky/serve/core.py CHANGED
@@ -124,7 +124,9 @@ def up(
124
124
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
125
125
 
126
126
  _validate_service_task(task)
127
-
127
+ # Always apply the policy again here, even though it might have been applied
128
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
129
+ # and get the mutated config.
128
130
  dag, mutated_user_config = admin_policy_utils.apply(
129
131
  task, use_mutated_config_in_current_request=False)
130
132
  task = dag.tasks[0]
@@ -319,6 +321,14 @@ def update(
319
321
  service_name: Name of the service.
320
322
  """
321
323
  _validate_service_task(task)
324
+ # Always apply the policy again here, even though it might have been applied
325
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
326
+ # and get the mutated config.
327
+ # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
328
+ # will not apply the config.
329
+ dag, _ = admin_policy_utils.apply(
330
+ task, use_mutated_config_in_current_request=False)
331
+ task = dag.tasks[0]
322
332
  handle = backend_utils.is_controller_accessible(
323
333
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
324
334
  stopped_message=
sky/skylet/constants.py CHANGED
@@ -79,7 +79,7 @@ SKYLET_VERSION = '8'
79
79
  # The version of the lib files that skylet/jobs use. Whenever there is an API
80
80
  # change for the job_lib or log_lib, we need to bump this version, so that the
81
81
  # user can be notified to update their SkyPilot version on the remote cluster.
82
- SKYLET_LIB_VERSION = 1
82
+ SKYLET_LIB_VERSION = 2
83
83
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
84
84
 
85
85
  # `sky jobs dashboard`-related
sky/skylet/job_lib.py CHANGED
@@ -29,6 +29,7 @@ if typing.TYPE_CHECKING:
29
29
 
30
30
  logger = sky_logging.init_logger(__name__)
31
31
 
32
+ _LINUX_NEW_LINE = '\n'
32
33
  _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
33
34
 
34
35
 
@@ -602,6 +603,7 @@ def update_job_status(job_ids: List[int],
602
603
  # the pending table until appearing in ray jobs. For jobs
603
604
  # submitted outside of the grace period, we will consider the
604
605
  # ray job status.
606
+
605
607
  if not (pending_job['submit'] > 0 and pending_job['submit'] <
606
608
  ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
607
609
  # Reset the job status to PENDING even though it may not
@@ -903,14 +905,19 @@ class JobLibCodeGen:
903
905
  def tail_logs(cls,
904
906
  job_id: Optional[int],
905
907
  managed_job_id: Optional[int],
906
- follow: bool = True) -> str:
908
+ follow: bool = True,
909
+ tail: int = 0) -> str:
907
910
  # pylint: disable=line-too-long
911
+
908
912
  code = [
913
+ # We use != instead of is not because 1 is not None will print a warning:
914
+ # <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
909
915
  f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
910
916
  'run_timestamp = job_lib.get_run_timestamp(job_id)',
911
917
  f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
912
- f'log_lib.tail_logs(job_id=job_id, log_dir=log_dir, '
913
- f'managed_job_id={managed_job_id!r}, follow={follow})',
918
+ f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
919
+ f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
920
+ f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
914
921
  ]
915
922
  return cls._build(code)
916
923
 
sky/skylet/log_lib.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  This is a remote utility module that provides logging functionality.
4
4
  """
5
+ import collections
5
6
  import copy
6
7
  import io
7
8
  import multiprocessing.pool
@@ -12,7 +13,8 @@ import sys
12
13
  import tempfile
13
14
  import textwrap
14
15
  import time
15
- from typing import Dict, Iterator, List, Optional, Tuple, Union
16
+ from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
17
+ Tuple, Union)
16
18
 
17
19
  import colorama
18
20
 
@@ -26,6 +28,9 @@ from sky.utils import ux_utils
26
28
  _SKY_LOG_WAITING_GAP_SECONDS = 1
27
29
  _SKY_LOG_WAITING_MAX_RETRY = 5
28
30
  _SKY_LOG_TAILING_GAP_SECONDS = 0.2
31
+ # Peek the head of the lines to check if we need to start
32
+ # streaming when tail > 0.
33
+ PEEK_HEAD_LINES_FOR_START_STREAM = 20
29
34
 
30
35
  logger = sky_logging.init_logger(__name__)
31
36
 
@@ -330,6 +335,7 @@ def run_bash_command_with_log(bash_command: str,
330
335
 
331
336
  def _follow_job_logs(file,
332
337
  job_id: int,
338
+ start_streaming: bool,
333
339
  start_streaming_at: str = '') -> Iterator[str]:
334
340
  """Yield each line from a file as they are written.
335
341
 
@@ -338,7 +344,6 @@ def _follow_job_logs(file,
338
344
  # No need to lock the status here, as the while loop can handle
339
345
  # the older status.
340
346
  status = job_lib.get_status_no_lock(job_id)
341
- start_streaming = False
342
347
  wait_last_logs = True
343
348
  while True:
344
349
  tmp = file.readline()
@@ -378,10 +383,45 @@ def _follow_job_logs(file,
378
383
  status = job_lib.get_status_no_lock(job_id)
379
384
 
380
385
 
386
+ def _peek_head_lines(log_file: TextIO) -> List[str]:
387
+ """Peek the head of the file."""
388
+ lines = [
389
+ log_file.readline() for _ in range(PEEK_HEAD_LINES_FOR_START_STREAM)
390
+ ]
391
+ # Reset the file pointer to the beginning
392
+ log_file.seek(0, os.SEEK_SET)
393
+ return [line for line in lines if line]
394
+
395
+
396
+ def _should_stream_the_whole_tail_lines(head_lines_of_log_file: List[str],
397
+ tail_lines: Deque[str],
398
+ start_stream_at: str) -> bool:
399
+ """Check if the entire tail lines should be streamed."""
400
+ # See comment:
401
+ # https://github.com/skypilot-org/skypilot/pull/4241#discussion_r1833611567
402
+ # for more details.
403
+ # Case 1: If start_stream_at is found at the head of the tail lines,
404
+ # we should not stream the whole tail lines.
405
+ for index, line in enumerate(tail_lines):
406
+ if index >= PEEK_HEAD_LINES_FOR_START_STREAM:
407
+ break
408
+ if start_stream_at in line:
409
+ return False
410
+ # Case 2: If start_stream_at is found at the head of log file, but not at
411
+ # the tail lines, we need to stream the whole tail lines.
412
+ for line in head_lines_of_log_file:
413
+ if start_stream_at in line:
414
+ return True
415
+ # Case 3: If start_stream_at is not at the head, and not found at the tail
416
+ # lines, we should not stream the whole tail lines.
417
+ return False
418
+
419
+
381
420
  def tail_logs(job_id: Optional[int],
382
421
  log_dir: Optional[str],
383
422
  managed_job_id: Optional[int] = None,
384
- follow: bool = True) -> None:
423
+ follow: bool = True,
424
+ tail: int = 0) -> None:
385
425
  """Tail the logs of a job.
386
426
 
387
427
  Args:
@@ -390,6 +430,8 @@ def tail_logs(job_id: Optional[int],
390
430
  managed_job_id: The managed job id (for logging info only to avoid
391
431
  confusion).
392
432
  follow: Whether to follow the logs or print the logs so far and exit.
433
+ tail: The number of lines to display from the end of the log file,
434
+ if 0, print all lines.
393
435
  """
394
436
  if job_id is None:
395
437
  # This only happens when job_lib.get_latest_job_id() returns None,
@@ -430,6 +472,8 @@ def tail_logs(job_id: Optional[int],
430
472
  status = job_lib.update_job_status([job_id], silent=True)[0]
431
473
 
432
474
  start_stream_at = 'Waiting for task resources on '
475
+ # Explicitly declare the type to avoid mypy warning.
476
+ lines: Iterable[str] = []
433
477
  if follow and status in [
434
478
  job_lib.JobStatus.SETTING_UP,
435
479
  job_lib.JobStatus.PENDING,
@@ -440,18 +484,43 @@ def tail_logs(job_id: Optional[int],
440
484
  with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
441
485
  # Using `_follow` instead of `tail -f` to streaming the whole
442
486
  # log and creating a new process for tail.
487
+ start_streaming = False
488
+ if tail > 0:
489
+ head_lines_of_log_file = _peek_head_lines(log_file)
490
+ lines = collections.deque(log_file, maxlen=tail)
491
+ start_streaming = _should_stream_the_whole_tail_lines(
492
+ head_lines_of_log_file, lines, start_stream_at)
493
+ for line in lines:
494
+ if start_stream_at in line:
495
+ start_streaming = True
496
+ if start_streaming:
497
+ print(line, end='')
498
+ # Flush the last n lines
499
+ print(end='', flush=True)
500
+ # Now, the cursor is at the end of the last lines
501
+ # if tail > 0
443
502
  for line in _follow_job_logs(log_file,
444
503
  job_id=job_id,
504
+ start_streaming=start_streaming,
445
505
  start_streaming_at=start_stream_at):
446
506
  print(line, end='', flush=True)
447
507
  else:
448
508
  try:
449
- start_stream = False
450
- with open(log_path, 'r', encoding='utf-8') as f:
451
- for line in f.readlines():
509
+ start_streaming = False
510
+ with open(log_path, 'r', encoding='utf-8') as log_file:
511
+ if tail > 0:
512
+ # If tail > 0, we need to read the last n lines.
513
+ # We use double ended queue to rotate the last n lines.
514
+ head_lines_of_log_file = _peek_head_lines(log_file)
515
+ lines = collections.deque(log_file, maxlen=tail)
516
+ start_streaming = _should_stream_the_whole_tail_lines(
517
+ head_lines_of_log_file, lines, start_stream_at)
518
+ else:
519
+ lines = log_file
520
+ for line in lines:
452
521
  if start_stream_at in line:
453
- start_stream = True
454
- if start_stream:
522
+ start_streaming = True
523
+ if start_streaming:
455
524
  print(line, end='', flush=True)
456
525
  except FileNotFoundError:
457
526
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
@@ -324,6 +324,8 @@ available_node_types:
324
324
  command: ["/bin/bash", "-c", "--"]
325
325
  args:
326
326
  - |
327
+ function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
328
+
327
329
  # Tails file and checks every 5 sec for
328
330
  # open file handlers with write access
329
331
  # closes if none exist
@@ -333,7 +335,7 @@ available_node_types:
333
335
  while kill -0 $TAIL_PID 2> /dev/null; do
334
336
  # only two PIDs should be accessing the file
335
337
  # the log appender and log tailer
336
- if [ $(lsof -w $file | wc -l) -lt 3 ]; then
338
+ if [ $(mylsof $file | wc -l) -lt 2 ]; then
337
339
  kill $TAIL_PID
338
340
  break
339
341
  fi
@@ -142,4 +142,5 @@ def apply(
142
142
  importlib.reload(skypilot_config)
143
143
 
144
144
  logger.debug(f'Mutated user request: {mutated_user_request}')
145
+ mutated_dag.policy_applied = True
145
146
  return mutated_dag, mutated_config
@@ -11,6 +11,7 @@ from sky import sky_logging
11
11
  from sky.skylet import constants
12
12
  from sky.skylet import log_lib
13
13
  from sky.utils import common_utils
14
+ from sky.utils import control_master_utils
14
15
  from sky.utils import subprocess_utils
15
16
  from sky.utils import timeline
16
17
 
@@ -104,13 +105,22 @@ def ssh_options_list(
104
105
  }
105
106
  # SSH Control will have a severe delay when using docker_ssh_proxy_command.
106
107
  # TODO(tian): Investigate why.
108
+ #
109
+ # We disable ControlMaster when ssh_proxy_command is used, because the
110
+ # master connection will be idle although the connection might be shared
111
+ # by other ssh commands that is not idle. In that case, user's custom proxy
112
+ # command may drop the connection due to idle timeout, since it will only
113
+ # see the idle master connection. It is an issue even with the
114
+ # ServerAliveInterval set, since the keepalive message may not be recognized
115
+ # by the custom proxy command, such as AWS SSM Session Manager.
116
+ #
107
117
  # We also do not use ControlMaster when we use `kubectl port-forward`
108
118
  # to access Kubernetes pods over SSH+Proxycommand. This is because the
109
119
  # process running ProxyCommand is kept running as long as the ssh session
110
120
  # is running and the ControlMaster keeps the session, which results in
111
121
  # 'ControlPersist' number of seconds delay per ssh commands ran.
112
122
  if (ssh_control_name is not None and docker_ssh_proxy_command is None and
113
- not disable_control_master):
123
+ ssh_proxy_command is None and not disable_control_master):
114
124
  arg_dict.update({
115
125
  # Control path: important optimization as we do multiple ssh in one
116
126
  # sky.launch().
@@ -459,7 +469,9 @@ class SSHCommandRunner(CommandRunner):
459
469
  None if ssh_control_name is None else hashlib.md5(
460
470
  ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
461
471
  self._ssh_proxy_command = ssh_proxy_command
462
- self.disable_control_master = disable_control_master
472
+ self.disable_control_master = (
473
+ disable_control_master or
474
+ control_master_utils.should_disable_control_master())
463
475
  if docker_user is not None:
464
476
  assert port is None or port == 22, (
465
477
  f'port must be None or 22 for docker_user, got {port}.')
@@ -0,0 +1,49 @@
1
+ """Utils to check if the ssh control master should be disabled."""
2
+
3
+ import functools
4
+
5
+ from sky import sky_logging
6
+ from sky.utils import subprocess_utils
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+
11
+ def is_tmp_9p_filesystem() -> bool:
12
+ """Check if the /tmp filesystem is 9p.
13
+
14
+ Returns:
15
+ bool: True if the /tmp filesystem is 9p, False otherwise.
16
+ """
17
+
18
+ result = subprocess_utils.run(['df', '-T', '/tmp'],
19
+ capture_output=True,
20
+ text=True,
21
+ shell=None,
22
+ check=False,
23
+ executable=None)
24
+
25
+ if result.returncode != 0:
26
+ return False
27
+
28
+ filesystem_infos = result.stdout.strip().split('\n')
29
+ if len(filesystem_infos) < 2:
30
+ return False
31
+ filesystem_types = filesystem_infos[1].split()
32
+ if len(filesystem_types) < 2:
33
+ return False
34
+ return filesystem_types[1].lower() == '9p'
35
+
36
+
37
+ @functools.lru_cache
38
+ def should_disable_control_master() -> bool:
39
+ """Whether disable ssh control master based on file system.
40
+
41
+ Returns:
42
+ bool: True if the ssh control master should be disabled,
43
+ False otherwise.
44
+ """
45
+ if is_tmp_9p_filesystem():
46
+ return True
47
+ # there may be additional criteria to disable ssh control master
48
+ # in the future. They should be checked here
49
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241107
3
+ Version: 1.0.0.dev20241109
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,13 +1,13 @@
1
- sky/__init__.py,sha256=LDYVc006Bm6m_yCUJiTKF3oPp3_O3ODjp1KhoU5meCE,5882
1
+ sky/__init__.py,sha256=vuaxCFFtQHJTriSEGG_wKshl6nmhDcnt70q66x1rkvA,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=6umPcFovU5sHIUdC0B9lfOstzWLA0DPS5x6dg1EOkeQ,211193
5
+ sky/cli.py,sha256=jEjXs5Z0u263eJIsTHoKyG9oOY6giqw19s2di9kEv1s,212088
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
- sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
- sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
- sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
10
- sky/execution.py,sha256=HF76sz-gCEZPGkuL48jJaLOTqjuHg0KysgKaPw-hn84,25997
7
+ sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
8
+ sky/dag.py,sha256=O9g8NnO8L1SGUEDyqW9W341AH4Wvd3nJs54niR-pkrk,2822
9
+ sky/exceptions.py,sha256=E3C2Ejcc8RUDAUQn7ar_Jr97C_AxD2rKKMmJOfLJ9d0,8965
10
+ sky/execution.py,sha256=TwcorzFxR_0m8uazPdeKltU3g3ikgUSqqzcSBrHp7K4,26070
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
13
13
  sky/resources.py,sha256=Zt8mCCmdvZ5ZCqY-l3KXlx_lkUesAopRtaEcEsrRFZo,68465
@@ -31,10 +31,10 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
34
- sky/backends/cloud_vm_ray_backend.py,sha256=jlX1atSF4L31ZMzC_tnBaWnxvc2Wb8DRwt5G_ukrlJk,232799
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=6Ew9Ej92KGlumlCnyDcGSEbHInj7g2Shqwx4oxRkWVQ,233122
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
- sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
37
+ sky/backends/wheel_utils.py,sha256=CUVOwlBtQjOMv-RSDGx2jMQ0M1D0w9ZPm0TDafJwBDI,8180
38
38
  sky/backends/monkey_patches/monkey_patch_ray_up.py,sha256=76-y2fCaE3JINj8lEwHT1eirYzCbpD8O1ySsysuGu8o,3450
39
39
  sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
@@ -79,7 +79,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzE
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
81
  sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
82
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
82
+ sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=yOPmmckiQ0HU6bKXWd7YdTrsF2sql3Bs_jYNpuxlo0I,4942
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
@@ -90,29 +90,29 @@ sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas
90
90
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
91
91
  sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
92
92
  sky/data/data_utils.py,sha256=-P5GsDH_m4slrCz4vHdgiFezIys8ufzvhEKePJwfjFc,28597
93
- sky/data/mounting_utils.py,sha256=44YkYIIgArEkyvxCtfmXXumybrU8bmn1TfLXWv_eldI,11480
93
+ sky/data/mounting_utils.py,sha256=HwBGg1NmX-2IJZV_6h2r1U3ajTGOyfmA3MqboA7znqU,11004
94
94
  sky/data/storage.py,sha256=OQ_kznF-P50Jq0feO5FBqm97QGhfbsZ2dX-Ar3sVWr4,163903
95
95
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
96
96
  sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
98
  sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
99
- sky/jobs/core.py,sha256=w7PancHi8_-afLKZQ3HHMD1sEDoepm1vEMxyDlXdo64,17155
100
- sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
101
- sky/jobs/state.py,sha256=exN6BdJlLBzFTccJCSHN4dNjVeYFgTgqgxOaHwLw2IQ,24307
102
- sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
103
- sky/jobs/dashboard/dashboard.py,sha256=FXVQAWjAuQQTfAGlTCD-Xb9LckC5I4NhGwiBZy8Avo8,3186
99
+ sky/jobs/core.py,sha256=Lk_zKizc9a7O-8WHhh4-VXBS5kT0jRpwmNNA7S4ueIo,17347
100
+ sky/jobs/recovery_strategy.py,sha256=O_DouAfWx8FNdQxXsr2msMwlKCIodS99cW6V4Lf1vMo,27219
101
+ sky/jobs/state.py,sha256=DE02bCZc9bPbbuayb3Zml553mb4pEV7Z8t1pt8IGbYM,25252
102
+ sky/jobs/utils.py,sha256=Ff3TttIEdVeM1_kOVkviqIDjeVfBPIXVE8i-yP1VDM8,37976
103
+ sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
106
106
  sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
109
  sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
110
- sky/provision/instance_setup.py,sha256=c6i_NC6GrW4hXAQIU5_dUBbnThjZQNS3cL2M6yMtzes,23616
110
+ sky/provision/instance_setup.py,sha256=gI739UMCqtPqdA522D92bPu5sA3OHBMDmIGmqqxsIwY,23652
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
113
113
  sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
114
114
  sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
115
- sky/provision/aws/config.py,sha256=ApEh63RR_KyCp9nPXX35z6jBREoulJPQ5st8K9Jlclo,23385
115
+ sky/provision/aws/config.py,sha256=dbwulPxXGIJjKJddv85PbtlXOjwLemaD65j3DISNsK0,24214
116
116
  sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
117
117
  sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,3238
118
118
  sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
@@ -154,7 +154,7 @@ sky/provision/paperspace/instance.py,sha256=q_V01DZSMXLfy63Zwt6AQotq02JuXQZb5CHS
154
154
  sky/provision/paperspace/utils.py,sha256=uOmxbDKjV6skFizC4gYXSxRuEqso5ck2kF7MbtNmhEs,9580
155
155
  sky/provision/runpod/__init__.py,sha256=6HYvHI27EaLrX1SS0vWVhdLu5HDBeZCdvAeDJuwM5pk,556
156
156
  sky/provision/runpod/config.py,sha256=9ulZJVL7nHuxhTdoj8D7lNn7SdicJ5zc6FIcHIG9tcg,321
157
- sky/provision/runpod/instance.py,sha256=ucmFQEzapbxylsl6K9EUo7bHTZYzvfECo6tpJc-MFrw,9577
157
+ sky/provision/runpod/instance.py,sha256=AIWzTHuAu2dw8Rk-AHc7-14hUAYPEKh_UMzAhMzjDh0,9807
158
158
  sky/provision/runpod/utils.py,sha256=ZjrcpjKzwS2nXQ21dW405PLxBl_V9awcfRjucGB3alw,6795
159
159
  sky/provision/vsphere/__init__.py,sha256=5icB8-kfs926S9DVfNJSCBVr7z7cmCEDr04-YHX89_4,788
160
160
  sky/provision/vsphere/config.py,sha256=f_ojGmi_vbnwJ8Ri48cqhZHBOuIkj41j9bFbq-ldPOo,504
@@ -175,7 +175,7 @@ sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
175
175
  sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
176
176
  sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
177
177
  sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
178
- sky/serve/core.py,sha256=jwrgglvtFqbD9Y4pzXmuso5hKc0OQcTWJ-AkvypiQII,30986
178
+ sky/serve/core.py,sha256=hszs95BwtC4wIJujGNokvFC46VjojgRz1BbYOIIPh6k,31601
179
179
  sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
180
180
  sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
181
181
  sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
@@ -190,10 +190,10 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
190
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
191
191
  sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
- sky/skylet/constants.py,sha256=TL-O0ZoxA1ZeNvKXzzA_UyIMXsma7flbsDZ1N_o9dKg,14468
193
+ sky/skylet/constants.py,sha256=w05Enrg9RhGp99P1WDYMKK_ki0M-e0bS8Wr-VZR0Vn8,14468
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=-SCbpJRiWMSwvhDjUwfwnvBap7Y5B3ol1l_PDPra3XI,36860
196
- sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
195
+ sky/skylet/job_lib.py,sha256=FD1n9vE0daOEUKSH3lnccfBh7Vs81R8s4ILZyKu2o7M,37275
196
+ sky/skylet/log_lib.py,sha256=BmhAgcLvlin3szhj33IH0kbdCALacVisF2x61BQpZdY,21888
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
199
199
  sky/skylet/subprocess_daemon.py,sha256=IJwGAzOdERrhWJS7VYKAUByNsLyIkKkB0w5nk06okG8,2818
@@ -228,7 +228,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjA
228
228
  sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
229
229
  sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
230
230
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
231
- sky/templates/kubernetes-ray.yml.j2,sha256=Wq9luXc6-t141uyHbtOy1IDmLMM0PBbePTZfZEtAKw0,18160
231
+ sky/templates/kubernetes-ray.yml.j2,sha256=dsWlkX-0b1igeZI4c0u0Jzia5I_9gezCiewR6pX1LlY,18374
232
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
233
233
  sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
234
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
@@ -243,11 +243,12 @@ sky/usage/constants.py,sha256=8xpg9vhDU9A3eObtpkNFjwa42oCazqGEv4yw_vJSO7U,590
243
243
  sky/usage/usage_lib.py,sha256=mxsbwUMEQjesUOIv4Yne-Ze7rVxSQYr3_wBXruifGRA,17898
244
244
  sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
245
  sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
246
- sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
246
+ sky/utils/admin_policy_utils.py,sha256=_Vt_jTTYCXmMdryj0vrrumFPewa93qHnzUqBDXjAhRU,5981
247
247
  sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
248
- sky/utils/command_runner.py,sha256=seU7uX9CrxiC8WOWBKHW94m67-V6DYghqRXhYdUIdQI,35756
248
+ sky/utils/command_runner.py,sha256=GHTZxoJQ3V8WVSRAaOA4JpRTxtCtuq36H9U8kOfWUwc,36450
249
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
250
250
  sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
251
+ sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
251
252
  sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
252
253
  sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
253
254
  sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
@@ -274,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241107.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241107.dist-info/METADATA,sha256=ICnKtcpMVvZVf_1H6k63r29XgS_-heZ4BcgH-p5J5s4,19708
279
- skypilot_nightly-1.0.0.dev20241107.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241107.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241107.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241107.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241109.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241109.dist-info/METADATA,sha256=YM8C71GXOj5CoHQlj5yNYhL8UkZ75DL-qMMTPXCOmXY,19708
280
+ skypilot_nightly-1.0.0.dev20241109.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
+ skypilot_nightly-1.0.0.dev20241109.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241109.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241109.dist-info/RECORD,,