skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '35f0cf4cf8fee06aadcac639740d25c7493b5534'
8
+ _SKYPILOT_COMMIT_SHA = 'e71e5a92ccd90a654662121d6f08c4e100377bbf'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250114'
38
+ __version__ = '1.0.0.dev20250116'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -35,7 +35,6 @@ from sky import jobs as managed_jobs
35
35
  from sky import optimizer
36
36
  from sky import provision as provision_lib
37
37
  from sky import resources as resources_lib
38
- from sky import serve as serve_lib
39
38
  from sky import sky_logging
40
39
  from sky import status_lib
41
40
  from sky import task as task_lib
@@ -4037,43 +4036,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4037
4036
  f'{colorama.Style.RESET_ALL}')
4038
4037
  return {str(job_id): local_log_dir}
4039
4038
 
4040
- def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
4041
- service_name: str, target: serve_lib.ServiceComponent,
4042
- replica_id: Optional[int], follow: bool) -> None:
4043
- """Tail the logs of a service.
4044
-
4045
- Args:
4046
- handle: The handle to the sky serve controller.
4047
- service_name: The name of the service.
4048
- target: The component to tail the logs of. Could be controller,
4049
- load balancer, or replica.
4050
- replica_id: The replica ID to tail the logs of. Only used when
4051
- target is replica.
4052
- follow: Whether to follow the logs.
4053
- """
4054
- if target != serve_lib.ServiceComponent.REPLICA:
4055
- code = serve_lib.ServeCodeGen.stream_serve_process_logs(
4056
- service_name,
4057
- stream_controller=(
4058
- target == serve_lib.ServiceComponent.CONTROLLER),
4059
- follow=follow)
4060
- else:
4061
- assert replica_id is not None, service_name
4062
- code = serve_lib.ServeCodeGen.stream_replica_logs(
4063
- service_name, replica_id, follow)
4064
-
4065
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
4066
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
4067
-
4068
- self.run_on_head(
4069
- handle,
4070
- code,
4071
- stream_logs=True,
4072
- process_stream=False,
4073
- ssh_mode=command_runner.SshMode.INTERACTIVE,
4074
- stdin=subprocess.DEVNULL,
4075
- )
4076
-
4077
4039
  def teardown_no_lock(self,
4078
4040
  handle: CloudVmRayResourceHandle,
4079
4041
  terminate: bool,
@@ -115,6 +115,16 @@ def _list_accelerators(
115
115
 
116
116
  If the user does not have sufficient permissions to list pods in all
117
117
  namespaces, the function will return free GPUs as -1.
118
+
119
+ Returns:
120
+ A tuple of three dictionaries:
121
+ - qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
122
+ objects with quantity information.
123
+ - total_accelerators_capacity: Dict mapping accelerator names to their
124
+ total capacity in the cluster.
125
+ - total_accelerators_available: Dict mapping accelerator names to their
126
+ current availability. Returns -1 for each accelerator if
127
+ realtime=False or if insufficient permissions.
118
128
  """
119
129
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
120
130
  # function from kubernetes_utils.
@@ -243,6 +253,10 @@ def _list_accelerators(
243
253
 
244
254
  accelerators_available = accelerator_count - allocated_qty
245
255
 
256
+ # Initialize the entry if it doesn't exist yet
257
+ if accelerator_name not in total_accelerators_available:
258
+ total_accelerators_available[accelerator_name] = 0
259
+
246
260
  if accelerators_available >= min_quantity_filter:
247
261
  quantized_availability = min_quantity_filter * (
248
262
  accelerators_available // min_quantity_filter)
sky/data/storage.py CHANGED
@@ -3968,7 +3968,7 @@ class OciStore(AbstractStore):
3968
3968
 
3969
3969
  def __init__(self,
3970
3970
  name: str,
3971
- source: str,
3971
+ source: Optional[SourceType],
3972
3972
  region: Optional[str] = None,
3973
3973
  is_sky_managed: Optional[bool] = None,
3974
3974
  sync_on_reconstruction: Optional[bool] = True,
@@ -3980,13 +3980,53 @@ class OciStore(AbstractStore):
3980
3980
  self.compartment: str
3981
3981
  self.namespace: str
3982
3982
 
3983
- # Bucket region should be consistence with the OCI config file
3984
- region = oci.get_oci_config()['region']
3983
+ # Region is from the specified name in <bucket>@<region> format.
3984
+ # Another case is name can also be set by the source, for example:
3985
+ # /datasets-storage:
3986
+ # source: oci://RAGData@us-sanjose-1
3987
+ # The name in above mount will be set to RAGData@us-sanjose-1
3988
+ region_in_name = None
3989
+ if name is not None and '@' in name:
3990
+ self._validate_bucket_expr(name)
3991
+ name, region_in_name = name.split('@')
3992
+
3993
+ # Region is from the specified source in oci://<bucket>@<region> format
3994
+ region_in_source = None
3995
+ if isinstance(source,
3996
+ str) and source.startswith('oci://') and '@' in source:
3997
+ self._validate_bucket_expr(source)
3998
+ source, region_in_source = source.split('@')
3999
+
4000
+ if region_in_name is not None and region_in_source is not None:
4001
+ # This should never happen because name and source will never be
4002
+ # the remote bucket at the same time.
4003
+ assert region_in_name == region_in_source, (
4004
+ f'Mismatch region specified. Region in name {region_in_name}, '
4005
+ f'but region in source is {region_in_source}')
4006
+
4007
+ if region_in_name is not None:
4008
+ region = region_in_name
4009
+ elif region_in_source is not None:
4010
+ region = region_in_source
4011
+
4012
+ # Default region set to what specified in oci config.
4013
+ if region is None:
4014
+ region = oci.get_oci_config()['region']
4015
+
4016
+ # So far from now on, the name and source are canonical, means there
4017
+ # is no region (@<region> suffix) associated with them anymore.
3985
4018
 
3986
4019
  super().__init__(name, source, region, is_sky_managed,
3987
4020
  sync_on_reconstruction, _bucket_sub_path)
3988
4021
  # TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
3989
4022
 
4023
+ def _validate_bucket_expr(self, bucket_expr: str):
4024
+ pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
4025
+ if not re.match(pattern, bucket_expr):
4026
+ raise ValueError(
4027
+ 'The format for the bucket portion is <bucket>@<region> '
4028
+ 'when specify a region with a bucket.')
4029
+
3990
4030
  def _validate(self):
3991
4031
  if self.source is not None and isinstance(self.source, str):
3992
4032
  if self.source.startswith('oci://'):
@@ -4137,7 +4177,8 @@ class OciStore(AbstractStore):
4137
4177
  sync_command = (
4138
4178
  'oci os object bulk-upload --no-follow-symlinks --overwrite '
4139
4179
  f'--bucket-name {self.name} --namespace-name {self.namespace} '
4140
- f'--src-dir "{base_dir_path}" {includes}')
4180
+ f'--region {self.region} --src-dir "{base_dir_path}" '
4181
+ f'{includes}')
4141
4182
 
4142
4183
  return sync_command
4143
4184
 
@@ -4157,8 +4198,8 @@ class OciStore(AbstractStore):
4157
4198
  sync_command = (
4158
4199
  'oci os object bulk-upload --no-follow-symlinks --overwrite '
4159
4200
  f'--bucket-name {self.name} --namespace-name {self.namespace} '
4160
- f'--object-prefix "{dest_dir_name}" --src-dir "{src_dir_path}" '
4161
- f'{excludes} ')
4201
+ f'--region {self.region} --object-prefix "{dest_dir_name}" '
4202
+ f'--src-dir "{src_dir_path}" {excludes}')
4162
4203
 
4163
4204
  return sync_command
4164
4205
 
@@ -4289,7 +4330,8 @@ class OciStore(AbstractStore):
4289
4330
  def get_file_download_command(remote_path, local_path):
4290
4331
  download_command = (f'oci os object get --bucket-name {self.name} '
4291
4332
  f'--namespace-name {self.namespace} '
4292
- f'--name {remote_path} --file {local_path}')
4333
+ f'--region {self.region} --name {remote_path} '
4334
+ f'--file {local_path}')
4293
4335
 
4294
4336
  return download_command
4295
4337
 
@@ -4346,6 +4388,7 @@ class OciStore(AbstractStore):
4346
4388
  @oci.with_oci_env
4347
4389
  def get_bucket_delete_command(bucket_name):
4348
4390
  remove_command = (f'oci os bucket delete --bucket-name '
4391
+ f'--region {self.region} '
4349
4392
  f'{bucket_name} --empty --force')
4350
4393
 
4351
4394
  return remove_command
sky/jobs/controller.py CHANGED
@@ -256,9 +256,7 @@ class JobsController:
256
256
  task.num_nodes == 1):
257
257
  continue
258
258
 
259
- if job_status in [
260
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
261
- ]:
259
+ if job_status in job_lib.JobStatus.user_code_failure_states():
262
260
  # Add a grace period before the check of preemption to avoid
263
261
  # false alarm for job failure.
264
262
  time.sleep(5)
@@ -288,9 +286,7 @@ class JobsController:
288
286
  if job_status is not None and not job_status.is_terminal():
289
287
  # The multi-node job is still running, continue monitoring.
290
288
  continue
291
- elif job_status in [
292
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
293
- ]:
289
+ elif job_status in job_lib.JobStatus.user_code_failure_states():
294
290
  # The user code has probably crashed, fail immediately.
295
291
  end_time = managed_job_utils.get_job_timestamp(
296
292
  self._backend, cluster_name, get_end_time=True)
@@ -493,6 +489,7 @@ def start(job_id, dag_yaml, retry_until_up):
493
489
  """Start the controller."""
494
490
  controller_process = None
495
491
  cancelling = False
492
+ task_id = None
496
493
  try:
497
494
  _handle_signal(job_id)
498
495
  # TODO(suquark): In theory, we should make controller process a
@@ -511,6 +508,7 @@ def start(job_id, dag_yaml, retry_until_up):
511
508
  except exceptions.ManagedJobUserCancelledError:
512
509
  dag, _ = _get_dag_and_name(dag_yaml)
513
510
  task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
511
+ assert task_id is not None, job_id
514
512
  logger.info(
515
513
  f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
516
514
  managed_job_state.set_cancelling(
@@ -542,6 +540,7 @@ def start(job_id, dag_yaml, retry_until_up):
542
540
  logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
543
541
 
544
542
  if cancelling:
543
+ assert task_id is not None, job_id # Since it's set with cancelling
545
544
  managed_job_state.set_cancelled(
546
545
  job_id=job_id,
547
546
  callback_func=managed_job_utils.event_callback_func(
sky/jobs/state.py CHANGED
@@ -620,10 +620,12 @@ def get_latest_task_id_status(
620
620
  id_statuses = _get_all_task_ids_statuses(job_id)
621
621
  if not id_statuses:
622
622
  return None, None
623
- task_id, status = id_statuses[-1]
624
- for task_id, status in id_statuses:
625
- if not status.is_terminal():
626
- break
623
+ task_id, status = next(
624
+ ((tid, st) for tid, st in id_statuses if not st.is_terminal()),
625
+ id_statuses[-1],
626
+ )
627
+ # Unpack the tuple first, or it triggers a Pylint's bug on recognizing
628
+ # the return type.
627
629
  return task_id, status
628
630
 
629
631
 
sky/jobs/utils.py CHANGED
@@ -398,32 +398,15 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
398
398
  job_statuses = backend.get_job_status(handle, stream_logs=False)
399
399
  job_status = list(job_statuses.values())[0]
400
400
  assert job_status is not None, 'No job found.'
401
+ assert task_id is not None, job_id
402
+
401
403
  if job_status != job_lib.JobStatus.CANCELLED:
402
- assert task_id is not None, job_id
403
- if task_id < num_tasks - 1 and follow:
404
- # The log for the current job is finished. We need to
405
- # wait until next job to be started.
406
- logger.debug(
407
- f'INFO: Log for the current task ({task_id}) '
408
- 'is finished. Waiting for the next task\'s log '
409
- 'to be started.')
410
- # Add a newline to avoid the status display below
411
- # removing the last line of the task output.
412
- print()
413
- status_display.update(
414
- ux_utils.spinner_message(
415
- f'Waiting for the next task: {task_id + 1}'))
416
- status_display.start()
417
- original_task_id = task_id
418
- while True:
419
- task_id, managed_job_status = (
420
- managed_job_state.get_latest_task_id_status(
421
- job_id))
422
- if original_task_id != task_id:
423
- break
424
- time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
425
- continue
426
- else:
404
+ if not follow:
405
+ break
406
+
407
+ # Logs for retrying failed tasks.
408
+ if (job_status
409
+ in job_lib.JobStatus.user_code_failure_states()):
427
410
  task_specs = managed_job_state.get_task_specs(
428
411
  job_id, task_id)
429
412
  if task_specs.get('max_restarts_on_errors', 0) == 0:
@@ -436,15 +419,51 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
436
419
  ux_utils.spinner_message(
437
420
  'Waiting for next restart for the failed task'))
438
421
  status_display.start()
439
- while True:
440
- _, managed_job_status = (
441
- managed_job_state.get_latest_task_id_status(
442
- job_id))
443
- if (managed_job_status !=
444
- managed_job_state.ManagedJobStatus.RUNNING):
445
- break
422
+
423
+ def is_managed_job_status_updated(
424
+ status: Optional[managed_job_state.ManagedJobStatus]
425
+ ) -> bool:
426
+ """Check if local managed job status reflects remote
427
+ job failure.
428
+
429
+ Ensures synchronization between remote cluster
430
+ failure detection (JobStatus.FAILED) and controller
431
+ retry logic.
432
+ """
433
+ return (status !=
434
+ managed_job_state.ManagedJobStatus.RUNNING)
435
+
436
+ while not is_managed_job_status_updated(
437
+ managed_job_status :=
438
+ managed_job_state.get_status(job_id)):
446
439
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
447
440
  continue
441
+
442
+ if task_id == num_tasks - 1:
443
+ break
444
+
445
+ # The log for the current job is finished. We need to
446
+ # wait until next job to be started.
447
+ logger.debug(
448
+ f'INFO: Log for the current task ({task_id}) '
449
+ 'is finished. Waiting for the next task\'s log '
450
+ 'to be started.')
451
+ # Add a newline to avoid the status display below
452
+ # removing the last line of the task output.
453
+ print()
454
+ status_display.update(
455
+ ux_utils.spinner_message(
456
+ f'Waiting for the next task: {task_id + 1}'))
457
+ status_display.start()
458
+ original_task_id = task_id
459
+ while True:
460
+ task_id, managed_job_status = (
461
+ managed_job_state.get_latest_task_id_status(job_id))
462
+ if original_task_id != task_id:
463
+ break
464
+ time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
465
+ continue
466
+
448
467
  # The job can be cancelled by the user or the controller (when
449
468
  # the cluster is partially preempted).
450
469
  logger.debug(
sky/serve/core.py CHANGED
@@ -1,6 +1,9 @@
1
1
  """SkyServe core APIs."""
2
2
  import re
3
+ import signal
4
+ import subprocess
3
5
  import tempfile
6
+ import threading
4
7
  from typing import Any, Dict, List, Optional, Tuple, Union
5
8
 
6
9
  import colorama
@@ -18,6 +21,7 @@ from sky.serve import serve_utils
18
21
  from sky.skylet import constants
19
22
  from sky.usage import usage_lib
20
23
  from sky.utils import admin_policy_utils
24
+ from sky.utils import command_runner
21
25
  from sky.utils import common_utils
22
26
  from sky.utils import controller_utils
23
27
  from sky.utils import resources_utils
@@ -731,8 +735,29 @@ def tail_logs(
731
735
 
732
736
  backend = backend_utils.get_backend_from_handle(handle)
733
737
  assert isinstance(backend, backends.CloudVmRayBackend), backend
734
- backend.tail_serve_logs(handle,
735
- service_name,
736
- target,
737
- replica_id,
738
- follow=follow)
738
+
739
+ if target != serve_utils.ServiceComponent.REPLICA:
740
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
741
+ service_name,
742
+ stream_controller=(
743
+ target == serve_utils.ServiceComponent.CONTROLLER),
744
+ follow=follow)
745
+ else:
746
+ assert replica_id is not None, service_name
747
+ code = serve_utils.ServeCodeGen.stream_replica_logs(
748
+ service_name, replica_id, follow)
749
+
750
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
751
+ # kill the process, so we need to handle it manually here.
752
+ if threading.current_thread() is threading.main_thread():
753
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
754
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
755
+
756
+ # Refer to the notes in
757
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
758
+ backend.run_on_head(handle,
759
+ code,
760
+ stream_logs=True,
761
+ process_stream=False,
762
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
763
+ stdin=subprocess.DEVNULL)
@@ -998,9 +998,7 @@ class SkyPilotReplicaManager(ReplicaManager):
998
998
  # Re-raise the exception if it is not preempted.
999
999
  raise
1000
1000
  job_status = list(job_statuses.values())[0]
1001
- if job_status in [
1002
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
1003
- ]:
1001
+ if job_status in job_lib.JobStatus.user_code_failure_states():
1004
1002
  info.status_property.user_app_failed = True
1005
1003
  serve_state.add_or_update_replica(self._service_name,
1006
1004
  info.replica_id, info)
sky/skylet/job_lib.py CHANGED
@@ -12,7 +12,7 @@ import signal
12
12
  import sqlite3
13
13
  import subprocess
14
14
  import time
15
- from typing import Any, Dict, List, Optional
15
+ from typing import Any, Dict, List, Optional, Sequence
16
16
 
17
17
  import colorama
18
18
  import filelock
@@ -162,13 +162,17 @@ class JobStatus(enum.Enum):
162
162
  def nonterminal_statuses(cls) -> List['JobStatus']:
163
163
  return [cls.INIT, cls.SETTING_UP, cls.PENDING, cls.RUNNING]
164
164
 
165
- def is_terminal(self):
165
+ def is_terminal(self) -> bool:
166
166
  return self not in self.nonterminal_statuses()
167
167
 
168
- def __lt__(self, other):
168
+ @classmethod
169
+ def user_code_failure_states(cls) -> Sequence['JobStatus']:
170
+ return (cls.FAILED, cls.FAILED_SETUP)
171
+
172
+ def __lt__(self, other: 'JobStatus') -> bool:
169
173
  return list(JobStatus).index(self) < list(JobStatus).index(other)
170
174
 
171
- def colored_str(self):
175
+ def colored_str(self) -> str:
172
176
  color = _JOB_STATUS_TO_COLOR[self]
173
177
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
174
178
 
sky/utils/db_utils.py CHANGED
@@ -4,11 +4,27 @@ import sqlite3
4
4
  import threading
5
5
  from typing import Any, Callable, Optional
6
6
 
7
+ # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
+ # obtains a database lock (not necessarily during connection, but whenever it is
9
+ # needed). It is not a connection timeout.
10
+ # Even in WAL mode, only a single writer is allowed at a time. Other writers
11
+ # will block until the write lock can be obtained. This behavior is described in
12
+ # the SQLite documentation for WAL: https://www.sqlite.org/wal.html
13
+ # Python's default timeout is 5s. In normal usage, lock contention is very low,
14
+ # and this is more than sufficient. However, in some highly concurrent cases,
15
+ # such as a jobs controller suddenly recovering thousands of jobs at once, we
16
+ # can see a small number of processes that take much longer to obtain the lock.
17
+ # In contrived highly contentious cases, around 0.1% of transactions will take
18
+ # >30s to take the lock. We have not seen cases that take >60s. For cases up to
19
+ # 1000x parallelism, this is thus thought to be a conservative setting.
20
+ # For more info, see the PR description for #4552.
21
+ _DB_TIMEOUT_S = 60
22
+
7
23
 
8
24
  @contextlib.contextmanager
9
25
  def safe_cursor(db_path: str):
10
26
  """A newly created, auto-committing, auto-closing cursor."""
11
- conn = sqlite3.connect(db_path)
27
+ conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
12
28
  cursor = conn.cursor()
13
29
  try:
14
30
  yield cursor
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
79
95
  def __init__(self, db_path: str, create_table: Callable):
80
96
  super().__init__()
81
97
  self.db_path = db_path
82
- # NOTE: We use a timeout of 10 seconds to avoid database locked
83
- # errors. This is a hack, but it works.
84
- self.conn = sqlite3.connect(db_path, timeout=10)
98
+ self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
85
99
  self.cursor = self.conn.cursor()
86
100
  create_table(self.cursor, self.conn)
@@ -93,11 +93,11 @@ cleanup_agent_node() {
93
93
 
94
94
  check_gpu() {
95
95
  local NODE_IP=$1
96
- run_remote "$NODE_IP" "
97
- if command -v nvidia-smi &> /dev/null; then
98
- nvidia-smi --list-gpus | grep 'GPU 0'
99
- fi
100
- "
96
+ if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
97
+ return 0 # GPU detected
98
+ else
99
+ return 1 # No GPU detected
100
+ fi
101
101
  }
102
102
 
103
103
  # Pre-flight checks
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250114
3
+ Version: 1.0.0.dev20250116
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -351,6 +351,8 @@ Read the research:
351
351
  - [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
352
352
  - [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)
353
353
 
354
+ SkyPilot was initially started at the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley and has since gained many industry contributors. Read more about the project's origin [here](https://docs.skypilot.co/en/latest/sky-computing.html).
355
+
354
356
  ## Support and Questions
355
357
  We are excited to hear your feedback!
356
358
  * For issues and feature requests, please [open a GitHub issue](https://github.com/skypilot-org/skypilot/issues/new).
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=8Saq3iWLIsCns9jB3ywi08wxlNICyDj6uFxY4ej12OA,5944
1
+ sky/__init__.py,sha256=NOjVo4cLFc0FCj6F1rqnD041xOezBxAjAjjUQqKcaqE,5944
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
4
4
  sky/check.py,sha256=s8deMVL-k9y8gd519K7NWZc3DqWsEySwiAr0uH3Vvcc,9459
@@ -32,7 +32,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
32
32
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
33
33
  sky/backends/backend.py,sha256=iBs5gnMaaUoH2OIQ3xhAjWdrJWqj8T61Za9TGsBFpvQ,7515
34
34
  sky/backends/backend_utils.py,sha256=Eeew8YV0VYSYxozqzadNMZrjhEMjlE3yuzTRP7YSl50,137348
35
- sky/backends/cloud_vm_ray_backend.py,sha256=ANDYIisCZ-IKWHIdQ2-XoJzxaASaOZxVxdTBI4f2Yo0,247430
35
+ sky/backends/cloud_vm_ray_backend.py,sha256=rW8YHJsnYwefIXRdIAAiDWEh9NUV7GZ89pmT4iMq0zY,245876
36
36
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
37
37
  sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
38
38
  sky/backends/wheel_utils.py,sha256=5BUzBqfYz7p1ME6_0PXGmcsAkLVb8NrFt317p7a4X8s,8278
@@ -68,7 +68,7 @@ sky/clouds/service_catalog/do_catalog.py,sha256=Cug2QaQlSN6nFhba7f1ksyzs6z0ICTj6
68
68
  sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
69
69
  sky/clouds/service_catalog/gcp_catalog.py,sha256=jJEfWjZ4ItsE657LjIf9mruJVZERFegCD5Qtu20AFNc,24542
70
70
  sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
71
- sky/clouds/service_catalog/kubernetes_catalog.py,sha256=D0DvhVlK6Z6HJcZHPOWqRNAbXgFaQOKUnS_xkmqzukA,12550
71
+ sky/clouds/service_catalog/kubernetes_catalog.py,sha256=449eTIw-ZIwliMWGPx6ENAYuX8nW2M4kO4mh5V3cea4,13268
72
72
  sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
73
73
  sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
74
74
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
@@ -94,15 +94,15 @@ sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
94
94
  sky/data/data_transfer.py,sha256=wixC4_3_JaeJFdGKOp-O5ulcsMugDSgrCR0SnPpugGc,8946
95
95
  sky/data/data_utils.py,sha256=HjcgMDuWRR_fNQ9gjuROi9GgPVvTGApiJwxGtdb2_UU,28860
96
96
  sky/data/mounting_utils.py,sha256=tJHBPEDP1Wg_r3oSGBwFhMDLnPCMPSFRz26O0QkDd0Y,14908
97
- sky/data/storage.py,sha256=07ccD5YaQ9j6R_zPkvNk7qXnW3awDkCn9V-Sx-KXGvo,201715
97
+ sky/data/storage.py,sha256=jOo3veWVL8JMTP2SVmcsXt-ZpfNbReWdOvEgoeCZIic,203768
98
98
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
99
99
  sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
100
100
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
101
- sky/jobs/controller.py,sha256=DDt92Sa0TV3VULnEyM5QopUowciH6PE9u0yTDumFatM,28538
101
+ sky/jobs/controller.py,sha256=nJKd_5cPJ14RPm_Gg-JgPquj5W9zfVQ1m6hJxxTKNpE,28577
102
102
  sky/jobs/core.py,sha256=AVbboohNCUDqfK_7DDkc-wJOg87nE7L6Vw0wbPTelIA,20022
103
103
  sky/jobs/recovery_strategy.py,sha256=eP9CLy5qiNTyMJTWWzAxdQ4YolUZWL1g3cLMH7tw8Es,27312
104
- sky/jobs/state.py,sha256=1NeW0SVtfVd02MnS9OzvV-OV9Plch8QLH-ZZnttaLCg,27598
105
- sky/jobs/utils.py,sha256=G-3f0qxJEep4Rl52UxnXLcVmjt2uLYn0qUja1pClwmw,39031
104
+ sky/jobs/state.py,sha256=CaOzoU0mPiXwioyupXol0XsNJsvDC8ApgDyKKE_fIRs,27694
105
+ sky/jobs/utils.py,sha256=0HlO8H1hzTr40XK7xJXseMoeIMQYA01qVuPAuEQFgAE,39596
106
106
  sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
107
107
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
108
108
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
@@ -190,10 +190,10 @@ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
190
190
  sky/serve/autoscalers.py,sha256=N7yRGT9Ay5_yJUOkqaBGC7jG3eIdzA5d66i8kskGxZc,30351
191
191
  sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
192
192
  sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
193
- sky/serve/core.py,sha256=j2pyYi_DPHndVe-lQ_WdLaI0_NBgH3tHosi8vV6fCBg,32303
193
+ sky/serve/core.py,sha256=UAbbnxmOZ8GBT7vaeFvtFC7_qXu05TFsNIFcLrdT3Oo,33341
194
194
  sky/serve/load_balancer.py,sha256=nNvDPJPRIrBc_qsBYJz1zzKa_fXDgfi0VDUf4SJEuW8,12990
195
195
  sky/serve/load_balancing_policies.py,sha256=XVj76qBgqh7h6wfx53RKQFzBefDWTE4TCdCEtFLLtI4,5398
196
- sky/serve/replica_managers.py,sha256=mNlIOdyd1Mo_PTGazHOQHA-Ql778TdDHZQ7V1yTMSiY,57764
196
+ sky/serve/replica_managers.py,sha256=SFvK7ewilc3NVRcqXg63WtU1WmhJKPtJd27JfKR2aow,57716
197
197
  sky/serve/serve_state.py,sha256=MAx63zlGOXaIgXedP9fUFlRxDKiez1shmyMetrJK6yQ,19756
198
198
  sky/serve/serve_utils.py,sha256=WgPcqEw3WyMOdgRTFg8DSsWyIG1xnRbRkI1-f09tNKg,39741
199
199
  sky/serve/service.py,sha256=7bvK9R9D48PZSYcOKSievXQ2mHUMk1d3AAIxtra7WOI,12083
@@ -208,7 +208,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
208
208
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
209
209
  sky/skylet/constants.py,sha256=1h5nhXsAvryo9THpfQ0wQKPSDjXcY9GeN6oX378yAyM,16021
210
210
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
211
- sky/skylet/job_lib.py,sha256=9YO4N_0cSn4Pp8nia1iTGESWUd1eO06H4vvjr-s0UCE,43840
211
+ sky/skylet/job_lib.py,sha256=Vp8rpRYioKYEwV9NkMArMfw-uOUaJMy8eQ2sUwZy-Kc,44014
212
212
  sky/skylet/log_lib.py,sha256=fcQzEe4OK8exsNVBhbdYe4uIq2cdSHszsKZTtX8a3-Q,20453
213
213
  sky/skylet/log_lib.pyi,sha256=VpA_VoL970Noj-YrBkKqLxFi34JVMY7KLrOQ3o4AqEI,4336
214
214
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -264,7 +264,7 @@ sky/utils/common_utils.py,sha256=Kh0iymQl9I4HXxYSc3TTcv-xeso27pU_1hGNOc9Xw2o,253
264
264
  sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
265
265
  sky/utils/controller_utils.py,sha256=g4wvp6BrXUcwjRbMvy_LBtZPMPOzHXeRWyEoXORoZrU,44381
266
266
  sky/utils/dag_utils.py,sha256=R1yhJssvzDg13p6PJIC8OkYFBiR64eIx5xQeRpAG9n4,6099
267
- sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
267
+ sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
268
268
  sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
269
269
  sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
270
270
  sky/utils/log_utils.py,sha256=xEbUZfDiIiZkyWoLHXwIcqVMCBDEENsLCiogEXMDLt0,14139
@@ -280,7 +280,7 @@ sky/utils/cli_utils/status_utils.py,sha256=2HrH6IBJCJ__AbuZ0ooIEgarBKIVIA5M3myE5
280
280
  sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
281
281
  sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
282
282
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
283
- sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=LzrhZbZ8W399U_0IktTi3Elb0w91oq267e4Wk5oUfb4,8471
283
+ sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=dj_q2LHFgq03bXJWJhtMFFWOpcWnWAYKfFQyMv7Gr5A,8551
284
284
  sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
285
285
  sky/utils/kubernetes/generate_kubeconfig.sh,sha256=MBvXJio0PeujZSCXiRKE_pa6HCTiU9qBzR1WrXccVSY,10477
286
286
  sky/utils/kubernetes/gpu_labeler.py,sha256=4px7FyfsukacPEvKwTLUNb3WwacMIUrHWjP93qTi3kE,6998
@@ -288,9 +288,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
288
288
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
289
289
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
290
290
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
291
- skypilot_nightly-1.0.0.dev20250114.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
292
- skypilot_nightly-1.0.0.dev20250114.dist-info/METADATA,sha256=NRUZuDh7YqD-D3dPuCRRLGK19a88kgP5BQr-UOkJHcQ,20632
293
- skypilot_nightly-1.0.0.dev20250114.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
294
- skypilot_nightly-1.0.0.dev20250114.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
295
- skypilot_nightly-1.0.0.dev20250114.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
296
- skypilot_nightly-1.0.0.dev20250114.dist-info/RECORD,,
291
+ skypilot_nightly-1.0.0.dev20250116.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
292
+ skypilot_nightly-1.0.0.dev20250116.dist-info/METADATA,sha256=8MvuzpqLuZac2S1Dx80sAsED2p3DgJo6EYa3YpI4GSU,20884
293
+ skypilot_nightly-1.0.0.dev20250116.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
294
+ skypilot_nightly-1.0.0.dev20250116.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
295
+ skypilot_nightly-1.0.0.dev20250116.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
296
+ skypilot_nightly-1.0.0.dev20250116.dist-info/RECORD,,